==================================================Ascend ============================= test session starts ============================== platform linux -- Python 3.9.19, pytest-6.2.5, py-1.11.0, pluggy-1.5.0 rootdir: /home/jenkins/mindspore/testcases/testcases/tests/st/ops/allcases_onecard, configfile: ../../../../../../../sault/virtual_test/virtualenv_002/sault/config/pytest.ini plugins: mock-3.14.0, hydra-core-1.3.2, forked-1.6.0, anyio-4.9.0, xdist-1.32.0 collected 1 item test_ops_group_cases.py platform: ascend910b, max workers: 8, memory threshold: 51200M level: level0, group_name: ops Start group testing... ops group_cases_0 with 8 cases start to running, all cases are below: case: (, 0) case: (, 1) case: (, 0) case: (, 1) case: (, 'KBK') case: (, 'PYBOOST') case: (, 0) case: (, 1) ops group_cases_0 total running memory: 516M, memory threshold: 51200M TotalTime = 0.995037, [21] [bootstrap]: 0.00113035 [type_inference]: 0.972286 [event_method]: 0.00038469 [auto_monad]: 0.00016831 [graph_reusing]: 6.64999e-06 [inline]: 3.16001e-06 [add_attr]: 0.00926678, [1] [add_attr_with_inline]: 0.00924648, [1] [Cycle 1]: 0.00017525, [2] [tag_attr]: 5.073e-05 [meta_addattr_fg_expand]: 1.6e-05 [parallel-infer-symbol]: 4.03001e-06 [pre_auto_parallel]: 6.736e-05 [insert-virtual-dataset]: 3.04999e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 2.47001e-06 [pipeline_split]: 1.73997e-06 [optimize]: 0.0106897, [53] [py_interpret_to_execute]: 8.77999e-06 [rewriter_before_opt_a]: 0.00037639 [opt_a]: 0.00744777, [2] [Cycle 1]: 0.00597586, [45] [expand_dump_flag]: 4e-06 [switch_simplify]: 9.077e-05 [loop_unroll]: 4.395e-05 [a_1]: 0.00078436 [with_stream_mark]: 2.279e-05 [recompute_prepare]: 1.338e-05 [updatestate_depend_eliminate]: 1.508e-05 [updatestate_assign_eliminate]: 1.146e-05 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.09999e-06 [a_2]: 0.00013605 [accelerated_algorithm]: 1.006e-05 [shard]: 1.96e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 1.019e-05 [merge_send_recv]: 4.205e-05 [auto_parallel]: 9.64e-06 [parallel]: 7.922e-05 [flash_sp]: 3.552e-05 [merge_comm]: 5.32999e-06 [allreduce_fusion]: 1.185e-05 [matmul_add_comm_reduction]: 1.922e-05 [allreduce_slice_to_reducescatter]: 8.40001e-06 [virtual_shard_identity]: 1.635e-05 [virtual_dataset]: 1.114e-05 [get_grad_eliminate_]: 1.04e-05 [virtual_output]: 1.037e-05 [merge_forward]: 4.35999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.878e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.777e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.474e-05 [set_forward_comm_id_for_comm_node_pass]: 1.312e-05 [meta_fg_expand]: 3.91001e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 1.715e-05 [after_resolve]: 1.873e-05 [a_after_grad]: 1.685e-05 [renormalize]: 0.00393276 [add_forward_monad_depend]: 9.05001e-06 [auto_monad_grad]: 2.79999e-06 [auto_monad_eliminator]: 3.263e-05 [cse]: 6.608e-05 [a_3]: 8.575e-05 [Cycle 2]: 0.00145763, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 1.283e-05 [loop_unroll]: 0.00041381 [a_1]: 0.00025536 [with_stream_mark]: 2.992e-05 [recompute_prepare]: 1.175e-05 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 3.82998e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 0.00012295 [accelerated_algorithm]: 1.052e-05 [shard]: 2.99999e-06 [meta_shard_fg_expand]: 2.71999e-06 [shard_inline]: 9.69999e-06 [merge_send_recv]: 9.84999e-06 [auto_parallel]: 1.296e-05 [parallel]: 1.048e-05 [flash_sp]: 5.10001e-06 [merge_comm]: 4.58999e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 1.275e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.282e-05 [virtual_dataset]: 9.40001e-06 [get_grad_eliminate_]: 9.59e-06 [virtual_output]: 9.52999e-06 [merge_forward]: 5.86998e-06 [cell_reuse_recompute_pass]: 3.01999e-06 [offload_activation]: 1.179e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.129e-05 [merge_recompute_call_nodes]: 1.88997e-06 [before_grad]: 1.434e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 3.76999e-06 [flash_sp_send_recv_attached]: 1.71e-06 [receive_attached]: 3.4e-06 [after_resolve]: 1.852e-05 [a_after_grad]: 1.583e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 3.00002e-06 [auto_monad_grad]: 1.96e-06 [auto_monad_eliminator]: 1.504e-05 [cse]: 3.954e-05 [a_3]: 6.107e-05 [py_interpret_to_execute_after_opt_a]: 8.05e-06 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 3.908e-05 [convert_after_rewriter]: 1.23002e-06 [order_py_execute_after_rewriter]: 1.05999e-06 [mutable_eliminate]: 0.00079806 [opt_b]: 0.00030935, [1] [Cycle 1]: 0.00030027, [7] [b_1]: 0.00019902 [b_2]: 1.199e-05 [updatestate_depend_eliminate]: 1.009e-05 [updatestate_assign_eliminate]: 3.95998e-06 [updatestate_loads_eliminate]: 3.22002e-06 [renormalize]: 8.09989e-07 [cse]: 3.403e-05 [optimize_parallel_all_gather_comm]: 3.155e-05 [overlap_param_gather]: 1.033e-05 [cconv]: 4.101e-05 [loop_unroll]: 0.00050324 [opt_after_cconv]: 0.00019244, [1] [Cycle 1]: 0.00018544, [7] [c_1]: 9.488e-05 [parameter_eliminate]: 4.84e-06 [updatestate_depend_eliminate]: 8.11002e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.26999e-06 [cse]: 3.176e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 2.173e-05 [tuple_transform]: 0.00011807, [1] [Cycle 1]: 0.00011205, [4] [d_1]: 7.687e-05 [none_parameter_eliminate]: 1.76998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.25e-05 [partial_unused_args_eliminate]: 2.09e-06 [add_recomputation]: 7.747e-05 [cse_after_recomputation]: 3.018e-05, [1] [Cycle 1]: 2.464e-05, [1] [cse]: 1.865e-05 [environ_conv]: 3.131e-05 [swap_dp_allreduce_reducescatter]: 2.482e-05 [bias_add_comm_swap]: 1.208e-05 [label_micro_interleaved_index]: 1.282e-05 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 2.71999e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.42e-06 [remove_cast_before_assign_add]: 8.59e-06 [full_micro_interleaved_order_control]: 9.77999e-06 [reorder_send_recv_between_fp_bp]: 2.50002e-06 [comm_op_add_attrs]: 1.10999e-06 [add_comm_op_reuse_tag]: 1.27e-06 [interleave_split_concat_branches]: 1.34e-06 [interleave_parallel_branches]: 8.85001e-06 [overlap_opt_shard_in_pipeline]: 2.777e-05 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.813e-05 [grouped_pairwise_exchange_alltoall]: 1.57001e-06 [offloading_packed_experts]: 4.57e-06 [overlap_recompute_and_grad_model_parallel]: 1.343e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.86e-06 [overlap_grad_ring_attention]: 1.948e-05 [overlap_grad_flash_sp]: 5.344e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 9.86e-06 [split_layernorm_comm]: 2.04e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 0.00010353, [1] [Cycle 1]: 9.792e-05, [6] [build]: 4.16001e-06 [elim_shapecalc]: 1.65e-05 [elim_not_effective]: 1.903e-05 [opt_reshape]: 1.104e-05 [fold_const_symbol]: 1.398e-05 [renormalize]: 3.50003e-07 [detach_backward]: 2.96999e-06 [pipeline_parallel_scheduler]: 2.15002e-06 [auto_monad_reorder]: 3.05e-05 [get_jit_bprop_graph]: 2.04999e-06 [rewriter_after_jit_bprop_graph]: 6.41e-06 [opt_after_jit_grad]: 0.00063548 [validate]: 0.00010145 Sums bootstrap : 0.001130s : 0.11% type_inference : 0.972286s : 98.75% event_method : 0.000385s : 0.04% auto_monad : 0.000168s : 0.02% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000051s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000067s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.00% optimize.rewriter_before_opt_a : 0.000376s : 0.04% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000104s : 0.01% optimize.opt_a.loop_unroll : 0.000458s : 0.05% optimize.opt_a.a_1 : 0.001040s : 0.11% optimize.opt_a.with_stream_mark : 0.000053s : 0.01% optimize.opt_a.recompute_prepare : 0.000025s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000259s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000020s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.01% optimize.opt_a.auto_parallel : 0.000023s : 0.00% optimize.opt_a.parallel : 0.000090s : 0.01% optimize.opt_a.flash_sp : 0.000041s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000016s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000032s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000029s : 0.00% optimize.opt_a.virtual_dataset : 0.000021s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.00% optimize.opt_a.virtual_output : 0.000020s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000031s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000049s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000037s : 0.00% optimize.opt_a.a_after_grad : 0.000033s : 0.00% optimize.opt_a.renormalize : 0.003933s : 0.40% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000048s : 0.00% optimize.opt_a.cse : 0.000106s : 0.01% optimize.opt_a.a_3 : 0.000147s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000798s : 0.08% optimize.opt_b.b_1 : 0.000199s : 0.02% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000041s : 0.00% optimize.loop_unroll : 0.000503s : 0.05% optimize.opt_after_cconv.c_1 : 0.000095s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000032s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.00% optimize.tuple_transform.d_1 : 0.000077s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000077s : 0.01% optimize.cse_after_recomputation.cse : 0.000019s : 0.00% optimize.environ_conv : 0.000031s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000053s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000635s : 0.06% validate : 0.000101s : 0.01% Time group info: ------[substitution.] 0.000276 33 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000002s : 2: substitution.fold_const_symbol 3.47% : 0.000010s : 8: substitution.graph_param_transform 79.46% : 0.000219s : 4: substitution.inline 1.81% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.94% : 0.000014s : 4: substitution.remove_not_recompute_node 3.21% : 0.000009s : 6: substitution.replace_old_param 5.56% : 0.000015s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.972110 2 99.70% : 0.969210s : 1: type_inference.infer 0.30% : 0.002899s : 1: type_inference.specialize ------[replace.] 0.000072 7 69.19% : 0.000050s : 4: replace.inline 30.81% : 0.000022s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000230 7 94.10% : 0.000217s : 4: match.inline 5.90% : 0.000014s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000284 2146 0.84% : 0.000002s : 20: predicate.accumulaten_eliminater 1.00% : 0.000003s : 8: predicate.ad_related_special_op_eliminate 0.60% : 0.000002s : 16: predicate.addn_check_dump 0.84% : 0.000002s : 20: predicate.addn_zero_filter 0.76% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.27% : 0.000006s : 36: predicate.arithmetic_simplify 0.86% : 0.000002s : 20: predicate.cast_eliminate 0.68% : 0.000002s : 16: predicate.check_bprop_eliminate 0.60% : 0.000002s : 16: predicate.compare_switch_simplify 0.26% : 0.000001s : 8: predicate.const_output_eliminate 0.65% : 0.000002s : 16: predicate.depend_value_elim 0.84% : 0.000002s : 20: predicate.dict_get_item_const_eliminator 0.99% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 20: predicate.dict_set_item_eliminator 1.64% : 0.000005s : 16: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 8: predicate.elim_not_effective 0.43% : 0.000001s : 8: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 28: predicate.environ_add_const_eliminate 1.02% : 0.000003s : 28: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 28: predicate.environ_get_depend_swap 1.71% : 0.000005s : 44: predicate.environ_get_eliminate 1.03% : 0.000003s : 28: predicate.environ_get_set_eliminate 1.07% : 0.000003s : 27: predicate.exchange_switch_depend_value 1.99% : 0.000006s : 27: predicate.float_depend_g_call 0.64% : 0.000002s : 16: predicate.float_environ_get_switch 0.96% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.24% : 0.000001s : 8: predicate.fold_const_symbol 0.83% : 0.000002s : 16: predicate.get_grad_eliminate 0.31% : 0.000001s : 8: predicate.graph_param_transform 0.60% : 0.000002s : 16: predicate.incorporate_call 0.53% : 0.000002s : 16: predicate.incorporate_call_switch 5.92% : 0.000017s : 95: predicate.inline 1.02% : 0.000003s : 16: predicate.inline_without_move 0.42% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.92% : 0.000003s : 16: predicate.less_batch_normalization 1.66% : 0.000005s : 39: predicate.list_to_tuple_eliminator_ 2.25% : 0.000006s : 59: predicate.load_eliminater 0.74% : 0.000002s : 8: predicate.loop_unroll_after_grad 2.91% : 0.000008s : 55: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 36: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 16: predicate.merge_addn 0.68% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 20: predicate.minmaximum_grad 1.53% : 0.000004s : 8: predicate.mutable_eliminate 0.34% : 0.000001s : 8: predicate.opt_reshape 0.41% : 0.000001s : 8: predicate.parallel_virtual_node 1.80% : 0.000005s : 27: predicate.partial_defer_inline 1.32% : 0.000004s : 31: predicate.partial_eliminate 0.77% : 0.000002s : 20: predicate.print_const_string_wrapper 0.70% : 0.000002s : 16: predicate.reduce_all_const_elim 1.06% : 0.000003s : 20: predicate.reduce_eliminate 2.22% : 0.000006s : 59: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000002s : 16: predicate.remove_not_recompute_node 1.54% : 0.000004s : 39: predicate.replace_applicator 0.55% : 0.000002s : 16: predicate.replace_old_param 0.39% : 0.000001s : 8: predicate.reset_defer_inline 0.90% : 0.000003s : 20: predicate.reshape_eliminate 0.71% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.53% : 0.000002s : 8: predicate.row_tensor_eliminate 0.94% : 0.000003s : 16: predicate.same_eliminate 0.67% : 0.000002s : 16: predicate.set_cell_output_no_recompute 1.11% : 0.000003s : 16: predicate.shard_identity_eliminate 0.71% : 0.000002s : 16: predicate.special_op_eliminate 0.74% : 0.000002s : 16: predicate.specialize_transform 1.08% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.19% : 0.000003s : 27: predicate.switch_defer_inline 1.90% : 0.000005s : 43: predicate.switch_layer_defer_inline 4.99% : 0.000014s : 106: predicate.switch_simplify 0.94% : 0.000003s : 20: predicate.tile_eliminate 0.79% : 0.000002s : 20: predicate.transpose_eliminate 1.39% : 0.000004s : 36: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 36: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000004s : 36: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000009s : 55: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 36: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000006s : 52: predicate.tuple_list_set_item_eliminator 1.66% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.13% : 0.000006s : 59: predicate.updatestate_pure_node_eliminater 2.90% : 0.000008s : 75: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 8: predicate.value_based_eliminate 0.75% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.91% : 0.000003s : 16: predicate.virtual_output_eliminate 0.28% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.149369 41 99.29% : 0.148304s : 35: func_graph_cloner_run.FuncGraphClonerGraph 0.71% : 0.001065s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.021399 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.91% : 0.009274s : 1: add_attr 0.91% : 0.009251s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000179s : 1: auto_monad 0.00% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.12% : 0.001188s : 1: bootstrap 0.00% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000036s : 1: environ_conv 0.04% : 0.000403s : 1: event_method 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.05% : 0.000512s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.08% : 0.000811s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000027s : 1: opt.transform.mutable_eliminate 0.22% : 0.002198s : 78: opt.transform.opt_a 0.01% : 0.000093s : 1: opt.transform.opt_after_cconv 0.00% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000182s : 28: opt.transform.opt_b 0.01% : 0.000087s : 2: opt.transform.opt_trans_graph 0.01% : 0.000056s : 4: opt.transform.symbol_engine_opt 0.73% : 0.007451s : 1: opt_a 0.02% : 0.000196s : 1: opt_after_cconv 0.06% : 0.000650s : 1: opt_after_jit_grad 0.03% : 0.000313s : 1: opt_b 1.05% : 0.010697s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000059s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000041s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000072s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000026s : 1: remove_dup_value 0.27% : 0.002775s : 1: renormalize.infer 0.11% : 0.001144s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000045s : 1: rewriter_after_opt_a 0.04% : 0.000384s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000107s : 1: symbol_engine_optimizer 0.01% : 0.000121s : 1: tuple_transform 95.19% : 0.972318s : 1: type_inference TotalTime = 0.0977308, [21] [bootstrap]: 0.0009145 [type_inference]: 0.0471451 [event_method]: 2.743e-05 [auto_monad]: 0.000154 [graph_reusing]: 7.82e-06 [inline]: 2.68998e-06 [add_attr]: 0.041536, [1] [add_attr_with_inline]: 0.0415158, [1] [Cycle 1]: 0.00017114, [2] [tag_attr]: 4.176e-05 [meta_addattr_fg_expand]: 1.438e-05 [parallel-infer-symbol]: 4.10998e-06 [pre_auto_parallel]: 6.568e-05 [insert-virtual-dataset]: 2.75997e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.42001e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00686432, [53] [py_interpret_to_execute]: 9.85002e-06 [rewriter_before_opt_a]: 0.00028316 [opt_a]: 0.00388834, [2] [Cycle 1]: 0.00316407, [45] [expand_dump_flag]: 3.7e-06 [switch_simplify]: 8.187e-05 [loop_unroll]: 3.752e-05 [a_1]: 0.00082517 [with_stream_mark]: 2.694e-05 [recompute_prepare]: 1.393e-05 [updatestate_depend_eliminate]: 1.641e-05 [updatestate_assign_eliminate]: 1.609e-05 [updatestate_loads_eliminate]: 3.32997e-06 [parameter_eliminate]: 3.25e-06 [a_2]: 8.674e-05 [accelerated_algorithm]: 8.50001e-06 [shard]: 2.68003e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 4.471e-05 [auto_parallel]: 1.256e-05 [parallel]: 8.946e-05 [flash_sp]: 3.529e-05 [merge_comm]: 8.15e-06 [allreduce_fusion]: 1.132e-05 [matmul_add_comm_reduction]: 1.772e-05 [allreduce_slice_to_reducescatter]: 8.51002e-06 [virtual_shard_identity]: 1.245e-05 [virtual_dataset]: 6.31e-06 [get_grad_eliminate_]: 6.07999e-06 [virtual_output]: 5.81e-06 [merge_forward]: 5.22999e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 1.931e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.612e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.094e-05 [set_forward_comm_id_for_comm_node_pass]: 1.318e-05 [meta_fg_expand]: 3.45e-06 [flash_sp_send_recv_attached]: 3.04999e-06 [receive_attached]: 1.635e-05 [after_resolve]: 1.659e-05 [a_after_grad]: 1.107e-05 [renormalize]: 0.00107825 [add_forward_monad_depend]: 8.27998e-06 [auto_monad_grad]: 2.99001e-06 [auto_monad_eliminator]: 2.94e-05 [cse]: 5.564e-05 [a_3]: 5.626e-05 [Cycle 2]: 0.00071049, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 8.85001e-06 [loop_unroll]: 1.226e-05 [a_1]: 0.00013891 [with_stream_mark]: 2.17e-05 [recompute_prepare]: 8.08001e-06 [updatestate_depend_eliminate]: 3.77998e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 5.40001e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 6.777e-05 [accelerated_algorithm]: 6.04001e-06 [shard]: 2.11998e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 6.02001e-06 [merge_send_recv]: 7.75e-06 [auto_parallel]: 8.78001e-06 [parallel]: 7.66999e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.83001e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 8.45999e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 7.61999e-06 [virtual_dataset]: 5.81e-06 [get_grad_eliminate_]: 5.79e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 4.22e-06 [cell_reuse_recompute_pass]: 2.53e-06 [offload_activation]: 9.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.645e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 9.56e-06 [set_forward_comm_id_for_comm_node_pass]: 3.54002e-06 [meta_fg_expand]: 1.87001e-06 [flash_sp_send_recv_attached]: 1.34003e-06 [receive_attached]: 1.86003e-06 [after_resolve]: 1.193e-05 [a_after_grad]: 9.07999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.54001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.003e-05 [cse]: 1.934e-05 [a_3]: 3.44e-05 [py_interpret_to_execute_after_opt_a]: 8.48999e-06 [slice_cell_reuse_recomputed_activation]: 1.80001e-06 [rewriter_after_opt_a]: 3.62e-05 [convert_after_rewriter]: 1.51998e-06 [order_py_execute_after_rewriter]: 1.57999e-06 [mutable_eliminate]: 0.00083051 [opt_b]: 0.0002192, [1] [Cycle 1]: 0.00020925, [7] [b_1]: 0.00011544 [b_2]: 8e-06 [updatestate_depend_eliminate]: 1.133e-05 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 1.02e-06 [cse]: 3.02e-05 [optimize_parallel_all_gather_comm]: 6.641e-05 [overlap_param_gather]: 1.061e-05 [cconv]: 3.963e-05 [loop_unroll]: 0.00053013 [opt_after_cconv]: 0.00011363, [1] [Cycle 1]: 0.0001054, [7] [c_1]: 2.894e-05 [parameter_eliminate]: 6.26998e-06 [updatestate_depend_eliminate]: 6.97002e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 2.332e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.525e-05 [tuple_transform]: 7.959e-05, [1] [Cycle 1]: 7.437e-05, [4] [d_1]: 4.779e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 6.92002e-06 [partial_unused_args_eliminate]: 1.71998e-06 [add_recomputation]: 7.647e-05 [cse_after_recomputation]: 2.409e-05, [1] [Cycle 1]: 1.935e-05, [1] [cse]: 1.326e-05 [environ_conv]: 2.759e-05 [swap_dp_allreduce_reducescatter]: 2.49e-05 [bias_add_comm_swap]: 1.113e-05 [label_micro_interleaved_index]: 1.286e-05 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.45001e-06 [slice_recompute_activation]: 2.26e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 9.70002e-06 [full_micro_interleaved_order_control]: 1.036e-05 [reorder_send_recv_between_fp_bp]: 2.87002e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.32e-06 [interleave_parallel_branches]: 8.3e-06 [overlap_opt_shard_in_pipeline]: 3.261e-05 [overlap_opt_shard_grad_in_pipeline]: 1.84998e-06 [control_data_broadcast_order]: 1.67e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 3.58e-06 [overlap_recompute_and_grad_model_parallel]: 1.283e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.30999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.43998e-06 [overlap_grad_ring_attention]: 1.967e-05 [overlap_grad_flash_sp]: 4.936e-05 [begin_end_overlap_inline]: 7.39994e-07 [split_matmul_comm_elemetwise]: 1.06e-05 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 8.649e-05, [1] [Cycle 1]: 8.172e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.426e-05 [elim_not_effective]: 1.468e-05 [opt_reshape]: 6.64001e-06 [fold_const_symbol]: 1.056e-05 [renormalize]: 5.00004e-07 [detach_backward]: 2.94001e-06 [pipeline_parallel_scheduler]: 1.42999e-06 [auto_monad_reorder]: 2.455e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 5.79e-06 [opt_after_jit_grad]: 0.00056746 [validate]: 0.00019363 Sums bootstrap : 0.000915s : 1.66% type_inference : 0.047145s : 85.73% event_method : 0.000027s : 0.05% auto_monad : 0.000154s : 0.28% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.03% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000066s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.02% optimize.rewriter_before_opt_a : 0.000283s : 0.51% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000091s : 0.16% optimize.opt_a.loop_unroll : 0.000050s : 0.09% optimize.opt_a.a_1 : 0.000964s : 1.75% optimize.opt_a.with_stream_mark : 0.000049s : 0.09% optimize.opt_a.recompute_prepare : 0.000022s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000155s : 0.28% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.02% optimize.opt_a.merge_send_recv : 0.000052s : 0.10% optimize.opt_a.auto_parallel : 0.000021s : 0.04% optimize.opt_a.parallel : 0.000097s : 0.18% optimize.opt_a.flash_sp : 0.000039s : 0.07% optimize.opt_a.merge_comm : 0.000012s : 0.02% optimize.opt_a.allreduce_fusion : 0.000015s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.02% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.04% optimize.opt_a.virtual_dataset : 0.000012s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000011s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000029s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000018s : 0.03% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000020s : 0.04% optimize.opt_a.renormalize : 0.001078s : 1.96% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.07% optimize.opt_a.cse : 0.000075s : 0.14% optimize.opt_a.a_3 : 0.000091s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.07% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000831s : 1.51% optimize.opt_b.b_1 : 0.000115s : 0.21% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000066s : 0.12% optimize.overlap_param_gather : 0.000011s : 0.02% optimize.cconv : 0.000040s : 0.07% optimize.loop_unroll : 0.000530s : 0.96% optimize.opt_after_cconv.c_1 : 0.000029s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.03% optimize.tuple_transform.d_1 : 0.000048s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000076s : 0.14% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000028s : 0.05% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.05% optimize.bias_add_comm_swap : 0.000011s : 0.02% optimize.label_micro_interleaved_index : 0.000013s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.02% optimize.full_micro_interleaved_order_control : 0.000010s : 0.02% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.02% optimize.overlap_opt_shard_in_pipeline : 0.000033s : 0.06% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.04% optimize.overlap_grad_flash_sp : 0.000049s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.02% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000025s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000567s : 1.03% validate : 0.000194s : 0.35% Time group info: ------[substitution.] 0.000307 30 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000002s : 2: substitution.fold_const_symbol 2.14% : 0.000007s : 4: substitution.graph_param_transform 79.70% : 0.000245s : 6: substitution.inline 1.47% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.20% : 0.000013s : 4: substitution.remove_not_recompute_node 2.87% : 0.000009s : 4: substitution.replace_old_param 8.28% : 0.000025s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.047047 2 96.97% : 0.045620s : 1: type_inference.infer 3.03% : 0.001427s : 1: type_inference.specialize ------[replace.] 0.000089 10 71.71% : 0.000064s : 6: replace.inline 28.29% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000265 10 91.16% : 0.000241s : 6: match.inline 8.84% : 0.000023s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1408 0.90% : 0.000002s : 15: predicate.accumulaten_eliminater 0.97% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000002s : 15: predicate.addn_zero_filter 0.79% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.18% : 0.000005s : 23: predicate.arithmetic_simplify 0.86% : 0.000002s : 15: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.95% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 0.98% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.57% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 19: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 19: predicate.environ_get_depend_swap 1.52% : 0.000003s : 27: predicate.environ_get_eliminate 0.99% : 0.000002s : 19: predicate.environ_get_set_eliminate 1.47% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 25: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.55% : 0.000001s : 8: predicate.get_grad_eliminate 0.26% : 0.000001s : 4: predicate.graph_param_transform 0.72% : 0.000002s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 6.07% : 0.000014s : 64: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000004s : 27: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 42: predicate.load_eliminater 1.18% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.81% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.43% : 0.000003s : 23: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.47% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 15: predicate.minmaximum_grad 1.77% : 0.000004s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 2.35% : 0.000005s : 25: predicate.partial_defer_inline 1.45% : 0.000003s : 23: predicate.partial_eliminate 0.90% : 0.000002s : 15: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.15% : 0.000003s : 15: predicate.reduce_eliminate 2.50% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.54% : 0.000003s : 27: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.38% : 0.000001s : 4: predicate.reset_defer_inline 0.85% : 0.000002s : 15: predicate.reshape_eliminate 0.70% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.73% : 0.000002s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.57% : 0.000001s : 8: predicate.special_op_eliminate 0.69% : 0.000002s : 8: predicate.specialize_transform 1.26% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 1.22% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000004s : 25: predicate.switch_defer_inline 2.04% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.63% : 0.000013s : 83: predicate.switch_simplify 0.93% : 0.000002s : 15: predicate.tile_eliminate 0.85% : 0.000002s : 15: predicate.transpose_eliminate 1.36% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.55% : 0.000008s : 35: predicate.tuple_list_get_item_eliminator 1.31% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 1.97% : 0.000004s : 31: predicate.tuple_list_set_item_eliminator 1.55% : 0.000003s : 27: predicate.tuple_to_list_eliminator_ 2.24% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.68% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000857 13 45.61% : 0.000391s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.39% : 0.000466s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.148638 192 0.00% : 0.000004s : 1: ForceFp32Comm 27.95% : 0.041543s : 1: add_attr 27.93% : 0.041521s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000160s : 1: auto_monad 0.02% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000014s : 1: bias_add_comm_swap 0.65% : 0.000970s : 1: bootstrap 0.03% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.02% : 0.000031s : 1: environ_conv 0.02% : 0.000035s : 1: event_method 0.01% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000016s : 1: label_micro_interleaved_index 0.36% : 0.000541s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.57% : 0.000845s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000026s : 1: opt.transform.mutable_eliminate 0.98% : 0.001464s : 78: opt.transform.opt_a 0.02% : 0.000028s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000095s : 28: opt.transform.opt_b 0.04% : 0.000053s : 2: opt.transform.opt_trans_graph 0.03% : 0.000041s : 4: opt.transform.symbol_engine_opt 2.62% : 0.003892s : 1: opt_a 0.08% : 0.000118s : 1: opt_after_cconv 0.39% : 0.000578s : 1: opt_after_jit_grad 0.15% : 0.000223s : 1: opt_b 4.62% : 0.006870s : 1: optimize 0.05% : 0.000072s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.04% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000037s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000071s : 1: pre_auto_parallel 0.01% : 0.000014s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000013s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.38% : 0.000567s : 1: renormalize.infer 0.33% : 0.000496s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000041s : 1: rewriter_after_opt_a 0.20% : 0.000291s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000014s : 1: split_matmul_comm_elemetwise 0.02% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000089s : 1: symbol_engine_optimizer 0.06% : 0.000082s : 1: tuple_transform 31.73% : 0.047167s : 1: type_inference TotalTime = 0.38441, [21] [bootstrap]: 0.00092305 [type_inference]: 0.358628 [event_method]: 0.00056175 [auto_monad]: 0.00027473 [graph_reusing]: 8.3e-06 [inline]: 4.853e-05 [add_attr]: 0.0105455, [1] [add_attr_with_inline]: 0.0104537, [1] [Cycle 1]: 0.00033682, [2] [tag_attr]: 5.486e-05 [meta_addattr_fg_expand]: 1.665e-05 [parallel-infer-symbol]: 4.12e-06 [pre_auto_parallel]: 9.192e-05 [insert-virtual-dataset]: 2.85002e-06 [parallel-infer-symbol-second]: 1.07e-06 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.0121387, [53] [py_interpret_to_execute]: 1.054e-05 [rewriter_before_opt_a]: 0.00045171 [opt_a]: 0.00839813, [2] [Cycle 1]: 0.0074277, [45] [expand_dump_flag]: 4.94e-06 [switch_simplify]: 9.509e-05 [loop_unroll]: 3.901e-05 [a_1]: 0.00094314 [with_stream_mark]: 3.202e-05 [recompute_prepare]: 1.601e-05 [updatestate_depend_eliminate]: 1.77e-05 [updatestate_assign_eliminate]: 1.325e-05 [updatestate_loads_eliminate]: 3.56001e-06 [parameter_eliminate]: 2.57001e-06 [a_2]: 0.00012332 [accelerated_algorithm]: 1.007e-05 [shard]: 2.36998e-06 [meta_shard_fg_expand]: 4.57e-06 [shard_inline]: 9.96998e-06 [merge_send_recv]: 4.387e-05 [auto_parallel]: 1.311e-05 [parallel]: 0.0003111 [flash_sp]: 4.155e-05 [merge_comm]: 9.13002e-06 [allreduce_fusion]: 1.209e-05 [matmul_add_comm_reduction]: 1.991e-05 [allreduce_slice_to_reducescatter]: 7.92e-06 [virtual_shard_identity]: 2.187e-05 [virtual_dataset]: 1.546e-05 [get_grad_eliminate_]: 9.42999e-06 [virtual_output]: 9.07999e-06 [merge_forward]: 6.94001e-06 [cell_reuse_recompute_pass]: 2.56998e-06 [offload_activation]: 2.04e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.009e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 1.443e-05 [set_forward_comm_id_for_comm_node_pass]: 1.409e-05 [meta_fg_expand]: 5.57001e-06 [flash_sp_send_recv_attached]: 4.1e-06 [receive_attached]: 1.822e-05 [after_resolve]: 1.757e-05 [a_after_grad]: 1.486e-05 [renormalize]: 0.00454622 [add_forward_monad_depend]: 1.269e-05 [auto_monad_grad]: 3.16999e-06 [auto_monad_eliminator]: 4.084e-05 [cse]: 9.875e-05 [a_3]: 8.162e-05 [Cycle 2]: 0.00095251, [45] [expand_dump_flag]: 2.96999e-06 [switch_simplify]: 1.104e-05 [loop_unroll]: 8.33999e-06 [a_1]: 0.00022554 [with_stream_mark]: 2.535e-05 [recompute_prepare]: 1.033e-05 [updatestate_depend_eliminate]: 5.54e-06 [updatestate_assign_eliminate]: 4.09002e-06 [updatestate_loads_eliminate]: 4.05998e-06 [parameter_eliminate]: 2.53e-06 [a_2]: 0.00010212 [accelerated_algorithm]: 9.66e-06 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 2.81999e-06 [shard_inline]: 8.74998e-06 [merge_send_recv]: 9.63002e-06 [auto_parallel]: 1.033e-05 [parallel]: 1.029e-05 [flash_sp]: 4.32e-06 [merge_comm]: 4.51002e-06 [allreduce_fusion]: 4.72998e-06 [matmul_add_comm_reduction]: 1.168e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 1.249e-05 [virtual_dataset]: 8.38001e-06 [get_grad_eliminate_]: 8.28999e-06 [virtual_output]: 8.05999e-06 [merge_forward]: 5.34998e-06 [cell_reuse_recompute_pass]: 2.86999e-06 [offload_activation]: 1.131e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.118e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 1.161e-05 [set_forward_comm_id_for_comm_node_pass]: 4.61002e-06 [meta_fg_expand]: 3.78001e-06 [flash_sp_send_recv_attached]: 2.36998e-06 [receive_attached]: 2.84001e-06 [after_resolve]: 1.487e-05 [a_after_grad]: 1.21e-05 [renormalize]: 1.50001e-07 [add_forward_monad_depend]: 2.14999e-06 [auto_monad_grad]: 1.87001e-06 [auto_monad_eliminator]: 1.28e-05 [cse]: 3.712e-05 [a_3]: 5.135e-05 [py_interpret_to_execute_after_opt_a]: 1.169e-05 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 4.045e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.12999e-06 [mutable_eliminate]: 0.00093461 [opt_b]: 0.00042516, [1] [Cycle 1]: 0.0004136, [7] [b_1]: 0.00025635 [b_2]: 1.256e-05 [updatestate_depend_eliminate]: 1.375e-05 [updatestate_assign_eliminate]: 5.20001e-06 [updatestate_loads_eliminate]: 5.53002e-06 [renormalize]: 5.69999e-07 [cse]: 4.606e-05 [optimize_parallel_all_gather_comm]: 3.434e-05 [overlap_param_gather]: 1.464e-05 [cconv]: 3.93e-05 [loop_unroll]: 0.00052701 [opt_after_cconv]: 0.0001329, [1] [Cycle 1]: 0.00012593, [7] [c_1]: 4.188e-05 [parameter_eliminate]: 5.61998e-06 [updatestate_depend_eliminate]: 7.07002e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 2.93e-06 [cse]: 2.999e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 6.049e-05 [tuple_transform]: 9.418e-05, [1] [Cycle 1]: 8.907e-05, [4] [d_1]: 5.79e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 8.59e-06 [partial_unused_args_eliminate]: 1.88002e-06 [add_recomputation]: 0.00010723 [cse_after_recomputation]: 3.317e-05, [1] [Cycle 1]: 2.718e-05, [1] [cse]: 2.127e-05 [environ_conv]: 3.082e-05 [swap_dp_allreduce_reducescatter]: 2.478e-05 [bias_add_comm_swap]: 1.121e-05 [label_micro_interleaved_index]: 1.317e-05 [label_fine_grained_interleaved_index]: 2.61999e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.06998e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.57001e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 9.06998e-06 [full_micro_interleaved_order_control]: 9.83002e-06 [reorder_send_recv_between_fp_bp]: 2.44001e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 8.76997e-06 [overlap_opt_shard_in_pipeline]: 0.00012302 [overlap_opt_shard_grad_in_pipeline]: 2.39999e-06 [control_data_broadcast_order]: 1.672e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 4.97999e-06 [overlap_recompute_and_grad_model_parallel]: 1.402e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 1.97e-05 [overlap_grad_flash_sp]: 5.163e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 9.76998e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 9.266e-05, [1] [Cycle 1]: 8.732e-05, [6] [build]: 4.79998e-06 [elim_shapecalc]: 1.664e-05 [elim_not_effective]: 1.546e-05 [opt_reshape]: 8.84998e-06 [fold_const_symbol]: 1.184e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.61e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 2.693e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 5.94e-06 [opt_after_jit_grad]: 0.00068897 [validate]: 0.00018844 Sums bootstrap : 0.000923s : 0.25% type_inference : 0.358628s : 96.35% event_method : 0.000562s : 0.15% auto_monad : 0.000275s : 0.07% graph_reusing : 0.000008s : 0.00% inline : 0.000049s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000055s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000092s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000452s : 0.12% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000106s : 0.03% optimize.opt_a.loop_unroll : 0.000047s : 0.01% optimize.opt_a.a_1 : 0.001169s : 0.31% optimize.opt_a.with_stream_mark : 0.000057s : 0.02% optimize.opt_a.recompute_prepare : 0.000026s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000225s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.01% optimize.opt_a.merge_send_recv : 0.000054s : 0.01% optimize.opt_a.auto_parallel : 0.000023s : 0.01% optimize.opt_a.parallel : 0.000321s : 0.09% optimize.opt_a.flash_sp : 0.000046s : 0.01% optimize.opt_a.merge_comm : 0.000014s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000032s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000034s : 0.01% optimize.opt_a.virtual_dataset : 0.000024s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000032s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000051s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000019s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.01% optimize.opt_a.a_after_grad : 0.000027s : 0.01% optimize.opt_a.renormalize : 0.004546s : 1.22% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000054s : 0.01% optimize.opt_a.cse : 0.000136s : 0.04% optimize.opt_a.a_3 : 0.000133s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000935s : 0.25% optimize.opt_b.b_1 : 0.000256s : 0.07% optimize.opt_b.b_2 : 0.000013s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000046s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.01% optimize.overlap_param_gather : 0.000015s : 0.00% optimize.cconv : 0.000039s : 0.01% optimize.loop_unroll : 0.000527s : 0.14% optimize.opt_after_cconv.c_1 : 0.000042s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000060s : 0.02% optimize.tuple_transform.d_1 : 0.000058s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000107s : 0.03% optimize.cse_after_recomputation.cse : 0.000021s : 0.01% optimize.environ_conv : 0.000031s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.01% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000123s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.01% optimize.overlap_grad_flash_sp : 0.000052s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000689s : 0.19% validate : 0.000188s : 0.05% Time group info: ------[substitution.] 0.000327 27 0.65% : 0.000002s : 2: substitution.elim_not_effective 0.50% : 0.000002s : 2: substitution.fold_const_symbol 2.22% : 0.000007s : 5: substitution.graph_param_transform 82.65% : 0.000270s : 6: substitution.inline 1.70% : 0.000006s : 4: substitution.j_node_and_user_rematch 4.19% : 0.000014s : 4: substitution.remove_not_recompute_node 2.55% : 0.000008s : 2: substitution.replace_old_param 5.53% : 0.000018s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.358187 2 98.81% : 0.353919s : 1: type_inference.infer 1.19% : 0.004268s : 1: type_inference.specialize ------[replace.] 0.000110 8 70.17% : 0.000077s : 6: replace.inline 29.83% : 0.000033s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000282 8 94.29% : 0.000266s : 6: match.inline 5.71% : 0.000016s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000283 1750 0.77% : 0.000002s : 18: predicate.accumulaten_eliminater 0.87% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 18: predicate.addn_zero_filter 0.73% : 0.000002s : 18: predicate.adjust_all_reduce_mul_add 2.17% : 0.000006s : 30: predicate.arithmetic_simplify 0.85% : 0.000002s : 18: predicate.cast_eliminate 0.52% : 0.000001s : 12: predicate.check_bprop_eliminate 0.47% : 0.000001s : 12: predicate.compare_switch_simplify 0.23% : 0.000001s : 6: predicate.const_output_eliminate 0.68% : 0.000002s : 12: predicate.depend_value_elim 0.78% : 0.000002s : 18: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 18: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 18: predicate.dict_set_item_eliminator 0.91% : 0.000003s : 11: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 24: predicate.environ_add_const_eliminate 0.90% : 0.000003s : 24: predicate.environ_get_add_eliminate 0.94% : 0.000003s : 24: predicate.environ_get_depend_swap 1.52% : 0.000004s : 36: predicate.environ_get_eliminate 0.90% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.25% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.40% : 0.000007s : 26: predicate.float_depend_g_call 0.52% : 0.000001s : 12: predicate.float_environ_get_switch 0.69% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.59% : 0.000002s : 12: predicate.get_grad_eliminate 0.29% : 0.000001s : 5: predicate.graph_param_transform 0.49% : 0.000001s : 12: predicate.incorporate_call 0.45% : 0.000001s : 12: predicate.incorporate_call_switch 5.64% : 0.000016s : 80: predicate.inline 0.89% : 0.000003s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 1.06% : 0.000003s : 12: predicate.less_batch_normalization 1.48% : 0.000004s : 31: predicate.list_to_tuple_eliminator_ 2.10% : 0.000006s : 50: predicate.load_eliminater 0.78% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.10% : 0.000006s : 43: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 30: predicate.make_slice_get_slice_eliminator 0.72% : 0.000002s : 12: predicate.merge_addn 0.51% : 0.000001s : 12: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 12: predicate.mini_step_allgather_replace 0.74% : 0.000002s : 18: predicate.minmaximum_grad 1.45% : 0.000004s : 6: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 6: predicate.parallel_virtual_node 2.28% : 0.000006s : 26: predicate.partial_defer_inline 1.18% : 0.000003s : 26: predicate.partial_eliminate 0.85% : 0.000002s : 18: predicate.print_const_string_wrapper 0.57% : 0.000002s : 12: predicate.reduce_all_const_elim 1.15% : 0.000003s : 18: predicate.reduce_eliminate 2.07% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000002s : 12: predicate.remove_not_recompute_node 1.55% : 0.000004s : 32: predicate.replace_applicator 0.56% : 0.000002s : 12: predicate.replace_old_param 0.44% : 0.000001s : 6: predicate.reset_defer_inline 0.89% : 0.000003s : 18: predicate.reshape_eliminate 0.57% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 6: predicate.row_tensor_eliminate 0.89% : 0.000003s : 12: predicate.same_eliminate 0.45% : 0.000001s : 12: predicate.set_cell_output_no_recompute 1.43% : 0.000004s : 12: predicate.shard_identity_eliminate 0.64% : 0.000002s : 11: predicate.special_op_eliminate 0.58% : 0.000002s : 12: predicate.specialize_transform 1.06% : 0.000003s : 12: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.30% : 0.000004s : 26: predicate.switch_defer_inline 1.76% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.30% : 0.000015s : 86: predicate.switch_simplify 0.82% : 0.000002s : 18: predicate.tile_eliminate 0.85% : 0.000002s : 18: predicate.transpose_eliminate 1.47% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.24% : 0.000003s : 29: predicate.tuple_list_get_item_depend_reorder 2.67% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.43% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 7.55% : 0.000021s : 41: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 31: predicate.tuple_to_list_eliminator_ 1.99% : 0.000006s : 50: predicate.updatestate_pure_node_eliminater 2.99% : 0.000008s : 62: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 6: predicate.value_based_eliminate 0.83% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.68% : 0.000002s : 12: predicate.virtual_output_eliminate 0.21% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004567 30 69.75% : 0.003186s : 22: func_graph_cloner_run.FuncGraphClonerGraph 30.25% : 0.001382s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.413484 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.55% : 0.010553s : 1: add_attr 2.53% : 0.010458s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000113s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.07% : 0.000287s : 1: auto_monad 0.01% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.24% : 0.000979s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000008s : 1: detach_backward 0.01% : 0.000034s : 1: environ_conv 0.14% : 0.000589s : 1: event_method 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000052s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.13% : 0.000536s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.23% : 0.000950s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000027s : 1: opt.transform.mutable_eliminate 0.45% : 0.001851s : 78: opt.transform.opt_a 0.01% : 0.000040s : 1: opt.transform.opt_after_cconv 0.01% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000237s : 28: opt.transform.opt_b 0.02% : 0.000064s : 2: opt.transform.opt_trans_graph 0.01% : 0.000048s : 4: opt.transform.symbol_engine_opt 2.03% : 0.008403s : 1: opt_a 0.03% : 0.000137s : 1: opt_after_cconv 0.17% : 0.000702s : 1: opt_after_jit_grad 0.10% : 0.000430s : 1: opt_b 2.94% : 0.012145s : 1: optimize 0.01% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000055s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000127s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000019s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000098s : 1: pre_auto_parallel 0.00% : 0.000015s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.02% : 0.000065s : 1: remove_dup_value 0.74% : 0.003071s : 1: renormalize.infer 0.35% : 0.001457s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000046s : 1: rewriter_after_opt_a 0.11% : 0.000464s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.01% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000096s : 1: symbol_engine_optimizer 0.02% : 0.000097s : 1: tuple_transform 86.74% : 0.358676s : 1: type_inference TotalTime = 2.23653, [21] [bootstrap]: 0.00103895 [type_inference]: 2.1019 [event_method]: 0.00165547 [auto_monad]: 0.00029352 [graph_reusing]: 8.72998e-06 [inline]: 6.54999e-06 [add_attr]: 0.109721, [1] [add_attr_with_inline]: 0.109704, [1] [Cycle 1]: 0.00019459, [2] [tag_attr]: 7.718e-05 [meta_addattr_fg_expand]: 1.948e-05 [parallel-infer-symbol]: 3.85998e-06 [pre_auto_parallel]: 9.056e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.0206951, [53] [py_interpret_to_execute]: 9.93002e-06 [rewriter_before_opt_a]: 0.00041302 [opt_a]: 0.0169772, [2] [Cycle 1]: 0.0158835, [45] [expand_dump_flag]: 4.74998e-06 [switch_simplify]: 0.00020695 [loop_unroll]: 5.802e-05 [a_1]: 0.00144812 [with_stream_mark]: 2.714e-05 [recompute_prepare]: 1.643e-05 [updatestate_depend_eliminate]: 1.6e-05 [updatestate_assign_eliminate]: 1.183e-05 [updatestate_loads_eliminate]: 3.86999e-06 [parameter_eliminate]: 2.39001e-06 [a_2]: 0.00016869 [accelerated_algorithm]: 1.178e-05 [shard]: 2.56998e-06 [meta_shard_fg_expand]: 3.83001e-06 [shard_inline]: 1.205e-05 [merge_send_recv]: 4.316e-05 [auto_parallel]: 1.23e-05 [parallel]: 0.00014355 [flash_sp]: 3.86e-05 [merge_comm]: 7.16999e-06 [allreduce_fusion]: 1.232e-05 [matmul_add_comm_reduction]: 1.983e-05 [allreduce_slice_to_reducescatter]: 7.59002e-06 [virtual_shard_identity]: 3.343e-05 [virtual_dataset]: 1.195e-05 [get_grad_eliminate_]: 1.231e-05 [virtual_output]: 1.088e-05 [merge_forward]: 5.72999e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 2.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.142e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.576e-05 [set_forward_comm_id_for_comm_node_pass]: 1.329e-05 [meta_fg_expand]: 5.61e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 1.743e-05 [after_resolve]: 1.941e-05 [a_after_grad]: 1.871e-05 [renormalize]: 0.0126306 [add_forward_monad_depend]: 1.373e-05 [auto_monad_grad]: 2.78e-06 [auto_monad_eliminator]: 4.291e-05 [cse]: 0.00023758 [a_3]: 0.00010043 [Cycle 2]: 0.00107814, [45] [expand_dump_flag]: 3.33998e-06 [switch_simplify]: 1.488e-05 [loop_unroll]: 1.042e-05 [a_1]: 0.00027156 [with_stream_mark]: 2.613e-05 [recompute_prepare]: 1.147e-05 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 4.63001e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 1.80001e-06 [a_2]: 0.00013197 [accelerated_algorithm]: 1.075e-05 [shard]: 2.45002e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 1.001e-05 [merge_send_recv]: 1.017e-05 [auto_parallel]: 1.015e-05 [parallel]: 9.31e-06 [flash_sp]: 3.90998e-06 [merge_comm]: 4.25e-06 [allreduce_fusion]: 4.24002e-06 [matmul_add_comm_reduction]: 1.238e-05 [allreduce_slice_to_reducescatter]: 1.15001e-06 [virtual_shard_identity]: 1.253e-05 [virtual_dataset]: 1.129e-05 [get_grad_eliminate_]: 1.065e-05 [virtual_output]: 9.59e-06 [merge_forward]: 5.15001e-06 [cell_reuse_recompute_pass]: 3.41999e-06 [offload_activation]: 1.223e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.509e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 1.611e-05 [set_forward_comm_id_for_comm_node_pass]: 5.14998e-06 [meta_fg_expand]: 3.31999e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 1.877e-05 [a_after_grad]: 1.802e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.08998e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 1.344e-05 [cse]: 4.635e-05 [a_3]: 6.622e-05 [py_interpret_to_execute_after_opt_a]: 1.016e-05 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 4.122e-05 [convert_after_rewriter]: 1.47999e-06 [order_py_execute_after_rewriter]: 1.05999e-06 [mutable_eliminate]: 0.0008108 [opt_b]: 0.00037956, [1] [Cycle 1]: 0.00037018, [7] [b_1]: 0.00023846 [b_2]: 1.202e-05 [updatestate_depend_eliminate]: 1.127e-05 [updatestate_assign_eliminate]: 3.68999e-06 [updatestate_loads_eliminate]: 3.53999e-06 [renormalize]: 6.99976e-07 [cse]: 6.063e-05 [optimize_parallel_all_gather_comm]: 3.386e-05 [overlap_param_gather]: 1.078e-05 [cconv]: 3.64e-05 [loop_unroll]: 0.00054697 [opt_after_cconv]: 0.00016814, [1] [Cycle 1]: 0.00016067, [7] [c_1]: 5.591e-05 [parameter_eliminate]: 5.00001e-06 [updatestate_depend_eliminate]: 8.51002e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.3e-06 [cse]: 4.795e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 6.092e-05 [tuple_transform]: 0.0001086, [1] [Cycle 1]: 0.00010339, [4] [d_1]: 6.764e-05 [none_parameter_eliminate]: 2.66e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 1.022e-05 [partial_unused_args_eliminate]: 2.30002e-06 [add_recomputation]: 0.00019909 [cse_after_recomputation]: 4.166e-05, [1] [Cycle 1]: 3.521e-05, [1] [cse]: 2.646e-05 [environ_conv]: 3.491e-05 [swap_dp_allreduce_reducescatter]: 2.533e-05 [bias_add_comm_swap]: 1.141e-05 [label_micro_interleaved_index]: 1.328e-05 [label_fine_grained_interleaved_index]: 2.80002e-06 [merge_cast_opt]: 1.68002e-06 [slice_recompute_activation]: 1.91e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 8.90999e-06 [full_micro_interleaved_order_control]: 9.72999e-06 [reorder_send_recv_between_fp_bp]: 3.51999e-06 [comm_op_add_attrs]: 1.29e-06 [add_comm_op_reuse_tag]: 1.74e-06 [interleave_split_concat_branches]: 1.32999e-06 [interleave_parallel_branches]: 8.52e-06 [overlap_opt_shard_in_pipeline]: 2.907e-05 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 2.359e-05 [grouped_pairwise_exchange_alltoall]: 2.19001e-06 [offloading_packed_experts]: 4.43999e-06 [overlap_recompute_and_grad_model_parallel]: 1.317e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42999e-06 [overlap_recompute_comm]: 2.26998e-06 [overlap_grad_ring_attention]: 1.878e-05 [overlap_grad_flash_sp]: 5.565e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 1.054e-05 [split_layernorm_comm]: 1.81998e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00011942, [1] [Cycle 1]: 0.00011253, [6] [build]: 5.76998e-06 [elim_shapecalc]: 2.234e-05 [elim_not_effective]: 2.037e-05 [opt_reshape]: 1.247e-05 [fold_const_symbol]: 1.573e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.60997e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 3.091e-05 [get_jit_bprop_graph]: 2.08002e-06 [rewriter_after_jit_bprop_graph]: 5.85002e-06 [opt_after_jit_grad]: 0.00069026 [validate]: 0.00010371 Sums bootstrap : 0.001039s : 0.05% type_inference : 2.101903s : 98.89% event_method : 0.001655s : 0.08% auto_monad : 0.000294s : 0.01% graph_reusing : 0.000009s : 0.00% inline : 0.000007s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000077s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000019s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000091s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.00% optimize.rewriter_before_opt_a : 0.000413s : 0.02% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000222s : 0.01% optimize.opt_a.loop_unroll : 0.000068s : 0.00% optimize.opt_a.a_1 : 0.001720s : 0.08% optimize.opt_a.with_stream_mark : 0.000053s : 0.00% optimize.opt_a.recompute_prepare : 0.000028s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000301s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000022s : 0.00% optimize.opt_a.merge_send_recv : 0.000053s : 0.00% optimize.opt_a.auto_parallel : 0.000022s : 0.00% optimize.opt_a.parallel : 0.000153s : 0.01% optimize.opt_a.flash_sp : 0.000043s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000032s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000046s : 0.00% optimize.opt_a.virtual_dataset : 0.000023s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000023s : 0.00% optimize.opt_a.virtual_output : 0.000020s : 0.00% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000033s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000057s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000032s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000038s : 0.00% optimize.opt_a.a_after_grad : 0.000037s : 0.00% optimize.opt_a.renormalize : 0.012631s : 0.59% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000056s : 0.00% optimize.opt_a.cse : 0.000284s : 0.01% optimize.opt_a.a_3 : 0.000167s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000811s : 0.04% optimize.opt_b.b_1 : 0.000238s : 0.01% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000061s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000036s : 0.00% optimize.loop_unroll : 0.000547s : 0.03% optimize.opt_after_cconv.c_1 : 0.000056s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000048s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000061s : 0.00% optimize.tuple_transform.d_1 : 0.000068s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000199s : 0.01% optimize.cse_after_recomputation.cse : 0.000026s : 0.00% optimize.environ_conv : 0.000035s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000004s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000029s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000024s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000056s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000031s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000690s : 0.03% validate : 0.000104s : 0.00% Time group info: ------[substitution.] 0.000468 43 0.46% : 0.000002s : 2: substitution.elim_not_effective 0.37% : 0.000002s : 2: substitution.fold_const_symbol 1.58% : 0.000007s : 7: substitution.graph_param_transform 82.13% : 0.000385s : 10: substitution.inline 1.10% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.17% : 0.000015s : 4: substitution.remove_not_recompute_node 1.72% : 0.000008s : 6: substitution.replace_old_param 5.22% : 0.000024s : 4: substitution.switch_simplify 4.26% : 0.000020s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 2.101746 2 99.56% : 2.092593s : 1: type_inference.infer 0.44% : 0.009153s : 1: type_inference.specialize ------[replace.] 0.000203 18 50.77% : 0.000103s : 10: replace.inline 31.85% : 0.000065s : 4: replace.switch_simplify 17.38% : 0.000035s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000415 18 90.90% : 0.000377s : 10: match.inline 5.09% : 0.000021s : 4: match.switch_simplify 4.01% : 0.000017s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000372 2798 1.06% : 0.000004s : 30: predicate.accumulaten_eliminater 0.82% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 18: predicate.addn_check_dump 1.02% : 0.000004s : 30: predicate.addn_zero_filter 0.90% : 0.000003s : 30: predicate.adjust_all_reduce_mul_add 2.27% : 0.000008s : 48: predicate.arithmetic_simplify 0.96% : 0.000004s : 30: predicate.cast_eliminate 0.53% : 0.000002s : 18: predicate.check_bprop_eliminate 0.48% : 0.000002s : 18: predicate.compare_switch_simplify 0.23% : 0.000001s : 9: predicate.const_output_eliminate 0.53% : 0.000002s : 18: predicate.depend_value_elim 1.02% : 0.000004s : 30: predicate.dict_get_item_const_eliminator 1.18% : 0.000004s : 30: predicate.dict_get_item_eliminator 0.97% : 0.000004s : 30: predicate.dict_set_item_eliminator 0.95% : 0.000004s : 16: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 7: predicate.elim_not_effective 0.36% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000004s : 39: predicate.environ_add_const_eliminate 1.13% : 0.000004s : 39: predicate.environ_get_add_eliminate 1.10% : 0.000004s : 39: predicate.environ_get_depend_swap 1.64% : 0.000006s : 57: predicate.environ_get_eliminate 1.09% : 0.000004s : 39: predicate.environ_get_set_eliminate 1.45% : 0.000005s : 44: predicate.exchange_switch_depend_value 2.39% : 0.000009s : 44: predicate.float_depend_g_call 0.59% : 0.000002s : 18: predicate.float_environ_get_switch 0.75% : 0.000003s : 27: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 7: predicate.fold_const_symbol 0.77% : 0.000003s : 18: predicate.get_grad_eliminate 0.17% : 0.000001s : 7: predicate.graph_param_transform 0.50% : 0.000002s : 18: predicate.incorporate_call 0.45% : 0.000002s : 18: predicate.incorporate_call_switch 5.86% : 0.000022s : 128: predicate.inline 0.73% : 0.000003s : 18: predicate.inline_without_move 0.37% : 0.000001s : 18: predicate.j_node_and_user_rematch 0.78% : 0.000003s : 18: predicate.less_batch_normalization 1.83% : 0.000007s : 50: predicate.list_to_tuple_eliminator_ 2.45% : 0.000009s : 82: predicate.load_eliminater 0.90% : 0.000003s : 9: predicate.loop_unroll_after_grad 2.58% : 0.000010s : 71: predicate.loop_unroll_before_grad 1.59% : 0.000006s : 48: predicate.make_slice_get_slice_eliminator 0.52% : 0.000002s : 18: predicate.merge_addn 0.54% : 0.000002s : 18: predicate.micro_step_allgather_replace 0.52% : 0.000002s : 18: predicate.mini_step_allgather_replace 0.95% : 0.000004s : 30: predicate.minmaximum_grad 0.97% : 0.000004s : 9: predicate.mutable_eliminate 0.34% : 0.000001s : 7: predicate.opt_reshape 0.29% : 0.000001s : 9: predicate.parallel_virtual_node 2.31% : 0.000009s : 44: predicate.partial_defer_inline 1.47% : 0.000005s : 43: predicate.partial_eliminate 0.97% : 0.000004s : 30: predicate.print_const_string_wrapper 0.57% : 0.000002s : 18: predicate.reduce_all_const_elim 1.22% : 0.000005s : 30: predicate.reduce_eliminate 2.42% : 0.000009s : 82: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000002s : 18: predicate.remove_not_recompute_node 1.62% : 0.000006s : 52: predicate.replace_applicator 0.55% : 0.000002s : 18: predicate.replace_old_param 0.30% : 0.000001s : 9: predicate.reset_defer_inline 1.07% : 0.000004s : 30: predicate.reshape_eliminate 0.57% : 0.000002s : 18: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 9: predicate.row_tensor_eliminate 0.93% : 0.000003s : 18: predicate.same_eliminate 0.47% : 0.000002s : 18: predicate.set_cell_output_no_recompute 0.78% : 0.000003s : 18: predicate.shard_identity_eliminate 0.59% : 0.000002s : 16: predicate.special_op_eliminate 0.59% : 0.000002s : 18: predicate.specialize_transform 0.91% : 0.000003s : 18: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000003s : 18: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 9: predicate.switch_call_monad_eliminater 1.68% : 0.000006s : 44: predicate.switch_defer_inline 2.24% : 0.000008s : 62: predicate.switch_layer_defer_inline 5.77% : 0.000021s : 148: predicate.switch_simplify 0.99% : 0.000004s : 30: predicate.tile_eliminate 0.97% : 0.000004s : 30: predicate.transpose_eliminate 1.50% : 0.000006s : 46: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000006s : 46: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000005s : 46: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000012s : 68: predicate.tuple_list_get_item_eliminator 1.48% : 0.000006s : 46: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000009s : 64: predicate.tuple_list_set_item_eliminator 1.55% : 0.000006s : 50: predicate.tuple_to_list_eliminator_ 2.35% : 0.000009s : 82: predicate.updatestate_pure_node_eliminater 3.03% : 0.000011s : 100: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 9: predicate.value_based_eliminate 0.64% : 0.000002s : 18: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 18: predicate.virtual_output_eliminate 0.22% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 9: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.109044 128 96.14% : 0.104830s : 115: func_graph_cloner_run.FuncGraphClonerGraph 3.86% : 0.004214s : 13: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.382549 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.61% : 0.109729s : 1: add_attr 4.60% : 0.109709s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.01% : 0.000208s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000310s : 1: auto_monad 0.00% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000027s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.05% : 0.001098s : 1: bootstrap 0.00% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000027s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000045s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000040s : 1: environ_conv 0.07% : 0.001690s : 1: event_method 0.01% : 0.000132s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000010s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000557s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000823s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000027s : 1: opt.transform.mutable_eliminate 0.11% : 0.002708s : 78: opt.transform.opt_a 0.00% : 0.000055s : 1: opt.transform.opt_after_cconv 0.00% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000221s : 28: opt.transform.opt_b 0.00% : 0.000075s : 2: opt.transform.opt_trans_graph 0.00% : 0.000065s : 4: opt.transform.symbol_engine_opt 0.71% : 0.016981s : 1: opt_a 0.01% : 0.000173s : 1: opt_after_cconv 0.03% : 0.000704s : 1: opt_after_jit_grad 0.02% : 0.000383s : 1: opt_b 0.87% : 0.020702s : 1: optimize 0.00% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000060s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000034s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000096s : 1: pre_auto_parallel 0.00% : 0.000015s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000066s : 1: remove_dup_value 0.36% : 0.008547s : 1: renormalize.infer 0.17% : 0.004065s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000047s : 1: rewriter_after_opt_a 0.02% : 0.000422s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000029s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000123s : 1: symbol_engine_optimizer 0.00% : 0.000111s : 1: tuple_transform 88.22% : 2.101936s : 1: type_inference TotalTime = 1.68799, [21] [bootstrap]: 0.00059496 [type_inference]: 1.35374 [event_method]: 0.00088488 [auto_monad]: 0.00030632 [graph_reusing]: 1.271e-05 [inline]: 3.56001e-06 [add_attr]: 0.00756507, [1] [add_attr_with_inline]: 0.00750126, [1] [Cycle 1]: 0.00025276, [2] [tag_attr]: 9.483e-05 [meta_addattr_fg_expand]: 2.274e-05 [parallel-infer-symbol]: 4.11001e-06 [pre_auto_parallel]: 0.00010179 [insert-virtual-dataset]: 3.13e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.47001e-06 [pipeline_split]: 2.21e-06 [optimize]: 0.311243, [53] [py_interpret_to_execute]: 1.087e-05 [rewriter_before_opt_a]: 0.00089655 [opt_a]: 0.306149, [3] [Cycle 1]: 0.295383, [45] [expand_dump_flag]: 7.70998e-06 [switch_simplify]: 0.00030203 [loop_unroll]: 0.00015233 [a_1]: 0.0487368 [with_stream_mark]: 5.313e-05 [recompute_prepare]: 4.514e-05 [updatestate_depend_eliminate]: 1.37e-05 [updatestate_assign_eliminate]: 1.065e-05 [updatestate_loads_eliminate]: 1.022e-05 [parameter_eliminate]: 4.60001e-06 [a_2]: 0.00039225 [accelerated_algorithm]: 0.00011381 [shard]: 2.39001e-06 [meta_shard_fg_expand]: 1.103e-05 [shard_inline]: 2.726e-05 [merge_send_recv]: 2.444e-05 [auto_parallel]: 2.161e-05 [parallel]: 0.00015777 [flash_sp]: 1.691e-05 [merge_comm]: 1.555e-05 [allreduce_fusion]: 1.127e-05 [matmul_add_comm_reduction]: 4.175e-05 [allreduce_slice_to_reducescatter]: 1.54e-06 [virtual_shard_identity]: 3.38e-05 [virtual_dataset]: 2.864e-05 [get_grad_eliminate_]: 2.57e-05 [virtual_output]: 2.507e-05 [merge_forward]: 1.43e-05 [cell_reuse_recompute_pass]: 2.11998e-06 [offload_activation]: 2.231e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.628e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 4.086e-05 [set_forward_comm_id_for_comm_node_pass]: 1.389e-05 [meta_fg_expand]: 0.00584386 [flash_sp_send_recv_attached]: 1.049e-05 [receive_attached]: 2.36e-06 [after_resolve]: 0.00015457 [a_after_grad]: 0.00016136 [renormalize]: 0.236953 [add_forward_monad_depend]: 3.693e-05 [auto_monad_grad]: 1.474e-05 [auto_monad_eliminator]: 0.00012894 [cse]: 0.00039006 [a_3]: 0.00076155 [Cycle 2]: 0.00886677, [45] [expand_dump_flag]: 5.08002e-06 [switch_simplify]: 0.0005136 [loop_unroll]: 9.201e-05 [a_1]: 0.00309617 [with_stream_mark]: 4.395e-05 [recompute_prepare]: 2.557e-05 [updatestate_depend_eliminate]: 9.87999e-06 [updatestate_assign_eliminate]: 7.63999e-06 [updatestate_loads_eliminate]: 6.64999e-06 [parameter_eliminate]: 3.2e-06 [a_2]: 0.00025048 [accelerated_algorithm]: 2.578e-05 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 6.96999e-06 [shard_inline]: 1.79e-05 [merge_send_recv]: 1.547e-05 [auto_parallel]: 1.672e-05 [parallel]: 1.034e-05 [flash_sp]: 4.77e-06 [merge_comm]: 8.07e-06 [allreduce_fusion]: 7.08e-06 [matmul_add_comm_reduction]: 1.602e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 2.065e-05 [virtual_dataset]: 1.632e-05 [get_grad_eliminate_]: 1.879e-05 [virtual_output]: 1.697e-05 [merge_forward]: 1.012e-05 [cell_reuse_recompute_pass]: 1.94e-06 [offload_activation]: 1.968e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.062e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 2.486e-05 [set_forward_comm_id_for_comm_node_pass]: 9.17999e-06 [meta_fg_expand]: 0.00034578 [flash_sp_send_recv_attached]: 3.23998e-06 [receive_attached]: 3.33e-06 [after_resolve]: 3.814e-05 [a_after_grad]: 2.779e-05 [renormalize]: 0.00320103 [add_forward_monad_depend]: 1.274e-05 [auto_monad_grad]: 3.16001e-06 [auto_monad_eliminator]: 3.356e-05 [cse]: 0.00024963 [a_3]: 0.0001388 [Cycle 3]: 0.00182198, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 3.733e-05 [loop_unroll]: 1.667e-05 [a_1]: 0.00058905 [with_stream_mark]: 5.523e-05 [recompute_prepare]: 1.954e-05 [updatestate_depend_eliminate]: 9.48002e-06 [updatestate_assign_eliminate]: 7.26999e-06 [updatestate_loads_eliminate]: 6.94001e-06 [parameter_eliminate]: 2.89999e-06 [a_2]: 0.00023894 [accelerated_algorithm]: 2.245e-05 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 5.15001e-06 [shard_inline]: 1.67e-05 [merge_send_recv]: 1.385e-05 [auto_parallel]: 1.517e-05 [parallel]: 9.79999e-06 [flash_sp]: 1.71e-06 [merge_comm]: 7.93999e-06 [allreduce_fusion]: 6.93998e-06 [matmul_add_comm_reduction]: 1.554e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.801e-05 [virtual_dataset]: 1.663e-05 [get_grad_eliminate_]: 1.608e-05 [virtual_output]: 1.657e-05 [merge_forward]: 8.24002e-06 [cell_reuse_recompute_pass]: 3.54002e-06 [offload_activation]: 1.558e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.068e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 2.537e-05 [set_forward_comm_id_for_comm_node_pass]: 9.72999e-06 [meta_fg_expand]: 6.42001e-06 [flash_sp_send_recv_attached]: 1.83002e-06 [receive_attached]: 2.63e-06 [after_resolve]: 2.507e-05 [a_after_grad]: 2.53e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.04999e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 2.19e-05 [cse]: 7.59e-05 [a_3]: 0.00010854 [py_interpret_to_execute_after_opt_a]: 1.044e-05 [slice_cell_reuse_recomputed_activation]: 2.12001e-06 [rewriter_after_opt_a]: 5.12e-05 [convert_after_rewriter]: 1.27e-06 [order_py_execute_after_rewriter]: 1.36002e-06 [mutable_eliminate]: 0.00091337 [opt_b]: 0.00088006, [1] [Cycle 1]: 0.0008684, [7] [b_1]: 0.00064177 [b_2]: 2.815e-05 [updatestate_depend_eliminate]: 1.746e-05 [updatestate_assign_eliminate]: 6.71999e-06 [updatestate_loads_eliminate]: 6.74001e-06 [renormalize]: 1.30001e-06 [cse]: 8.487e-05 [optimize_parallel_all_gather_comm]: 3.707e-05 [overlap_param_gather]: 2.34001e-06 [cconv]: 4.252e-05 [loop_unroll]: 0.00066511 [opt_after_cconv]: 0.00027009, [1] [Cycle 1]: 0.00026051, [7] [c_1]: 0.00012107 [parameter_eliminate]: 6.71e-06 [updatestate_depend_eliminate]: 1.261e-05 [updatestate_assign_eliminate]: 6.63e-06 [updatestate_loads_eliminate]: 6.34001e-06 [cse]: 6.364e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 0.0001026 [tuple_transform]: 0.00023319, [1] [Cycle 1]: 0.00022577, [4] [d_1]: 0.0001729 [none_parameter_eliminate]: 2.51998e-06 [renormalize]: 3.7998e-07 [switch_simplify]: 2.141e-05 [partial_unused_args_eliminate]: 2.69001e-06 [add_recomputation]: 0.00010607 [cse_after_recomputation]: 6.815e-05, [1] [Cycle 1]: 6.16e-05, [1] [cse]: 5.343e-05 [environ_conv]: 2.024e-05 [swap_dp_allreduce_reducescatter]: 1.357e-05 [bias_add_comm_swap]: 4.03001e-06 [label_micro_interleaved_index]: 6.89001e-06 [label_fine_grained_interleaved_index]: 2.50997e-06 [merge_cast_opt]: 1.64e-06 [slice_recompute_activation]: 2.35002e-06 [micro_interleaved_order_control]: 2.77002e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.15001e-06 [remove_cast_before_assign_add]: 1.11997e-06 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.90002e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.09e-06 [interleave_split_concat_branches]: 1.40999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 6.456e-05 [overlap_opt_shard_grad_in_pipeline]: 2.69999e-06 [control_data_broadcast_order]: 3.535e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 7.9e-06 [overlap_recompute_and_grad_model_parallel]: 8.22998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 7.80998e-06 [overlap_grad_flash_sp]: 6.397e-05 [begin_end_overlap_inline]: 5.90022e-07 [split_matmul_comm_elemetwise]: 2.39999e-06 [split_layernorm_comm]: 2.02001e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 0.0001712, [1] [Cycle 1]: 0.00016478, [6] [build]: 1.824e-05 [elim_shapecalc]: 3.076e-05 [elim_not_effective]: 3.214e-05 [opt_reshape]: 1.803e-05 [fold_const_symbol]: 2.383e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.303e-05 [pipeline_parallel_scheduler]: 1.86003e-06 [auto_monad_reorder]: 4.482e-05 [get_jit_bprop_graph]: 2.08998e-06 [rewriter_after_jit_bprop_graph]: 8.52998e-06 [opt_after_jit_grad]: 0.0129888 [validate]: 0.00018685 Sums bootstrap : 0.000595s : 0.04% type_inference : 1.353736s : 80.66% event_method : 0.000885s : 0.05% auto_monad : 0.000306s : 0.02% graph_reusing : 0.000013s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000095s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000023s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000102s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000897s : 0.05% optimize.opt_a.expand_dump_flag : 0.000016s : 0.00% optimize.opt_a.switch_simplify : 0.000853s : 0.05% optimize.opt_a.loop_unroll : 0.000261s : 0.02% optimize.opt_a.a_1 : 0.052422s : 3.12% optimize.opt_a.with_stream_mark : 0.000152s : 0.01% optimize.opt_a.recompute_prepare : 0.000090s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000033s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000026s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000024s : 0.00% optimize.opt_a.parameter_eliminate : 0.000011s : 0.00% optimize.opt_a.a_2 : 0.000882s : 0.05% optimize.opt_a.accelerated_algorithm : 0.000162s : 0.01% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000023s : 0.00% optimize.opt_a.shard_inline : 0.000062s : 0.00% optimize.opt_a.merge_send_recv : 0.000054s : 0.00% optimize.opt_a.auto_parallel : 0.000053s : 0.00% optimize.opt_a.parallel : 0.000178s : 0.01% optimize.opt_a.flash_sp : 0.000023s : 0.00% optimize.opt_a.merge_comm : 0.000032s : 0.00% optimize.opt_a.allreduce_fusion : 0.000025s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000073s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000072s : 0.00% optimize.opt_a.virtual_dataset : 0.000062s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000061s : 0.00% optimize.opt_a.virtual_output : 0.000059s : 0.00% optimize.opt_a.merge_forward : 0.000033s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000058s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000128s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000091s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000033s : 0.00% optimize.opt_a.meta_fg_expand : 0.006196s : 0.37% optimize.opt_a.flash_sp_send_recv_attached : 0.000016s : 0.00% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000218s : 0.01% optimize.opt_a.a_after_grad : 0.000214s : 0.01% optimize.opt_a.renormalize : 0.240154s : 14.31% optimize.opt_a.add_forward_monad_depend : 0.000053s : 0.00% optimize.opt_a.auto_monad_grad : 0.000021s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000184s : 0.01% optimize.opt_a.cse : 0.000716s : 0.04% optimize.opt_a.a_3 : 0.001009s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000051s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000913s : 0.05% optimize.opt_b.b_1 : 0.000642s : 0.04% optimize.opt_b.b_2 : 0.000028s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000085s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000037s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000043s : 0.00% optimize.loop_unroll : 0.000665s : 0.04% optimize.opt_after_cconv.c_1 : 0.000121s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.cse : 0.000064s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000103s : 0.01% optimize.tuple_transform.d_1 : 0.000173s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000021s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000106s : 0.01% optimize.cse_after_recomputation.cse : 0.000053s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000065s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000035s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000064s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000031s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000032s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000018s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000024s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000023s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000045s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.012989s : 0.77% validate : 0.000187s : 0.01% Time group info: ------[substitution.] 0.002408 383 0.16% : 0.000004s : 6: substitution.elim_not_effective 0.68% : 0.000016s : 14: substitution.float_depend_g_call 0.88% : 0.000021s : 13: substitution.float_tuple_getitem_switch 0.13% : 0.000003s : 6: substitution.fold_const_symbol 0.48% : 0.000012s : 13: substitution.graph_param_transform 0.20% : 0.000005s : 2: substitution.incorporate_call 0.09% : 0.000002s : 2: substitution.incorporate_call_switch 60.82% : 0.001465s : 26: substitution.inline 1.24% : 0.000030s : 2: substitution.inline_without_move 0.71% : 0.000017s : 23: substitution.j_node_and_user_rematch 3.41% : 0.000082s : 3: substitution.less_batch_normalization 4.15% : 0.000100s : 22: substitution.minmaximum_grad 1.19% : 0.000029s : 14: substitution.partial_eliminate 0.83% : 0.000020s : 23: substitution.remove_not_recompute_node 2.02% : 0.000049s : 11: substitution.replace_applicator 0.87% : 0.000021s : 25: substitution.replace_old_param 0.18% : 0.000004s : 1: substitution.set_cell_output_no_recompute 1.98% : 0.000048s : 6: substitution.switch_simplify 4.81% : 0.000116s : 28: substitution.tuple_list_convert_item_index_to_positive 1.58% : 0.000038s : 28: substitution.tuple_list_get_item_const_eliminator 2.50% : 0.000060s : 28: substitution.tuple_list_get_item_depend_reorder 8.84% : 0.000213s : 59: substitution.tuple_list_get_item_eliminator 2.27% : 0.000055s : 28: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 1.353510 2 99.49% : 1.346657s : 1: type_inference.infer 0.51% : 0.006853s : 1: type_inference.specialize ------[replace.] 0.000898 53 44.33% : 0.000398s : 26: replace.inline 31.10% : 0.000279s : 6: replace.switch_simplify 24.58% : 0.000221s : 21: replace.tuple_list_get_item_eliminator ------[match.] 0.001553 53 92.85% : 0.001442s : 26: match.inline 2.75% : 0.000043s : 6: match.switch_simplify 4.39% : 0.000068s : 21: match.tuple_list_get_item_eliminator ------[predicate.] 0.001342 9325 1.05% : 0.000014s : 109: predicate.accumulaten_eliminater 0.59% : 0.000008s : 13: predicate.ad_related_special_op_eliminate 0.49% : 0.000007s : 52: predicate.addn_check_dump 1.02% : 0.000014s : 109: predicate.addn_zero_filter 1.01% : 0.000014s : 109: predicate.adjust_all_reduce_mul_add 1.98% : 0.000027s : 161: predicate.arithmetic_simplify 0.99% : 0.000013s : 109: predicate.cast_eliminate 1.14% : 0.000015s : 114: predicate.check_bprop_eliminate 0.45% : 0.000006s : 52: predicate.compare_switch_simplify 0.12% : 0.000002s : 14: predicate.const_output_eliminate 0.47% : 0.000006s : 52: predicate.depend_value_elim 1.06% : 0.000014s : 109: predicate.dict_get_item_const_eliminator 1.17% : 0.000016s : 109: predicate.dict_get_item_eliminator 0.95% : 0.000013s : 109: predicate.dict_set_item_eliminator 0.55% : 0.000007s : 27: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 13: predicate.elim_not_effective 0.19% : 0.000003s : 13: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000014s : 123: predicate.environ_add_const_eliminate 1.15% : 0.000015s : 123: predicate.environ_get_add_eliminate 1.08% : 0.000014s : 123: predicate.environ_get_depend_swap 1.62% : 0.000022s : 175: predicate.environ_get_eliminate 1.04% : 0.000014s : 123: predicate.environ_get_set_eliminate 1.45% : 0.000019s : 156: predicate.exchange_switch_depend_value 2.62% : 0.000035s : 156: predicate.float_depend_g_call 0.48% : 0.000006s : 52: predicate.float_environ_get_switch 0.66% : 0.000009s : 66: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 13: predicate.fold_const_symbol 0.55% : 0.000007s : 52: predicate.get_grad_eliminate 0.11% : 0.000001s : 13: predicate.graph_param_transform 0.45% : 0.000006s : 52: predicate.incorporate_call 0.43% : 0.000006s : 52: predicate.incorporate_call_switch 5.35% : 0.000072s : 397: predicate.inline 1.34% : 0.000018s : 89: predicate.inline_without_move 0.28% : 0.000004s : 52: predicate.j_node_and_user_rematch 0.72% : 0.000010s : 52: predicate.less_batch_normalization 1.56% : 0.000021s : 157: predicate.list_to_tuple_eliminator_ 2.43% : 0.000033s : 267: predicate.load_eliminater 0.40% : 0.000005s : 14: predicate.loop_unroll_after_grad 2.59% : 0.000035s : 242: predicate.loop_unroll_before_grad 1.30% : 0.000017s : 137: predicate.make_slice_get_slice_eliminator 0.51% : 0.000007s : 52: predicate.merge_addn 2.68% : 0.000036s : 114: predicate.micro_step_allgather_replace 1.09% : 0.000015s : 114: predicate.mini_step_allgather_replace 0.98% : 0.000013s : 109: predicate.minmaximum_grad 0.42% : 0.000006s : 14: predicate.mutable_eliminate 0.14% : 0.000002s : 13: predicate.opt_reshape 0.15% : 0.000002s : 14: predicate.parallel_virtual_node 2.75% : 0.000037s : 156: predicate.partial_defer_inline 1.48% : 0.000020s : 144: predicate.partial_eliminate 0.99% : 0.000013s : 109: predicate.print_const_string_wrapper 0.49% : 0.000007s : 52: predicate.reduce_all_const_elim 1.32% : 0.000018s : 109: predicate.reduce_eliminate 2.34% : 0.000031s : 267: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000004s : 52: predicate.remove_not_recompute_node 1.96% : 0.000026s : 244: predicate.replace_applicator 0.61% : 0.000008s : 89: predicate.replace_old_param 0.09% : 0.000001s : 14: predicate.reset_defer_inline 0.97% : 0.000013s : 109: predicate.reshape_eliminate 1.19% : 0.000016s : 114: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 14: predicate.row_tensor_eliminate 1.36% : 0.000018s : 114: predicate.same_eliminate 0.46% : 0.000006s : 52: predicate.set_cell_output_no_recompute 0.62% : 0.000008s : 52: predicate.shard_identity_eliminate 0.43% : 0.000006s : 27: predicate.special_op_eliminate 0.51% : 0.000007s : 52: predicate.specialize_transform 1.29% : 0.000017s : 114: predicate.split_environ_get_set_with_tuple_value 1.13% : 0.000015s : 89: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 14: predicate.switch_call_monad_eliminater 1.61% : 0.000022s : 156: predicate.switch_defer_inline 2.69% : 0.000036s : 270: predicate.switch_layer_defer_inline 5.28% : 0.000071s : 475: predicate.switch_simplify 1.72% : 0.000023s : 109: predicate.tile_eliminate 1.01% : 0.000014s : 109: predicate.transpose_eliminate 1.40% : 0.000019s : 136: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000021s : 136: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000018s : 136: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000042s : 209: predicate.tuple_list_get_item_eliminator 1.44% : 0.000019s : 136: predicate.tuple_list_get_set_item_eliminator 1.96% : 0.000026s : 188: predicate.tuple_list_set_item_eliminator 1.54% : 0.000021s : 157: predicate.tuple_to_list_eliminator_ 2.26% : 0.000030s : 267: predicate.updatestate_pure_node_eliminater 2.83% : 0.000038s : 319: predicate.updatestate_useless_node_eliminater 0.13% : 0.000002s : 14: predicate.value_based_eliminate 0.57% : 0.000008s : 52: predicate.virtual_dataset_eliminate 0.55% : 0.000007s : 52: predicate.virtual_output_eliminate 0.12% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.21% : 0.000003s : 14: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.016006 88 84.74% : 0.013564s : 58: func_graph_cloner_run.FuncGraphClonerGraph 15.26% : 0.002442s : 30: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.304161 233 0.00% : 0.000004s : 1: ForceFp32Comm 0.33% : 0.007574s : 1: add_attr 0.33% : 0.007507s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000112s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000325s : 1: auto_monad 0.00% : 0.000049s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.03% : 0.000632s : 1: bootstrap 0.00% : 0.000046s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000041s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000072s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000030s : 1: detach_backward 0.00% : 0.000024s : 1: environ_conv 0.04% : 0.000907s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000018s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.03% : 0.000677s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.04% : 0.000930s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000034s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000040s : 1: opt.transform.mutable_eliminate 2.45% : 0.056423s : 117: opt.transform.opt_a 0.01% : 0.000119s : 1: opt.transform.opt_after_cconv 0.01% : 0.000142s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000623s : 28: opt.transform.opt_b 0.01% : 0.000191s : 2: opt.transform.opt_trans_graph 0.00% : 0.000097s : 4: opt.transform.symbol_engine_opt 13.29% : 0.306153s : 1: opt_a 0.01% : 0.000274s : 1: opt_after_cconv 0.57% : 0.013019s : 1: opt_after_jit_grad 0.04% : 0.000886s : 1: opt_b 13.51% : 0.311251s : 1: optimize 0.00% : 0.000042s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000073s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000072s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000107s : 1: pre_auto_parallel 0.00% : 0.000016s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000110s : 1: remove_dup_value 5.88% : 0.135595s : 2: renormalize.infer 4.54% : 0.104524s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000056s : 1: rewriter_after_opt_a 0.04% : 0.000910s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000174s : 1: symbol_engine_optimizer 0.01% : 0.000237s : 1: tuple_transform 58.75% : 1.353773s : 1: type_inference TotalTime = 0.919538, [21] [bootstrap]: 0.00069334 [type_inference]: 0.79373 [event_method]: 2.935e-05 [auto_monad]: 8.26e-05 [graph_reusing]: 6.24999e-06 [inline]: 2.59001e-06 [add_attr]: 0.111802, [1] [add_attr_with_inline]: 0.111787, [1] [Cycle 1]: 9.521e-05, [2] [tag_attr]: 3.981e-05 [meta_addattr_fg_expand]: 6.46e-06 [parallel-infer-symbol]: 5.07e-06 [pre_auto_parallel]: 6.205e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 1.00999e-06 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.0120741, [53] [py_interpret_to_execute]: 1.254e-05 [rewriter_before_opt_a]: 0.00038012 [opt_a]: 0.00868435, [2] [Cycle 1]: 0.00756626, [45] [expand_dump_flag]: 3.73999e-06 [switch_simplify]: 5.815e-05 [loop_unroll]: 4.085e-05 [a_1]: 0.00079138 [with_stream_mark]: 2.565e-05 [recompute_prepare]: 1.819e-05 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 0.00013186 [accelerated_algorithm]: 1.166e-05 [shard]: 2.11e-06 [meta_shard_fg_expand]: 2.91e-06 [shard_inline]: 1.087e-05 [merge_send_recv]: 1.084e-05 [auto_parallel]: 1.038e-05 [parallel]: 2.327e-05 [flash_sp]: 1.179e-05 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 3.80998e-06 [matmul_add_comm_reduction]: 1.108e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 1.387e-05 [virtual_dataset]: 1.077e-05 [get_grad_eliminate_]: 1.009e-05 [virtual_output]: 1.028e-05 [merge_forward]: 5.74e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 1.16e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.078e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.599e-05 [set_forward_comm_id_for_comm_node_pass]: 4.80999e-06 [meta_fg_expand]: 3.88999e-06 [flash_sp_send_recv_attached]: 2.63998e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.691e-05 [a_after_grad]: 1.642e-05 [renormalize]: 0.00570359 [add_forward_monad_depend]: 1.347e-05 [auto_monad_grad]: 3.31999e-06 [auto_monad_eliminator]: 2.971e-05 [cse]: 5.298e-05 [a_3]: 9.641e-05 [Cycle 2]: 0.00110103, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 1.311e-05 [loop_unroll]: 1.017e-05 [a_1]: 0.00027861 [with_stream_mark]: 2.834e-05 [recompute_prepare]: 1.303e-05 [updatestate_depend_eliminate]: 6.53e-06 [updatestate_assign_eliminate]: 4.22998e-06 [updatestate_loads_eliminate]: 4.41002e-06 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00012433 [accelerated_algorithm]: 1.138e-05 [shard]: 2.35002e-06 [meta_shard_fg_expand]: 2.93998e-06 [shard_inline]: 1.038e-05 [merge_send_recv]: 1.11e-05 [auto_parallel]: 1.11e-05 [parallel]: 1.181e-05 [flash_sp]: 4.09002e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 4.15e-06 [matmul_add_comm_reduction]: 1.21e-05 [allreduce_slice_to_reducescatter]: 1.15001e-06 [virtual_shard_identity]: 1.241e-05 [virtual_dataset]: 1.001e-05 [get_grad_eliminate_]: 9.71e-06 [virtual_output]: 1.077e-05 [merge_forward]: 6.33002e-06 [cell_reuse_recompute_pass]: 4.08999e-06 [offload_activation]: 1.348e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.598e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.497e-05 [set_forward_comm_id_for_comm_node_pass]: 5.39998e-06 [meta_fg_expand]: 4.31002e-06 [flash_sp_send_recv_attached]: 2.02999e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.702e-05 [a_after_grad]: 1.703e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.70998e-06 [auto_monad_grad]: 2.09999e-06 [auto_monad_eliminator]: 1.441e-05 [cse]: 3.746e-05 [a_3]: 7.086e-05 [py_interpret_to_execute_after_opt_a]: 1.276e-05 [slice_cell_reuse_recomputed_activation]: 2.68998e-06 [rewriter_after_opt_a]: 3.167e-05 [convert_after_rewriter]: 1.64e-06 [order_py_execute_after_rewriter]: 1.42e-06 [mutable_eliminate]: 0.00090637 [opt_b]: 0.00036866, [1] [Cycle 1]: 0.00035763, [7] [b_1]: 0.00023675 [b_2]: 1.276e-05 [updatestate_depend_eliminate]: 1.249e-05 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.3e-06 [renormalize]: 1.17e-06 [cse]: 4.429e-05 [optimize_parallel_all_gather_comm]: 2.318e-05 [overlap_param_gather]: 2.49001e-06 [cconv]: 4.024e-05 [loop_unroll]: 0.00058847 [opt_after_cconv]: 0.00015051, [1] [Cycle 1]: 0.00014073, [7] [c_1]: 5.11e-05 [parameter_eliminate]: 6.61999e-06 [updatestate_depend_eliminate]: 8.45999e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 2.86999e-06 [cse]: 3.325e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 2.809e-05 [tuple_transform]: 0.00015736, [1] [Cycle 1]: 0.00015243, [4] [d_1]: 0.00011178 [none_parameter_eliminate]: 2.89001e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 1.158e-05 [partial_unused_args_eliminate]: 1.97001e-06 [add_recomputation]: 6.662e-05 [cse_after_recomputation]: 3.64e-05, [1] [Cycle 1]: 3.097e-05, [1] [cse]: 2.431e-05 [environ_conv]: 1.417e-05 [swap_dp_allreduce_reducescatter]: 6.56e-06 [bias_add_comm_swap]: 3.67998e-06 [label_micro_interleaved_index]: 5.05001e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 2.74001e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.99e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 3.08e-06 [comm_op_add_attrs]: 1.17999e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.40999e-06 [interleave_parallel_branches]: 1.21997e-06 [overlap_opt_shard_in_pipeline]: 1.60999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.21e-06 [control_data_broadcast_order]: 1.741e-05 [grouped_pairwise_exchange_alltoall]: 2.02001e-06 [offloading_packed_experts]: 4.18999e-06 [overlap_recompute_and_grad_model_parallel]: 5.39e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.90001e-06 [overlap_recompute_comm]: 3.27002e-06 [overlap_grad_ring_attention]: 4.54998e-06 [overlap_grad_flash_sp]: 2.74e-05 [begin_end_overlap_inline]: 7.60017e-07 [split_matmul_comm_elemetwise]: 2.46e-06 [split_layernorm_comm]: 2.01998e-06 [handle_group_info]: 1.47999e-06 [symbol_engine_optimizer]: 0.00017907, [1] [Cycle 1]: 0.00017311, [6] [build]: 4.84998e-06 [elim_shapecalc]: 1.687e-05 [elim_not_effective]: 8.261e-05 [opt_reshape]: 1.295e-05 [fold_const_symbol]: 1.745e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.45002e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.382e-05 [get_jit_bprop_graph]: 3.31001e-06 [rewriter_after_jit_bprop_graph]: 7.56001e-06 [opt_after_jit_grad]: 0.00070984 [validate]: 8.798e-05 Sums bootstrap : 0.000693s : 0.09% type_inference : 0.793730s : 98.41% event_method : 0.000029s : 0.00% auto_monad : 0.000083s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000040s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000062s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000013s : 0.00% optimize.rewriter_before_opt_a : 0.000380s : 0.05% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000071s : 0.01% optimize.opt_a.loop_unroll : 0.000051s : 0.01% optimize.opt_a.a_1 : 0.001070s : 0.13% optimize.opt_a.with_stream_mark : 0.000054s : 0.01% optimize.opt_a.recompute_prepare : 0.000031s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000256s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000021s : 0.00% optimize.opt_a.merge_send_recv : 0.000022s : 0.00% optimize.opt_a.auto_parallel : 0.000021s : 0.00% optimize.opt_a.parallel : 0.000035s : 0.00% optimize.opt_a.flash_sp : 0.000016s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.00% optimize.opt_a.virtual_dataset : 0.000021s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.00% optimize.opt_a.virtual_output : 0.000021s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000047s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000034s : 0.00% optimize.opt_a.a_after_grad : 0.000033s : 0.00% optimize.opt_a.renormalize : 0.005704s : 0.71% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.01% optimize.opt_a.cse : 0.000090s : 0.01% optimize.opt_a.a_3 : 0.000167s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000032s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000906s : 0.11% optimize.opt_b.b_1 : 0.000237s : 0.03% optimize.opt_b.b_2 : 0.000013s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000044s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.00% optimize.loop_unroll : 0.000588s : 0.07% optimize.opt_after_cconv.c_1 : 0.000051s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000028s : 0.00% optimize.tuple_transform.d_1 : 0.000112s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.01% optimize.cse_after_recomputation.cse : 0.000024s : 0.00% optimize.environ_conv : 0.000014s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000083s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000017s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000710s : 0.09% validate : 0.000088s : 0.01% Time group info: ------[substitution.] 0.000336 29 19.49% : 0.000065s : 2: substitution.elim_not_effective 0.62% : 0.000002s : 2: substitution.fold_const_symbol 2.57% : 0.000009s : 7: substitution.graph_param_transform 66.71% : 0.000224s : 4: substitution.inline 1.89% : 0.000006s : 4: substitution.j_node_and_user_rematch 1.97% : 0.000007s : 4: substitution.remove_not_recompute_node 2.45% : 0.000008s : 4: substitution.replace_old_param 4.31% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.793574 2 99.53% : 0.789842s : 1: type_inference.infer 0.47% : 0.003733s : 1: type_inference.specialize ------[replace.] 0.000079 6 75.28% : 0.000060s : 4: replace.inline 24.72% : 0.000020s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000233 6 94.81% : 0.000221s : 4: match.inline 5.19% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000293 1896 1.04% : 0.000003s : 18: predicate.accumulaten_eliminater 0.84% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 14: predicate.addn_check_dump 0.80% : 0.000002s : 18: predicate.addn_zero_filter 0.70% : 0.000002s : 18: predicate.adjust_all_reduce_mul_add 2.33% : 0.000007s : 32: predicate.arithmetic_simplify 0.94% : 0.000003s : 18: predicate.cast_eliminate 0.63% : 0.000002s : 14: predicate.check_bprop_eliminate 0.57% : 0.000002s : 14: predicate.compare_switch_simplify 0.25% : 0.000001s : 7: predicate.const_output_eliminate 0.57% : 0.000002s : 14: predicate.depend_value_elim 0.83% : 0.000002s : 18: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 18: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 18: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 7: predicate.elim_not_effective 0.44% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000003s : 25: predicate.environ_add_const_eliminate 1.25% : 0.000004s : 25: predicate.environ_get_add_eliminate 0.94% : 0.000003s : 25: predicate.environ_get_depend_swap 1.64% : 0.000005s : 39: predicate.environ_get_eliminate 0.95% : 0.000003s : 25: predicate.environ_get_set_eliminate 1.10% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.05% : 0.000006s : 24: predicate.float_depend_g_call 0.58% : 0.000002s : 14: predicate.float_environ_get_switch 0.86% : 0.000003s : 21: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.84% : 0.000002s : 14: predicate.get_grad_eliminate 0.30% : 0.000001s : 7: predicate.graph_param_transform 0.62% : 0.000002s : 14: predicate.incorporate_call 0.48% : 0.000001s : 14: predicate.incorporate_call_switch 5.45% : 0.000016s : 84: predicate.inline 1.09% : 0.000003s : 14: predicate.inline_without_move 0.37% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.96% : 0.000003s : 14: predicate.less_batch_normalization 1.77% : 0.000005s : 34: predicate.list_to_tuple_eliminator_ 2.17% : 0.000006s : 52: predicate.load_eliminater 1.23% : 0.000004s : 7: predicate.loop_unroll_after_grad 2.42% : 0.000007s : 49: predicate.loop_unroll_before_grad 1.72% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.83% : 0.000002s : 14: predicate.merge_addn 0.69% : 0.000002s : 14: predicate.micro_step_allgather_replace 0.95% : 0.000003s : 14: predicate.mini_step_allgather_replace 0.67% : 0.000002s : 18: predicate.minmaximum_grad 1.65% : 0.000005s : 7: predicate.mutable_eliminate 0.39% : 0.000001s : 7: predicate.opt_reshape 0.62% : 0.000002s : 7: predicate.parallel_virtual_node 1.90% : 0.000006s : 24: predicate.partial_defer_inline 1.19% : 0.000003s : 27: predicate.partial_eliminate 0.82% : 0.000002s : 18: predicate.print_const_string_wrapper 0.71% : 0.000002s : 14: predicate.reduce_all_const_elim 1.04% : 0.000003s : 18: predicate.reduce_eliminate 2.32% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000002s : 14: predicate.remove_not_recompute_node 1.65% : 0.000005s : 34: predicate.replace_applicator 0.60% : 0.000002s : 14: predicate.replace_old_param 0.47% : 0.000001s : 7: predicate.reset_defer_inline 0.98% : 0.000003s : 18: predicate.reshape_eliminate 0.78% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 7: predicate.row_tensor_eliminate 0.89% : 0.000003s : 14: predicate.same_eliminate 0.57% : 0.000002s : 14: predicate.set_cell_output_no_recompute 0.96% : 0.000003s : 14: predicate.shard_identity_eliminate 0.65% : 0.000002s : 14: predicate.special_op_eliminate 0.64% : 0.000002s : 14: predicate.specialize_transform 1.16% : 0.000003s : 14: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.20% : 0.000004s : 24: predicate.switch_defer_inline 1.74% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.03% : 0.000015s : 94: predicate.switch_simplify 0.78% : 0.000002s : 18: predicate.tile_eliminate 1.00% : 0.000003s : 18: predicate.transpose_eliminate 1.56% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000009s : 48: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000007s : 46: predicate.tuple_list_set_item_eliminator 1.62% : 0.000005s : 34: predicate.tuple_to_list_eliminator_ 2.41% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.52% : 0.000007s : 66: predicate.updatestate_useless_node_eliminater 0.52% : 0.000002s : 7: predicate.value_based_eliminate 0.96% : 0.000003s : 14: predicate.virtual_dataset_eliminate 0.70% : 0.000002s : 14: predicate.virtual_output_eliminate 0.28% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007106 41 76.66% : 0.005448s : 35: func_graph_cloner_run.FuncGraphClonerGraph 23.34% : 0.001659s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.051318 192 0.00% : 0.000005s : 1: ForceFp32Comm 10.64% : 0.111810s : 1: add_attr 10.63% : 0.111792s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.01% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000088s : 1: auto_monad 0.00% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.07% : 0.000729s : 1: bootstrap 0.00% : 0.000044s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000008s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000037s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.06% : 0.000600s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.09% : 0.000922s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000037s : 1: opt.transform.mutable_eliminate 0.17% : 0.001802s : 78: opt.transform.opt_a 0.00% : 0.000050s : 1: opt.transform.opt_after_cconv 0.00% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000215s : 28: opt.transform.opt_b 0.01% : 0.000121s : 2: opt.transform.opt_trans_graph 0.01% : 0.000124s : 4: opt.transform.symbol_engine_opt 0.83% : 0.008689s : 1: opt_a 0.01% : 0.000155s : 1: opt_after_cconv 0.07% : 0.000726s : 1: opt_after_jit_grad 0.04% : 0.000373s : 1: opt_b 1.15% : 0.012081s : 1: optimize 0.00% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000031s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.01% : 0.000068s : 1: pre_auto_parallel 0.00% : 0.000016s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000032s : 1: remove_dup_value 0.35% : 0.003654s : 1: renormalize.infer 0.19% : 0.002029s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000036s : 1: rewriter_after_opt_a 0.04% : 0.000389s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000182s : 1: symbol_engine_optimizer 0.02% : 0.000160s : 1: tuple_transform 75.50% : 0.793763s : 1: type_inference TotalTime = 2.27597, [21] [bootstrap]: 0.00070266 [type_inference]: 1.63394 [event_method]: 0.0147161 [auto_monad]: 0.00036276 [graph_reusing]: 1.302e-05 [inline]: 3.23e-06 [add_attr]: 0.0152399, [1] [add_attr_with_inline]: 0.0152251, [1] [Cycle 1]: 0.0001947, [2] [tag_attr]: 0.00011834 [meta_addattr_fg_expand]: 1.963e-05 [parallel-infer-symbol]: 5.79e-06 [pre_auto_parallel]: 0.00011169 [insert-virtual-dataset]: 3.60998e-06 [parallel-infer-symbol-second]: 1.02e-06 [dataset_repeat_opt]: 2.08002e-06 [pipeline_split]: 3.28e-06 [optimize]: 0.60981, [53] [py_interpret_to_execute]: 1.377e-05 [rewriter_before_opt_a]: 0.0009028 [opt_a]: 0.533866, [3] [Cycle 1]: 0.476607, [45] [expand_dump_flag]: 8.46002e-06 [switch_simplify]: 0.00027607 [loop_unroll]: 0.00011792 [a_1]: 0.0494873 [with_stream_mark]: 5.525e-05 [recompute_prepare]: 4.987e-05 [updatestate_depend_eliminate]: 1.531e-05 [updatestate_assign_eliminate]: 1.151e-05 [updatestate_loads_eliminate]: 9.64e-06 [parameter_eliminate]: 3.83001e-06 [a_2]: 0.00036462 [accelerated_algorithm]: 5.972e-05 [shard]: 3.09001e-06 [meta_shard_fg_expand]: 1.576e-05 [shard_inline]: 3.362e-05 [merge_send_recv]: 2.577e-05 [auto_parallel]: 1.972e-05 [parallel]: 4.806e-05 [flash_sp]: 5.336e-05 [merge_comm]: 3.926e-05 [allreduce_fusion]: 1.204e-05 [matmul_add_comm_reduction]: 0.00104183 [allreduce_slice_to_reducescatter]: 0.00012479 [virtual_shard_identity]: 7.595e-05 [virtual_dataset]: 3.671e-05 [get_grad_eliminate_]: 3.371e-05 [virtual_output]: 3.035e-05 [merge_forward]: 1.739e-05 [cell_reuse_recompute_pass]: 3.60998e-06 [offload_activation]: 2.602e-05 [cell_reuse_handle_not_recompute_node_pass]: 6.199e-05 [merge_recompute_call_nodes]: 2.49001e-06 [before_grad]: 5.041e-05 [set_forward_comm_id_for_comm_node_pass]: 1.713e-05 [meta_fg_expand]: 0.00292296 [flash_sp_send_recv_attached]: 9.37001e-06 [receive_attached]: 2.96999e-06 [after_resolve]: 0.00011118 [a_after_grad]: 0.00012582 [renormalize]: 0.402642 [add_forward_monad_depend]: 3.731e-05 [auto_monad_grad]: 2.066e-05 [auto_monad_eliminator]: 0.00017605 [cse]: 0.00049111 [a_3]: 0.00065957 [Cycle 2]: 0.0557111, [45] [expand_dump_flag]: 4.84003e-06 [switch_simplify]: 0.00017705 [loop_unroll]: 8.127e-05 [a_1]: 0.0293701 [with_stream_mark]: 4.444e-05 [recompute_prepare]: 2.585e-05 [updatestate_depend_eliminate]: 9.92999e-06 [updatestate_assign_eliminate]: 8.67e-06 [updatestate_loads_eliminate]: 6.23002e-06 [parameter_eliminate]: 2.89001e-06 [a_2]: 0.00022641 [accelerated_algorithm]: 2.266e-05 [shard]: 2.19999e-06 [meta_shard_fg_expand]: 9.97999e-06 [shard_inline]: 1.598e-05 [merge_send_recv]: 1.428e-05 [auto_parallel]: 1.399e-05 [parallel]: 1.3e-05 [flash_sp]: 6.59001e-06 [merge_comm]: 8.17e-06 [allreduce_fusion]: 6.78e-06 [matmul_add_comm_reduction]: 1.524e-05 [allreduce_slice_to_reducescatter]: 1.76998e-06 [virtual_shard_identity]: 1.943e-05 [virtual_dataset]: 1.537e-05 [get_grad_eliminate_]: 1.462e-05 [virtual_output]: 1.415e-05 [merge_forward]: 8.82e-06 [cell_reuse_recompute_pass]: 2.08002e-06 [offload_activation]: 1.588e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.885e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 2.409e-05 [set_forward_comm_id_for_comm_node_pass]: 7.46001e-06 [meta_fg_expand]: 0.0004249 [flash_sp_send_recv_attached]: 3.06001e-06 [receive_attached]: 2.90002e-06 [after_resolve]: 3.622e-05 [a_after_grad]: 2.562e-05 [renormalize]: 0.0241788 [add_forward_monad_depend]: 8.62998e-06 [auto_monad_grad]: 3.61999e-06 [auto_monad_eliminator]: 2.845e-05 [cse]: 0.00022588 [a_3]: 0.00012558 [Cycle 3]: 0.00152306, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 1.848e-05 [loop_unroll]: 1.467e-05 [a_1]: 0.00047559 [with_stream_mark]: 2.458e-05 [recompute_prepare]: 1.556e-05 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 6.34001e-06 [updatestate_loads_eliminate]: 5.91e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00021021 [accelerated_algorithm]: 2.033e-05 [shard]: 2.38998e-06 [meta_shard_fg_expand]: 3.50998e-06 [shard_inline]: 1.396e-05 [merge_send_recv]: 4.126e-05 [auto_parallel]: 1.511e-05 [parallel]: 9.67001e-06 [flash_sp]: 1.86998e-06 [merge_comm]: 6.88e-06 [allreduce_fusion]: 6.33e-06 [matmul_add_comm_reduction]: 1.338e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.776e-05 [virtual_dataset]: 1.615e-05 [get_grad_eliminate_]: 1.546e-05 [virtual_output]: 1.468e-05 [merge_forward]: 7.63001e-06 [cell_reuse_recompute_pass]: 4.41002e-06 [offload_activation]: 1.544e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.473e-05 [merge_recompute_call_nodes]: 1.91998e-06 [before_grad]: 2.118e-05 [set_forward_comm_id_for_comm_node_pass]: 6.89999e-06 [meta_fg_expand]: 5.10001e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 2.84999e-06 [after_resolve]: 2.207e-05 [a_after_grad]: 2.281e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.49001e-06 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 1.786e-05 [cse]: 5.54e-05 [a_3]: 9.446e-05 [py_interpret_to_execute_after_opt_a]: 8.3e-06 [slice_cell_reuse_recomputed_activation]: 2.44999e-06 [rewriter_after_opt_a]: 4.547e-05 [convert_after_rewriter]: 1.30999e-06 [order_py_execute_after_rewriter]: 1.17999e-06 [mutable_eliminate]: 0.072563 [opt_b]: 0.00061372, [1] [Cycle 1]: 0.00060189, [7] [b_1]: 0.00039345 [b_2]: 1.857e-05 [updatestate_depend_eliminate]: 1.934e-05 [updatestate_assign_eliminate]: 6.07001e-06 [updatestate_loads_eliminate]: 5.96998e-06 [renormalize]: 9.70002e-07 [cse]: 0.00011389 [optimize_parallel_all_gather_comm]: 3.202e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 4.258e-05 [loop_unroll]: 0.00054149 [opt_after_cconv]: 0.00020796, [1] [Cycle 1]: 0.0002013, [7] [c_1]: 7.614e-05 [parameter_eliminate]: 6.35997e-06 [updatestate_depend_eliminate]: 1.019e-05 [updatestate_assign_eliminate]: 5.42999e-06 [updatestate_loads_eliminate]: 5.19e-06 [cse]: 6.04e-05 [renormalize]: 7.2e-07 [remove_dup_value]: 9.026e-05 [tuple_transform]: 0.00018009, [1] [Cycle 1]: 0.00017507, [4] [d_1]: 0.00013519 [none_parameter_eliminate]: 2.25002e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.514e-05 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 8.388e-05 [cse_after_recomputation]: 4.518e-05, [1] [Cycle 1]: 4.072e-05, [1] [cse]: 3.482e-05 [environ_conv]: 1.962e-05 [swap_dp_allreduce_reducescatter]: 1.017e-05 [bias_add_comm_swap]: 3.31001e-06 [label_micro_interleaved_index]: 6.79001e-06 [label_fine_grained_interleaved_index]: 2.41e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.76999e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 2.91e-06 [reorder_send_recv_between_fp_bp]: 3.08998e-06 [comm_op_add_attrs]: 9.90025e-07 [add_comm_op_reuse_tag]: 1.12e-06 [interleave_split_concat_branches]: 1.49e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 2.102e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 7.46999e-06 [overlap_recompute_and_grad_model_parallel]: 7.31001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29003e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 7.25e-06 [overlap_grad_flash_sp]: 3.245e-05 [begin_end_overlap_inline]: 6.39993e-07 [split_matmul_comm_elemetwise]: 2.38998e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 0.00013035, [1] [Cycle 1]: 0.0001253, [6] [build]: 1.463e-05 [elim_shapecalc]: 1.824e-05 [elim_not_effective]: 2.469e-05 [opt_reshape]: 1.482e-05 [fold_const_symbol]: 2.149e-05 [renormalize]: 1.50001e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 3.165e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 9.09e-06 [opt_after_jit_grad]: 0.00062381 [validate]: 9.221e-05 Sums bootstrap : 0.000703s : 0.03% type_inference : 1.633939s : 72.87% event_method : 0.014716s : 0.66% auto_monad : 0.000363s : 0.02% graph_reusing : 0.000013s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000118s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000112s : 0.00% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000014s : 0.00% optimize.rewriter_before_opt_a : 0.000903s : 0.04% optimize.opt_a.expand_dump_flag : 0.000016s : 0.00% optimize.opt_a.switch_simplify : 0.000472s : 0.02% optimize.opt_a.loop_unroll : 0.000214s : 0.01% optimize.opt_a.a_1 : 0.079333s : 3.54% optimize.opt_a.with_stream_mark : 0.000124s : 0.01% optimize.opt_a.recompute_prepare : 0.000091s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000033s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000027s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000022s : 0.00% optimize.opt_a.parameter_eliminate : 0.000008s : 0.00% optimize.opt_a.a_2 : 0.000801s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000103s : 0.00% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000029s : 0.00% optimize.opt_a.shard_inline : 0.000064s : 0.00% optimize.opt_a.merge_send_recv : 0.000081s : 0.00% optimize.opt_a.auto_parallel : 0.000049s : 0.00% optimize.opt_a.parallel : 0.000071s : 0.00% optimize.opt_a.flash_sp : 0.000062s : 0.00% optimize.opt_a.merge_comm : 0.000054s : 0.00% optimize.opt_a.allreduce_fusion : 0.000025s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.001070s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000127s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000113s : 0.01% optimize.opt_a.virtual_dataset : 0.000068s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000064s : 0.00% optimize.opt_a.virtual_output : 0.000059s : 0.00% optimize.opt_a.merge_forward : 0.000034s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% optimize.opt_a.offload_activation : 0.000057s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000116s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000096s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000031s : 0.00% optimize.opt_a.meta_fg_expand : 0.003353s : 0.15% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000169s : 0.01% optimize.opt_a.a_after_grad : 0.000174s : 0.01% optimize.opt_a.renormalize : 0.426821s : 19.04% optimize.opt_a.add_forward_monad_depend : 0.000048s : 0.00% optimize.opt_a.auto_monad_grad : 0.000027s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000222s : 0.01% optimize.opt_a.cse : 0.000772s : 0.03% optimize.opt_a.a_3 : 0.000880s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.072563s : 3.24% optimize.opt_b.b_1 : 0.000393s : 0.02% optimize.opt_b.b_2 : 0.000019s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000114s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000043s : 0.00% optimize.loop_unroll : 0.000541s : 0.02% optimize.opt_after_cconv.c_1 : 0.000076s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000060s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000090s : 0.00% optimize.tuple_transform.d_1 : 0.000135s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000015s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000084s : 0.00% optimize.cse_after_recomputation.cse : 0.000035s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000032s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000021s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000032s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.000624s : 0.03% validate : 0.000092s : 0.00% Time group info: ------[substitution.] 0.003459 309 0.08% : 0.000003s : 5: substitution.elim_not_effective 0.64% : 0.000022s : 13: substitution.float_depend_g_call 0.57% : 0.000020s : 9: substitution.float_tuple_getitem_switch 0.07% : 0.000003s : 5: substitution.fold_const_symbol 0.29% : 0.000010s : 11: substitution.graph_param_transform 0.12% : 0.000004s : 2: substitution.incorporate_call 0.07% : 0.000003s : 2: substitution.incorporate_call_switch 79.54% : 0.002751s : 26: substitution.inline 0.73% : 0.000025s : 2: substitution.inline_without_move 0.45% : 0.000015s : 20: substitution.j_node_and_user_rematch 0.78% : 0.000027s : 3: substitution.less_batch_normalization 0.78% : 0.000027s : 16: substitution.minmaximum_grad 0.50% : 0.000017s : 13: substitution.partial_eliminate 0.59% : 0.000020s : 20: substitution.remove_not_recompute_node 1.33% : 0.000046s : 11: substitution.replace_applicator 0.59% : 0.000020s : 22: substitution.replace_old_param 0.17% : 0.000006s : 1: substitution.set_cell_output_no_recompute 1.01% : 0.000035s : 6: substitution.switch_simplify 2.94% : 0.000102s : 20: substitution.tuple_list_convert_item_index_to_positive 0.84% : 0.000029s : 20: substitution.tuple_list_get_item_const_eliminator 1.47% : 0.000051s : 20: substitution.tuple_list_get_item_depend_reorder 5.18% : 0.000179s : 42: substitution.tuple_list_get_item_eliminator 1.28% : 0.000044s : 20: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 1.607141 2 96.13% : 1.544937s : 1: type_inference.infer 3.87% : 0.062203s : 1: type_inference.specialize ------[replace.] 0.000861 47 58.16% : 0.000501s : 26: replace.inline 15.12% : 0.000130s : 6: replace.switch_simplify 26.72% : 0.000230s : 15: replace.tuple_list_get_item_eliminator ------[match.] 0.002828 47 96.43% : 0.002726s : 26: match.inline 1.05% : 0.000030s : 6: match.switch_simplify 2.52% : 0.000071s : 15: match.tuple_list_get_item_eliminator ------[predicate.] 0.001229 8191 0.99% : 0.000012s : 96: predicate.accumulaten_eliminater 0.31% : 0.000004s : 11: predicate.ad_related_special_op_eliminate 0.44% : 0.000005s : 45: predicate.addn_check_dump 1.03% : 0.000013s : 96: predicate.addn_zero_filter 0.92% : 0.000011s : 96: predicate.adjust_all_reduce_mul_add 2.08% : 0.000026s : 141: predicate.arithmetic_simplify 1.04% : 0.000013s : 96: predicate.cast_eliminate 1.04% : 0.000013s : 103: predicate.check_bprop_eliminate 0.43% : 0.000005s : 45: predicate.compare_switch_simplify 0.09% : 0.000001s : 12: predicate.const_output_eliminate 0.54% : 0.000007s : 45: predicate.depend_value_elim 1.12% : 0.000014s : 96: predicate.dict_get_item_const_eliminator 1.32% : 0.000016s : 96: predicate.dict_get_item_eliminator 0.97% : 0.000012s : 96: predicate.dict_set_item_eliminator 0.35% : 0.000004s : 23: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 11: predicate.elim_not_effective 0.13% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000014s : 108: predicate.environ_add_const_eliminate 1.03% : 0.000013s : 108: predicate.environ_get_add_eliminate 1.05% : 0.000013s : 108: predicate.environ_get_depend_swap 1.48% : 0.000018s : 153: predicate.environ_get_eliminate 1.05% : 0.000013s : 108: predicate.environ_get_set_eliminate 1.43% : 0.000018s : 137: predicate.exchange_switch_depend_value 2.70% : 0.000033s : 137: predicate.float_depend_g_call 0.51% : 0.000006s : 45: predicate.float_environ_get_switch 0.68% : 0.000008s : 57: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 11: predicate.fold_const_symbol 0.59% : 0.000007s : 45: predicate.get_grad_eliminate 0.12% : 0.000002s : 11: predicate.graph_param_transform 0.44% : 0.000005s : 45: predicate.incorporate_call 0.41% : 0.000005s : 45: predicate.incorporate_call_switch 5.23% : 0.000064s : 347: predicate.inline 1.20% : 0.000015s : 83: predicate.inline_without_move 0.32% : 0.000004s : 45: predicate.j_node_and_user_rematch 0.64% : 0.000008s : 45: predicate.less_batch_normalization 1.65% : 0.000020s : 134: predicate.list_to_tuple_eliminator_ 2.25% : 0.000028s : 231: predicate.load_eliminater 0.27% : 0.000003s : 12: predicate.loop_unroll_after_grad 2.71% : 0.000033s : 222: predicate.loop_unroll_before_grad 1.25% : 0.000015s : 120: predicate.make_slice_get_slice_eliminator 0.46% : 0.000006s : 45: predicate.merge_addn 1.04% : 0.000013s : 103: predicate.micro_step_allgather_replace 1.09% : 0.000013s : 103: predicate.mini_step_allgather_replace 0.98% : 0.000012s : 96: predicate.minmaximum_grad 0.77% : 0.000010s : 12: predicate.mutable_eliminate 0.13% : 0.000002s : 11: predicate.opt_reshape 0.16% : 0.000002s : 12: predicate.parallel_virtual_node 4.54% : 0.000056s : 137: predicate.partial_defer_inline 1.46% : 0.000018s : 123: predicate.partial_eliminate 1.04% : 0.000013s : 96: predicate.print_const_string_wrapper 0.47% : 0.000006s : 45: predicate.reduce_all_const_elim 1.35% : 0.000017s : 96: predicate.reduce_eliminate 2.52% : 0.000031s : 231: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000004s : 45: predicate.remove_not_recompute_node 2.15% : 0.000026s : 214: predicate.replace_applicator 0.65% : 0.000008s : 83: predicate.replace_old_param 0.12% : 0.000002s : 12: predicate.reset_defer_inline 0.99% : 0.000012s : 96: predicate.reshape_eliminate 1.13% : 0.000014s : 103: predicate.row_tensor_add_zeros_like 0.14% : 0.000002s : 12: predicate.row_tensor_eliminate 1.30% : 0.000016s : 103: predicate.same_eliminate 0.39% : 0.000005s : 45: predicate.set_cell_output_no_recompute 0.80% : 0.000010s : 45: predicate.shard_identity_eliminate 0.27% : 0.000003s : 23: predicate.special_op_eliminate 0.53% : 0.000007s : 45: predicate.specialize_transform 1.34% : 0.000016s : 103: predicate.split_environ_get_set_with_tuple_value 1.09% : 0.000013s : 83: predicate.stack_unstack_eliminate 0.14% : 0.000002s : 12: predicate.switch_call_monad_eliminater 1.59% : 0.000020s : 137: predicate.switch_defer_inline 2.65% : 0.000033s : 240: predicate.switch_layer_defer_inline 5.63% : 0.000069s : 427: predicate.switch_simplify 0.98% : 0.000012s : 96: predicate.tile_eliminate 1.03% : 0.000013s : 96: predicate.transpose_eliminate 1.39% : 0.000017s : 119: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000020s : 119: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000017s : 119: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000041s : 179: predicate.tuple_list_get_item_eliminator 1.88% : 0.000023s : 119: predicate.tuple_list_get_set_item_eliminator 2.00% : 0.000025s : 164: predicate.tuple_list_set_item_eliminator 1.51% : 0.000019s : 134: predicate.tuple_to_list_eliminator_ 2.15% : 0.000026s : 231: predicate.updatestate_pure_node_eliminater 2.80% : 0.000034s : 276: predicate.updatestate_useless_node_eliminater 0.14% : 0.000002s : 12: predicate.value_based_eliminate 0.68% : 0.000008s : 45: predicate.virtual_dataset_eliminate 0.51% : 0.000006s : 45: predicate.virtual_output_eliminate 0.11% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.14% : 0.000002s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.052407 90 93.58% : 0.049040s : 60: func_graph_cloner_run.FuncGraphClonerGraph 6.42% : 0.003367s : 30: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.410992 233 0.00% : 0.000004s : 1: ForceFp32Comm 0.45% : 0.015248s : 1: add_attr 0.45% : 0.015230s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000088s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000380s : 1: auto_monad 0.00% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.02% : 0.000748s : 1: bootstrap 0.00% : 0.000046s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000048s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000024s : 1: environ_conv 0.43% : 0.014773s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.02% : 0.000551s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 2.13% : 0.072590s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000063s : 1: opt.transform.mutable_eliminate 2.42% : 0.082599s : 117: opt.transform.opt_a 0.00% : 0.000075s : 1: opt.transform.opt_after_cconv 0.00% : 0.000056s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000374s : 28: opt.transform.opt_b 0.00% : 0.000148s : 2: opt.transform.opt_trans_graph 0.00% : 0.000075s : 4: opt.transform.symbol_engine_opt 15.65% : 0.533871s : 1: opt_a 0.01% : 0.000212s : 1: opt_after_cconv 0.02% : 0.000635s : 1: opt_after_jit_grad 0.02% : 0.000620s : 1: opt_b 17.88% : 0.609817s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.00% : 0.000117s : 1: pre_auto_parallel 0.00% : 0.000018s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000095s : 1: remove_dup_value 9.54% : 0.325536s : 2: renormalize.infer 2.97% : 0.101235s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000049s : 1: rewriter_after_opt_a 0.03% : 0.000916s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000134s : 1: symbol_engine_optimizer 0.01% : 0.000183s : 1: tuple_transform 47.90% : 1.633991s : 1: type_inference TotalTime = 2.11482, [21] [bootstrap]: 0.00066871 [type_inference]: 1.79792 [event_method]: 0.00049919 [auto_monad]: 8.573e-05 [graph_reusing]: 6.96001e-06 [inline]: 3.16001e-06 [add_attr]: 0.00424643, [1] [add_attr_with_inline]: 0.00423291, [1] [Cycle 1]: 0.0001019, [2] [tag_attr]: 4.907e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 5.15999e-06 [pre_auto_parallel]: 4.813e-05 [insert-virtual-dataset]: 3.83001e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 2.74001e-06 [optimize]: 0.304408, [53] [py_interpret_to_execute]: 8.41002e-06 [rewriter_before_opt_a]: 0.00034306 [opt_a]: 0.220554, [2] [Cycle 1]: 0.219512, [45] [expand_dump_flag]: 4e-06 [switch_simplify]: 6.017e-05 [loop_unroll]: 4.14e-05 [a_1]: 0.00082435 [with_stream_mark]: 2.337e-05 [recompute_prepare]: 1.51e-05 [updatestate_depend_eliminate]: 5.41002e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.78001e-06 [parameter_eliminate]: 2.22999e-06 [a_2]: 0.00014647 [accelerated_algorithm]: 1.28e-05 [shard]: 2.15002e-06 [meta_shard_fg_expand]: 3.26001e-06 [shard_inline]: 1.133e-05 [merge_send_recv]: 1.037e-05 [auto_parallel]: 8.53001e-06 [parallel]: 1.881e-05 [flash_sp]: 1.144e-05 [merge_comm]: 4e-06 [allreduce_fusion]: 3.82002e-06 [matmul_add_comm_reduction]: 1.104e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.573e-05 [virtual_dataset]: 1.376e-05 [get_grad_eliminate_]: 1.333e-05 [virtual_output]: 1.104e-05 [merge_forward]: 6.17999e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 1.099e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.953e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.715e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 4.10998e-06 [flash_sp_send_recv_attached]: 2.99999e-06 [receive_attached]: 3.04001e-06 [after_resolve]: 1.889e-05 [a_after_grad]: 1.821e-05 [renormalize]: 0.217607 [add_forward_monad_depend]: 1.352e-05 [auto_monad_grad]: 2.89001e-06 [auto_monad_eliminator]: 2.933e-05 [cse]: 5.57e-05 [a_3]: 9.244e-05 [Cycle 2]: 0.00102533, [45] [expand_dump_flag]: 2.58e-06 [switch_simplify]: 1.331e-05 [loop_unroll]: 1.067e-05 [a_1]: 0.00025922 [with_stream_mark]: 2.34e-05 [recompute_prepare]: 1.17e-05 [updatestate_depend_eliminate]: 4.66002e-06 [updatestate_assign_eliminate]: 3.70998e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 0.00012351 [accelerated_algorithm]: 1.212e-05 [shard]: 2.99999e-06 [meta_shard_fg_expand]: 2.56e-06 [shard_inline]: 1.004e-05 [merge_send_recv]: 1.044e-05 [auto_parallel]: 1.03e-05 [parallel]: 1.111e-05 [flash_sp]: 4.94e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 4.27e-06 [matmul_add_comm_reduction]: 1.101e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 1.26e-05 [virtual_dataset]: 1.051e-05 [get_grad_eliminate_]: 9.44998e-06 [virtual_output]: 9.79999e-06 [merge_forward]: 5.12999e-06 [cell_reuse_recompute_pass]: 3.76001e-06 [offload_activation]: 1.17e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.245e-05 [merge_recompute_call_nodes]: 1.81998e-06 [before_grad]: 1.44e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68001e-06 [meta_fg_expand]: 3.45998e-06 [flash_sp_send_recv_attached]: 2.11e-06 [receive_attached]: 2.18002e-06 [after_resolve]: 2.003e-05 [a_after_grad]: 1.621e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.31e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 1.216e-05 [cse]: 2.846e-05 [a_3]: 6.232e-05 [py_interpret_to_execute_after_opt_a]: 1.093e-05 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 2.815e-05 [convert_after_rewriter]: 1.27999e-06 [order_py_execute_after_rewriter]: 2.32999e-06 [mutable_eliminate]: 0.00084482 [opt_b]: 0.00034681, [1] [Cycle 1]: 0.0003369, [7] [b_1]: 0.00022247 [b_2]: 1.147e-05 [updatestate_depend_eliminate]: 1.151e-05 [updatestate_assign_eliminate]: 3.88999e-06 [updatestate_loads_eliminate]: 3.09999e-06 [renormalize]: 1.13001e-06 [cse]: 4.207e-05 [optimize_parallel_all_gather_comm]: 2.154e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 3.607e-05 [loop_unroll]: 0.00052801 [opt_after_cconv]: 0.00015439, [1] [Cycle 1]: 0.00014772, [7] [c_1]: 5.731e-05 [parameter_eliminate]: 5.91e-06 [updatestate_depend_eliminate]: 8.00999e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 2.81e-06 [cse]: 3.271e-05 [renormalize]: 6.30011e-07 [remove_dup_value]: 2.41e-05 [tuple_transform]: 0.00012424, [1] [Cycle 1]: 0.00011911, [4] [d_1]: 8.344e-05 [none_parameter_eliminate]: 2.09999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.078e-05 [partial_unused_args_eliminate]: 2.27999e-06 [add_recomputation]: 6.921e-05 [cse_after_recomputation]: 3.431e-05, [1] [Cycle 1]: 2.89e-05, [1] [cse]: 2.167e-05 [environ_conv]: 1.462e-05 [swap_dp_allreduce_reducescatter]: 6.84999e-06 [bias_add_comm_swap]: 3.40998e-06 [label_micro_interleaved_index]: 7e-06 [label_fine_grained_interleaved_index]: 3.11001e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.37001e-06 [micro_interleaved_order_control]: 2.91e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 1.36998e-06 [remove_cast_before_assign_add]: 1.33002e-06 [full_micro_interleaved_order_control]: 2.88e-06 [reorder_send_recv_between_fp_bp]: 2.61999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.14003e-06 [interleave_split_concat_branches]: 1.27e-06 [interleave_parallel_branches]: 1.65001e-06 [overlap_opt_shard_in_pipeline]: 1.57001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.131e-05 [control_data_broadcast_order]: 5.136e-05 [grouped_pairwise_exchange_alltoall]: 2.64999e-06 [offloading_packed_experts]: 6.83998e-06 [overlap_recompute_and_grad_model_parallel]: 6.79001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.61002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.81e-06 [overlap_recompute_comm]: 2.81e-06 [overlap_grad_ring_attention]: 4.84998e-06 [overlap_grad_flash_sp]: 3.791e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.13001e-06 [symbol_engine_optimizer]: 0.00017289, [1] [Cycle 1]: 0.0001606, [6] [build]: 1.138e-05 [elim_shapecalc]: 4.277e-05 [elim_not_effective]: 2.907e-05 [opt_reshape]: 1.302e-05 [fold_const_symbol]: 1.677e-05 [renormalize]: 7.30011e-07 [detach_backward]: 2.67001e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 3.046e-05 [get_jit_bprop_graph]: 2.38998e-06 [rewriter_after_jit_bprop_graph]: 9.44e-06 [opt_after_jit_grad]: 0.00657375 [validate]: 9.084e-05 Sums bootstrap : 0.000669s : 0.03% type_inference : 1.797923s : 88.62% event_method : 0.000499s : 0.02% auto_monad : 0.000086s : 0.00% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000049s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000048s : 0.00% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000343s : 0.02% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000073s : 0.00% optimize.opt_a.loop_unroll : 0.000052s : 0.00% optimize.opt_a.a_1 : 0.001084s : 0.05% optimize.opt_a.with_stream_mark : 0.000047s : 0.00% optimize.opt_a.recompute_prepare : 0.000027s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000270s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000025s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000021s : 0.00% optimize.opt_a.merge_send_recv : 0.000021s : 0.00% optimize.opt_a.auto_parallel : 0.000019s : 0.00% optimize.opt_a.parallel : 0.000030s : 0.00% optimize.opt_a.flash_sp : 0.000016s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000028s : 0.00% optimize.opt_a.virtual_dataset : 0.000024s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000023s : 0.00% optimize.opt_a.virtual_output : 0.000021s : 0.00% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000032s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000039s : 0.00% optimize.opt_a.a_after_grad : 0.000034s : 0.00% optimize.opt_a.renormalize : 0.217607s : 10.73% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.00% optimize.opt_a.cse : 0.000084s : 0.00% optimize.opt_a.a_3 : 0.000155s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000845s : 0.04% optimize.opt_b.b_1 : 0.000222s : 0.01% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.00% optimize.loop_unroll : 0.000528s : 0.03% optimize.opt_after_cconv.c_1 : 0.000057s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000024s : 0.00% optimize.tuple_transform.d_1 : 0.000083s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.00% optimize.cse_after_recomputation.cse : 0.000022s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000011s : 0.00% optimize.control_data_broadcast_order : 0.000051s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000038s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000043s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000029s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000017s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.006574s : 0.32% validate : 0.000091s : 0.00% Time group info: ------[substitution.] 0.000288 33 0.84% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000002s : 2: substitution.fold_const_symbol 3.23% : 0.000009s : 8: substitution.graph_param_transform 82.97% : 0.000239s : 4: substitution.inline 1.79% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.48% : 0.000007s : 4: substitution.remove_not_recompute_node 3.12% : 0.000009s : 6: substitution.replace_old_param 4.92% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 1.797772 2 99.81% : 1.794352s : 1: type_inference.infer 0.19% : 0.003420s : 1: type_inference.specialize ------[replace.] 0.000086 7 68.19% : 0.000058s : 4: replace.inline 31.81% : 0.000027s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000248 7 94.99% : 0.000235s : 4: match.inline 5.01% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000297 2146 0.77% : 0.000002s : 20: predicate.accumulaten_eliminater 1.57% : 0.000005s : 8: predicate.ad_related_special_op_eliminate 0.57% : 0.000002s : 16: predicate.addn_check_dump 0.94% : 0.000003s : 20: predicate.addn_zero_filter 0.68% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.11% : 0.000006s : 36: predicate.arithmetic_simplify 0.91% : 0.000003s : 20: predicate.cast_eliminate 0.61% : 0.000002s : 16: predicate.check_bprop_eliminate 0.61% : 0.000002s : 16: predicate.compare_switch_simplify 0.25% : 0.000001s : 8: predicate.const_output_eliminate 0.63% : 0.000002s : 16: predicate.depend_value_elim 0.79% : 0.000002s : 20: predicate.dict_get_item_const_eliminator 1.02% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.77% : 0.000002s : 20: predicate.dict_set_item_eliminator 1.69% : 0.000005s : 16: predicate.dumpgradient_eliminate 0.40% : 0.000001s : 8: predicate.elim_not_effective 0.76% : 0.000002s : 8: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000003s : 28: predicate.environ_add_const_eliminate 0.98% : 0.000003s : 28: predicate.environ_get_add_eliminate 0.97% : 0.000003s : 28: predicate.environ_get_depend_swap 1.84% : 0.000005s : 44: predicate.environ_get_eliminate 0.98% : 0.000003s : 28: predicate.environ_get_set_eliminate 1.08% : 0.000003s : 27: predicate.exchange_switch_depend_value 1.86% : 0.000006s : 27: predicate.float_depend_g_call 0.59% : 0.000002s : 16: predicate.float_environ_get_switch 0.90% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 8: predicate.fold_const_symbol 0.74% : 0.000002s : 16: predicate.get_grad_eliminate 0.25% : 0.000001s : 8: predicate.graph_param_transform 0.56% : 0.000002s : 16: predicate.incorporate_call 0.51% : 0.000002s : 16: predicate.incorporate_call_switch 5.39% : 0.000016s : 95: predicate.inline 0.89% : 0.000003s : 16: predicate.inline_without_move 0.40% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.87% : 0.000003s : 16: predicate.less_batch_normalization 1.87% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.07% : 0.000006s : 59: predicate.load_eliminater 0.98% : 0.000003s : 8: predicate.loop_unroll_after_grad 2.36% : 0.000007s : 55: predicate.loop_unroll_before_grad 1.59% : 0.000005s : 36: predicate.make_slice_get_slice_eliminator 0.64% : 0.000002s : 16: predicate.merge_addn 0.60% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.69% : 0.000002s : 20: predicate.minmaximum_grad 1.46% : 0.000004s : 8: predicate.mutable_eliminate 0.46% : 0.000001s : 8: predicate.opt_reshape 0.43% : 0.000001s : 8: predicate.parallel_virtual_node 1.78% : 0.000005s : 27: predicate.partial_defer_inline 1.27% : 0.000004s : 31: predicate.partial_eliminate 0.78% : 0.000002s : 20: predicate.print_const_string_wrapper 0.74% : 0.000002s : 16: predicate.reduce_all_const_elim 1.23% : 0.000004s : 20: predicate.reduce_eliminate 2.15% : 0.000006s : 59: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000002s : 16: predicate.remove_not_recompute_node 1.52% : 0.000005s : 39: predicate.replace_applicator 0.60% : 0.000002s : 16: predicate.replace_old_param 0.38% : 0.000001s : 8: predicate.reset_defer_inline 0.91% : 0.000003s : 20: predicate.reshape_eliminate 0.70% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 8: predicate.row_tensor_eliminate 0.95% : 0.000003s : 16: predicate.same_eliminate 0.51% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.94% : 0.000003s : 16: predicate.shard_identity_eliminate 0.82% : 0.000002s : 16: predicate.special_op_eliminate 0.64% : 0.000002s : 16: predicate.specialize_transform 1.01% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.43% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.26% : 0.000004s : 27: predicate.switch_defer_inline 1.75% : 0.000005s : 43: predicate.switch_layer_defer_inline 5.10% : 0.000015s : 106: predicate.switch_simplify 1.18% : 0.000003s : 20: predicate.tile_eliminate 0.73% : 0.000002s : 20: predicate.transpose_eliminate 1.50% : 0.000004s : 36: predicate.tuple_list_convert_item_index_to_positive 1.83% : 0.000005s : 36: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000004s : 36: predicate.tuple_list_get_item_depend_reorder 3.93% : 0.000012s : 55: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 36: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000007s : 52: predicate.tuple_list_set_item_eliminator 1.70% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.15% : 0.000006s : 59: predicate.updatestate_pure_node_eliminater 2.65% : 0.000008s : 75: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 8: predicate.value_based_eliminate 0.97% : 0.000003s : 16: predicate.virtual_dataset_eliminate 0.69% : 0.000002s : 16: predicate.virtual_output_eliminate 0.30% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.179433 41 94.75% : 0.170004s : 35: func_graph_cloner_run.FuncGraphClonerGraph 5.25% : 0.009429s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.643215 192 0.00% : 0.000005s : 1: ForceFp32Comm 0.16% : 0.004253s : 1: add_attr 0.16% : 0.004238s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000094s : 1: auto_monad 0.00% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.03% : 0.000696s : 1: bootstrap 0.00% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000056s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000018s : 1: environ_conv 0.02% : 0.000516s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.02% : 0.000538s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.03% : 0.000858s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000030s : 1: opt.transform.mutable_eliminate 0.07% : 0.001836s : 78: opt.transform.opt_a 0.00% : 0.000055s : 1: opt.transform.opt_after_cconv 0.00% : 0.000069s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000195s : 28: opt.transform.opt_b 0.00% : 0.000092s : 2: opt.transform.opt_trans_graph 0.00% : 0.000090s : 4: opt.transform.symbol_engine_opt 8.34% : 0.220559s : 1: opt_a 0.01% : 0.000160s : 1: opt_after_cconv 0.25% : 0.006597s : 1: opt_after_jit_grad 0.01% : 0.000351s : 1: opt_b 11.52% : 0.304416s : 1: optimize 0.00% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000042s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000054s : 1: overlap_opt_shard_grad_in_pipeline 3.05% : 0.080577s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000028s : 1: remove_dup_value 7.86% : 0.207851s : 1: renormalize.infer 0.37% : 0.009725s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000032s : 1: rewriter_after_opt_a 0.01% : 0.000353s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000176s : 1: symbol_engine_optimizer 0.00% : 0.000127s : 1: tuple_transform 68.02% : 1.797951s : 1: type_inference TotalTime = 2.08208, [21] [bootstrap]: 0.0006077 [type_inference]: 1.30499 [event_method]: 0.00097933 [auto_monad]: 0.00023227 [graph_reusing]: 1.353e-05 [inline]: 3.26999e-06 [add_attr]: 0.148892, [1] [add_attr_with_inline]: 0.148874, [1] [Cycle 1]: 0.00018186, [2] [tag_attr]: 0.00010189 [meta_addattr_fg_expand]: 1.959e-05 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 0.00010609 [insert-virtual-dataset]: 2.97002e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.39001e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.625272, [53] [py_interpret_to_execute]: 1.298e-05 [rewriter_before_opt_a]: 0.0214168 [opt_a]: 0.600376, [3] [Cycle 1]: 0.508047, [45] [expand_dump_flag]: 1.557e-05 [switch_simplify]: 0.00029736 [loop_unroll]: 0.00012712 [a_1]: 0.00322882 [with_stream_mark]: 5.676e-05 [recompute_prepare]: 5.37e-05 [updatestate_depend_eliminate]: 1.516e-05 [updatestate_assign_eliminate]: 1.075e-05 [updatestate_loads_eliminate]: 1.045e-05 [parameter_eliminate]: 5.88998e-06 [a_2]: 0.00038511 [accelerated_algorithm]: 6.23e-05 [shard]: 2.94999e-06 [meta_shard_fg_expand]: 1.218e-05 [shard_inline]: 2.772e-05 [merge_send_recv]: 2.545e-05 [auto_parallel]: 2.113e-05 [parallel]: 2.647e-05 [flash_sp]: 1.565e-05 [merge_comm]: 1.311e-05 [allreduce_fusion]: 1.162e-05 [matmul_add_comm_reduction]: 4.166e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 3.423e-05 [virtual_dataset]: 2.535e-05 [get_grad_eliminate_]: 2.473e-05 [virtual_output]: 2.416e-05 [merge_forward]: 1.542e-05 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 2.392e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.588e-05 [merge_recompute_call_nodes]: 1.99999e-06 [before_grad]: 5.177e-05 [set_forward_comm_id_for_comm_node_pass]: 1.438e-05 [meta_fg_expand]: 0.0039994 [flash_sp_send_recv_attached]: 1.309e-05 [receive_attached]: 2.86e-06 [after_resolve]: 0.00013468 [a_after_grad]: 0.00017197 [renormalize]: 0.40764 [add_forward_monad_depend]: 1.838e-05 [auto_monad_grad]: 1.259e-05 [auto_monad_eliminator]: 0.00010651 [cse]: 0.0899859 [a_3]: 0.00070564 [Cycle 2]: 0.0905803, [45] [expand_dump_flag]: 4.47e-06 [switch_simplify]: 0.0117938 [loop_unroll]: 9.092e-05 [a_1]: 0.00278852 [with_stream_mark]: 3.765e-05 [recompute_prepare]: 2.228e-05 [updatestate_depend_eliminate]: 1.079e-05 [updatestate_assign_eliminate]: 7.50998e-06 [updatestate_loads_eliminate]: 6.39999e-06 [parameter_eliminate]: 2.43e-06 [a_2]: 0.0002485 [accelerated_algorithm]: 2.695e-05 [shard]: 2.22001e-06 [meta_shard_fg_expand]: 7.37002e-06 [shard_inline]: 1.641e-05 [merge_send_recv]: 1.545e-05 [auto_parallel]: 1.726e-05 [parallel]: 1.181e-05 [flash_sp]: 4.38001e-06 [merge_comm]: 8.08999e-06 [allreduce_fusion]: 6.81001e-06 [matmul_add_comm_reduction]: 1.547e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 1.871e-05 [virtual_dataset]: 1.647e-05 [get_grad_eliminate_]: 1.74e-05 [virtual_output]: 1.622e-05 [merge_forward]: 7.75e-06 [cell_reuse_recompute_pass]: 1.97001e-06 [offload_activation]: 2.025e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.809e-05 [merge_recompute_call_nodes]: 1.81003e-06 [before_grad]: 2.685e-05 [set_forward_comm_id_for_comm_node_pass]: 9.07001e-06 [meta_fg_expand]: 0.00037175 [flash_sp_send_recv_attached]: 3.28e-06 [receive_attached]: 3.20998e-06 [after_resolve]: 4.282e-05 [a_after_grad]: 2.794e-05 [renormalize]: 0.0739512 [add_forward_monad_depend]: 1.499e-05 [auto_monad_grad]: 2.93003e-06 [auto_monad_eliminator]: 3.844e-05 [cse]: 0.00025917 [a_3]: 0.00014383 [Cycle 3]: 0.00171683, [45] [expand_dump_flag]: 3.76999e-06 [switch_simplify]: 3.675e-05 [loop_unroll]: 1.737e-05 [a_1]: 0.00057 [with_stream_mark]: 2.891e-05 [recompute_prepare]: 1.849e-05 [updatestate_depend_eliminate]: 8.99998e-06 [updatestate_assign_eliminate]: 7.63999e-06 [updatestate_loads_eliminate]: 7.26001e-06 [parameter_eliminate]: 2.68998e-06 [a_2]: 0.00023378 [accelerated_algorithm]: 2.327e-05 [shard]: 2.66999e-06 [meta_shard_fg_expand]: 5.35999e-06 [shard_inline]: 1.593e-05 [merge_send_recv]: 1.412e-05 [auto_parallel]: 1.532e-05 [parallel]: 1.191e-05 [flash_sp]: 3.69002e-06 [merge_comm]: 7.01001e-06 [allreduce_fusion]: 7.43999e-06 [matmul_add_comm_reduction]: 1.667e-05 [allreduce_slice_to_reducescatter]: 1.59e-06 [virtual_shard_identity]: 1.84e-05 [virtual_dataset]: 1.598e-05 [get_grad_eliminate_]: 1.553e-05 [virtual_output]: 1.485e-05 [merge_forward]: 8.12e-06 [cell_reuse_recompute_pass]: 2.89001e-06 [offload_activation]: 1.67e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.93e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 2.562e-05 [set_forward_comm_id_for_comm_node_pass]: 7.71001e-06 [meta_fg_expand]: 6.84001e-06 [flash_sp_send_recv_attached]: 1.83002e-06 [receive_attached]: 2.90998e-06 [after_resolve]: 2.695e-05 [a_after_grad]: 2.62e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 2.64999e-06 [auto_monad_grad]: 2.79001e-06 [auto_monad_eliminator]: 1.994e-05 [cse]: 6.674e-05 [a_3]: 0.00010779 [py_interpret_to_execute_after_opt_a]: 1.367e-05 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 5.316e-05 [convert_after_rewriter]: 1.39e-06 [order_py_execute_after_rewriter]: 1.28002e-06 [mutable_eliminate]: 0.00091372 [opt_b]: 0.0005966, [1] [Cycle 1]: 0.00058733, [7] [b_1]: 0.00043066 [b_2]: 1.837e-05 [updatestate_depend_eliminate]: 1.384e-05 [updatestate_assign_eliminate]: 6.31e-06 [updatestate_loads_eliminate]: 6.12999e-06 [renormalize]: 6.19999e-07 [cse]: 7.071e-05 [optimize_parallel_all_gather_comm]: 3.13e-05 [overlap_param_gather]: 2.08002e-06 [cconv]: 3.947e-05 [loop_unroll]: 0.00053355 [opt_after_cconv]: 0.00023613, [1] [Cycle 1]: 0.00022907, [7] [c_1]: 0.00010285 [parameter_eliminate]: 5.89e-06 [updatestate_depend_eliminate]: 1.153e-05 [updatestate_assign_eliminate]: 5.97999e-06 [updatestate_loads_eliminate]: 5.87999e-06 [cse]: 5.859e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 9.937e-05 [tuple_transform]: 0.00020551, [1] [Cycle 1]: 0.00020014, [4] [d_1]: 0.00015805 [none_parameter_eliminate]: 2.26e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.888e-05 [partial_unused_args_eliminate]: 1.90001e-06 [add_recomputation]: 8.67e-05 [cse_after_recomputation]: 4.594e-05, [1] [Cycle 1]: 4.033e-05, [1] [cse]: 3.471e-05 [environ_conv]: 1.666e-05 [swap_dp_allreduce_reducescatter]: 1.208e-05 [bias_add_comm_swap]: 3.55e-06 [label_micro_interleaved_index]: 5.44998e-06 [label_fine_grained_interleaved_index]: 3.69002e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.95998e-06 [assign_add_opt]: 1.46002e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.56e-06 [reorder_send_recv_between_fp_bp]: 2.98e-06 [comm_op_add_attrs]: 1.41002e-06 [add_comm_op_reuse_tag]: 1.58002e-06 [interleave_split_concat_branches]: 1.37e-06 [interleave_parallel_branches]: 1.15001e-06 [overlap_opt_shard_in_pipeline]: 1.25999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.19999e-06 [control_data_broadcast_order]: 2.47e-05 [grouped_pairwise_exchange_alltoall]: 1.96e-06 [offloading_packed_experts]: 8.22e-06 [overlap_recompute_and_grad_model_parallel]: 7.38e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.52001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59e-06 [overlap_recompute_comm]: 2.32999e-06 [overlap_grad_ring_attention]: 6.52001e-06 [overlap_grad_flash_sp]: 3.432e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.64001e-06 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 0.00014411, [1] [Cycle 1]: 0.00013932, [6] [build]: 1.647e-05 [elim_shapecalc]: 2.154e-05 [elim_not_effective]: 2.976e-05 [opt_reshape]: 1.677e-05 [fold_const_symbol]: 2.347e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 2.19001e-06 [auto_monad_reorder]: 3.328e-05 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 7.76001e-06 [opt_after_jit_grad]: 0.00059224 [validate]: 9.83e-05 Sums bootstrap : 0.000608s : 0.03% type_inference : 1.304992s : 67.57% event_method : 0.000979s : 0.05% auto_monad : 0.000232s : 0.01% graph_reusing : 0.000014s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000102s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000106s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000013s : 0.00% optimize.rewriter_before_opt_a : 0.021417s : 1.11% optimize.opt_a.expand_dump_flag : 0.000024s : 0.00% optimize.opt_a.switch_simplify : 0.012128s : 0.63% optimize.opt_a.loop_unroll : 0.000235s : 0.01% optimize.opt_a.a_1 : 0.006587s : 0.34% optimize.opt_a.with_stream_mark : 0.000123s : 0.01% optimize.opt_a.recompute_prepare : 0.000094s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000035s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000026s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000024s : 0.00% optimize.opt_a.parameter_eliminate : 0.000011s : 0.00% optimize.opt_a.a_2 : 0.000867s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000113s : 0.01% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000025s : 0.00% optimize.opt_a.shard_inline : 0.000060s : 0.00% optimize.opt_a.merge_send_recv : 0.000055s : 0.00% optimize.opt_a.auto_parallel : 0.000054s : 0.00% optimize.opt_a.parallel : 0.000050s : 0.00% optimize.opt_a.flash_sp : 0.000024s : 0.00% optimize.opt_a.merge_comm : 0.000028s : 0.00% optimize.opt_a.allreduce_fusion : 0.000026s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000074s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000071s : 0.00% optimize.opt_a.virtual_dataset : 0.000058s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000058s : 0.00% optimize.opt_a.virtual_output : 0.000055s : 0.00% optimize.opt_a.merge_forward : 0.000031s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000061s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000103s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000104s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000031s : 0.00% optimize.opt_a.meta_fg_expand : 0.004378s : 0.23% optimize.opt_a.flash_sp_send_recv_attached : 0.000018s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000204s : 0.01% optimize.opt_a.a_after_grad : 0.000226s : 0.01% optimize.opt_a.renormalize : 0.481591s : 24.93% optimize.opt_a.add_forward_monad_depend : 0.000036s : 0.00% optimize.opt_a.auto_monad_grad : 0.000018s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000165s : 0.01% optimize.opt_a.cse : 0.090312s : 4.68% optimize.opt_a.a_3 : 0.000957s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000914s : 0.05% optimize.opt_b.b_1 : 0.000431s : 0.02% optimize.opt_b.b_2 : 0.000018s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000071s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000039s : 0.00% optimize.loop_unroll : 0.000534s : 0.03% optimize.opt_after_cconv.c_1 : 0.000103s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.cse : 0.000059s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000099s : 0.01% optimize.tuple_transform.d_1 : 0.000158s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000087s : 0.00% optimize.cse_after_recomputation.cse : 0.000035s : 0.00% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000004s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000025s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000034s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000030s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000033s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000592s : 0.03% validate : 0.000098s : 0.01% Time group info: ------[substitution.] 0.002072 383 0.18% : 0.000004s : 6: substitution.elim_not_effective 0.60% : 0.000012s : 14: substitution.float_depend_g_call 1.02% : 0.000021s : 13: substitution.float_tuple_getitem_switch 0.20% : 0.000004s : 6: substitution.fold_const_symbol 0.55% : 0.000011s : 13: substitution.graph_param_transform 0.18% : 0.000004s : 2: substitution.incorporate_call 0.12% : 0.000002s : 2: substitution.incorporate_call_switch 63.05% : 0.001306s : 26: substitution.inline 1.45% : 0.000030s : 2: substitution.inline_without_move 0.76% : 0.000016s : 23: substitution.j_node_and_user_rematch 1.67% : 0.000035s : 3: substitution.less_batch_normalization 1.61% : 0.000033s : 22: substitution.minmaximum_grad 0.64% : 0.000013s : 14: substitution.partial_eliminate 0.97% : 0.000020s : 23: substitution.remove_not_recompute_node 2.36% : 0.000049s : 11: substitution.replace_applicator 1.12% : 0.000023s : 25: substitution.replace_old_param 0.33% : 0.000007s : 1: substitution.set_cell_output_no_recompute 2.03% : 0.000042s : 6: substitution.switch_simplify 5.84% : 0.000121s : 28: substitution.tuple_list_convert_item_index_to_positive 1.79% : 0.000037s : 28: substitution.tuple_list_get_item_const_eliminator 2.66% : 0.000055s : 28: substitution.tuple_list_get_item_depend_reorder 8.38% : 0.000174s : 59: substitution.tuple_list_get_item_eliminator 2.49% : 0.000052s : 28: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 1.304754 2 98.47% : 1.284847s : 1: type_inference.infer 1.53% : 0.019907s : 1: type_inference.specialize ------[replace.] 0.000774 53 47.28% : 0.000366s : 26: replace.inline 23.09% : 0.000179s : 6: replace.switch_simplify 29.64% : 0.000229s : 21: replace.tuple_list_get_item_eliminator ------[match.] 0.001381 53 93.07% : 0.001285s : 26: match.inline 2.65% : 0.000037s : 6: match.switch_simplify 4.29% : 0.000059s : 21: match.tuple_list_get_item_eliminator ------[predicate.] 0.001326 9325 1.09% : 0.000014s : 109: predicate.accumulaten_eliminater 0.23% : 0.000003s : 13: predicate.ad_related_special_op_eliminate 0.45% : 0.000006s : 52: predicate.addn_check_dump 0.98% : 0.000013s : 109: predicate.addn_zero_filter 0.91% : 0.000012s : 109: predicate.adjust_all_reduce_mul_add 1.88% : 0.000025s : 161: predicate.arithmetic_simplify 1.00% : 0.000013s : 109: predicate.cast_eliminate 1.06% : 0.000014s : 114: predicate.check_bprop_eliminate 0.46% : 0.000006s : 52: predicate.compare_switch_simplify 0.09% : 0.000001s : 14: predicate.const_output_eliminate 0.46% : 0.000006s : 52: predicate.depend_value_elim 1.04% : 0.000014s : 109: predicate.dict_get_item_const_eliminator 1.21% : 0.000016s : 109: predicate.dict_get_item_eliminator 0.98% : 0.000013s : 109: predicate.dict_set_item_eliminator 0.38% : 0.000005s : 27: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 13: predicate.elim_not_effective 0.17% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000015s : 123: predicate.environ_add_const_eliminate 1.15% : 0.000015s : 123: predicate.environ_get_add_eliminate 1.11% : 0.000015s : 123: predicate.environ_get_depend_swap 1.63% : 0.000022s : 175: predicate.environ_get_eliminate 1.12% : 0.000015s : 123: predicate.environ_get_set_eliminate 1.49% : 0.000020s : 156: predicate.exchange_switch_depend_value 2.27% : 0.000030s : 156: predicate.float_depend_g_call 0.54% : 0.000007s : 52: predicate.float_environ_get_switch 0.68% : 0.000009s : 66: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 13: predicate.fold_const_symbol 0.52% : 0.000007s : 52: predicate.get_grad_eliminate 0.10% : 0.000001s : 13: predicate.graph_param_transform 0.47% : 0.000006s : 52: predicate.incorporate_call 0.42% : 0.000006s : 52: predicate.incorporate_call_switch 5.10% : 0.000068s : 397: predicate.inline 1.21% : 0.000016s : 89: predicate.inline_without_move 0.29% : 0.000004s : 52: predicate.j_node_and_user_rematch 0.77% : 0.000010s : 52: predicate.less_batch_normalization 1.56% : 0.000021s : 157: predicate.list_to_tuple_eliminator_ 2.36% : 0.000031s : 267: predicate.load_eliminater 0.29% : 0.000004s : 14: predicate.loop_unroll_after_grad 2.69% : 0.000036s : 242: predicate.loop_unroll_before_grad 1.28% : 0.000017s : 137: predicate.make_slice_get_slice_eliminator 0.50% : 0.000007s : 52: predicate.merge_addn 1.05% : 0.000014s : 114: predicate.micro_step_allgather_replace 1.11% : 0.000015s : 114: predicate.mini_step_allgather_replace 0.97% : 0.000013s : 109: predicate.minmaximum_grad 0.45% : 0.000006s : 14: predicate.mutable_eliminate 0.14% : 0.000002s : 13: predicate.opt_reshape 0.14% : 0.000002s : 14: predicate.parallel_virtual_node 2.21% : 0.000029s : 156: predicate.partial_defer_inline 1.49% : 0.000020s : 144: predicate.partial_eliminate 0.99% : 0.000013s : 109: predicate.print_const_string_wrapper 0.54% : 0.000007s : 52: predicate.reduce_all_const_elim 1.62% : 0.000021s : 109: predicate.reduce_eliminate 2.51% : 0.000033s : 267: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000004s : 52: predicate.remove_not_recompute_node 1.92% : 0.000026s : 244: predicate.replace_applicator 0.60% : 0.000008s : 89: predicate.replace_old_param 0.12% : 0.000002s : 14: predicate.reset_defer_inline 1.06% : 0.000014s : 109: predicate.reshape_eliminate 1.11% : 0.000015s : 114: predicate.row_tensor_add_zeros_like 0.15% : 0.000002s : 14: predicate.row_tensor_eliminate 1.78% : 0.000024s : 114: predicate.same_eliminate 0.39% : 0.000005s : 52: predicate.set_cell_output_no_recompute 0.63% : 0.000008s : 52: predicate.shard_identity_eliminate 0.26% : 0.000004s : 27: predicate.special_op_eliminate 0.51% : 0.000007s : 52: predicate.specialize_transform 1.30% : 0.000017s : 114: predicate.split_environ_get_set_with_tuple_value 1.23% : 0.000016s : 89: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 14: predicate.switch_call_monad_eliminater 1.64% : 0.000022s : 156: predicate.switch_defer_inline 2.67% : 0.000035s : 270: predicate.switch_layer_defer_inline 7.57% : 0.000100s : 475: predicate.switch_simplify 0.97% : 0.000013s : 109: predicate.tile_eliminate 0.98% : 0.000013s : 109: predicate.transpose_eliminate 1.42% : 0.000019s : 136: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000020s : 136: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000019s : 136: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000040s : 209: predicate.tuple_list_get_item_eliminator 1.41% : 0.000019s : 136: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000027s : 188: predicate.tuple_list_set_item_eliminator 1.65% : 0.000022s : 157: predicate.tuple_to_list_eliminator_ 2.28% : 0.000030s : 267: predicate.updatestate_pure_node_eliminater 3.81% : 0.000050s : 319: predicate.updatestate_useless_node_eliminater 0.14% : 0.000002s : 14: predicate.value_based_eliminate 0.59% : 0.000008s : 52: predicate.virtual_dataset_eliminate 0.51% : 0.000007s : 52: predicate.virtual_output_eliminate 0.13% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 14: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.032946 88 90.09% : 0.029680s : 58: func_graph_cloner_run.FuncGraphClonerGraph 9.91% : 0.003265s : 30: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.348751 233 0.00% : 0.000004s : 1: ForceFp32Comm 4.45% : 0.148900s : 1: add_attr 4.45% : 0.148881s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.00% : 0.000091s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000246s : 1: auto_monad 0.00% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.02% : 0.000643s : 1: bootstrap 0.00% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000028s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000049s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.03% : 0.001002s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000018s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.02% : 0.000544s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.03% : 0.000927s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000031s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000043s : 1: opt.transform.mutable_eliminate 0.31% : 0.010280s : 117: opt.transform.opt_a 0.00% : 0.000101s : 1: opt.transform.opt_after_cconv 0.00% : 0.000058s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000411s : 28: opt.transform.opt_b 0.01% : 0.000174s : 2: opt.transform.opt_trans_graph 0.00% : 0.000088s : 4: opt.transform.symbol_engine_opt 17.93% : 0.600382s : 1: opt_a 0.01% : 0.000240s : 1: opt_after_cconv 0.02% : 0.000602s : 1: opt_after_jit_grad 0.02% : 0.000601s : 1: opt_b 18.67% : 0.625280s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000038s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000112s : 1: pre_auto_parallel 0.00% : 0.000019s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000105s : 1: remove_dup_value 14.16% : 0.474218s : 2: renormalize.infer 0.22% : 0.007325s : 2: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000058s : 1: rewriter_after_opt_a 0.64% : 0.021457s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000147s : 1: symbol_engine_optimizer 0.01% : 0.000209s : 1: tuple_transform 38.97% : 1.305027s : 1: type_inference TotalTime = 1.81042, [21] [bootstrap]: 0.106587 [type_inference]: 1.43212 [event_method]: 3.136e-05 [auto_monad]: 0.00095235 [graph_reusing]: 6.138e-05 [inline]: 4.03001e-06 [add_attr]: 0.154237, [1] [add_attr_with_inline]: 0.154164, [1] [Cycle 1]: 0.00015462, [2] [tag_attr]: 3.636e-05 [meta_addattr_fg_expand]: 8.64e-06 [parallel-infer-symbol]: 3.81001e-06 [pre_auto_parallel]: 5.584e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 1.32e-06 [dataset_repeat_opt]: 2.46e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.112569, [53] [py_interpret_to_execute]: 1.579e-05 [rewriter_before_opt_a]: 0.00046371 [opt_a]: 0.0998502, [2] [Cycle 1]: 0.0990762, [45] [expand_dump_flag]: 0.00011461 [switch_simplify]: 0.00010048 [loop_unroll]: 8.706e-05 [a_1]: 0.0937027 [with_stream_mark]: 0.00011488 [recompute_prepare]: 2.127e-05 [updatestate_depend_eliminate]: 5.90002e-06 [updatestate_assign_eliminate]: 4.92e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 2.67001e-06 [a_2]: 0.00015585 [accelerated_algorithm]: 8.77999e-06 [shard]: 4.31002e-06 [meta_shard_fg_expand]: 6.02001e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 1.24e-05 [auto_parallel]: 1.301e-05 [parallel]: 0.00020262 [flash_sp]: 1.676e-05 [merge_comm]: 4.52e-06 [allreduce_fusion]: 3.75e-06 [matmul_add_comm_reduction]: 1.266e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.162e-05 [virtual_dataset]: 9.17001e-06 [get_grad_eliminate_]: 6.606e-05 [virtual_output]: 5.5e-05 [merge_forward]: 6.06e-06 [cell_reuse_recompute_pass]: 3.73999e-06 [offload_activation]: 1.31e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.684e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.239e-05 [set_forward_comm_id_for_comm_node_pass]: 4.21001e-06 [meta_fg_expand]: 4.56002e-06 [flash_sp_send_recv_attached]: 3.74002e-06 [receive_attached]: 3.68e-06 [after_resolve]: 6.655e-05 [a_after_grad]: 0.00010784 [renormalize]: 0.00316336 [add_forward_monad_depend]: 1.369e-05 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.909e-05 [cse]: 3.71e-05 [a_3]: 7.095e-05 [Cycle 2]: 0.00075399, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 9.12999e-06 [loop_unroll]: 7.03998e-06 [a_1]: 0.00015025 [with_stream_mark]: 2.385e-05 [recompute_prepare]: 7.2e-06 [updatestate_depend_eliminate]: 3.95998e-06 [updatestate_assign_eliminate]: 3.96001e-06 [updatestate_loads_eliminate]: 3.65e-06 [parameter_eliminate]: 2.55002e-06 [a_2]: 7.34e-05 [accelerated_algorithm]: 6.90002e-06 [shard]: 3.09999e-06 [meta_shard_fg_expand]: 3.34001e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 8.67e-06 [auto_parallel]: 1.017e-05 [parallel]: 9.17001e-06 [flash_sp]: 4.45999e-06 [merge_comm]: 3.4e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 1.106e-05 [allreduce_slice_to_reducescatter]: 1.14e-06 [virtual_shard_identity]: 8.03001e-06 [virtual_dataset]: 6.54999e-06 [get_grad_eliminate_]: 6.41e-06 [virtual_output]: 6.61e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 3.60998e-06 [offload_activation]: 1.25e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.893e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05998e-06 [meta_fg_expand]: 3.86999e-06 [flash_sp_send_recv_attached]: 2.16998e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.363e-05 [a_after_grad]: 1.063e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.87999e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 9.24e-06 [cse]: 1.521e-05 [a_3]: 3.792e-05 [py_interpret_to_execute_after_opt_a]: 1.344e-05 [slice_cell_reuse_recomputed_activation]: 2.76e-06 [rewriter_after_opt_a]: 2.21e-05 [convert_after_rewriter]: 1.77999e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00205124 [opt_b]: 0.00188168, [1] [Cycle 1]: 0.00148262, [7] [b_1]: 0.00039518 [b_2]: 1.289e-05 [updatestate_depend_eliminate]: 1.434e-05 [updatestate_assign_eliminate]: 5.04e-06 [updatestate_loads_eliminate]: 0.00011762 [renormalize]: 9.79984e-07 [cse]: 0.00010802 [optimize_parallel_all_gather_comm]: 0.00025651 [overlap_param_gather]: 3.07997e-06 [cconv]: 0.00010354 [loop_unroll]: 0.00220435 [opt_after_cconv]: 0.00094615, [1] [Cycle 1]: 0.00087951, [7] [c_1]: 4.603e-05 [parameter_eliminate]: 8.64e-06 [updatestate_depend_eliminate]: 1.479e-05 [updatestate_assign_eliminate]: 5.27001e-06 [updatestate_loads_eliminate]: 4.1e-06 [cse]: 0.00010114 [renormalize]: 1.17999e-06 [remove_dup_value]: 1.954e-05 [tuple_transform]: 0.00060364, [1] [Cycle 1]: 0.00042526, [4] [d_1]: 0.0001236 [none_parameter_eliminate]: 3.17002e-06 [renormalize]: 3.10014e-07 [switch_simplify]: 6.974e-05 [partial_unused_args_eliminate]: 3.46999e-06 [add_recomputation]: 0.0005635 [cse_after_recomputation]: 0.00026852, [1] [Cycle 1]: 0.00015201, [1] [cse]: 2.999e-05 [environ_conv]: 1.148e-05 [swap_dp_allreduce_reducescatter]: 6.433e-05 [bias_add_comm_swap]: 5.42001e-06 [label_micro_interleaved_index]: 1.24e-05 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.87999e-06 [slice_recompute_activation]: 3.18e-06 [micro_interleaved_order_control]: 5.25001e-06 [assign_add_opt]: 1.86e-06 [ForceFp32Comm]: 1.67999e-06 [remove_cast_before_assign_add]: 1.27999e-06 [full_micro_interleaved_order_control]: 6.126e-05 [reorder_send_recv_between_fp_bp]: 3.32002e-06 [comm_op_add_attrs]: 1.37999e-06 [add_comm_op_reuse_tag]: 1.34e-06 [interleave_split_concat_branches]: 1.76998e-06 [interleave_parallel_branches]: 2.46e-06 [overlap_opt_shard_in_pipeline]: 6.042e-05 [overlap_opt_shard_grad_in_pipeline]: 2.47001e-06 [control_data_broadcast_order]: 7.902e-05 [grouped_pairwise_exchange_alltoall]: 2.46e-06 [offloading_packed_experts]: 5.49e-06 [overlap_recompute_and_grad_model_parallel]: 6.521e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.96e-06 [overlap_recompute_allgather_and_fa_grad]: 1.52001e-06 [overlap_recompute_comm]: 2.89999e-06 [overlap_grad_ring_attention]: 7.012e-05 [overlap_grad_flash_sp]: 3.534e-05 [begin_end_overlap_inline]: 1.74998e-06 [split_matmul_comm_elemetwise]: 2.51e-06 [split_layernorm_comm]: 2.74999e-06 [handle_group_info]: 1.42e-06 [symbol_engine_optimizer]: 0.00087893, [1] [Cycle 1]: 0.00075534, [6] [build]: 9.52999e-06 [elim_shapecalc]: 3.037e-05 [elim_not_effective]: 7.02e-05 [opt_reshape]: 1.163e-05 [fold_const_symbol]: 1.635e-05 [renormalize]: 3.9002e-07 [detach_backward]: 3.29001e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 3.014e-05 [get_jit_bprop_graph]: 2.50002e-06 [rewriter_after_jit_bprop_graph]: 1.306e-05 [opt_after_jit_grad]: 0.00268127 [validate]: 0.00021586 Sums bootstrap : 0.106587s : 6.46% type_inference : 1.432121s : 86.84% event_method : 0.000031s : 0.00% auto_monad : 0.000952s : 0.06% graph_reusing : 0.000061s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000036s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000056s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000016s : 0.00% optimize.rewriter_before_opt_a : 0.000464s : 0.03% optimize.opt_a.expand_dump_flag : 0.000118s : 0.01% optimize.opt_a.switch_simplify : 0.000110s : 0.01% optimize.opt_a.loop_unroll : 0.000094s : 0.01% optimize.opt_a.a_1 : 0.093853s : 5.69% optimize.opt_a.with_stream_mark : 0.000139s : 0.01% optimize.opt_a.recompute_prepare : 0.000028s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000229s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.00% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000021s : 0.00% optimize.opt_a.auto_parallel : 0.000023s : 0.00% optimize.opt_a.parallel : 0.000212s : 0.01% optimize.opt_a.flash_sp : 0.000021s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000016s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000072s : 0.00% optimize.opt_a.virtual_output : 0.000062s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000080s : 0.00% optimize.opt_a.a_after_grad : 0.000118s : 0.01% optimize.opt_a.renormalize : 0.003163s : 0.19% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.00% optimize.opt_a.cse : 0.000052s : 0.00% optimize.opt_a.a_3 : 0.000109s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.002051s : 0.12% optimize.opt_b.b_1 : 0.000395s : 0.02% optimize.opt_b.b_2 : 0.000013s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000118s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000108s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000257s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000104s : 0.01% optimize.loop_unroll : 0.002204s : 0.13% optimize.opt_after_cconv.c_1 : 0.000046s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000101s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.00% optimize.tuple_transform.d_1 : 0.000124s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000070s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000563s : 0.03% optimize.cse_after_recomputation.cse : 0.000030s : 0.00% optimize.environ_conv : 0.000011s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000064s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000005s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000002s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000061s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000060s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000079s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000065s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000070s : 0.00% optimize.overlap_grad_flash_sp : 0.000035s : 0.00% optimize.begin_end_overlap_inline : 0.000002s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000003s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000030s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000070s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000013s : 0.00% opt_after_jit_grad : 0.002681s : 0.16% validate : 0.000216s : 0.01% Time group info: ------[substitution.] 0.000682 29 0.31% : 0.000002s : 2: substitution.elim_not_effective 0.26% : 0.000002s : 2: substitution.fold_const_symbol 1.19% : 0.000008s : 4: substitution.graph_param_transform 91.58% : 0.000625s : 5: substitution.inline 0.75% : 0.000005s : 4: substitution.j_node_and_user_rematch 1.01% : 0.000007s : 4: substitution.remove_not_recompute_node 1.24% : 0.000008s : 4: substitution.replace_old_param 3.67% : 0.000025s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 1.431481 2 90.09% : 1.289676s : 1: type_inference.infer 9.91% : 0.141805s : 1: type_inference.specialize ------[replace.] 0.000549 9 91.72% : 0.000504s : 5: replace.inline 8.28% : 0.000045s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000586 9 96.37% : 0.000564s : 5: match.inline 3.63% : 0.000021s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000292 1345 1.01% : 0.000003s : 14: predicate.accumulaten_eliminater 1.44% : 0.000004s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.14% : 0.000003s : 14: predicate.addn_zero_filter 0.66% : 0.000002s : 14: predicate.adjust_all_reduce_mul_add 3.10% : 0.000009s : 22: predicate.arithmetic_simplify 0.92% : 0.000003s : 14: predicate.cast_eliminate 0.42% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.78% : 0.000002s : 8: predicate.depend_value_elim 0.76% : 0.000002s : 14: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 14: predicate.dict_get_item_eliminator 0.72% : 0.000002s : 14: predicate.dict_set_item_eliminator 1.21% : 0.000004s : 8: predicate.dumpgradient_eliminate 0.48% : 0.000001s : 4: predicate.elim_not_effective 0.73% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 2.40% : 0.000007s : 18: predicate.environ_add_const_eliminate 0.93% : 0.000003s : 18: predicate.environ_get_add_eliminate 0.82% : 0.000002s : 18: predicate.environ_get_depend_swap 1.34% : 0.000004s : 26: predicate.environ_get_eliminate 0.83% : 0.000002s : 18: predicate.environ_get_set_eliminate 1.46% : 0.000004s : 23: predicate.exchange_switch_depend_value 2.85% : 0.000008s : 23: predicate.float_depend_g_call 0.58% : 0.000002s : 8: predicate.float_environ_get_switch 0.68% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000002s : 8: predicate.get_grad_eliminate 0.34% : 0.000001s : 4: predicate.graph_param_transform 0.38% : 0.000001s : 8: predicate.incorporate_call 0.31% : 0.000001s : 8: predicate.incorporate_call_switch 6.78% : 0.000020s : 61: predicate.inline 0.56% : 0.000002s : 8: predicate.inline_without_move 0.21% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000005s : 26: predicate.list_to_tuple_eliminator_ 1.97% : 0.000006s : 40: predicate.load_eliminater 2.26% : 0.000007s : 4: predicate.loop_unroll_after_grad 2.22% : 0.000006s : 41: predicate.loop_unroll_before_grad 1.36% : 0.000004s : 22: predicate.make_slice_get_slice_eliminator 0.65% : 0.000002s : 8: predicate.merge_addn 0.42% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.70% : 0.000002s : 14: predicate.minmaximum_grad 1.63% : 0.000005s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.28% : 0.000001s : 4: predicate.parallel_virtual_node 2.40% : 0.000007s : 23: predicate.partial_defer_inline 1.07% : 0.000003s : 22: predicate.partial_eliminate 0.97% : 0.000003s : 14: predicate.print_const_string_wrapper 0.43% : 0.000001s : 8: predicate.reduce_all_const_elim 1.35% : 0.000004s : 14: predicate.reduce_eliminate 2.44% : 0.000007s : 40: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000004s : 26: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.40% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000003s : 14: predicate.reshape_eliminate 0.62% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 4: predicate.row_tensor_eliminate 0.61% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.61% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000003s : 8: predicate.special_op_eliminate 0.58% : 0.000002s : 8: predicate.specialize_transform 0.92% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000003s : 8: predicate.stack_unstack_eliminate 0.23% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.15% : 0.000003s : 23: predicate.switch_defer_inline 1.90% : 0.000006s : 31: predicate.switch_layer_defer_inline 4.62% : 0.000014s : 76: predicate.switch_simplify 0.74% : 0.000002s : 14: predicate.tile_eliminate 0.94% : 0.000003s : 14: predicate.transpose_eliminate 1.57% : 0.000005s : 22: predicate.tuple_list_convert_item_index_to_positive 1.17% : 0.000003s : 22: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 22: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 22: predicate.tuple_list_get_set_item_eliminator 2.48% : 0.000007s : 30: predicate.tuple_list_set_item_eliminator 2.14% : 0.000006s : 26: predicate.tuple_to_list_eliminator_ 1.75% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 2.59% : 0.000008s : 48: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000002s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.94% : 0.000003s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.128135 12 90.00% : 0.115316s : 5: func_graph_cloner_run.FuncGraphClonerGraph 10.00% : 0.012819s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.175451 192 0.00% : 0.000006s : 1: ForceFp32Comm 7.09% : 0.154244s : 1: add_attr 7.09% : 0.154169s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.03% : 0.000634s : 1: add_recomputation 0.00% : 0.000059s : 1: assign_add_opt 0.04% : 0.000968s : 1: auto_monad 0.00% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000063s : 1: begin_end_overlap_inline 0.00% : 0.000080s : 1: bias_add_comm_swap 4.91% : 0.106903s : 1: bootstrap 0.01% : 0.000158s : 1: cconv 0.00% : 0.000058s : 1: comm_op_add_attrs 0.01% : 0.000145s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.02% : 0.000337s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000008s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.00% : 0.000039s : 1: event_method 0.00% : 0.000068s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000123s : 1: graph_reusing 0.00% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000061s : 1: interleave_parallel_branches 0.01% : 0.000123s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000080s : 1: label_micro_interleaved_index 0.10% : 0.002228s : 1: loop_unroll 0.00% : 0.000055s : 1: merge_cast_opt 0.00% : 0.000013s : 1: micro_interleaved_order_control 0.10% : 0.002129s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000091s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000090s : 1: opt.transform.mutable_eliminate 4.36% : 0.094769s : 78: opt.transform.opt_a 0.00% : 0.000044s : 1: opt.transform.opt_after_cconv 0.00% : 0.000055s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000359s : 28: opt.transform.opt_b 0.01% : 0.000187s : 2: opt.transform.opt_trans_graph 0.01% : 0.000121s : 4: opt.transform.symbol_engine_opt 4.59% : 0.099856s : 1: opt_a 0.04% : 0.000952s : 1: opt_after_cconv 0.12% : 0.002708s : 1: opt_after_jit_grad 0.09% : 0.001889s : 1: opt_b 5.18% : 0.112580s : 1: optimize 0.01% : 0.000325s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000102s : 1: overlap_grad_flash_sp 0.00% : 0.000007s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000125s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000068s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000056s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000165s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000110s : 1: parallel-infer-symbol 0.00% : 0.000007s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000021s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 0.03% : 0.000646s : 1: renormalize.infer 0.11% : 0.002496s : 1: renormalize.specialize 0.00% : 0.000067s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000018s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000026s : 1: rewriter_after_opt_a 0.02% : 0.000527s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000060s : 1: slice_recompute_activation 0.00% : 0.000008s : 1: split_layernorm_comm 0.00% : 0.000069s : 1: split_matmul_comm_elemetwise 0.01% : 0.000127s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000934s : 1: symbol_engine_optimizer 0.03% : 0.000609s : 1: tuple_transform 65.83% : 1.432154s : 1: type_inference TotalTime = 0.59879, [21] [bootstrap]: 0.127535 [type_inference]: 0.245489 [event_method]: 2.84e-05 [auto_monad]: 0.00013268 [graph_reusing]: 6.39999e-06 [inline]: 3.85e-06 [add_attr]: 0.124597, [1] [add_attr_with_inline]: 0.124579, [1] [Cycle 1]: 0.00010714, [2] [tag_attr]: 3.165e-05 [meta_addattr_fg_expand]: 7.21999e-06 [parallel-infer-symbol]: 3.93001e-06 [pre_auto_parallel]: 5.493e-05 [insert-virtual-dataset]: 2.69001e-06 [parallel-infer-symbol-second]: 9.29984e-07 [dataset_repeat_opt]: 2.53998e-06 [pipeline_split]: 2.24001e-06 [optimize]: 0.0814951, [53] [py_interpret_to_execute]: 1.543e-05 [rewriter_before_opt_a]: 0.00025827 [opt_a]: 0.0336813, [2] [Cycle 1]: 0.0328311, [45] [expand_dump_flag]: 3.76999e-06 [switch_simplify]: 5.066e-05 [loop_unroll]: 3.526e-05 [a_1]: 0.00080394 [with_stream_mark]: 2.84e-05 [recompute_prepare]: 1.605e-05 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 4.12998e-06 [updatestate_loads_eliminate]: 3.49001e-06 [parameter_eliminate]: 2.19001e-06 [a_2]: 0.00010127 [accelerated_algorithm]: 9.22001e-06 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 3.13e-06 [shard_inline]: 7.45998e-06 [merge_send_recv]: 1.12e-05 [auto_parallel]: 8.67e-06 [parallel]: 3.149e-05 [flash_sp]: 1.221e-05 [merge_comm]: 5.81e-06 [allreduce_fusion]: 3.72002e-06 [matmul_add_comm_reduction]: 1.214e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 1.486e-05 [virtual_dataset]: 1.027e-05 [get_grad_eliminate_]: 7.65e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 6.16e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [offload_activation]: 1.279e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.803e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.34e-05 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 2.91e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.787e-05 [a_after_grad]: 1.408e-05 [renormalize]: 0.0310192 [add_forward_monad_depend]: 1.183e-05 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 2.626e-05 [cse]: 7.26e-05 [a_3]: 7.661e-05 [Cycle 2]: 0.00083277, [45] [expand_dump_flag]: 2.61999e-06 [switch_simplify]: 1.03e-05 [loop_unroll]: 9.56998e-06 [a_1]: 0.00017467 [with_stream_mark]: 2.269e-05 [recompute_prepare]: 8.55999e-06 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 4.17e-06 [updatestate_loads_eliminate]: 3.9e-06 [parameter_eliminate]: 2.00002e-06 [a_2]: 9.105e-05 [accelerated_algorithm]: 8.08999e-06 [shard]: 2.74001e-06 [meta_shard_fg_expand]: 3.21999e-06 [shard_inline]: 7.82e-06 [merge_send_recv]: 1.017e-05 [auto_parallel]: 1.002e-05 [parallel]: 1.037e-05 [flash_sp]: 4.3e-06 [merge_comm]: 3.98999e-06 [allreduce_fusion]: 4.11001e-06 [matmul_add_comm_reduction]: 1.168e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.096e-05 [virtual_dataset]: 8.65999e-06 [get_grad_eliminate_]: 7.91001e-06 [virtual_output]: 8.47998e-06 [merge_forward]: 5.62999e-06 [cell_reuse_recompute_pass]: 3.26999e-06 [offload_activation]: 1.216e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.903e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 1.152e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 3.40998e-06 [flash_sp_send_recv_attached]: 1.99999e-06 [receive_attached]: 2.91e-06 [after_resolve]: 1.72e-05 [a_after_grad]: 1.219e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.16998e-06 [auto_monad_grad]: 2.02001e-06 [auto_monad_eliminator]: 8.43999e-06 [cse]: 2.365e-05 [a_3]: 4.283e-05 [py_interpret_to_execute_after_opt_a]: 1.193e-05 [slice_cell_reuse_recomputed_activation]: 2.18002e-06 [rewriter_after_opt_a]: 2.774e-05 [convert_after_rewriter]: 1.42999e-06 [order_py_execute_after_rewriter]: 1.22999e-06 [mutable_eliminate]: 0.00092 [opt_b]: 0.00027492, [1] [Cycle 1]: 0.00026524, [7] [b_1]: 0.00015635 [b_2]: 1.042e-05 [updatestate_depend_eliminate]: 1.341e-05 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 4.34997e-06 [renormalize]: 1.14998e-06 [cse]: 3.639e-05 [optimize_parallel_all_gather_comm]: 2.055e-05 [overlap_param_gather]: 2.94999e-06 [cconv]: 3.575e-05 [loop_unroll]: 0.00058708 [opt_after_cconv]: 0.00012623, [1] [Cycle 1]: 0.00011954, [7] [c_1]: 3.753e-05 [parameter_eliminate]: 5.24e-06 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 2.51998e-06 [cse]: 2.726e-05 [renormalize]: 9.70002e-07 [remove_dup_value]: 4.511e-05 [tuple_transform]: 8.719e-05, [1] [Cycle 1]: 8.271e-05, [4] [d_1]: 5.29e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.98001e-06 [partial_unused_args_eliminate]: 2.17001e-06 [add_recomputation]: 5.662e-05 [cse_after_recomputation]: 2.547e-05, [1] [Cycle 1]: 2.01e-05, [1] [cse]: 1.495e-05 [environ_conv]: 6.06003e-06 [swap_dp_allreduce_reducescatter]: 5.91e-06 [bias_add_comm_swap]: 3.57997e-06 [label_micro_interleaved_index]: 6.09001e-06 [label_fine_grained_interleaved_index]: 2.60002e-06 [merge_cast_opt]: 1.97999e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 3.14999e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.44001e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.34e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.22e-06 [overlap_opt_shard_in_pipeline]: 7.97e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92999e-06 [control_data_broadcast_order]: 1.418e-05 [grouped_pairwise_exchange_alltoall]: 1.92999e-06 [offloading_packed_experts]: 4.35999e-06 [overlap_recompute_and_grad_model_parallel]: 4.57998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 1.96e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 2.347e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.29999e-06 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.0449141, [1] [Cycle 1]: 0.0449034, [6] [build]: 3.97e-06 [elim_shapecalc]: 1.127e-05 [elim_not_effective]: 1.331e-05 [opt_reshape]: 7.82e-06 [fold_const_symbol]: 1.181e-05 [renormalize]: 2.16e-06 [detach_backward]: 6.51e-06 [pipeline_parallel_scheduler]: 2.50002e-06 [auto_monad_reorder]: 5.88e-05 [get_jit_bprop_graph]: 2.64001e-06 [rewriter_after_jit_bprop_graph]: 1.9e-05 [opt_after_jit_grad]: 0.0189254 [validate]: 7.16e-05 Sums bootstrap : 0.127535s : 29.79% type_inference : 0.245489s : 57.35% event_method : 0.000028s : 0.01% auto_monad : 0.000133s : 0.03% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000055s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000015s : 0.00% optimize.rewriter_before_opt_a : 0.000258s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000061s : 0.01% optimize.opt_a.loop_unroll : 0.000045s : 0.01% optimize.opt_a.a_1 : 0.000979s : 0.23% optimize.opt_a.with_stream_mark : 0.000051s : 0.01% optimize.opt_a.recompute_prepare : 0.000025s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000192s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000015s : 0.00% optimize.opt_a.merge_send_recv : 0.000021s : 0.00% optimize.opt_a.auto_parallel : 0.000019s : 0.00% optimize.opt_a.parallel : 0.000042s : 0.01% optimize.opt_a.flash_sp : 0.000017s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000026s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000035s : 0.01% optimize.opt_a.a_after_grad : 0.000026s : 0.01% optimize.opt_a.renormalize : 0.031019s : 7.25% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.01% optimize.opt_a.cse : 0.000096s : 0.02% optimize.opt_a.a_3 : 0.000119s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000920s : 0.21% optimize.opt_b.b_1 : 0.000156s : 0.04% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.00% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000036s : 0.01% optimize.loop_unroll : 0.000587s : 0.14% optimize.opt_after_cconv.c_1 : 0.000038s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000045s : 0.01% optimize.tuple_transform.d_1 : 0.000053s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.01% optimize.cse_after_recomputation.cse : 0.000015s : 0.00% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000008s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000002s : 0.00% detach_backward : 0.000007s : 0.00% pipeline_parallel_scheduler : 0.000003s : 0.00% auto_monad_reorder : 0.000059s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000019s : 0.00% opt_after_jit_grad : 0.018925s : 4.42% validate : 0.000072s : 0.02% Time group info: ------[substitution.] 0.000243 33 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000001s : 2: substitution.fold_const_symbol 2.54% : 0.000006s : 5: substitution.graph_param_transform 79.67% : 0.000194s : 4: substitution.inline 2.40% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.76% : 0.000007s : 4: substitution.remove_not_recompute_node 3.50% : 0.000009s : 6: substitution.replace_old_param 7.86% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.245386 2 99.25% : 0.243533s : 1: type_inference.infer 0.75% : 0.001853s : 1: type_inference.specialize ------[replace.] 0.000085 10 57.49% : 0.000049s : 4: replace.inline 42.51% : 0.000036s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000207 10 92.02% : 0.000191s : 4: match.inline 7.98% : 0.000017s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000257 1594 0.83% : 0.000002s : 16: predicate.accumulaten_eliminater 1.99% : 0.000005s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 1.02% : 0.000003s : 16: predicate.addn_zero_filter 0.75% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.37% : 0.000006s : 26: predicate.arithmetic_simplify 1.04% : 0.000003s : 16: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.49% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.63% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 16: predicate.dict_set_item_eliminator 1.55% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 5: predicate.elim_not_effective 0.31% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 21: predicate.environ_get_depend_swap 1.57% : 0.000004s : 31: predicate.environ_get_eliminate 1.11% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.52% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.36% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.77% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.74% : 0.000002s : 10: predicate.get_grad_eliminate 0.17% : 0.000000s : 5: predicate.graph_param_transform 0.48% : 0.000001s : 10: predicate.incorporate_call 0.41% : 0.000001s : 10: predicate.incorporate_call_switch 5.38% : 0.000014s : 72: predicate.inline 0.80% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 10: predicate.less_batch_normalization 1.79% : 0.000005s : 32: predicate.list_to_tuple_eliminator_ 2.26% : 0.000006s : 48: predicate.load_eliminater 0.90% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.29% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.44% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 16: predicate.minmaximum_grad 1.39% : 0.000004s : 5: predicate.mutable_eliminate 0.31% : 0.000001s : 5: predicate.opt_reshape 0.50% : 0.000001s : 5: predicate.parallel_virtual_node 1.87% : 0.000005s : 26: predicate.partial_defer_inline 1.39% : 0.000004s : 27: predicate.partial_eliminate 0.89% : 0.000002s : 16: predicate.print_const_string_wrapper 0.61% : 0.000002s : 10: predicate.reduce_all_const_elim 1.33% : 0.000003s : 16: predicate.reduce_eliminate 2.49% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 10: predicate.remove_not_recompute_node 1.65% : 0.000004s : 32: predicate.replace_applicator 0.53% : 0.000001s : 10: predicate.replace_old_param 0.33% : 0.000001s : 5: predicate.reset_defer_inline 0.97% : 0.000002s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 5: predicate.row_tensor_eliminate 1.11% : 0.000003s : 10: predicate.same_eliminate 0.47% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.94% : 0.000002s : 10: predicate.shard_identity_eliminate 0.87% : 0.000002s : 10: predicate.special_op_eliminate 0.71% : 0.000002s : 10: predicate.specialize_transform 0.92% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.44% : 0.000004s : 26: predicate.switch_defer_inline 1.98% : 0.000005s : 36: predicate.switch_layer_defer_inline 4.99% : 0.000013s : 83: predicate.switch_simplify 0.85% : 0.000002s : 16: predicate.tile_eliminate 0.85% : 0.000002s : 16: predicate.transpose_eliminate 1.53% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000004s : 26: predicate.tuple_list_get_item_depend_reorder 3.45% : 0.000009s : 42: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000006s : 36: predicate.tuple_list_set_item_eliminator 1.78% : 0.000005s : 32: predicate.tuple_to_list_eliminator_ 2.21% : 0.000006s : 48: predicate.updatestate_pure_node_eliminater 2.76% : 0.000007s : 58: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.86% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 10: predicate.virtual_output_eliminate 0.21% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001231 11 31.08% : 0.000383s : 5: func_graph_cloner_run.FuncGraphClonerGraph 68.92% : 0.000849s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.837595 192 0.00% : 0.000004s : 1: ForceFp32Comm 14.88% : 0.124605s : 1: add_attr 14.87% : 0.124584s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000141s : 1: auto_monad 0.01% : 0.000064s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 15.24% : 0.127691s : 1: bootstrap 0.00% : 0.000040s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000011s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.00% : 0.000036s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.07% : 0.000599s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.11% : 0.000934s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000027s : 1: opt.transform.mutable_eliminate 0.18% : 0.001545s : 78: opt.transform.opt_a 0.00% : 0.000036s : 1: opt.transform.opt_after_cconv 0.01% : 0.000061s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000134s : 28: opt.transform.opt_b 0.01% : 0.000059s : 2: opt.transform.opt_trans_graph 0.00% : 0.000040s : 4: opt.transform.symbol_engine_opt 4.02% : 0.033686s : 1: opt_a 0.02% : 0.000130s : 1: opt_after_cconv 2.26% : 0.018954s : 1: opt_after_jit_grad 0.03% : 0.000278s : 1: opt_b 9.73% : 0.081507s : 1: optimize 0.00% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000011s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000008s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000060s : 1: pre_auto_parallel 0.00% : 0.000020s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000050s : 1: remove_dup_value 3.58% : 0.029982s : 1: renormalize.infer 0.12% : 0.001017s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000022s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000032s : 1: rewriter_after_opt_a 0.03% : 0.000269s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 5.36% : 0.044927s : 1: symbol_engine_optimizer 0.01% : 0.000090s : 1: tuple_transform 29.31% : 0.245521s : 1: type_inference TotalTime = 0.175409, [21] [bootstrap]: 0.00048902 [type_inference]: 0.163839 [event_method]: 2.447e-05 [auto_monad]: 7.032e-05 [graph_reusing]: 6.31e-06 [inline]: 3.3e-06 [add_attr]: 0.00408189, [1] [add_attr_with_inline]: 0.00406715, [1] [Cycle 1]: 8.111e-05, [2] [tag_attr]: 2.76e-05 [meta_addattr_fg_expand]: 6.59001e-06 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 4.544e-05 [insert-virtual-dataset]: 2.78998e-06 [parallel-infer-symbol-second]: 8.90024e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00603595, [53] [py_interpret_to_execute]: 8.43001e-06 [rewriter_before_opt_a]: 0.00023107 [opt_a]: 0.00342072, [2] [Cycle 1]: 0.00271699, [45] [expand_dump_flag]: 3.20002e-06 [switch_simplify]: 4.836e-05 [loop_unroll]: 3.467e-05 [a_1]: 0.00072639 [with_stream_mark]: 2.263e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 4.12998e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.02002e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 7.916e-05 [accelerated_algorithm]: 7.47998e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 3.3e-06 [shard_inline]: 7.14001e-06 [merge_send_recv]: 1.122e-05 [auto_parallel]: 8.52998e-06 [parallel]: 2.063e-05 [flash_sp]: 1.072e-05 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 3.404e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 9.47999e-06 [virtual_dataset]: 8.58001e-06 [get_grad_eliminate_]: 6.86999e-06 [virtual_output]: 7.36999e-06 [merge_forward]: 4.71002e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.206e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.646e-05 [merge_recompute_call_nodes]: 2.02999e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45999e-06 [meta_fg_expand]: 3.51999e-06 [flash_sp_send_recv_attached]: 3.13e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 1.258e-05 [a_after_grad]: 1.064e-05 [renormalize]: 0.00115515 [add_forward_monad_depend]: 9.05999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 2.254e-05 [cse]: 3.355e-05 [a_3]: 5.332e-05 [Cycle 2]: 0.00068772, [45] [expand_dump_flag]: 2.29001e-06 [switch_simplify]: 8.05999e-06 [loop_unroll]: 6.50002e-06 [a_1]: 0.00013002 [with_stream_mark]: 1.997e-05 [recompute_prepare]: 6.46999e-06 [updatestate_depend_eliminate]: 4.25999e-06 [updatestate_assign_eliminate]: 3.65998e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 6.743e-05 [accelerated_algorithm]: 5.89999e-06 [shard]: 1.81998e-06 [meta_shard_fg_expand]: 2.33998e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 8.08999e-06 [auto_parallel]: 8.69e-06 [parallel]: 7.77998e-06 [flash_sp]: 3.75e-06 [merge_comm]: 3.28998e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 8.95001e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 5.79e-06 [get_grad_eliminate_]: 6.44999e-06 [virtual_output]: 5.61998e-06 [merge_forward]: 3.94002e-06 [cell_reuse_recompute_pass]: 2.56e-06 [offload_activation]: 9.24998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.567e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 9.29e-06 [set_forward_comm_id_for_comm_node_pass]: 3.8e-06 [meta_fg_expand]: 2.22999e-06 [flash_sp_send_recv_attached]: 1.40999e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.129e-05 [a_after_grad]: 8.82999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 8.15999e-06 [cse]: 1.706e-05 [a_3]: 3.741e-05 [py_interpret_to_execute_after_opt_a]: 7.84997e-06 [slice_cell_reuse_recomputed_activation]: 2.53e-06 [rewriter_after_opt_a]: 2.129e-05 [convert_after_rewriter]: 1.35999e-06 [order_py_execute_after_rewriter]: 1.27e-06 [mutable_eliminate]: 0.00081079 [opt_b]: 0.00020261, [1] [Cycle 1]: 0.00019474, [7] [b_1]: 0.00011637 [b_2]: 7.97998e-06 [updatestate_depend_eliminate]: 8.48999e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.58e-06 [renormalize]: 5.8001e-07 [cse]: 2.228e-05 [optimize_parallel_all_gather_comm]: 1.917e-05 [overlap_param_gather]: 2.31e-06 [cconv]: 3.34e-05 [loop_unroll]: 0.00051608 [opt_after_cconv]: 0.0001547, [1] [Cycle 1]: 0.0001476, [7] [c_1]: 3.03e-05 [parameter_eliminate]: 4.63999e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 2.50002e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 2.014e-05 [renormalize]: 2.59985e-07 [remove_dup_value]: 1.536e-05 [tuple_transform]: 8.223e-05, [1] [Cycle 1]: 7.654e-05, [4] [d_1]: 4.762e-05 [none_parameter_eliminate]: 2.09e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.09001e-06 [partial_unused_args_eliminate]: 2.49001e-06 [add_recomputation]: 5.48e-05 [cse_after_recomputation]: 2.018e-05, [1] [Cycle 1]: 1.525e-05, [1] [cse]: 1.022e-05 [environ_conv]: 6.12999e-06 [swap_dp_allreduce_reducescatter]: 5.30001e-06 [bias_add_comm_swap]: 3.43999e-06 [label_micro_interleaved_index]: 5.04e-06 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.62999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.73e-06 [assign_add_opt]: 1.68002e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.50001e-06 [overlap_opt_shard_in_pipeline]: 1.32e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06998e-06 [control_data_broadcast_order]: 1.212e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.87998e-06 [overlap_recompute_and_grad_model_parallel]: 5.29e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.37998e-06 [overlap_grad_flash_sp]: 2.18e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 2.49001e-06 [split_layernorm_comm]: 2.31e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.185e-05, [1] [Cycle 1]: 7.734e-05, [6] [build]: 3.85e-06 [elim_shapecalc]: 1.188e-05 [elim_not_effective]: 1.335e-05 [opt_reshape]: 8.00999e-06 [fold_const_symbol]: 1.04e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.01e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 1.758e-05 [get_jit_bprop_graph]: 2.58003e-06 [rewriter_after_jit_bprop_graph]: 5.52001e-06 [opt_after_jit_grad]: 0.00055329 [validate]: 4.678e-05 Sums bootstrap : 0.000489s : 0.29% type_inference : 0.163839s : 96.23% event_method : 0.000024s : 0.01% auto_monad : 0.000070s : 0.04% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000231s : 0.14% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000056s : 0.03% optimize.opt_a.loop_unroll : 0.000041s : 0.02% optimize.opt_a.a_1 : 0.000856s : 0.50% optimize.opt_a.with_stream_mark : 0.000043s : 0.03% optimize.opt_a.recompute_prepare : 0.000018s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000147s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.01% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000043s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.01% optimize.opt_a.a_after_grad : 0.000019s : 0.01% optimize.opt_a.renormalize : 0.001155s : 0.68% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.02% optimize.opt_a.cse : 0.000051s : 0.03% optimize.opt_a.a_3 : 0.000091s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000811s : 0.48% optimize.opt_b.b_1 : 0.000116s : 0.07% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.02% optimize.loop_unroll : 0.000516s : 0.30% optimize.opt_after_cconv.c_1 : 0.000030s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000048s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.03% optimize.cse_after_recomputation.cse : 0.000010s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000553s : 0.32% validate : 0.000047s : 0.03% Time group info: ------[substitution.] 0.000227 29 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000002s : 2: substitution.fold_const_symbol 2.53% : 0.000006s : 4: substitution.graph_param_transform 81.24% : 0.000184s : 5: substitution.inline 1.70% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.76% : 0.000006s : 4: substitution.remove_not_recompute_node 2.34% : 0.000005s : 4: substitution.replace_old_param 7.95% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.163754 2 99.33% : 0.162655s : 1: type_inference.infer 0.67% : 0.001099s : 1: type_inference.specialize ------[replace.] 0.000080 9 66.17% : 0.000053s : 5: replace.inline 33.83% : 0.000027s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 9 92.16% : 0.000181s : 5: match.inline 7.84% : 0.000015s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1345 1.06% : 0.000002s : 14: predicate.accumulaten_eliminater 0.91% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.57% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 14: predicate.addn_zero_filter 0.90% : 0.000002s : 14: predicate.adjust_all_reduce_mul_add 2.09% : 0.000004s : 22: predicate.arithmetic_simplify 0.90% : 0.000002s : 14: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.44% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 14: predicate.dict_get_item_const_eliminator 1.25% : 0.000003s : 14: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 14: predicate.dict_set_item_eliminator 1.18% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 18: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 18: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 18: predicate.environ_get_depend_swap 1.54% : 0.000003s : 26: predicate.environ_get_eliminate 1.02% : 0.000002s : 18: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.73% : 0.000006s : 23: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.70% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 1.06% : 0.000002s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.46% : 0.000001s : 8: predicate.incorporate_call_switch 6.11% : 0.000013s : 61: predicate.inline 0.62% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 8: predicate.less_batch_normalization 2.07% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.42% : 0.000005s : 40: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.85% : 0.000006s : 41: predicate.loop_unroll_before_grad 1.45% : 0.000003s : 22: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 14: predicate.minmaximum_grad 1.35% : 0.000003s : 4: predicate.mutable_eliminate 0.64% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 2.10% : 0.000004s : 23: predicate.partial_defer_inline 1.40% : 0.000003s : 22: predicate.partial_eliminate 0.94% : 0.000002s : 14: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.08% : 0.000002s : 14: predicate.reduce_eliminate 2.26% : 0.000005s : 40: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.42% : 0.000003s : 26: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.24% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 14: predicate.reshape_eliminate 0.68% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.90% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 1.14% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 23: predicate.switch_defer_inline 2.09% : 0.000004s : 31: predicate.switch_layer_defer_inline 5.40% : 0.000011s : 76: predicate.switch_simplify 0.86% : 0.000002s : 14: predicate.tile_eliminate 1.01% : 0.000002s : 14: predicate.transpose_eliminate 1.34% : 0.000003s : 22: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 22: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 22: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 22: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000005s : 30: predicate.tuple_list_set_item_eliminator 1.83% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 2.84% : 0.000006s : 48: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.67% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000826 12 39.01% : 0.000322s : 5: func_graph_cloner_run.FuncGraphClonerGraph 60.99% : 0.000504s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.188101 192 0.00% : 0.000004s : 1: ForceFp32Comm 2.17% : 0.004089s : 1: add_attr 2.17% : 0.004072s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.03% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000076s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.28% : 0.000517s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000031s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.28% : 0.000527s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.44% : 0.000823s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.69% : 0.001294s : 78: opt.transform.opt_a 0.02% : 0.000029s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000096s : 28: opt.transform.opt_b 0.03% : 0.000053s : 2: opt.transform.opt_trans_graph 0.02% : 0.000040s : 4: opt.transform.symbol_engine_opt 1.82% : 0.003425s : 1: opt_a 0.08% : 0.000159s : 1: opt_after_cconv 0.30% : 0.000566s : 1: opt_after_jit_grad 0.11% : 0.000207s : 1: opt_b 3.21% : 0.006042s : 1: optimize 0.01% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.30% : 0.000560s : 1: renormalize.infer 0.31% : 0.000581s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000025s : 1: rewriter_after_opt_a 0.13% : 0.000239s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000085s : 1: symbol_engine_optimizer 0.05% : 0.000085s : 1: tuple_transform 87.12% : 0.163867s : 1: type_inference TotalTime = 14.4854, [21] [bootstrap]: 0.00060927 [type_inference]: 11.802 [event_method]: 0.00103045 [auto_monad]: 0.00029671 [graph_reusing]: 1.179e-05 [inline]: 3.58999e-06 [add_attr]: 0.00541606, [1] [add_attr_with_inline]: 0.00540427, [1] [Cycle 1]: 0.00016296, [2] [tag_attr]: 5.909e-05 [meta_addattr_fg_expand]: 1.547e-05 [parallel-infer-symbol]: 3.61001e-06 [pre_auto_parallel]: 7.753e-05 [insert-virtual-dataset]: 3.34001e-06 [parallel-infer-symbol-second]: 1.06002e-06 [dataset_repeat_opt]: 2.24999e-06 [pipeline_split]: 1.71e-06 [optimize]: 2.67496, [53] [py_interpret_to_execute]: 5.47001e-06 [rewriter_before_opt_a]: 0.0005139 [opt_a]: 2.67138, [3] [Cycle 1]: 2.66521, [45] [expand_dump_flag]: 6.71e-06 [switch_simplify]: 0.00018617 [loop_unroll]: 7.552e-05 [a_1]: 0.00178378 [with_stream_mark]: 3.42e-05 [recompute_prepare]: 2.707e-05 [updatestate_depend_eliminate]: 1.113e-05 [updatestate_assign_eliminate]: 8.67e-06 [updatestate_loads_eliminate]: 8.74e-06 [parameter_eliminate]: 3.31999e-06 [a_2]: 0.00027019 [accelerated_algorithm]: 1.817e-05 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 8.53001e-06 [shard_inline]: 1.748e-05 [merge_send_recv]: 1.978e-05 [auto_parallel]: 1.508e-05 [parallel]: 0.864715 [flash_sp]: 4.239e-05 [merge_comm]: 3.372e-05 [allreduce_fusion]: 1.004e-05 [matmul_add_comm_reduction]: 4.456e-05 [allreduce_slice_to_reducescatter]: 1.33002e-06 [virtual_shard_identity]: 5.571e-05 [virtual_dataset]: 2.135e-05 [get_grad_eliminate_]: 2.006e-05 [virtual_output]: 1.805e-05 [merge_forward]: 1.33e-05 [cell_reuse_recompute_pass]: 3.41001e-06 [offload_activation]: 2.211e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.79e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 3.338e-05 [set_forward_comm_id_for_comm_node_pass]: 1.346e-05 [meta_fg_expand]: 0.00357704 [flash_sp_send_recv_attached]: 5.32001e-06 [receive_attached]: 2.79999e-06 [after_resolve]: 0.00010422 [a_after_grad]: 0.00012279 [renormalize]: 1.79218 [add_forward_monad_depend]: 1.83e-05 [auto_monad_grad]: 8.77e-06 [auto_monad_eliminator]: 7.829e-05 [cse]: 0.00058976 [a_3]: 0.00047184 [Cycle 2]: 0.00529967, [45] [expand_dump_flag]: 3.86001e-06 [switch_simplify]: 6.002e-05 [loop_unroll]: 5.338e-05 [a_1]: 0.00218191 [with_stream_mark]: 2.876e-05 [recompute_prepare]: 1.443e-05 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 4.07e-06 [updatestate_loads_eliminate]: 3.71999e-06 [parameter_eliminate]: 2.37001e-06 [a_2]: 0.00011229 [accelerated_algorithm]: 5.567e-05 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 4.15e-06 [shard_inline]: 9.31e-06 [merge_send_recv]: 1.136e-05 [auto_parallel]: 1.16e-05 [parallel]: 9.76e-06 [flash_sp]: 4.15e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 3.95e-06 [matmul_add_comm_reduction]: 9.92999e-06 [allreduce_slice_to_reducescatter]: 1.37999e-06 [virtual_shard_identity]: 4.85e-05 [virtual_dataset]: 1.024e-05 [get_grad_eliminate_]: 8.92999e-06 [virtual_output]: 8.72e-06 [merge_forward]: 5.48002e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 1.261e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.821e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.383e-05 [set_forward_comm_id_for_comm_node_pass]: 4.52e-06 [meta_fg_expand]: 0.00021882 [flash_sp_send_recv_attached]: 1.93002e-06 [receive_attached]: 3.13e-06 [after_resolve]: 1.882e-05 [a_after_grad]: 1.424e-05 [renormalize]: 0.00188529 [add_forward_monad_depend]: 5.77999e-06 [auto_monad_grad]: 2.35002e-06 [auto_monad_eliminator]: 1.645e-05 [cse]: 3.93e-05 [a_3]: 6.866e-05 [Cycle 3]: 0.00084143, [45] [expand_dump_flag]: 2.60997e-06 [switch_simplify]: 1.061e-05 [loop_unroll]: 9.87999e-06 [a_1]: 0.00019103 [with_stream_mark]: 1.249e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 4.27e-06 [updatestate_assign_eliminate]: 3.31001e-06 [updatestate_loads_eliminate]: 3.24001e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00010582 [accelerated_algorithm]: 8.46002e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 2.44999e-06 [shard_inline]: 8.86997e-06 [merge_send_recv]: 6.49001e-06 [auto_parallel]: 7.95e-06 [parallel]: 5.94e-06 [flash_sp]: 9.70002e-07 [merge_comm]: 3.98001e-06 [allreduce_fusion]: 3.70998e-06 [matmul_add_comm_reduction]: 6.63e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 9.72999e-06 [virtual_dataset]: 7.84002e-06 [get_grad_eliminate_]: 9.77001e-06 [virtual_output]: 8.93002e-06 [merge_forward]: 4.53001e-06 [cell_reuse_recompute_pass]: 1.71998e-06 [offload_activation]: 9.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.63e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 1.202e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 1.17999e-06 [receive_attached]: 1.41002e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 1.316e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.57001e-06 [auto_monad_grad]: 1.11002e-06 [auto_monad_eliminator]: 8.69e-06 [cse]: 2.769e-05 [a_3]: 5.515e-05 [py_interpret_to_execute_after_opt_a]: 7.26999e-06 [slice_cell_reuse_recomputed_activation]: 2.00002e-06 [rewriter_after_opt_a]: 2.744e-05 [convert_after_rewriter]: 1.66e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00107195 [opt_b]: 0.00028764, [1] [Cycle 1]: 0.00027817, [7] [b_1]: 0.00017993 [b_2]: 1.098e-05 [updatestate_depend_eliminate]: 8.55999e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.26999e-06 [renormalize]: 5.8001e-07 [cse]: 3.471e-05 [optimize_parallel_all_gather_comm]: 2.029e-05 [overlap_param_gather]: 2.89999e-06 [cconv]: 2.949e-05 [loop_unroll]: 0.0005362 [opt_after_cconv]: 0.00013361, [1] [Cycle 1]: 0.0001271, [7] [c_1]: 4.391e-05 [parameter_eliminate]: 3.2e-06 [updatestate_depend_eliminate]: 7.51001e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 3.014e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 5.141e-05 [tuple_transform]: 0.00017599, [1] [Cycle 1]: 0.00017092, [4] [d_1]: 0.00013151 [none_parameter_eliminate]: 2.44999e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 1.132e-05 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 5.562e-05 [cse_after_recomputation]: 3.612e-05, [1] [Cycle 1]: 3.011e-05, [1] [cse]: 2.405e-05 [environ_conv]: 1.055e-05 [swap_dp_allreduce_reducescatter]: 6.01e-06 [bias_add_comm_swap]: 2.89999e-06 [label_micro_interleaved_index]: 4.65999e-06 [label_fine_grained_interleaved_index]: 2.73998e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.43002e-06 [ForceFp32Comm]: 1.34998e-06 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.80002e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.49998e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 4.542e-05 [overlap_opt_shard_grad_in_pipeline]: 2.26e-06 [control_data_broadcast_order]: 1.799e-05 [grouped_pairwise_exchange_alltoall]: 1.70001e-06 [offloading_packed_experts]: 4.55001e-06 [overlap_recompute_and_grad_model_parallel]: 5.82999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.92002e-06 [overlap_grad_ring_attention]: 4.58001e-06 [overlap_grad_flash_sp]: 4.515e-05 [begin_end_overlap_inline]: 8.50006e-07 [split_matmul_comm_elemetwise]: 2.59001e-06 [split_layernorm_comm]: 1.76003e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 9.717e-05, [1] [Cycle 1]: 9.16e-05, [6] [build]: 3.25e-06 [elim_shapecalc]: 1.511e-05 [elim_not_effective]: 1.685e-05 [opt_reshape]: 1.037e-05 [fold_const_symbol]: 1.404e-05 [renormalize]: 3.09985e-07 [detach_backward]: 2.24001e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 2.141e-05 [get_jit_bprop_graph]: 1.68002e-06 [rewriter_after_jit_bprop_graph]: 5.26002e-06 [opt_after_jit_grad]: 0.0005908 [validate]: 0.00013093 Sums bootstrap : 0.000609s : 0.00% type_inference : 11.801985s : 81.51% event_method : 0.001030s : 0.01% auto_monad : 0.000297s : 0.00% graph_reusing : 0.000012s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000059s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000078s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000514s : 0.00% optimize.opt_a.expand_dump_flag : 0.000013s : 0.00% optimize.opt_a.switch_simplify : 0.000257s : 0.00% optimize.opt_a.loop_unroll : 0.000139s : 0.00% optimize.opt_a.a_1 : 0.004157s : 0.03% optimize.opt_a.with_stream_mark : 0.000075s : 0.00% optimize.opt_a.recompute_prepare : 0.000050s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000016s : 0.00% optimize.opt_a.parameter_eliminate : 0.000007s : 0.00% optimize.opt_a.a_2 : 0.000488s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000082s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000015s : 0.00% optimize.opt_a.shard_inline : 0.000036s : 0.00% optimize.opt_a.merge_send_recv : 0.000038s : 0.00% optimize.opt_a.auto_parallel : 0.000035s : 0.00% optimize.opt_a.parallel : 0.864730s : 5.97% optimize.opt_a.flash_sp : 0.000048s : 0.00% optimize.opt_a.merge_comm : 0.000042s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000061s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000114s : 0.00% optimize.opt_a.virtual_dataset : 0.000039s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.00% optimize.opt_a.virtual_output : 0.000036s : 0.00% optimize.opt_a.merge_forward : 0.000023s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000044s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000082s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000059s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000023s : 0.00% optimize.opt_a.meta_fg_expand : 0.003799s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000007s : 0.00% optimize.opt_a.after_resolve : 0.000135s : 0.00% optimize.opt_a.a_after_grad : 0.000150s : 0.00% optimize.opt_a.renormalize : 1.794070s : 12.39% optimize.opt_a.add_forward_monad_depend : 0.000026s : 0.00% optimize.opt_a.auto_monad_grad : 0.000012s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000103s : 0.00% optimize.opt_a.cse : 0.000657s : 0.00% optimize.opt_a.a_3 : 0.000596s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000027s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.001072s : 0.01% optimize.opt_b.b_1 : 0.000180s : 0.00% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.00% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000029s : 0.00% optimize.loop_unroll : 0.000536s : 0.00% optimize.opt_after_cconv.c_1 : 0.000044s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000051s : 0.00% optimize.tuple_transform.d_1 : 0.000132s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.00% optimize.cse_after_recomputation.cse : 0.000024s : 0.00% optimize.environ_conv : 0.000011s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000045s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000591s : 0.00% validate : 0.000131s : 0.00% Time group info: ------[substitution.] 0.001037 167 0.22% : 0.000002s : 2: substitution.elim_not_effective 0.91% : 0.000009s : 11: substitution.float_depend_g_call 0.38% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.15% : 0.000002s : 2: substitution.fold_const_symbol 0.70% : 0.000007s : 5: substitution.graph_param_transform 0.30% : 0.000003s : 2: substitution.incorporate_call 0.24% : 0.000002s : 2: substitution.incorporate_call_switch 69.21% : 0.000718s : 21: substitution.inline 2.51% : 0.000026s : 2: substitution.inline_without_move 1.13% : 0.000012s : 12: substitution.j_node_and_user_rematch 1.18% : 0.000012s : 7: substitution.minmaximum_grad 2.08% : 0.000022s : 11: substitution.partial_eliminate 1.27% : 0.000013s : 12: substitution.remove_not_recompute_node 3.21% : 0.000033s : 9: substitution.replace_applicator 1.54% : 0.000016s : 19: substitution.replace_old_param 0.31% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.69% : 0.000028s : 3: substitution.switch_simplify 2.22% : 0.000023s : 7: substitution.tuple_list_convert_item_index_to_positive 1.04% : 0.000011s : 7: substitution.tuple_list_get_item_const_eliminator 1.58% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 5.62% : 0.000058s : 16: substitution.tuple_list_get_item_eliminator 1.50% : 0.000016s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 11.801828 2 92.29% : 10.891702s : 1: type_inference.infer 7.71% : 0.910126s : 1: type_inference.specialize ------[replace.] 0.000394 31 57.97% : 0.000228s : 21: replace.inline 11.74% : 0.000046s : 3: replace.switch_simplify 30.30% : 0.000119s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000754 31 93.44% : 0.000705s : 21: match.inline 3.34% : 0.000025s : 3: match.switch_simplify 3.22% : 0.000024s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.001329 5056 0.62% : 0.000008s : 61: predicate.accumulaten_eliminater 0.16% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.25% : 0.000003s : 26: predicate.addn_check_dump 0.64% : 0.000008s : 61: predicate.addn_zero_filter 0.54% : 0.000007s : 61: predicate.adjust_all_reduce_mul_add 3.52% : 0.000047s : 87: predicate.arithmetic_simplify 0.61% : 0.000008s : 61: predicate.cast_eliminate 0.63% : 0.000008s : 65: predicate.check_bprop_eliminate 0.26% : 0.000003s : 26: predicate.compare_switch_simplify 0.04% : 0.000001s : 6: predicate.const_output_eliminate 0.27% : 0.000004s : 26: predicate.depend_value_elim 0.58% : 0.000008s : 61: predicate.dict_get_item_const_eliminator 0.68% : 0.000009s : 61: predicate.dict_get_item_eliminator 0.54% : 0.000007s : 61: predicate.dict_set_item_eliminator 0.16% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.06% : 0.000001s : 5: predicate.elim_not_effective 0.07% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 0.63% : 0.000008s : 67: predicate.environ_add_const_eliminate 0.56% : 0.000007s : 67: predicate.environ_get_add_eliminate 0.59% : 0.000008s : 67: predicate.environ_get_depend_swap 0.84% : 0.000011s : 93: predicate.environ_get_eliminate 0.59% : 0.000008s : 67: predicate.environ_get_set_eliminate 0.86% : 0.000011s : 89: predicate.exchange_switch_depend_value 1.30% : 0.000017s : 89: predicate.float_depend_g_call 0.25% : 0.000003s : 26: predicate.float_environ_get_switch 0.34% : 0.000005s : 32: predicate.float_tuple_getitem_switch 0.03% : 0.000000s : 5: predicate.fold_const_symbol 0.33% : 0.000004s : 26: predicate.get_grad_eliminate 0.07% : 0.000001s : 5: predicate.graph_param_transform 0.25% : 0.000003s : 26: predicate.incorporate_call 0.22% : 0.000003s : 26: predicate.incorporate_call_switch 2.87% : 0.000038s : 214: predicate.inline 0.84% : 0.000011s : 65: predicate.inline_without_move 0.15% : 0.000002s : 26: predicate.j_node_and_user_rematch 3.77% : 0.000050s : 26: predicate.less_batch_normalization 0.92% : 0.000012s : 79: predicate.list_to_tuple_eliminator_ 1.27% : 0.000017s : 141: predicate.load_eliminater 0.16% : 0.000002s : 6: predicate.loop_unroll_after_grad 1.45% : 0.000019s : 138: predicate.loop_unroll_before_grad 0.75% : 0.000010s : 73: predicate.make_slice_get_slice_eliminator 0.29% : 0.000004s : 26: predicate.merge_addn 0.66% : 0.000009s : 65: predicate.micro_step_allgather_replace 0.66% : 0.000009s : 65: predicate.mini_step_allgather_replace 0.57% : 0.000008s : 61: predicate.minmaximum_grad 0.22% : 0.000003s : 6: predicate.mutable_eliminate 0.07% : 0.000001s : 5: predicate.opt_reshape 0.07% : 0.000001s : 6: predicate.parallel_virtual_node 1.28% : 0.000017s : 89: predicate.partial_defer_inline 0.76% : 0.000010s : 74: predicate.partial_eliminate 0.59% : 0.000008s : 61: predicate.print_const_string_wrapper 0.27% : 0.000004s : 26: predicate.reduce_all_const_elim 34.76% : 0.000462s : 61: predicate.reduce_eliminate 1.29% : 0.000017s : 141: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000003s : 26: predicate.remove_not_recompute_node 1.07% : 0.000014s : 133: predicate.replace_applicator 0.43% : 0.000006s : 65: predicate.replace_old_param 0.05% : 0.000001s : 6: predicate.reset_defer_inline 0.63% : 0.000008s : 61: predicate.reshape_eliminate 0.80% : 0.000011s : 65: predicate.row_tensor_add_zeros_like 0.07% : 0.000001s : 6: predicate.row_tensor_eliminate 0.90% : 0.000012s : 65: predicate.same_eliminate 0.20% : 0.000003s : 26: predicate.set_cell_output_no_recompute 3.12% : 0.000042s : 26: predicate.shard_identity_eliminate 0.13% : 0.000002s : 11: predicate.special_op_eliminate 0.28% : 0.000004s : 26: predicate.specialize_transform 0.86% : 0.000011s : 65: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000012s : 65: predicate.stack_unstack_eliminate 0.05% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.01% : 0.000013s : 89: predicate.switch_defer_inline 1.62% : 0.000022s : 154: predicate.switch_layer_defer_inline 2.91% : 0.000039s : 264: predicate.switch_simplify 0.71% : 0.000009s : 61: predicate.tile_eliminate 0.66% : 0.000009s : 61: predicate.transpose_eliminate 0.72% : 0.000010s : 72: predicate.tuple_list_convert_item_index_to_positive 0.79% : 0.000011s : 72: predicate.tuple_list_get_item_const_eliminator 0.74% : 0.000010s : 72: predicate.tuple_list_get_item_depend_reorder 1.32% : 0.000018s : 105: predicate.tuple_list_get_item_eliminator 0.77% : 0.000010s : 72: predicate.tuple_list_get_set_item_eliminator 1.11% : 0.000015s : 98: predicate.tuple_list_set_item_eliminator 4.23% : 0.000056s : 79: predicate.tuple_to_list_eliminator_ 1.19% : 0.000016s : 141: predicate.updatestate_pure_node_eliminater 1.50% : 0.000020s : 167: predicate.updatestate_useless_node_eliminater 0.06% : 0.000001s : 6: predicate.value_based_eliminate 0.30% : 0.000004s : 26: predicate.virtual_dataset_eliminate 0.32% : 0.000004s : 26: predicate.virtual_output_eliminate 0.04% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.08% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 1.822000 69 50.32% : 0.916905s : 44: func_graph_cloner_run.FuncGraphClonerGraph 49.68% : 0.905095s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 18.966249 233 0.00% : 0.000004s : 1: ForceFp32Comm 0.03% : 0.005422s : 1: add_attr 0.03% : 0.005408s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.00% : 0.000060s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000307s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.00% : 0.000644s : 1: bootstrap 0.00% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.01% : 0.001055s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.00% : 0.000547s : 1: loop_unroll 0.00% : 0.000044s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.001083s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.03% : 0.006279s : 117: opt.transform.opt_a 0.00% : 0.000043s : 1: opt.transform.opt_after_cconv 0.00% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000159s : 28: opt.transform.opt_b 0.00% : 0.000140s : 2: opt.transform.opt_trans_graph 0.00% : 0.000051s : 4: opt.transform.symbol_engine_opt 14.09% : 2.671431s : 1: opt_a 0.00% : 0.000137s : 1: opt_after_cconv 0.00% : 0.000602s : 1: opt_after_jit_grad 0.00% : 0.000291s : 1: opt_b 14.10% : 2.674970s : 1: optimize 0.00% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000051s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000049s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.00% : 0.000082s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000056s : 1: remove_dup_value 4.95% : 0.938937s : 2: renormalize.infer 4.51% : 0.855051s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.00% : 0.000523s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000100s : 1: symbol_engine_optimizer 0.00% : 0.000179s : 1: tuple_transform 62.23% : 11.802016s : 1: type_inference TotalTime = 1.34133, [21] [bootstrap]: 0.00046556 [type_inference]: 1.21847 [event_method]: 0.00081507 [auto_monad]: 0.00010256 [graph_reusing]: 6.186e-05 [inline]: 8.28999e-06 [add_attr]: 0.00580593, [1] [add_attr_with_inline]: 0.005795, [1] [Cycle 1]: 9.211e-05, [2] [tag_attr]: 3.803e-05 [meta_addattr_fg_expand]: 6.63e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 8.065e-05 [insert-virtual-dataset]: 3.55e-06 [parallel-infer-symbol-second]: 1.27e-06 [dataset_repeat_opt]: 2.41998e-06 [pipeline_split]: 1.87999e-06 [optimize]: 0.114442, [53] [py_interpret_to_execute]: 6.84999e-06 [rewriter_before_opt_a]: 0.0002902 [opt_a]: 0.111307, [2] [Cycle 1]: 0.00617583, [45] [expand_dump_flag]: 4.03999e-06 [switch_simplify]: 5.317e-05 [loop_unroll]: 3.67e-05 [a_1]: 0.00076236 [with_stream_mark]: 2.003e-05 [recompute_prepare]: 1.059e-05 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 4.41002e-06 [updatestate_loads_eliminate]: 3.81001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00011196 [accelerated_algorithm]: 8.75001e-06 [shard]: 1.80001e-06 [meta_shard_fg_expand]: 2.34001e-06 [shard_inline]: 8.45001e-06 [merge_send_recv]: 9.81e-06 [auto_parallel]: 6.79999e-06 [parallel]: 1.954e-05 [flash_sp]: 9.03002e-06 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.81001e-06 [matmul_add_comm_reduction]: 1.079e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.068e-05 [virtual_dataset]: 9.07001e-06 [get_grad_eliminate_]: 8.64003e-06 [virtual_output]: 8.52e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.24e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.483e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.254e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 3.36001e-06 [flash_sp_send_recv_attached]: 2.60002e-06 [receive_attached]: 2.11998e-06 [after_resolve]: 1.271e-05 [a_after_grad]: 1.291e-05 [renormalize]: 0.00454589 [add_forward_monad_depend]: 7.33999e-06 [auto_monad_grad]: 3.07002e-06 [auto_monad_eliminator]: 2.128e-05 [cse]: 4.932e-05 [a_3]: 7.052e-05 [Cycle 2]: 0.105117, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 9.94001e-06 [loop_unroll]: 8.81997e-06 [a_1]: 0.0002092 [with_stream_mark]: 1.724e-05 [recompute_prepare]: 8.82999e-06 [updatestate_depend_eliminate]: 4.60999e-06 [updatestate_assign_eliminate]: 3.97e-06 [updatestate_loads_eliminate]: 4.02e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 0.104056 [accelerated_algorithm]: 2.137e-05 [shard]: 6.58e-06 [meta_shard_fg_expand]: 4.70001e-06 [shard_inline]: 9.54e-06 [merge_send_recv]: 4.518e-05 [auto_parallel]: 1.89e-05 [parallel]: 1.346e-05 [flash_sp]: 6.74999e-06 [merge_comm]: 4.94e-06 [allreduce_fusion]: 4.28001e-06 [matmul_add_comm_reduction]: 1.541e-05 [allreduce_slice_to_reducescatter]: 1.59e-06 [virtual_shard_identity]: 2.451e-05 [virtual_dataset]: 9.46e-06 [get_grad_eliminate_]: 8.61002e-06 [virtual_output]: 8.65001e-06 [merge_forward]: 5.44e-06 [cell_reuse_recompute_pass]: 3.86999e-06 [offload_activation]: 1.372e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.588e-05 [merge_recompute_call_nodes]: 1.80001e-06 [before_grad]: 1.381e-05 [set_forward_comm_id_for_comm_node_pass]: 4.11001e-06 [meta_fg_expand]: 3.68e-06 [flash_sp_send_recv_attached]: 1.59e-06 [receive_attached]: 2.68e-06 [after_resolve]: 1.773e-05 [a_after_grad]: 1.298e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 8.36002e-06 [auto_monad_grad]: 3.18998e-06 [auto_monad_eliminator]: 2.225e-05 [cse]: 6.375e-05 [a_3]: 5.622e-05 [py_interpret_to_execute_after_opt_a]: 1.161e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 3.096e-05 [convert_after_rewriter]: 1.22e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00086813 [opt_b]: 0.0002883, [1] [Cycle 1]: 0.00027738, [7] [b_1]: 0.00018157 [b_2]: 1.092e-05 [updatestate_depend_eliminate]: 7.4e-06 [updatestate_assign_eliminate]: 4.16001e-06 [updatestate_loads_eliminate]: 3.74002e-06 [renormalize]: 1.21997e-06 [cse]: 3.353e-05 [optimize_parallel_all_gather_comm]: 1.81e-05 [overlap_param_gather]: 2.44999e-06 [cconv]: 3.992e-05 [loop_unroll]: 0.00050874 [opt_after_cconv]: 0.00012983, [1] [Cycle 1]: 0.00012329, [7] [c_1]: 4.32e-05 [parameter_eliminate]: 4.49998e-06 [updatestate_depend_eliminate]: 6.12999e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.98998e-06 [cse]: 2.808e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 6.122e-05 [tuple_transform]: 9.138e-05, [1] [Cycle 1]: 8.633e-05, [4] [d_1]: 5.7e-05 [none_parameter_eliminate]: 1.61002e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 9.84001e-06 [partial_unused_args_eliminate]: 2.09999e-06 [add_recomputation]: 5.946e-05 [cse_after_recomputation]: 0.00025862, [1] [Cycle 1]: 2.52e-05, [1] [cse]: 1.966e-05 [environ_conv]: 1.295e-05 [swap_dp_allreduce_reducescatter]: 7.48999e-06 [bias_add_comm_swap]: 3.23e-06 [label_micro_interleaved_index]: 5.10999e-06 [label_fine_grained_interleaved_index]: 3.08e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.39999e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.15999e-06 [full_micro_interleaved_order_control]: 2.74001e-06 [reorder_send_recv_between_fp_bp]: 3.02002e-06 [comm_op_add_attrs]: 1.15001e-06 [add_comm_op_reuse_tag]: 1.08001e-06 [interleave_split_concat_branches]: 1.14998e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.74e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 1.551e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 4.28001e-06 [overlap_recompute_and_grad_model_parallel]: 5.51002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.78e-06 [overlap_grad_ring_attention]: 4.32998e-06 [overlap_grad_flash_sp]: 2.412e-05 [begin_end_overlap_inline]: 6.59988e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 9.018e-05, [1] [Cycle 1]: 8.425e-05, [6] [build]: 4.43001e-06 [elim_shapecalc]: 1.378e-05 [elim_not_effective]: 1.564e-05 [opt_reshape]: 9.20001e-06 [fold_const_symbol]: 1.244e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.67001e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 2.022e-05 [get_jit_bprop_graph]: 2.34001e-06 [rewriter_after_jit_bprop_graph]: 4.87998e-06 [opt_after_jit_grad]: 0.0006046 [validate]: 0.00013808 Sums bootstrap : 0.000466s : 0.03% type_inference : 1.218468s : 91.34% event_method : 0.000815s : 0.06% auto_monad : 0.000103s : 0.01% graph_reusing : 0.000062s : 0.00% inline : 0.000008s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000038s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000081s : 0.01% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000290s : 0.02% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000063s : 0.00% optimize.opt_a.loop_unroll : 0.000046s : 0.00% optimize.opt_a.a_1 : 0.000972s : 0.07% optimize.opt_a.with_stream_mark : 0.000037s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.104168s : 7.81% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.00% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.00% optimize.opt_a.merge_send_recv : 0.000055s : 0.00% optimize.opt_a.auto_parallel : 0.000026s : 0.00% optimize.opt_a.parallel : 0.000033s : 0.00% optimize.opt_a.flash_sp : 0.000016s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000035s : 0.00% optimize.opt_a.virtual_dataset : 0.000019s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.00% optimize.opt_a.a_after_grad : 0.000026s : 0.00% optimize.opt_a.renormalize : 0.004546s : 0.34% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000044s : 0.00% optimize.opt_a.cse : 0.000113s : 0.01% optimize.opt_a.a_3 : 0.000127s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000031s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000868s : 0.07% optimize.opt_b.b_1 : 0.000182s : 0.01% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.00% optimize.loop_unroll : 0.000509s : 0.04% optimize.opt_after_cconv.c_1 : 0.000043s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000061s : 0.00% optimize.tuple_transform.d_1 : 0.000057s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000059s : 0.00% optimize.cse_after_recomputation.cse : 0.000020s : 0.00% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000605s : 0.05% validate : 0.000138s : 0.01% Time group info: ------[substitution.] 0.000234 27 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000002s : 2: substitution.fold_const_symbol 3.03% : 0.000007s : 5: substitution.graph_param_transform 81.44% : 0.000190s : 6: substitution.inline 2.43% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000006s : 4: substitution.remove_not_recompute_node 3.31% : 0.000008s : 2: substitution.replace_old_param 5.58% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 1.218292 2 99.52% : 1.212436s : 1: type_inference.infer 0.48% : 0.005856s : 1: type_inference.specialize ------[replace.] 0.000079 8 79.57% : 0.000063s : 6: replace.inline 20.43% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000198 8 94.05% : 0.000186s : 6: match.inline 5.95% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000275 1750 0.77% : 0.000002s : 18: predicate.accumulaten_eliminater 0.90% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 12: predicate.addn_check_dump 0.83% : 0.000002s : 18: predicate.addn_zero_filter 0.73% : 0.000002s : 18: predicate.adjust_all_reduce_mul_add 2.34% : 0.000006s : 30: predicate.arithmetic_simplify 0.90% : 0.000002s : 18: predicate.cast_eliminate 0.64% : 0.000002s : 12: predicate.check_bprop_eliminate 0.51% : 0.000001s : 12: predicate.compare_switch_simplify 0.19% : 0.000001s : 6: predicate.const_output_eliminate 0.63% : 0.000002s : 12: predicate.depend_value_elim 0.89% : 0.000002s : 18: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 18: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 18: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 5: predicate.elim_not_effective 0.44% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.17% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 24: predicate.environ_get_depend_swap 1.62% : 0.000004s : 36: predicate.environ_get_eliminate 0.98% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.20% : 0.000003s : 26: predicate.exchange_switch_depend_value 1.93% : 0.000005s : 26: predicate.float_depend_g_call 1.10% : 0.000003s : 12: predicate.float_environ_get_switch 0.79% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 12: predicate.incorporate_call 0.47% : 0.000001s : 12: predicate.incorporate_call_switch 7.54% : 0.000021s : 80: predicate.inline 0.63% : 0.000002s : 12: predicate.inline_without_move 0.32% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.98% : 0.000003s : 12: predicate.less_batch_normalization 1.76% : 0.000005s : 31: predicate.list_to_tuple_eliminator_ 2.06% : 0.000006s : 50: predicate.load_eliminater 0.77% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.14% : 0.000006s : 43: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 30: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 12: predicate.merge_addn 0.56% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.73% : 0.000002s : 18: predicate.minmaximum_grad 1.09% : 0.000003s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.43% : 0.000001s : 6: predicate.parallel_virtual_node 1.55% : 0.000004s : 26: predicate.partial_defer_inline 1.19% : 0.000003s : 26: predicate.partial_eliminate 0.86% : 0.000002s : 18: predicate.print_const_string_wrapper 0.65% : 0.000002s : 12: predicate.reduce_all_const_elim 1.10% : 0.000003s : 18: predicate.reduce_eliminate 2.17% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 12: predicate.remove_not_recompute_node 1.39% : 0.000004s : 32: predicate.replace_applicator 0.57% : 0.000002s : 12: predicate.replace_old_param 0.28% : 0.000001s : 6: predicate.reset_defer_inline 0.85% : 0.000002s : 18: predicate.reshape_eliminate 0.68% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 6: predicate.row_tensor_eliminate 0.78% : 0.000002s : 12: predicate.same_eliminate 0.46% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 12: predicate.shard_identity_eliminate 0.64% : 0.000002s : 11: predicate.special_op_eliminate 0.60% : 0.000002s : 12: predicate.specialize_transform 0.90% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.33% : 0.000004s : 26: predicate.switch_defer_inline 1.87% : 0.000005s : 38: predicate.switch_layer_defer_inline 4.50% : 0.000012s : 86: predicate.switch_simplify 0.78% : 0.000002s : 18: predicate.tile_eliminate 0.84% : 0.000002s : 18: predicate.transpose_eliminate 1.50% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.50% : 0.000004s : 31: predicate.tuple_to_list_eliminator_ 6.18% : 0.000017s : 50: predicate.updatestate_pure_node_eliminater 3.40% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 6: predicate.value_based_eliminate 0.72% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.67% : 0.000002s : 12: predicate.virtual_output_eliminate 0.18% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.48% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004261 30 62.97% : 0.002683s : 22: func_graph_cloner_run.FuncGraphClonerGraph 37.03% : 0.001578s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.571701 192 0.00% : 0.000004s : 1: ForceFp32Comm 0.37% : 0.005813s : 1: add_attr 0.37% : 0.005799s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000063s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000111s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.03% : 0.000496s : 1: bootstrap 0.00% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000263s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.05% : 0.000841s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000068s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000012s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.03% : 0.000518s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.06% : 0.000880s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 6.71% : 0.105505s : 78: opt.transform.opt_a 0.00% : 0.000042s : 1: opt.transform.opt_after_cconv 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000158s : 28: opt.transform.opt_b 0.00% : 0.000065s : 2: opt.transform.opt_trans_graph 0.00% : 0.000047s : 4: opt.transform.symbol_engine_opt 7.08% : 0.111312s : 1: opt_a 0.01% : 0.000133s : 1: opt_after_cconv 0.04% : 0.000617s : 1: opt_after_jit_grad 0.02% : 0.000292s : 1: opt_b 7.28% : 0.114449s : 1: optimize 0.00% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000087s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000065s : 1: remove_dup_value 0.19% : 0.002953s : 1: renormalize.infer 0.10% : 0.001580s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000034s : 1: rewriter_after_opt_a 0.02% : 0.000297s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000093s : 1: symbol_engine_optimizer 0.01% : 0.000094s : 1: tuple_transform 77.53% : 1.218529s : 1: type_inference TotalTime = 2.56867, [21] [bootstrap]: 0.00071286 [type_inference]: 1.86259 [event_method]: 6.125e-05 [auto_monad]: 0.00017297 [graph_reusing]: 9.62001e-06 [inline]: 2.96001e-06 [add_attr]: 0.00416374, [1] [add_attr_with_inline]: 0.00415176, [1] [Cycle 1]: 0.00011415, [2] [tag_attr]: 5.898e-05 [meta_addattr_fg_expand]: 1.264e-05 [parallel-infer-symbol]: 4.01001e-06 [pre_auto_parallel]: 7.238e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 1.25001e-06 [dataset_repeat_opt]: 2.39999e-06 [pipeline_split]: 1.90001e-06 [optimize]: 0.70002, [53] [py_interpret_to_execute]: 6.95002e-06 [rewriter_before_opt_a]: 0.00043565 [opt_a]: 0.648797, [3] [Cycle 1]: 0.566561, [45] [expand_dump_flag]: 5.57001e-06 [switch_simplify]: 0.00016755 [loop_unroll]: 7.292e-05 [a_1]: 0.00173881 [with_stream_mark]: 3.614e-05 [recompute_prepare]: 2.743e-05 [updatestate_depend_eliminate]: 1.102e-05 [updatestate_assign_eliminate]: 8.82999e-06 [updatestate_loads_eliminate]: 8.23999e-06 [parameter_eliminate]: 2.84999e-06 [a_2]: 0.00028281 [accelerated_algorithm]: 0.278387 [shard]: 7.08998e-06 [meta_shard_fg_expand]: 1.787e-05 [shard_inline]: 3.497e-05 [merge_send_recv]: 4.053e-05 [auto_parallel]: 2.382e-05 [parallel]: 2.651e-05 [flash_sp]: 1.973e-05 [merge_comm]: 1.066e-05 [allreduce_fusion]: 9.72001e-06 [matmul_add_comm_reduction]: 4.097e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 2.538e-05 [virtual_dataset]: 2.047e-05 [get_grad_eliminate_]: 1.904e-05 [virtual_output]: 1.893e-05 [merge_forward]: 1.104e-05 [cell_reuse_recompute_pass]: 3.32002e-06 [offload_activation]: 2.12e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.62e-05 [merge_recompute_call_nodes]: 2.19001e-06 [before_grad]: 3.159e-05 [set_forward_comm_id_for_comm_node_pass]: 1.016e-05 [meta_fg_expand]: 0.00324793 [flash_sp_send_recv_attached]: 6.97002e-06 [receive_attached]: 3.3e-06 [after_resolve]: 0.00011243 [a_after_grad]: 0.00012356 [renormalize]: 0.280282 [add_forward_monad_depend]: 1.608e-05 [auto_monad_grad]: 9.41003e-06 [auto_monad_eliminator]: 7.578e-05 [cse]: 0.00057302 [a_3]: 0.00047771 [Cycle 2]: 0.0811949, [45] [expand_dump_flag]: 3.85e-06 [switch_simplify]: 6.079e-05 [loop_unroll]: 5.711e-05 [a_1]: 0.00155188 [with_stream_mark]: 2.48e-05 [recompute_prepare]: 1.162e-05 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 4.15e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 0.0745366 [a_2]: 0.00021986 [accelerated_algorithm]: 1.53e-05 [shard]: 5.57001e-06 [meta_shard_fg_expand]: 1.304e-05 [shard_inline]: 9.47999e-06 [merge_send_recv]: 2.379e-05 [auto_parallel]: 1.864e-05 [parallel]: 1.278e-05 [flash_sp]: 6.79999e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.23999e-06 [matmul_add_comm_reduction]: 1.322e-05 [allreduce_slice_to_reducescatter]: 1.45001e-06 [virtual_shard_identity]: 1.675e-05 [virtual_dataset]: 1.259e-05 [get_grad_eliminate_]: 9.10001e-06 [virtual_output]: 9.14e-06 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 3.77998e-06 [offload_activation]: 1.326e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.283e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 1.571e-05 [set_forward_comm_id_for_comm_node_pass]: 4.70001e-06 [meta_fg_expand]: 0.00024978 [flash_sp_send_recv_attached]: 2.21e-06 [receive_attached]: 2.89001e-06 [after_resolve]: 1.538e-05 [a_after_grad]: 1.618e-05 [renormalize]: 0.00351377 [add_forward_monad_depend]: 1.364e-05 [auto_monad_grad]: 3.11001e-06 [auto_monad_eliminator]: 2.856e-05 [cse]: 5.969e-05 [a_3]: 9.356e-05 [Cycle 3]: 0.00101623, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 1.336e-05 [loop_unroll]: 1.012e-05 [a_1]: 0.00020406 [with_stream_mark]: 2.372e-05 [recompute_prepare]: 1.106e-05 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00010704 [accelerated_algorithm]: 1.218e-05 [shard]: 3.48e-06 [meta_shard_fg_expand]: 4.91002e-06 [shard_inline]: 9.72001e-06 [merge_send_recv]: 1.206e-05 [auto_parallel]: 1.342e-05 [parallel]: 1.163e-05 [flash_sp]: 1.97999e-06 [merge_comm]: 4.67e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 1.35e-05 [allreduce_slice_to_reducescatter]: 1.17e-06 [virtual_shard_identity]: 1.419e-05 [virtual_dataset]: 8.92999e-06 [get_grad_eliminate_]: 9.89001e-06 [virtual_output]: 9.86e-06 [merge_forward]: 5.16002e-06 [cell_reuse_recompute_pass]: 4.37e-06 [offload_activation]: 1.251e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.849e-05 [merge_recompute_call_nodes]: 2.17001e-06 [before_grad]: 1.583e-05 [set_forward_comm_id_for_comm_node_pass]: 4.33001e-06 [meta_fg_expand]: 6.45002e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.89999e-06 [after_resolve]: 9.71e-06 [a_after_grad]: 1.432e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.59002e-06 [auto_monad_grad]: 2.31e-06 [auto_monad_eliminator]: 1.659e-05 [cse]: 4.032e-05 [a_3]: 5.401e-05 [py_interpret_to_execute_after_opt_a]: 1.613e-05 [slice_cell_reuse_recomputed_activation]: 2.78e-06 [rewriter_after_opt_a]: 3.449e-05 [convert_after_rewriter]: 2.73e-06 [order_py_execute_after_rewriter]: 1.24e-06 [mutable_eliminate]: 0.0487997 [opt_b]: 0.00034865, [1] [Cycle 1]: 0.00033703, [7] [b_1]: 0.00019889 [b_2]: 1.338e-05 [updatestate_depend_eliminate]: 1.127e-05 [updatestate_assign_eliminate]: 4.33001e-06 [updatestate_loads_eliminate]: 4.00998e-06 [renormalize]: 1.39e-06 [cse]: 6.071e-05 [optimize_parallel_all_gather_comm]: 2.902e-05 [overlap_param_gather]: 1.96e-06 [cconv]: 4.002e-05 [loop_unroll]: 0.00054799 [opt_after_cconv]: 0.00014516, [1] [Cycle 1]: 0.00013693, [7] [c_1]: 3.86e-05 [parameter_eliminate]: 6.06998e-06 [updatestate_depend_eliminate]: 8.82999e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.08998e-06 [cse]: 3.696e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 3.256e-05 [tuple_transform]: 0.0001057, [1] [Cycle 1]: 0.00010098, [4] [d_1]: 6.621e-05 [none_parameter_eliminate]: 2.04999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 1.085e-05 [partial_unused_args_eliminate]: 2.46e-06 [add_recomputation]: 5.584e-05 [cse_after_recomputation]: 3.362e-05, [1] [Cycle 1]: 2.873e-05, [1] [cse]: 2.047e-05 [environ_conv]: 1.421e-05 [swap_dp_allreduce_reducescatter]: 7.41001e-06 [bias_add_comm_swap]: 3.98001e-06 [label_micro_interleaved_index]: 6.06e-06 [label_fine_grained_interleaved_index]: 3.33e-06 [merge_cast_opt]: 1.59998e-06 [slice_recompute_activation]: 2.34999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.40999e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.76e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.11002e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.17999e-06 [overlap_opt_shard_in_pipeline]: 1.32999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.476e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.32e-06 [overlap_recompute_and_grad_model_parallel]: 5.98998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72001e-06 [overlap_recompute_comm]: 2.67001e-06 [overlap_grad_ring_attention]: 4.43001e-06 [overlap_grad_flash_sp]: 2.265e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.20002e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 0.0001777, [1] [Cycle 1]: 0.00017288, [6] [build]: 4.05e-06 [elim_shapecalc]: 1.685e-05 [elim_not_effective]: 1.583e-05 [opt_reshape]: 9.77001e-06 [fold_const_symbol]: 9.228e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 2.36e-05 [get_jit_bprop_graph]: 2.61999e-06 [rewriter_after_jit_bprop_graph]: 7.55e-06 [opt_after_jit_grad]: 0.00055025 [validate]: 7.064e-05 Sums bootstrap : 0.000713s : 0.03% type_inference : 1.862588s : 72.68% event_method : 0.000061s : 0.00% auto_monad : 0.000173s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000059s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000072s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000436s : 0.02% optimize.opt_a.expand_dump_flag : 0.000012s : 0.00% optimize.opt_a.switch_simplify : 0.000242s : 0.01% optimize.opt_a.loop_unroll : 0.000140s : 0.01% optimize.opt_a.a_1 : 0.003495s : 0.14% optimize.opt_a.with_stream_mark : 0.000085s : 0.00% optimize.opt_a.recompute_prepare : 0.000050s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000016s : 0.00% optimize.opt_a.parameter_eliminate : 0.074541s : 2.91% optimize.opt_a.a_2 : 0.000610s : 0.02% optimize.opt_a.accelerated_algorithm : 0.278414s : 10.86% optimize.opt_a.shard : 0.000016s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000036s : 0.00% optimize.opt_a.shard_inline : 0.000054s : 0.00% optimize.opt_a.merge_send_recv : 0.000076s : 0.00% optimize.opt_a.auto_parallel : 0.000056s : 0.00% optimize.opt_a.parallel : 0.000051s : 0.00% optimize.opt_a.flash_sp : 0.000029s : 0.00% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000068s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000056s : 0.00% optimize.opt_a.virtual_dataset : 0.000042s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000038s : 0.00% optimize.opt_a.virtual_output : 0.000038s : 0.00% optimize.opt_a.merge_forward : 0.000022s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.offload_activation : 0.000047s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000108s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000063s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000019s : 0.00% optimize.opt_a.meta_fg_expand : 0.003504s : 0.14% optimize.opt_a.flash_sp_send_recv_attached : 0.000012s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000138s : 0.01% optimize.opt_a.a_after_grad : 0.000154s : 0.01% optimize.opt_a.renormalize : 0.283796s : 11.07% optimize.opt_a.add_forward_monad_depend : 0.000033s : 0.00% optimize.opt_a.auto_monad_grad : 0.000015s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000121s : 0.00% optimize.opt_a.cse : 0.000673s : 0.03% optimize.opt_a.a_3 : 0.000625s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000016s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000034s : 0.00% optimize.convert_after_rewriter : 0.000003s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.048800s : 1.90% optimize.opt_b.b_1 : 0.000199s : 0.01% optimize.opt_b.b_2 : 0.000013s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000061s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.00% optimize.loop_unroll : 0.000548s : 0.02% optimize.opt_after_cconv.c_1 : 0.000039s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000037s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000033s : 0.00% optimize.tuple_transform.d_1 : 0.000066s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.00% optimize.cse_after_recomputation.cse : 0.000020s : 0.00% optimize.environ_conv : 0.000014s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000092s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000550s : 0.02% validate : 0.000071s : 0.00% Time group info: ------[substitution.] 0.001008 170 0.28% : 0.000003s : 2: substitution.elim_not_effective 0.97% : 0.000010s : 12: substitution.float_depend_g_call 0.45% : 0.000005s : 2: substitution.float_tuple_getitem_switch 7.59% : 0.000076s : 2: substitution.fold_const_symbol 0.74% : 0.000007s : 5: substitution.graph_param_transform 0.30% : 0.000003s : 2: substitution.incorporate_call 0.21% : 0.000002s : 2: substitution.incorporate_call_switch 63.47% : 0.000640s : 19: substitution.inline 2.48% : 0.000025s : 2: substitution.inline_without_move 1.36% : 0.000014s : 12: substitution.j_node_and_user_rematch 1.17% : 0.000012s : 7: substitution.minmaximum_grad 1.08% : 0.000011s : 12: substitution.partial_eliminate 1.48% : 0.000015s : 12: substitution.remove_not_recompute_node 3.01% : 0.000030s : 9: substitution.replace_applicator 1.19% : 0.000012s : 20: substitution.replace_old_param 0.32% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.24% : 0.000012s : 3: substitution.switch_simplify 2.36% : 0.000024s : 7: substitution.tuple_list_convert_item_index_to_positive 1.00% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.57% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 6.28% : 0.000063s : 18: substitution.tuple_list_get_item_eliminator 1.47% : 0.000015s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 1.862432 2 99.67% : 1.856221s : 1: type_inference.infer 0.33% : 0.006211s : 1: type_inference.specialize ------[replace.] 0.000343 31 55.56% : 0.000191s : 19: replace.inline 13.25% : 0.000045s : 3: replace.switch_simplify 31.19% : 0.000107s : 9: replace.tuple_list_get_item_eliminator ------[match.] 0.000670 31 93.82% : 0.000629s : 19: match.inline 1.60% : 0.000011s : 3: match.switch_simplify 4.58% : 0.000031s : 9: match.tuple_list_get_item_eliminator ------[predicate.] 0.000791 5164 1.05% : 0.000008s : 63: predicate.accumulaten_eliminater 0.22% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000004s : 26: predicate.addn_check_dump 1.07% : 0.000008s : 63: predicate.addn_zero_filter 0.92% : 0.000007s : 63: predicate.adjust_all_reduce_mul_add 2.47% : 0.000020s : 89: predicate.arithmetic_simplify 1.03% : 0.000008s : 63: predicate.cast_eliminate 1.18% : 0.000009s : 68: predicate.check_bprop_eliminate 0.48% : 0.000004s : 26: predicate.compare_switch_simplify 0.07% : 0.000001s : 5: predicate.const_output_eliminate 0.60% : 0.000005s : 26: predicate.depend_value_elim 1.04% : 0.000008s : 63: predicate.dict_get_item_const_eliminator 1.25% : 0.000010s : 63: predicate.dict_get_item_eliminator 1.04% : 0.000008s : 63: predicate.dict_set_item_eliminator 0.30% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 5: predicate.elim_not_effective 0.12% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.07% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.02% : 0.000008s : 68: predicate.environ_get_depend_swap 1.49% : 0.000012s : 94: predicate.environ_get_eliminate 0.99% : 0.000008s : 68: predicate.environ_get_set_eliminate 1.52% : 0.000012s : 91: predicate.exchange_switch_depend_value 2.17% : 0.000017s : 91: predicate.float_depend_g_call 0.51% : 0.000004s : 26: predicate.float_environ_get_switch 0.68% : 0.000005s : 31: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 5: predicate.fold_const_symbol 0.62% : 0.000005s : 26: predicate.get_grad_eliminate 0.07% : 0.000001s : 5: predicate.graph_param_transform 0.45% : 0.000004s : 26: predicate.incorporate_call 0.37% : 0.000003s : 26: predicate.incorporate_call_switch 5.13% : 0.000041s : 216: predicate.inline 1.43% : 0.000011s : 67: predicate.inline_without_move 0.28% : 0.000002s : 26: predicate.j_node_and_user_rematch 2.93% : 0.000023s : 26: predicate.less_batch_normalization 1.52% : 0.000012s : 82: predicate.list_to_tuple_eliminator_ 2.36% : 0.000019s : 145: predicate.load_eliminater 0.40% : 0.000003s : 5: predicate.loop_unroll_after_grad 2.48% : 0.000020s : 141: predicate.loop_unroll_before_grad 1.32% : 0.000010s : 73: predicate.make_slice_get_slice_eliminator 0.59% : 0.000005s : 26: predicate.merge_addn 1.16% : 0.000009s : 68: predicate.micro_step_allgather_replace 1.14% : 0.000009s : 68: predicate.mini_step_allgather_replace 0.92% : 0.000007s : 63: predicate.minmaximum_grad 0.73% : 0.000006s : 5: predicate.mutable_eliminate 0.13% : 0.000001s : 5: predicate.opt_reshape 0.14% : 0.000001s : 5: predicate.parallel_virtual_node 2.08% : 0.000016s : 91: predicate.partial_defer_inline 1.33% : 0.000011s : 77: predicate.partial_eliminate 1.10% : 0.000009s : 63: predicate.print_const_string_wrapper 0.55% : 0.000004s : 26: predicate.reduce_all_const_elim 1.28% : 0.000010s : 63: predicate.reduce_eliminate 2.28% : 0.000018s : 145: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000003s : 26: predicate.remove_not_recompute_node 1.96% : 0.000015s : 140: predicate.replace_applicator 0.77% : 0.000006s : 67: predicate.replace_old_param 0.18% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000008s : 63: predicate.reshape_eliminate 1.38% : 0.000011s : 68: predicate.row_tensor_add_zeros_like 0.19% : 0.000002s : 5: predicate.row_tensor_eliminate 1.50% : 0.000012s : 68: predicate.same_eliminate 0.38% : 0.000003s : 26: predicate.set_cell_output_no_recompute 0.66% : 0.000005s : 26: predicate.shard_identity_eliminate 0.29% : 0.000002s : 10: predicate.special_op_eliminate 0.85% : 0.000007s : 26: predicate.specialize_transform 1.39% : 0.000011s : 68: predicate.split_environ_get_set_with_tuple_value 1.50% : 0.000012s : 67: predicate.stack_unstack_eliminate 0.09% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.67% : 0.000013s : 91: predicate.switch_defer_inline 2.83% : 0.000022s : 159: predicate.switch_layer_defer_inline 6.08% : 0.000048s : 269: predicate.switch_simplify 1.00% : 0.000008s : 63: predicate.tile_eliminate 0.95% : 0.000008s : 63: predicate.transpose_eliminate 1.38% : 0.000011s : 73: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000011s : 73: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000011s : 73: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000020s : 108: predicate.tuple_list_get_item_eliminator 1.50% : 0.000012s : 73: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000016s : 99: predicate.tuple_list_set_item_eliminator 1.44% : 0.000011s : 82: predicate.tuple_to_list_eliminator_ 2.17% : 0.000017s : 145: predicate.updatestate_pure_node_eliminater 2.66% : 0.000021s : 171: predicate.updatestate_useless_node_eliminater 0.13% : 0.000001s : 5: predicate.value_based_eliminate 0.65% : 0.000005s : 26: predicate.virtual_dataset_eliminate 0.66% : 0.000005s : 26: predicate.virtual_output_eliminate 0.06% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.16% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007774 70 75.28% : 0.005852s : 47: func_graph_cloner_run.FuncGraphClonerGraph 24.72% : 0.001922s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.840964 233 0.00% : 0.000004s : 1: ForceFp32Comm 0.11% : 0.004169s : 1: add_attr 0.11% : 0.004157s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.00% : 0.000060s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000180s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.02% : 0.000747s : 1: bootstrap 0.00% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000007s : 1: detach_backward 0.00% : 0.000018s : 1: environ_conv 0.00% : 0.000070s : 1: event_method 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.01% : 0.000559s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 1.27% : 0.048820s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000045s : 1: opt.transform.mutable_eliminate 7.39% : 0.284018s : 117: opt.transform.opt_a 0.00% : 0.000037s : 1: opt.transform.opt_after_cconv 0.00% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000170s : 28: opt.transform.opt_b 0.00% : 0.000073s : 2: opt.transform.opt_trans_graph 0.00% : 0.000131s : 4: opt.transform.symbol_engine_opt 16.89% : 0.648803s : 1: opt_a 0.00% : 0.000149s : 1: opt_after_cconv 0.01% : 0.000562s : 1: opt_after_jit_grad 0.01% : 0.000353s : 1: opt_b 18.23% : 0.700027s : 1: optimize 0.00% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000008s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.00% : 0.000078s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000037s : 1: remove_dup_value 7.26% : 0.278979s : 2: renormalize.infer 0.12% : 0.004780s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000039s : 1: rewriter_after_opt_a 0.01% : 0.000444s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000181s : 1: symbol_engine_optimizer 0.00% : 0.000109s : 1: tuple_transform 48.49% : 1.862625s : 1: type_inference TotalTime = 0.385957, [21] [bootstrap]: 0.00485519 [type_inference]: 0.237917 [event_method]: 9.519e-05 [auto_monad]: 0.00022221 [graph_reusing]: 7.8e-06 [inline]: 3.75e-06 [add_attr]: 0.131609, [1] [add_attr_with_inline]: 0.131539, [1] [Cycle 1]: 0.00011739, [2] [tag_attr]: 4.817e-05 [meta_addattr_fg_expand]: 1.595e-05 [parallel-infer-symbol]: 4.83001e-06 [pre_auto_parallel]: 5.438e-05 [insert-virtual-dataset]: 3.11999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.94999e-06 [optimize]: 0.00979566, [53] [py_interpret_to_execute]: 1.575e-05 [rewriter_before_opt_a]: 0.00034519 [opt_a]: 0.00581734, [2] [Cycle 1]: 0.00473241, [45] [expand_dump_flag]: 5.64e-06 [switch_simplify]: 6.711e-05 [loop_unroll]: 4.693e-05 [a_1]: 0.00122507 [with_stream_mark]: 0.00011508 [recompute_prepare]: 2.168e-05 [updatestate_depend_eliminate]: 6.83998e-06 [updatestate_assign_eliminate]: 4.08001e-06 [updatestate_loads_eliminate]: 3.75e-06 [parameter_eliminate]: 3.66999e-06 [a_2]: 0.00012022 [accelerated_algorithm]: 1.346e-05 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 7.46001e-06 [shard_inline]: 9.10999e-06 [merge_send_recv]: 1.201e-05 [auto_parallel]: 1.194e-05 [parallel]: 9.984e-05 [flash_sp]: 3.961e-05 [merge_comm]: 8.57e-06 [allreduce_fusion]: 3.93001e-06 [matmul_add_comm_reduction]: 1.244e-05 [allreduce_slice_to_reducescatter]: 1.48002e-06 [virtual_shard_identity]: 2.555e-05 [virtual_dataset]: 1.192e-05 [get_grad_eliminate_]: 9.94001e-06 [virtual_output]: 1.02e-05 [merge_forward]: 6.27001e-06 [cell_reuse_recompute_pass]: 3.8e-06 [offload_activation]: 1.261e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.544e-05 [merge_recompute_call_nodes]: 1.87001e-06 [before_grad]: 1.61e-05 [set_forward_comm_id_for_comm_node_pass]: 5.42999e-06 [meta_fg_expand]: 6.23e-06 [flash_sp_send_recv_attached]: 3.28e-06 [receive_attached]: 2.69999e-06 [after_resolve]: 2.2e-05 [a_after_grad]: 1.478e-05 [renormalize]: 0.00209974 [add_forward_monad_depend]: 1.338e-05 [auto_monad_grad]: 2.75002e-06 [auto_monad_eliminator]: 2.967e-05 [cse]: 5.05e-05 [a_3]: 8.134e-05 [Cycle 2]: 0.00106473, [45] [expand_dump_flag]: 3.09001e-06 [switch_simplify]: 1.3e-05 [loop_unroll]: 8.80999e-06 [a_1]: 0.00018803 [with_stream_mark]: 2.39e-05 [recompute_prepare]: 1.02e-05 [updatestate_depend_eliminate]: 4.50999e-06 [updatestate_assign_eliminate]: 3.52002e-06 [updatestate_loads_eliminate]: 3.42002e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 0.00015109 [accelerated_algorithm]: 1.144e-05 [shard]: 3.66001e-06 [meta_shard_fg_expand]: 3.36999e-06 [shard_inline]: 7.96001e-06 [merge_send_recv]: 1.191e-05 [auto_parallel]: 1.12e-05 [parallel]: 1.021e-05 [flash_sp]: 4.80999e-06 [merge_comm]: 4.29002e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.22e-05 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 1.27e-05 [virtual_dataset]: 9.29e-06 [get_grad_eliminate_]: 1.069e-05 [virtual_output]: 8.68001e-06 [merge_forward]: 5.71e-06 [cell_reuse_recompute_pass]: 3.95e-06 [offload_activation]: 1.269e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.663e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 1.538e-05 [set_forward_comm_id_for_comm_node_pass]: 5.25999e-06 [meta_fg_expand]: 3.25998e-06 [flash_sp_send_recv_attached]: 2.19999e-06 [receive_attached]: 2.76e-06 [after_resolve]: 2.109e-05 [a_after_grad]: 1.373e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.5e-06 [auto_monad_grad]: 2.32001e-06 [auto_monad_eliminator]: 1.299e-05 [cse]: 3.411e-05 [a_3]: 5.507e-05 [py_interpret_to_execute_after_opt_a]: 1.444e-05 [slice_cell_reuse_recomputed_activation]: 2.56998e-06 [rewriter_after_opt_a]: 2.938e-05 [convert_after_rewriter]: 1.47999e-06 [order_py_execute_after_rewriter]: 1.77001e-06 [mutable_eliminate]: 0.0010487 [opt_b]: 0.00029551, [1] [Cycle 1]: 0.00028483, [7] [b_1]: 0.00017048 [b_2]: 1.1e-05 [updatestate_depend_eliminate]: 9.33002e-06 [updatestate_assign_eliminate]: 3.66999e-06 [updatestate_loads_eliminate]: 2.95998e-06 [renormalize]: 1.64e-06 [cse]: 4.298e-05 [optimize_parallel_all_gather_comm]: 0.00013175 [overlap_param_gather]: 3.98999e-06 [cconv]: 4.098e-05 [loop_unroll]: 0.00087459 [opt_after_cconv]: 0.00019699, [1] [Cycle 1]: 0.00018723, [7] [c_1]: 4.477e-05 [parameter_eliminate]: 2.417e-05 [updatestate_depend_eliminate]: 1.418e-05 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 3.11001e-06 [cse]: 5.209e-05 [renormalize]: 8.70001e-07 [remove_dup_value]: 5.659e-05 [tuple_transform]: 0.00010976, [1] [Cycle 1]: 0.00010308, [4] [d_1]: 7.167e-05 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 9.52999e-06 [partial_unused_args_eliminate]: 2.21998e-06 [add_recomputation]: 0.00014952 [cse_after_recomputation]: 3.609e-05, [1] [Cycle 1]: 2.953e-05, [1] [cse]: 2.163e-05 [environ_conv]: 8.57998e-06 [swap_dp_allreduce_reducescatter]: 6.84999e-06 [bias_add_comm_swap]: 3.95e-06 [label_micro_interleaved_index]: 1.043e-05 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.65002e-06 [assign_add_opt]: 1.47001e-06 [ForceFp32Comm]: 1.06002e-06 [remove_cast_before_assign_add]: 1.43002e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.21997e-06 [overlap_opt_shard_in_pipeline]: 1.91e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82999e-06 [control_data_broadcast_order]: 1.658e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 4.59998e-06 [overlap_recompute_and_grad_model_parallel]: 6.33998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 5.00001e-06 [overlap_grad_flash_sp]: 2.655e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.31e-06 [split_layernorm_comm]: 1.69998e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 0.00011312, [1] [Cycle 1]: 0.00010632, [6] [build]: 5.19998e-06 [elim_shapecalc]: 2.018e-05 [elim_not_effective]: 1.905e-05 [opt_reshape]: 1.07e-05 [fold_const_symbol]: 1.316e-05 [renormalize]: 4.09986e-07 [detach_backward]: 2.20002e-06 [pipeline_parallel_scheduler]: 1.69998e-06 [auto_monad_reorder]: 2.411e-05 [get_jit_bprop_graph]: 3.42002e-06 [rewriter_after_jit_bprop_graph]: 7.99002e-06 [opt_after_jit_grad]: 0.00092623 [validate]: 0.00014014 Sums bootstrap : 0.004855s : 1.92% type_inference : 0.237917s : 94.11% event_method : 0.000095s : 0.04% auto_monad : 0.000222s : 0.09% graph_reusing : 0.000008s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000048s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.01% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000054s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000016s : 0.01% optimize.rewriter_before_opt_a : 0.000345s : 0.14% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000080s : 0.03% optimize.opt_a.loop_unroll : 0.000056s : 0.02% optimize.opt_a.a_1 : 0.001413s : 0.56% optimize.opt_a.with_stream_mark : 0.000139s : 0.05% optimize.opt_a.recompute_prepare : 0.000032s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000271s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000025s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000011s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.01% optimize.opt_a.merge_send_recv : 0.000024s : 0.01% optimize.opt_a.auto_parallel : 0.000023s : 0.01% optimize.opt_a.parallel : 0.000110s : 0.04% optimize.opt_a.flash_sp : 0.000044s : 0.02% optimize.opt_a.merge_comm : 0.000013s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000038s : 0.02% optimize.opt_a.virtual_dataset : 0.000021s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000021s : 0.01% optimize.opt_a.virtual_output : 0.000019s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000052s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000043s : 0.02% optimize.opt_a.a_after_grad : 0.000029s : 0.01% optimize.opt_a.renormalize : 0.002100s : 0.83% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000043s : 0.02% optimize.opt_a.cse : 0.000085s : 0.03% optimize.opt_a.a_3 : 0.000136s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.001049s : 0.41% optimize.opt_b.b_1 : 0.000170s : 0.07% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.000043s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000132s : 0.05% optimize.overlap_param_gather : 0.000004s : 0.00% optimize.cconv : 0.000041s : 0.02% optimize.loop_unroll : 0.000875s : 0.35% optimize.opt_after_cconv.c_1 : 0.000045s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000024s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000052s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000057s : 0.02% optimize.tuple_transform.d_1 : 0.000072s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000150s : 0.06% optimize.cse_after_recomputation.cse : 0.000022s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000010s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000027s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000926s : 0.37% validate : 0.000140s : 0.06% Time group info: ------[substitution.] 0.000470 35 0.61% : 0.000003s : 2: substitution.elim_not_effective 0.39% : 0.000002s : 2: substitution.fold_const_symbol 1.72% : 0.000008s : 5: substitution.graph_param_transform 85.18% : 0.000401s : 6: substitution.inline 1.52% : 0.000007s : 4: substitution.j_node_and_user_rematch 2.25% : 0.000011s : 4: substitution.remove_not_recompute_node 2.36% : 0.000011s : 6: substitution.replace_old_param 5.96% : 0.000028s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.237706 2 98.70% : 0.234605s : 1: type_inference.infer 1.30% : 0.003101s : 1: type_inference.specialize ------[replace.] 0.000127 12 60.78% : 0.000077s : 6: replace.inline 39.22% : 0.000050s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000420 12 94.25% : 0.000396s : 6: match.inline 5.75% : 0.000024s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000324 1724 0.99% : 0.000003s : 18: predicate.accumulaten_eliminater 0.92% : 0.000003s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000002s : 10: predicate.addn_check_dump 1.06% : 0.000003s : 18: predicate.addn_zero_filter 0.82% : 0.000003s : 18: predicate.adjust_all_reduce_mul_add 2.64% : 0.000009s : 28: predicate.arithmetic_simplify 1.17% : 0.000004s : 18: predicate.cast_eliminate 0.52% : 0.000002s : 10: predicate.check_bprop_eliminate 0.58% : 0.000002s : 10: predicate.compare_switch_simplify 0.14% : 0.000000s : 5: predicate.const_output_eliminate 0.92% : 0.000003s : 10: predicate.depend_value_elim 1.01% : 0.000003s : 18: predicate.dict_get_item_const_eliminator 1.15% : 0.000004s : 18: predicate.dict_get_item_eliminator 1.20% : 0.000004s : 18: predicate.dict_set_item_eliminator 1.18% : 0.000004s : 10: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 5: predicate.elim_not_effective 0.37% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000005s : 23: predicate.environ_add_const_eliminate 0.99% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 23: predicate.environ_get_depend_swap 1.51% : 0.000005s : 33: predicate.environ_get_eliminate 1.00% : 0.000003s : 23: predicate.environ_get_set_eliminate 1.41% : 0.000005s : 30: predicate.exchange_switch_depend_value 1.92% : 0.000006s : 30: predicate.float_depend_g_call 0.52% : 0.000002s : 10: predicate.float_environ_get_switch 0.68% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 5: predicate.fold_const_symbol 0.77% : 0.000002s : 10: predicate.get_grad_eliminate 0.20% : 0.000001s : 5: predicate.graph_param_transform 0.37% : 0.000001s : 10: predicate.incorporate_call 0.33% : 0.000001s : 10: predicate.incorporate_call_switch 4.79% : 0.000016s : 78: predicate.inline 0.64% : 0.000002s : 10: predicate.inline_without_move 0.22% : 0.000001s : 10: predicate.j_node_and_user_rematch 1.18% : 0.000004s : 10: predicate.less_batch_normalization 1.91% : 0.000006s : 34: predicate.list_to_tuple_eliminator_ 2.33% : 0.000008s : 52: predicate.load_eliminater 1.40% : 0.000005s : 5: predicate.loop_unroll_after_grad 2.46% : 0.000008s : 54: predicate.loop_unroll_before_grad 1.83% : 0.000006s : 28: predicate.make_slice_get_slice_eliminator 0.75% : 0.000002s : 10: predicate.merge_addn 0.49% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.43% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000003s : 18: predicate.minmaximum_grad 1.27% : 0.000004s : 5: predicate.mutable_eliminate 0.35% : 0.000001s : 5: predicate.opt_reshape 0.35% : 0.000001s : 5: predicate.parallel_virtual_node 2.76% : 0.000009s : 30: predicate.partial_defer_inline 1.22% : 0.000004s : 29: predicate.partial_eliminate 0.80% : 0.000003s : 18: predicate.print_const_string_wrapper 0.63% : 0.000002s : 10: predicate.reduce_all_const_elim 1.48% : 0.000005s : 18: predicate.reduce_eliminate 2.22% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.28% : 0.000004s : 34: predicate.replace_applicator 0.48% : 0.000002s : 10: predicate.replace_old_param 0.35% : 0.000001s : 5: predicate.reset_defer_inline 1.04% : 0.000003s : 18: predicate.reshape_eliminate 0.59% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 5: predicate.row_tensor_eliminate 0.72% : 0.000002s : 10: predicate.same_eliminate 0.31% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.99% : 0.000003s : 10: predicate.shard_identity_eliminate 0.54% : 0.000002s : 10: predicate.special_op_eliminate 0.44% : 0.000001s : 10: predicate.specialize_transform 0.78% : 0.000003s : 10: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.24% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.40% : 0.000005s : 30: predicate.switch_defer_inline 2.06% : 0.000007s : 40: predicate.switch_layer_defer_inline 5.44% : 0.000018s : 99: predicate.switch_simplify 0.85% : 0.000003s : 18: predicate.tile_eliminate 1.00% : 0.000003s : 18: predicate.transpose_eliminate 1.74% : 0.000006s : 28: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000006s : 28: predicate.tuple_list_get_item_const_eliminator 1.67% : 0.000005s : 28: predicate.tuple_list_get_item_depend_reorder 3.37% : 0.000011s : 44: predicate.tuple_list_get_item_eliminator 1.97% : 0.000006s : 28: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000008s : 38: predicate.tuple_list_set_item_eliminator 1.64% : 0.000005s : 34: predicate.tuple_to_list_eliminator_ 2.24% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.69% : 0.000009s : 62: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 5: predicate.value_based_eliminate 0.67% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000002s : 10: predicate.virtual_output_eliminate 0.16% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002229 15 46.23% : 0.001031s : 7: func_graph_cloner_run.FuncGraphClonerGraph 53.77% : 0.001199s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.531588 192 0.00% : 0.000004s : 1: ForceFp32Comm 24.76% : 0.131619s : 1: add_attr 24.75% : 0.131545s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000157s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.04% : 0.000234s : 1: auto_monad 0.01% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.93% : 0.004917s : 1: bootstrap 0.01% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.02% : 0.000107s : 1: event_method 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000014s : 1: label_micro_interleaved_index 0.17% : 0.000896s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.20% : 0.001067s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000031s : 1: opt.transform.mutable_eliminate 0.40% : 0.002121s : 78: opt.transform.opt_a 0.01% : 0.000043s : 1: opt.transform.opt_after_cconv 0.01% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000143s : 28: opt.transform.opt_b 0.01% : 0.000077s : 2: opt.transform.opt_trans_graph 0.01% : 0.000058s : 4: opt.transform.symbol_engine_opt 1.10% : 0.005822s : 1: opt_a 0.04% : 0.000202s : 1: opt_after_cconv 0.18% : 0.000948s : 1: opt_after_jit_grad 0.06% : 0.000300s : 1: opt_b 1.84% : 0.009803s : 1: optimize 0.03% : 0.000140s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000059s : 1: pre_auto_parallel 0.00% : 0.000021s : 1: py_interpret_to_execute 0.00% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000063s : 1: remove_dup_value 0.17% : 0.000887s : 1: renormalize.infer 0.22% : 0.001190s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000014s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000034s : 1: rewriter_after_opt_a 0.07% : 0.000358s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000116s : 1: symbol_engine_optimizer 0.02% : 0.000113s : 1: tuple_transform 44.76% : 0.237959s : 1: type_inference TotalTime = 3.36112, [21] [bootstrap]: 0.00420287 [type_inference]: 3.25372 [event_method]: 0.00101162 [auto_monad]: 0.00018325 [graph_reusing]: 8.15999e-06 [inline]: 4.99998e-06 [add_attr]: 0.00727893, [1] [add_attr_with_inline]: 0.00726302, [1] [Cycle 1]: 0.00010863, [2] [tag_attr]: 4.966e-05 [meta_addattr_fg_expand]: 9.56e-06 [parallel-infer-symbol]: 3.91999e-06 [pre_auto_parallel]: 6.637e-05 [insert-virtual-dataset]: 3.11999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.37001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0933367, [53] [py_interpret_to_execute]: 9.05001e-06 [rewriter_before_opt_a]: 0.0807802 [opt_a]: 0.00925394, [2] [Cycle 1]: 0.00818726, [45] [expand_dump_flag]: 7.01001e-06 [switch_simplify]: 9.271e-05 [loop_unroll]: 5.814e-05 [a_1]: 0.00142743 [with_stream_mark]: 3.149e-05 [recompute_prepare]: 1.692e-05 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 2.60997e-06 [a_2]: 0.00013925 [accelerated_algorithm]: 1.163e-05 [shard]: 3.19001e-06 [meta_shard_fg_expand]: 3.98001e-06 [shard_inline]: 1.069e-05 [merge_send_recv]: 1.297e-05 [auto_parallel]: 1.266e-05 [parallel]: 2.343e-05 [flash_sp]: 1.314e-05 [merge_comm]: 4.17e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 1.145e-05 [allreduce_slice_to_reducescatter]: 1.25999e-06 [virtual_shard_identity]: 1.454e-05 [virtual_dataset]: 1.18e-05 [get_grad_eliminate_]: 1.126e-05 [virtual_output]: 1.161e-05 [merge_forward]: 5.97999e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 1.217e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.01e-05 [merge_recompute_call_nodes]: 1.82001e-06 [before_grad]: 1.896e-05 [set_forward_comm_id_for_comm_node_pass]: 4.59002e-06 [meta_fg_expand]: 5.24e-06 [flash_sp_send_recv_attached]: 3.24001e-06 [receive_attached]: 2.63e-06 [after_resolve]: 2.082e-05 [a_after_grad]: 1.813e-05 [renormalize]: 0.00559258 [add_forward_monad_depend]: 1.471e-05 [auto_monad_grad]: 3.00998e-06 [auto_monad_eliminator]: 3.055e-05 [cse]: 5.202e-05 [a_3]: 0.00010015 [Cycle 2]: 0.00104187, [45] [expand_dump_flag]: 2.63998e-06 [switch_simplify]: 1.26e-05 [loop_unroll]: 1.016e-05 [a_1]: 0.0002728 [with_stream_mark]: 2.294e-05 [recompute_prepare]: 1.257e-05 [updatestate_depend_eliminate]: 4.80999e-06 [updatestate_assign_eliminate]: 4.46002e-06 [updatestate_loads_eliminate]: 4.23001e-06 [parameter_eliminate]: 2.88e-06 [a_2]: 0.00012566 [accelerated_algorithm]: 1.164e-05 [shard]: 2.98998e-06 [meta_shard_fg_expand]: 2.52001e-06 [shard_inline]: 9.66e-06 [merge_send_recv]: 1.191e-05 [auto_parallel]: 1.13e-05 [parallel]: 1.093e-05 [flash_sp]: 4.98001e-06 [merge_comm]: 4.49002e-06 [allreduce_fusion]: 4.15999e-06 [matmul_add_comm_reduction]: 1.112e-05 [allreduce_slice_to_reducescatter]: 1.65001e-06 [virtual_shard_identity]: 1.311e-05 [virtual_dataset]: 1.067e-05 [get_grad_eliminate_]: 1.027e-05 [virtual_output]: 1.076e-05 [merge_forward]: 5.76e-06 [cell_reuse_recompute_pass]: 3.87998e-06 [offload_activation]: 1.231e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.23e-05 [merge_recompute_call_nodes]: 2.11e-06 [before_grad]: 1.525e-05 [set_forward_comm_id_for_comm_node_pass]: 4.95999e-06 [meta_fg_expand]: 5.62001e-06 [flash_sp_send_recv_attached]: 2.02999e-06 [receive_attached]: 2.93e-06 [after_resolve]: 1.805e-05 [a_after_grad]: 1.626e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.88002e-06 [auto_monad_grad]: 1.86e-06 [auto_monad_eliminator]: 1.054e-05 [cse]: 2.964e-05 [a_3]: 6.366e-05 [py_interpret_to_execute_after_opt_a]: 1.38e-05 [slice_cell_reuse_recomputed_activation]: 2.34001e-06 [rewriter_after_opt_a]: 2.792e-05 [convert_after_rewriter]: 2.54999e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00110263 [opt_b]: 0.00034465, [1] [Cycle 1]: 0.00033309, [7] [b_1]: 0.0002182 [b_2]: 1.352e-05 [updatestate_depend_eliminate]: 1.168e-05 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.02002e-06 [renormalize]: 1.07e-06 [cse]: 4.189e-05 [optimize_parallel_all_gather_comm]: 2.529e-05 [overlap_param_gather]: 3.02002e-06 [cconv]: 4.069e-05 [loop_unroll]: 0.00064344 [opt_after_cconv]: 0.00015432, [1] [Cycle 1]: 0.00014705, [7] [c_1]: 5.503e-05 [parameter_eliminate]: 6.59001e-06 [updatestate_depend_eliminate]: 8.62e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.33e-06 [cse]: 3.385e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 5.624e-05 [tuple_transform]: 0.00021409, [1] [Cycle 1]: 0.0002079, [4] [d_1]: 0.00016748 [none_parameter_eliminate]: 2.27001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.296e-05 [partial_unused_args_eliminate]: 2.11e-06 [add_recomputation]: 6.936e-05 [cse_after_recomputation]: 3.356e-05, [1] [Cycle 1]: 2.869e-05, [1] [cse]: 2.214e-05 [environ_conv]: 1.252e-05 [swap_dp_allreduce_reducescatter]: 6.78e-06 [bias_add_comm_swap]: 4.2e-06 [label_micro_interleaved_index]: 8.26002e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.74001e-06 [assign_add_opt]: 1.31002e-06 [ForceFp32Comm]: 1.35001e-06 [remove_cast_before_assign_add]: 1.44998e-06 [full_micro_interleaved_order_control]: 2.63998e-06 [reorder_send_recv_between_fp_bp]: 3.05998e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.41002e-06 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.76e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96998e-06 [control_data_broadcast_order]: 1.872e-05 [grouped_pairwise_exchange_alltoall]: 1.94e-06 [offloading_packed_experts]: 4.91997e-06 [overlap_recompute_and_grad_model_parallel]: 5.99e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.56998e-06 [overlap_grad_ring_attention]: 6.04999e-06 [overlap_grad_flash_sp]: 2.494e-05 [begin_end_overlap_inline]: 9.20001e-07 [split_matmul_comm_elemetwise]: 2.64999e-06 [split_layernorm_comm]: 2.12999e-06 [handle_group_info]: 1.77999e-06 [symbol_engine_optimizer]: 9.752e-05, [1] [Cycle 1]: 9.227e-05, [6] [build]: 4.12e-06 [elim_shapecalc]: 1.511e-05 [elim_not_effective]: 1.632e-05 [opt_reshape]: 1.045e-05 [fold_const_symbol]: 1.459e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.67001e-06 [pipeline_parallel_scheduler]: 1.90001e-06 [auto_monad_reorder]: 2.438e-05 [get_jit_bprop_graph]: 3.4e-06 [rewriter_after_jit_bprop_graph]: 7.60998e-06 [opt_after_jit_grad]: 0.00090982 [validate]: 6.765e-05 Sums bootstrap : 0.004203s : 0.13% type_inference : 3.253715s : 97.05% event_method : 0.001012s : 0.03% auto_monad : 0.000183s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000005s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000050s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000010s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000066s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.00% optimize.rewriter_before_opt_a : 0.080780s : 2.41% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000105s : 0.00% optimize.opt_a.loop_unroll : 0.000068s : 0.00% optimize.opt_a.a_1 : 0.001700s : 0.05% optimize.opt_a.with_stream_mark : 0.000054s : 0.00% optimize.opt_a.recompute_prepare : 0.000029s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000265s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000020s : 0.00% optimize.opt_a.merge_send_recv : 0.000025s : 0.00% optimize.opt_a.auto_parallel : 0.000024s : 0.00% optimize.opt_a.parallel : 0.000034s : 0.00% optimize.opt_a.flash_sp : 0.000018s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000008s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000028s : 0.00% optimize.opt_a.virtual_dataset : 0.000022s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000022s : 0.00% optimize.opt_a.virtual_output : 0.000022s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000034s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000011s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000039s : 0.00% optimize.opt_a.a_after_grad : 0.000034s : 0.00% optimize.opt_a.renormalize : 0.005593s : 0.17% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.00% optimize.opt_a.cse : 0.000082s : 0.00% optimize.opt_a.a_3 : 0.000164s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000014s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000003s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.001103s : 0.03% optimize.opt_b.b_1 : 0.000218s : 0.01% optimize.opt_b.b_2 : 0.000014s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.00% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000041s : 0.00% optimize.loop_unroll : 0.000643s : 0.02% optimize.opt_after_cconv.c_1 : 0.000055s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000034s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000056s : 0.00% optimize.tuple_transform.d_1 : 0.000167s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.00% optimize.cse_after_recomputation.cse : 0.000022s : 0.00% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000910s : 0.03% validate : 0.000068s : 0.00% Time group info: ------[substitution.] 0.000420 41 0.55% : 0.000002s : 2: substitution.elim_not_effective 0.43% : 0.000002s : 2: substitution.fold_const_symbol 3.08% : 0.000013s : 7: substitution.graph_param_transform 83.75% : 0.000352s : 7: substitution.inline 1.37% : 0.000006s : 4: substitution.j_node_and_user_rematch 1.85% : 0.000008s : 4: substitution.remove_not_recompute_node 2.00% : 0.000008s : 6: substitution.replace_old_param 6.97% : 0.000029s : 9: substitution.tuple_list_get_item_eliminator ------[type_inference.] 3.253471 2 95.40% : 3.103690s : 1: type_inference.infer 4.60% : 0.149781s : 1: type_inference.specialize ------[replace.] 0.000147 16 55.71% : 0.000082s : 7: replace.inline 44.29% : 0.000065s : 9: replace.tuple_list_get_item_eliminator ------[match.] 0.000371 16 93.20% : 0.000345s : 7: match.inline 6.80% : 0.000025s : 9: match.tuple_list_get_item_eliminator ------[predicate.] 0.000414 2641 1.06% : 0.000004s : 28: predicate.accumulaten_eliminater 0.64% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.50% : 0.000002s : 16: predicate.addn_check_dump 1.03% : 0.000004s : 28: predicate.addn_zero_filter 0.86% : 0.000004s : 28: predicate.adjust_all_reduce_mul_add 2.44% : 0.000010s : 44: predicate.arithmetic_simplify 1.02% : 0.000004s : 28: predicate.cast_eliminate 0.50% : 0.000002s : 16: predicate.check_bprop_eliminate 0.51% : 0.000002s : 16: predicate.compare_switch_simplify 0.18% : 0.000001s : 8: predicate.const_output_eliminate 0.62% : 0.000003s : 16: predicate.depend_value_elim 1.09% : 0.000005s : 28: predicate.dict_get_item_const_eliminator 1.16% : 0.000005s : 28: predicate.dict_get_item_eliminator 0.96% : 0.000004s : 28: predicate.dict_set_item_eliminator 0.90% : 0.000004s : 15: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 7: predicate.elim_not_effective 0.35% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000006s : 36: predicate.environ_add_const_eliminate 1.22% : 0.000005s : 36: predicate.environ_get_add_eliminate 1.01% : 0.000004s : 36: predicate.environ_get_depend_swap 1.58% : 0.000007s : 52: predicate.environ_get_eliminate 1.18% : 0.000005s : 36: predicate.environ_get_set_eliminate 1.46% : 0.000006s : 44: predicate.exchange_switch_depend_value 2.39% : 0.000010s : 44: predicate.float_depend_g_call 0.42% : 0.000002s : 16: predicate.float_environ_get_switch 0.61% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.15% : 0.000001s : 7: predicate.fold_const_symbol 0.68% : 0.000003s : 16: predicate.get_grad_eliminate 0.23% : 0.000001s : 7: predicate.graph_param_transform 0.41% : 0.000002s : 16: predicate.incorporate_call 0.38% : 0.000002s : 16: predicate.incorporate_call_switch 5.11% : 0.000021s : 120: predicate.inline 0.60% : 0.000002s : 16: predicate.inline_without_move 0.29% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.87% : 0.000004s : 16: predicate.less_batch_normalization 2.02% : 0.000008s : 52: predicate.list_to_tuple_eliminator_ 2.60% : 0.000011s : 81: predicate.load_eliminater 0.83% : 0.000003s : 8: predicate.loop_unroll_after_grad 2.54% : 0.000011s : 69: predicate.loop_unroll_before_grad 1.66% : 0.000007s : 44: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 16: predicate.merge_addn 0.69% : 0.000003s : 16: predicate.micro_step_allgather_replace 0.68% : 0.000003s : 16: predicate.mini_step_allgather_replace 0.84% : 0.000003s : 28: predicate.minmaximum_grad 1.07% : 0.000004s : 8: predicate.mutable_eliminate 0.30% : 0.000001s : 7: predicate.opt_reshape 0.36% : 0.000001s : 8: predicate.parallel_virtual_node 2.14% : 0.000009s : 44: predicate.partial_defer_inline 1.44% : 0.000006s : 45: predicate.partial_eliminate 1.10% : 0.000005s : 28: predicate.print_const_string_wrapper 0.58% : 0.000002s : 16: predicate.reduce_all_const_elim 1.43% : 0.000006s : 28: predicate.reduce_eliminate 2.41% : 0.000010s : 81: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000002s : 16: predicate.remove_not_recompute_node 1.45% : 0.000006s : 53: predicate.replace_applicator 0.45% : 0.000002s : 16: predicate.replace_old_param 0.37% : 0.000002s : 8: predicate.reset_defer_inline 1.21% : 0.000005s : 28: predicate.reshape_eliminate 0.67% : 0.000003s : 16: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 8: predicate.row_tensor_eliminate 0.69% : 0.000003s : 16: predicate.same_eliminate 0.36% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.81% : 0.000003s : 16: predicate.shard_identity_eliminate 0.56% : 0.000002s : 15: predicate.special_op_eliminate 0.59% : 0.000002s : 16: predicate.specialize_transform 0.85% : 0.000004s : 16: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000004s : 16: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.54% : 0.000006s : 44: predicate.switch_defer_inline 2.17% : 0.000009s : 60: predicate.switch_layer_defer_inline 5.87% : 0.000024s : 136: predicate.switch_simplify 1.18% : 0.000005s : 28: predicate.tile_eliminate 0.97% : 0.000004s : 28: predicate.transpose_eliminate 1.36% : 0.000006s : 43: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000007s : 43: predicate.tuple_list_get_item_const_eliminator 1.81% : 0.000008s : 43: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000012s : 68: predicate.tuple_list_get_item_eliminator 1.49% : 0.000006s : 43: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000010s : 59: predicate.tuple_list_set_item_eliminator 1.99% : 0.000008s : 52: predicate.tuple_to_list_eliminator_ 2.29% : 0.000009s : 81: predicate.updatestate_pure_node_eliminater 2.93% : 0.000012s : 97: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 8: predicate.value_based_eliminate 0.72% : 0.000003s : 16: predicate.virtual_dataset_eliminate 0.74% : 0.000003s : 16: predicate.virtual_output_eliminate 0.21% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.012954 32 86.88% : 0.011254s : 23: func_graph_cloner_run.FuncGraphClonerGraph 13.12% : 0.001700s : 9: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.470178 192 0.00% : 0.000005s : 1: ForceFp32Comm 0.21% : 0.007287s : 1: add_attr 0.21% : 0.007268s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000195s : 1: auto_monad 0.00% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.12% : 0.004275s : 1: bootstrap 0.00% : 0.000046s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.03% : 0.001040s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000012s : 1: label_micro_interleaved_index 0.02% : 0.000654s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.03% : 0.001119s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000032s : 1: opt.transform.mutable_eliminate 0.07% : 0.002497s : 78: opt.transform.opt_a 0.00% : 0.000053s : 1: opt.transform.opt_after_cconv 0.00% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000194s : 28: opt.transform.opt_b 0.01% : 0.000177s : 2: opt.transform.opt_trans_graph 0.00% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.27% : 0.009261s : 1: opt_a 0.00% : 0.000159s : 1: opt_after_cconv 0.03% : 0.000925s : 1: opt_after_jit_grad 0.01% : 0.000348s : 1: opt_b 2.69% : 0.093346s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000072s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000061s : 1: remove_dup_value 0.10% : 0.003500s : 1: renormalize.infer 0.06% : 0.002072s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 2.33% : 0.080816s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000100s : 1: symbol_engine_optimizer 0.01% : 0.000218s : 1: tuple_transform 93.76% : 3.253757s : 1: type_inference group_cases_0 have all been run, results of sub cases are below: case: (1,) {} pass. case: (1,) {} pass. case: (0,) {} pass. case: (1,) {} pass. case: ('PYBOOST',) {} pass. case: (0,) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. ops group_cases_1 with 8 cases start to running, all cases are below: case: (, 'KBK') case: (, 'PYBOOST') case: (, 0) case: (, 1) case: (, 0) case: (, 1) case: (, 0) case: (, 1) ops group_cases_1 total running memory: 32M, memory threshold: 51200M [WARNING] ME(167381:281473890602800,ForkProcess-12):2026-01-29-17:42:39.521.798 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(167391:281473890602800,ForkProcess-13):2026-01-29-17:42:39.654.430 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(167514:281473890602800,ForkProcess-14):2026-01-29-17:42:39.669.506 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(167309:281473890602800,ForkProcess-11):2026-01-29-17:42:39.852.686 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. TotalTime = 0.16997, [21] [bootstrap]: 0.00120822 [type_inference]: 0.0860649 [event_method]: 1.908e-05 [auto_monad]: 0.00014681 [graph_reusing]: 6.24999e-06 [inline]: 3.06999e-06 [add_attr]: 0.00846666, [1] [add_attr_with_inline]: 0.0084506, [1] [Cycle 1]: 0.00014521, [2] [tag_attr]: 3.602e-05 [meta_addattr_fg_expand]: 1.309e-05 [parallel-infer-symbol]: 3.49001e-06 [pre_auto_parallel]: 5.66e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0728877, [53] [py_interpret_to_execute]: 8.02e-06 [rewriter_before_opt_a]: 0.00023017 [opt_a]: 0.00303566, [2] [Cycle 1]: 0.00246062, [45] [expand_dump_flag]: 3.08e-06 [switch_simplify]: 6.982e-05 [loop_unroll]: 2.916e-05 [a_1]: 0.00056363 [with_stream_mark]: 1.928e-05 [recompute_prepare]: 6.98998e-06 [updatestate_depend_eliminate]: 1.146e-05 [updatestate_assign_eliminate]: 1.032e-05 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 6.981e-05 [accelerated_algorithm]: 5.92999e-06 [shard]: 2.13998e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 5.34998e-06 [merge_send_recv]: 4.297e-05 [auto_parallel]: 7.74002e-06 [parallel]: 0.00013274 [flash_sp]: 3.283e-05 [merge_comm]: 4.93001e-06 [allreduce_fusion]: 1.15e-05 [matmul_add_comm_reduction]: 1.767e-05 [allreduce_slice_to_reducescatter]: 7.35e-06 [virtual_shard_identity]: 1.034e-05 [virtual_dataset]: 5.84999e-06 [get_grad_eliminate_]: 6.16e-06 [virtual_output]: 5.74999e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.824e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.417e-05 [merge_recompute_call_nodes]: 1.73002e-06 [before_grad]: 1.002e-05 [set_forward_comm_id_for_comm_node_pass]: 1.15e-05 [meta_fg_expand]: 2.53998e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 1.767e-05 [after_resolve]: 1.05e-05 [a_after_grad]: 1.452e-05 [renormalize]: 0.00086555 [add_forward_monad_depend]: 7.08e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 2.583e-05 [cse]: 5.104e-05 [a_3]: 4.161e-05 [Cycle 2]: 0.00056334, [45] [expand_dump_flag]: 1.83002e-06 [switch_simplify]: 6.82002e-06 [loop_unroll]: 4.95999e-06 [a_1]: 9.415e-05 [with_stream_mark]: 1.424e-05 [recompute_prepare]: 5.26998e-06 [updatestate_depend_eliminate]: 3.28998e-06 [updatestate_assign_eliminate]: 2.33002e-06 [updatestate_loads_eliminate]: 2.85002e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 5.59e-05 [accelerated_algorithm]: 4.92e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 5.35999e-06 [merge_send_recv]: 6.46e-06 [auto_parallel]: 5.97001e-06 [parallel]: 6.61999e-06 [flash_sp]: 3.42002e-06 [merge_comm]: 3.04001e-06 [allreduce_fusion]: 2.80002e-06 [matmul_add_comm_reduction]: 6.25002e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 5.47999e-06 [virtual_dataset]: 4.68001e-06 [get_grad_eliminate_]: 5.08002e-06 [virtual_output]: 4.82e-06 [merge_forward]: 3.31001e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 7.45e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.234e-05 [merge_recompute_call_nodes]: 8.80013e-07 [before_grad]: 8.99e-06 [set_forward_comm_id_for_comm_node_pass]: 3.23e-06 [meta_fg_expand]: 2.06e-06 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.37e-06 [after_resolve]: 8.36002e-06 [a_after_grad]: 7.12002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.25001e-06 [auto_monad_eliminator]: 5.64e-06 [cse]: 1.292e-05 [a_3]: 2.786e-05 [py_interpret_to_execute_after_opt_a]: 6.45002e-06 [slice_cell_reuse_recomputed_activation]: 2.44001e-06 [rewriter_after_opt_a]: 2.682e-05 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.22e-06 [mutable_eliminate]: 0.0676375 [opt_b]: 0.00024126, [1] [Cycle 1]: 0.00023012, [7] [b_1]: 0.0001165 [b_2]: 8.32e-06 [updatestate_depend_eliminate]: 1.471e-05 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 3.13e-06 [renormalize]: 1.38002e-06 [cse]: 4.385e-05 [optimize_parallel_all_gather_comm]: 4.067e-05 [overlap_param_gather]: 1.131e-05 [cconv]: 4.063e-05 [loop_unroll]: 0.00063754 [opt_after_cconv]: 0.0001176, [1] [Cycle 1]: 0.00010882, [7] [c_1]: 2.79e-05 [parameter_eliminate]: 7.15003e-06 [updatestate_depend_eliminate]: 7.54002e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 2.739e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 1.653e-05 [tuple_transform]: 7.707e-05, [1] [Cycle 1]: 7.239e-05, [4] [d_1]: 4.499e-05 [none_parameter_eliminate]: 1.85001e-06 [renormalize]: 3.10014e-07 [switch_simplify]: 6.71e-06 [partial_unused_args_eliminate]: 2.12999e-06 [add_recomputation]: 7.892e-05 [cse_after_recomputation]: 2.239e-05, [1] [Cycle 1]: 1.712e-05, [1] [cse]: 1.148e-05 [environ_conv]: 3.545e-05 [swap_dp_allreduce_reducescatter]: 2.489e-05 [bias_add_comm_swap]: 1.165e-05 [label_micro_interleaved_index]: 1.444e-05 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.74e-06 [slice_recompute_activation]: 2.34999e-06 [micro_interleaved_order_control]: 2.41e-06 [assign_add_opt]: 1.77001e-06 [ForceFp32Comm]: 9.29984e-07 [remove_cast_before_assign_add]: 8.96002e-06 [full_micro_interleaved_order_control]: 1.028e-05 [reorder_send_recv_between_fp_bp]: 2.82002e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.38002e-06 [interleave_parallel_branches]: 8.12e-06 [overlap_opt_shard_in_pipeline]: 2.918e-05 [overlap_opt_shard_grad_in_pipeline]: 2.09999e-06 [control_data_broadcast_order]: 1.503e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 3.85998e-06 [overlap_recompute_and_grad_model_parallel]: 1.31e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.42999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.51998e-06 [overlap_recompute_comm]: 2.74001e-06 [overlap_grad_ring_attention]: 1.934e-05 [overlap_grad_flash_sp]: 5.828e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 9.59e-06 [split_layernorm_comm]: 2.07001e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 8.426e-05, [1] [Cycle 1]: 7.899e-05, [6] [build]: 3.73001e-06 [elim_shapecalc]: 1.254e-05 [elim_not_effective]: 1.462e-05 [opt_reshape]: 6.84999e-06 [fold_const_symbol]: 9.67001e-06 [renormalize]: 2.50002e-07 [detach_backward]: 1.94999e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 2.35e-05 [get_jit_bprop_graph]: 1.85001e-06 [rewriter_after_jit_bprop_graph]: 6.49999e-06 [opt_after_jit_grad]: 0.00077963 [validate]: 7.614e-05 Sums bootstrap : 0.001208s : 0.75% type_inference : 0.086065s : 53.64% event_method : 0.000019s : 0.01% auto_monad : 0.000147s : 0.09% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000036s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000057s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000230s : 0.14% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000077s : 0.05% optimize.opt_a.loop_unroll : 0.000034s : 0.02% optimize.opt_a.a_1 : 0.000658s : 0.41% optimize.opt_a.with_stream_mark : 0.000034s : 0.02% optimize.opt_a.recompute_prepare : 0.000012s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000126s : 0.08% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000049s : 0.03% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000139s : 0.09% optimize.opt_a.flash_sp : 0.000036s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.01% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000022s : 0.01% optimize.opt_a.renormalize : 0.000866s : 0.54% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.02% optimize.opt_a.cse : 0.000064s : 0.04% optimize.opt_a.a_3 : 0.000069s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000027s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.067637s : 42.16% optimize.opt_b.b_1 : 0.000117s : 0.07% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000044s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000041s : 0.03% optimize.overlap_param_gather : 0.000011s : 0.01% optimize.cconv : 0.000041s : 0.03% optimize.loop_unroll : 0.000638s : 0.40% optimize.opt_after_cconv.c_1 : 0.000028s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000045s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000079s : 0.05% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000035s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.02% optimize.bias_add_comm_swap : 0.000012s : 0.01% optimize.label_micro_interleaved_index : 0.000014s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.01% optimize.full_micro_interleaved_order_control : 0.000010s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000029s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.01% optimize.overlap_grad_flash_sp : 0.000058s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000780s : 0.49% validate : 0.000076s : 0.05% Time group info: ------[substitution.] 0.000202 24 1.23% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 2.97% : 0.000006s : 3: substitution.graph_param_transform 73.57% : 0.000149s : 5: substitution.inline 2.22% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.72% : 0.000014s : 4: substitution.remove_not_recompute_node 2.18% : 0.000004s : 2: substitution.replace_old_param 10.47% : 0.000021s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.085979 2 98.83% : 0.084970s : 1: type_inference.infer 1.17% : 0.001008s : 1: type_inference.specialize ------[replace.] 0.000061 7 77.33% : 0.000047s : 5: replace.inline 22.67% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000166 7 88.00% : 0.000146s : 5: match.inline 12.00% : 0.000020s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000186 1031 0.83% : 0.000002s : 11: predicate.accumulaten_eliminater 1.84% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 6: predicate.addn_check_dump 1.00% : 0.000002s : 11: predicate.addn_zero_filter 0.76% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 17: predicate.arithmetic_simplify 0.83% : 0.000002s : 11: predicate.cast_eliminate 0.54% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.88% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 3: predicate.elim_not_effective 0.68% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 14: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 14: predicate.environ_get_depend_swap 1.57% : 0.000003s : 20: predicate.environ_get_eliminate 1.00% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.32% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 18: predicate.float_depend_g_call 0.43% : 0.000001s : 6: predicate.float_environ_get_switch 0.86% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.77% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.46% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 5.88% : 0.000011s : 47: predicate.inline 0.74% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.65% : 0.000001s : 6: predicate.less_batch_normalization 1.87% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 30: predicate.load_eliminater 1.32% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.55% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.44% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.75% : 0.000001s : 11: predicate.minmaximum_grad 3.60% : 0.000007s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.54% : 0.000001s : 3: predicate.parallel_virtual_node 1.79% : 0.000003s : 18: predicate.partial_defer_inline 1.27% : 0.000002s : 16: predicate.partial_eliminate 0.80% : 0.000001s : 11: predicate.print_const_string_wrapper 0.59% : 0.000001s : 6: predicate.reduce_all_const_elim 1.02% : 0.000002s : 11: predicate.reduce_eliminate 2.28% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000001s : 6: predicate.remove_not_recompute_node 1.12% : 0.000002s : 19: predicate.replace_applicator 0.48% : 0.000001s : 6: predicate.replace_old_param 0.45% : 0.000001s : 3: predicate.reset_defer_inline 0.82% : 0.000002s : 11: predicate.reshape_eliminate 0.48% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 3: predicate.row_tensor_eliminate 0.95% : 0.000002s : 6: predicate.same_eliminate 0.33% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.78% : 0.000001s : 6: predicate.shard_identity_eliminate 0.99% : 0.000002s : 6: predicate.special_op_eliminate 0.62% : 0.000001s : 6: predicate.specialize_transform 0.89% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 18: predicate.switch_defer_inline 1.85% : 0.000003s : 24: predicate.switch_layer_defer_inline 5.07% : 0.000009s : 61: predicate.switch_simplify 0.84% : 0.000002s : 11: predicate.tile_eliminate 0.88% : 0.000002s : 11: predicate.transpose_eliminate 1.44% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.62% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.11% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.85% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.48% : 0.000001s : 3: predicate.value_based_eliminate 0.70% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.86% : 0.000002s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000754 12 50.48% : 0.000381s : 5: func_graph_cloner_run.FuncGraphClonerGraph 49.52% : 0.000374s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.253346 192 0.00% : 0.000004s : 1: ForceFp32Comm 3.34% : 0.008473s : 1: add_attr 3.34% : 0.008456s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000084s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.06% : 0.000153s : 1: auto_monad 0.01% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000015s : 1: bias_add_comm_swap 0.50% : 0.001272s : 1: bootstrap 0.02% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.02% : 0.000040s : 1: environ_conv 0.01% : 0.000026s : 1: event_method 0.01% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000018s : 1: label_micro_interleaved_index 0.26% : 0.000651s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 26.71% : 0.067661s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000037s : 1: opt.transform.mutable_eliminate 0.42% : 0.001055s : 78: opt.transform.opt_a 0.01% : 0.000025s : 1: opt.transform.opt_after_cconv 0.01% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000093s : 28: opt.transform.opt_b 0.02% : 0.000049s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 1.20% : 0.003040s : 1: opt_a 0.05% : 0.000122s : 1: opt_after_cconv 0.31% : 0.000795s : 1: opt_after_jit_grad 0.10% : 0.000246s : 1: opt_b 28.77% : 0.072893s : 1: optimize 0.02% : 0.000046s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000062s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.18% : 0.000454s : 1: renormalize.infer 0.16% : 0.000401s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000030s : 1: rewriter_after_opt_a 0.09% : 0.000237s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.01% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000087s : 1: symbol_engine_optimizer 0.03% : 0.000080s : 1: tuple_transform 33.98% : 0.086093s : 1: type_inference TotalTime = 1.0726, [21] [bootstrap]: 0.00097116 [type_inference]: 0.986799 [event_method]: 0.00045149 [auto_monad]: 0.0001465 [graph_reusing]: 6.39001e-06 [inline]: 3.55998e-06 [add_attr]: 0.0729783, [1] [add_attr_with_inline]: 0.0729585, [1] [Cycle 1]: 0.0001627, [2] [tag_attr]: 4.803e-05 [meta_addattr_fg_expand]: 1.566e-05 [parallel-infer-symbol]: 3.43e-06 [pre_auto_parallel]: 6.871e-05 [insert-virtual-dataset]: 2.38998e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.34001e-06 [pipeline_split]: 1.76998e-06 [optimize]: 0.010228, [53] [py_interpret_to_execute]: 7.88999e-06 [rewriter_before_opt_a]: 0.00030134 [opt_a]: 0.00698379, [2] [Cycle 1]: 0.00601306, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 8.624e-05 [loop_unroll]: 4.587e-05 [a_1]: 0.00094026 [with_stream_mark]: 2.288e-05 [recompute_prepare]: 1.407e-05 [updatestate_depend_eliminate]: 1.427e-05 [updatestate_assign_eliminate]: 1.224e-05 [updatestate_loads_eliminate]: 3.78999e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 0.00014138 [accelerated_algorithm]: 1.106e-05 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.71e-06 [shard_inline]: 1.011e-05 [merge_send_recv]: 4.321e-05 [auto_parallel]: 8.48999e-06 [parallel]: 8.541e-05 [flash_sp]: 3.535e-05 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 1.223e-05 [matmul_add_comm_reduction]: 1.766e-05 [allreduce_slice_to_reducescatter]: 8.22e-06 [virtual_shard_identity]: 1.551e-05 [virtual_dataset]: 1.145e-05 [get_grad_eliminate_]: 1.048e-05 [virtual_output]: 1.029e-05 [merge_forward]: 5.00999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.86e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.605e-05 [merge_recompute_call_nodes]: 1.41998e-06 [before_grad]: 1.451e-05 [set_forward_comm_id_for_comm_node_pass]: 1.225e-05 [meta_fg_expand]: 3.80998e-06 [flash_sp_send_recv_attached]: 2.89999e-06 [receive_attached]: 1.686e-05 [after_resolve]: 1.955e-05 [a_after_grad]: 1.679e-05 [renormalize]: 0.00379804 [add_forward_monad_depend]: 9.24e-06 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 3.032e-05 [cse]: 8.128e-05 [a_3]: 8.583e-05 [Cycle 2]: 0.00095616, [45] [expand_dump_flag]: 2.46e-06 [switch_simplify]: 1.277e-05 [loop_unroll]: 9.57999e-06 [a_1]: 0.00026068 [with_stream_mark]: 1.882e-05 [recompute_prepare]: 1.106e-05 [updatestate_depend_eliminate]: 4.4e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 0.00012496 [accelerated_algorithm]: 1.022e-05 [shard]: 2.69999e-06 [meta_shard_fg_expand]: 2.39999e-06 [shard_inline]: 9.62001e-06 [merge_send_recv]: 8.87e-06 [auto_parallel]: 1.012e-05 [parallel]: 1.021e-05 [flash_sp]: 4.17e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 3.8e-06 [matmul_add_comm_reduction]: 9.66e-06 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.136e-05 [virtual_dataset]: 9.64e-06 [get_grad_eliminate_]: 9.34e-06 [virtual_output]: 9.59e-06 [merge_forward]: 4.48001e-06 [cell_reuse_recompute_pass]: 2.59999e-06 [offload_activation]: 1.032e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.924e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.334e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 1.83002e-06 [receive_attached]: 2.01e-06 [after_resolve]: 1.612e-05 [a_after_grad]: 1.581e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.47001e-06 [auto_monad_eliminator]: 7.83999e-06 [cse]: 2.425e-05 [a_3]: 6.248e-05 [py_interpret_to_execute_after_opt_a]: 7.25e-06 [slice_cell_reuse_recomputed_activation]: 2.28998e-06 [rewriter_after_opt_a]: 3.591e-05 [convert_after_rewriter]: 1.64998e-06 [order_py_execute_after_rewriter]: 1.27e-06 [mutable_eliminate]: 0.00083779 [opt_b]: 0.00032847, [1] [Cycle 1]: 0.00032024, [7] [b_1]: 0.00021646 [b_2]: 1.114e-05 [updatestate_depend_eliminate]: 1.007e-05 [updatestate_assign_eliminate]: 3.48999e-06 [updatestate_loads_eliminate]: 3.43999e-06 [renormalize]: 9.79984e-07 [cse]: 3.635e-05 [optimize_parallel_all_gather_comm]: 3.246e-05 [overlap_param_gather]: 1.382e-05 [cconv]: 5.765e-05 [loop_unroll]: 0.00053022 [opt_after_cconv]: 0.00015839, [1] [Cycle 1]: 0.00015162, [7] [c_1]: 6.318e-05 [parameter_eliminate]: 4.20999e-06 [updatestate_depend_eliminate]: 6.65998e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 3.261e-05 [renormalize]: 5.89993e-07 [remove_dup_value]: 5.496e-05 [tuple_transform]: 0.00011235, [1] [Cycle 1]: 0.00010707, [4] [d_1]: 7.161e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 1.081e-05 [partial_unused_args_eliminate]: 1.99999e-06 [add_recomputation]: 7.689e-05 [cse_after_recomputation]: 3.167e-05, [1] [Cycle 1]: 2.636e-05, [1] [cse]: 1.9e-05 [environ_conv]: 2.786e-05 [swap_dp_allreduce_reducescatter]: 2.592e-05 [bias_add_comm_swap]: 1.208e-05 [label_micro_interleaved_index]: 1.485e-05 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.84999e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 1.36002e-06 [remove_cast_before_assign_add]: 9.32001e-06 [full_micro_interleaved_order_control]: 1.117e-05 [reorder_send_recv_between_fp_bp]: 3.36001e-06 [comm_op_add_attrs]: 1.26002e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.67001e-06 [interleave_parallel_branches]: 8.30999e-06 [overlap_opt_shard_in_pipeline]: 2.613e-05 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.547e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 4.87e-06 [overlap_recompute_and_grad_model_parallel]: 1.432e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.24998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.67001e-06 [overlap_grad_ring_attention]: 1.932e-05 [overlap_grad_flash_sp]: 5.516e-05 [begin_end_overlap_inline]: 6.50005e-07 [split_matmul_comm_elemetwise]: 9.87999e-06 [split_layernorm_comm]: 1.69e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 0.0001024, [1] [Cycle 1]: 9.732e-05, [6] [build]: 4.74998e-06 [elim_shapecalc]: 1.635e-05 [elim_not_effective]: 1.786e-05 [opt_reshape]: 1.078e-05 [fold_const_symbol]: 1.413e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.53998e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 2.501e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 3.41999e-06 [opt_after_jit_grad]: 0.00060069 [validate]: 7.706e-05 Sums bootstrap : 0.000971s : 0.10% type_inference : 0.986799s : 98.82% event_method : 0.000451s : 0.05% auto_monad : 0.000147s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000048s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000069s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000301s : 0.03% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000099s : 0.01% optimize.opt_a.loop_unroll : 0.000055s : 0.01% optimize.opt_a.a_1 : 0.001201s : 0.12% optimize.opt_a.with_stream_mark : 0.000042s : 0.00% optimize.opt_a.recompute_prepare : 0.000025s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000266s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000020s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.01% optimize.opt_a.auto_parallel : 0.000019s : 0.00% optimize.opt_a.parallel : 0.000096s : 0.01% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000016s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.00% optimize.opt_a.virtual_dataset : 0.000021s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.00% optimize.opt_a.virtual_output : 0.000020s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000028s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000036s : 0.00% optimize.opt_a.a_after_grad : 0.000033s : 0.00% optimize.opt_a.renormalize : 0.003798s : 0.38% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.00% optimize.opt_a.cse : 0.000106s : 0.01% optimize.opt_a.a_3 : 0.000148s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000838s : 0.08% optimize.opt_b.b_1 : 0.000216s : 0.02% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000058s : 0.01% optimize.loop_unroll : 0.000530s : 0.05% optimize.opt_after_cconv.c_1 : 0.000063s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000055s : 0.01% optimize.tuple_transform.d_1 : 0.000072s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000077s : 0.01% optimize.cse_after_recomputation.cse : 0.000019s : 0.00% optimize.environ_conv : 0.000028s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000026s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000026s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000055s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000601s : 0.06% validate : 0.000077s : 0.01% Time group info: ------[substitution.] 0.000267 36 0.85% : 0.000002s : 2: substitution.elim_not_effective 0.61% : 0.000002s : 2: substitution.fold_const_symbol 2.92% : 0.000008s : 7: substitution.graph_param_transform 78.98% : 0.000211s : 5: substitution.inline 1.83% : 0.000005s : 4: substitution.j_node_and_user_rematch 4.90% : 0.000013s : 4: substitution.remove_not_recompute_node 2.84% : 0.000008s : 6: substitution.replace_old_param 7.07% : 0.000019s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.986697 2 99.70% : 0.983751s : 1: type_inference.infer 0.30% : 0.002946s : 1: type_inference.specialize ------[replace.] 0.000089 11 62.93% : 0.000056s : 5: replace.inline 37.07% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000224 11 92.73% : 0.000207s : 5: match.inline 7.27% : 0.000016s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000312 2527 0.86% : 0.000003s : 25: predicate.accumulaten_eliminater 0.77% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.62% : 0.000002s : 18: predicate.addn_check_dump 1.02% : 0.000003s : 25: predicate.addn_zero_filter 0.82% : 0.000003s : 25: predicate.adjust_all_reduce_mul_add 2.11% : 0.000007s : 43: predicate.arithmetic_simplify 0.97% : 0.000003s : 25: predicate.cast_eliminate 0.61% : 0.000002s : 18: predicate.check_bprop_eliminate 0.64% : 0.000002s : 18: predicate.compare_switch_simplify 0.26% : 0.000001s : 9: predicate.const_output_eliminate 0.57% : 0.000002s : 18: predicate.depend_value_elim 0.92% : 0.000003s : 25: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 25: predicate.dict_get_item_eliminator 0.87% : 0.000003s : 25: predicate.dict_set_item_eliminator 1.05% : 0.000003s : 16: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 7: predicate.elim_not_effective 0.39% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000004s : 34: predicate.environ_add_const_eliminate 1.09% : 0.000003s : 34: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 34: predicate.environ_get_depend_swap 1.72% : 0.000005s : 52: predicate.environ_get_eliminate 1.11% : 0.000003s : 34: predicate.environ_get_set_eliminate 1.36% : 0.000004s : 36: predicate.exchange_switch_depend_value 2.13% : 0.000007s : 36: predicate.float_depend_g_call 0.58% : 0.000002s : 18: predicate.float_environ_get_switch 1.03% : 0.000003s : 27: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 7: predicate.fold_const_symbol 0.76% : 0.000002s : 18: predicate.get_grad_eliminate 0.26% : 0.000001s : 7: predicate.graph_param_transform 0.61% : 0.000002s : 18: predicate.incorporate_call 0.55% : 0.000002s : 18: predicate.incorporate_call_switch 5.81% : 0.000018s : 115: predicate.inline 0.82% : 0.000003s : 18: predicate.inline_without_move 0.45% : 0.000001s : 18: predicate.j_node_and_user_rematch 0.88% : 0.000003s : 18: predicate.less_batch_normalization 1.89% : 0.000006s : 47: predicate.list_to_tuple_eliminator_ 2.55% : 0.000008s : 74: predicate.load_eliminater 0.92% : 0.000003s : 9: predicate.loop_unroll_after_grad 2.27% : 0.000007s : 56: predicate.loop_unroll_before_grad 1.63% : 0.000005s : 43: predicate.make_slice_get_slice_eliminator 0.63% : 0.000002s : 18: predicate.merge_addn 0.63% : 0.000002s : 18: predicate.micro_step_allgather_replace 0.63% : 0.000002s : 18: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 25: predicate.minmaximum_grad 1.08% : 0.000003s : 9: predicate.mutable_eliminate 0.39% : 0.000001s : 7: predicate.opt_reshape 0.35% : 0.000001s : 9: predicate.parallel_virtual_node 1.73% : 0.000005s : 36: predicate.partial_defer_inline 1.62% : 0.000005s : 40: predicate.partial_eliminate 0.85% : 0.000003s : 25: predicate.print_const_string_wrapper 0.76% : 0.000002s : 18: predicate.reduce_all_const_elim 1.11% : 0.000003s : 25: predicate.reduce_eliminate 2.54% : 0.000008s : 74: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000002s : 18: predicate.remove_not_recompute_node 1.71% : 0.000005s : 49: predicate.replace_applicator 0.65% : 0.000002s : 18: predicate.replace_old_param 0.38% : 0.000001s : 9: predicate.reset_defer_inline 0.89% : 0.000003s : 25: predicate.reshape_eliminate 0.82% : 0.000003s : 18: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 9: predicate.row_tensor_eliminate 0.83% : 0.000003s : 18: predicate.same_eliminate 0.58% : 0.000002s : 18: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 18: predicate.shard_identity_eliminate 0.70% : 0.000002s : 16: predicate.special_op_eliminate 0.67% : 0.000002s : 18: predicate.specialize_transform 1.07% : 0.000003s : 18: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 18: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 9: predicate.switch_call_monad_eliminater 1.43% : 0.000004s : 36: predicate.switch_defer_inline 2.01% : 0.000006s : 54: predicate.switch_layer_defer_inline 4.91% : 0.000015s : 117: predicate.switch_simplify 0.93% : 0.000003s : 25: predicate.tile_eliminate 0.89% : 0.000003s : 25: predicate.transpose_eliminate 1.48% : 0.000005s : 41: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000004s : 41: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000004s : 41: predicate.tuple_list_get_item_depend_reorder 3.09% : 0.000010s : 65: predicate.tuple_list_get_item_eliminator 1.42% : 0.000004s : 41: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000007s : 59: predicate.tuple_list_set_item_eliminator 1.74% : 0.000005s : 47: predicate.tuple_to_list_eliminator_ 2.46% : 0.000008s : 74: predicate.updatestate_pure_node_eliminater 3.15% : 0.000010s : 92: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 9: predicate.value_based_eliminate 0.75% : 0.000002s : 18: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 18: predicate.virtual_output_eliminate 0.27% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 9: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003726 30 76.27% : 0.002842s : 23: func_graph_cloner_run.FuncGraphClonerGraph 23.73% : 0.000884s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.161849 192 0.00% : 0.000004s : 1: ForceFp32Comm 6.28% : 0.072986s : 1: add_attr 6.28% : 0.072964s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000157s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.09% : 0.001034s : 1: bootstrap 0.01% : 0.000063s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000032s : 1: environ_conv 0.04% : 0.000468s : 1: event_method 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.05% : 0.000541s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.07% : 0.000850s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000026s : 1: opt.transform.mutable_eliminate 0.17% : 0.001965s : 78: opt.transform.opt_a 0.01% : 0.000062s : 1: opt.transform.opt_after_cconv 0.00% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000197s : 28: opt.transform.opt_b 0.01% : 0.000080s : 2: opt.transform.opt_trans_graph 0.00% : 0.000055s : 4: opt.transform.symbol_engine_opt 0.60% : 0.006988s : 1: opt_a 0.01% : 0.000163s : 1: opt_after_cconv 0.05% : 0.000612s : 1: opt_after_jit_grad 0.03% : 0.000332s : 1: opt_b 0.88% : 0.010235s : 1: optimize 0.00% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000059s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000030s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000073s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.01% : 0.000060s : 1: remove_dup_value 0.24% : 0.002748s : 1: renormalize.infer 0.09% : 0.001036s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000039s : 1: rewriter_after_opt_a 0.03% : 0.000309s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000105s : 1: symbol_engine_optimizer 0.01% : 0.000116s : 1: tuple_transform 84.94% : 0.986822s : 1: type_inference TotalTime = 0.618883, [21] [bootstrap]: 0.00104889 [type_inference]: 0.505512 [event_method]: 0.00027236 [auto_monad]: 0.00021678 [graph_reusing]: 9.84001e-06 [inline]: 2.98e-06 [add_attr]: 0.0491445, [1] [add_attr_with_inline]: 0.0491254, [1] [Cycle 1]: 0.00020873, [2] [tag_attr]: 7.398e-05 [meta_addattr_fg_expand]: 2.236e-05 [parallel-infer-symbol]: 4.04002e-06 [pre_auto_parallel]: 9.472e-05 [insert-virtual-dataset]: 2.85002e-06 [parallel-infer-symbol-second]: 1.21002e-06 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.0615568, [53] [py_interpret_to_execute]: 1.08e-05 [rewriter_before_opt_a]: 0.00065684 [opt_a]: 0.0575972, [2] [Cycle 1]: 0.00787249, [45] [expand_dump_flag]: 6.76999e-06 [switch_simplify]: 0.0002232 [loop_unroll]: 7.311e-05 [a_1]: 0.00171914 [with_stream_mark]: 3.695e-05 [recompute_prepare]: 2.304e-05 [updatestate_depend_eliminate]: 1.852e-05 [updatestate_assign_eliminate]: 1.355e-05 [updatestate_loads_eliminate]: 5.34998e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 0.0002095 [accelerated_algorithm]: 4.601e-05 [shard]: 3.10002e-06 [meta_shard_fg_expand]: 4.45e-06 [shard_inline]: 1.458e-05 [merge_send_recv]: 4.475e-05 [auto_parallel]: 1.663e-05 [parallel]: 0.00010236 [flash_sp]: 3.907e-05 [merge_comm]: 9.69e-06 [allreduce_fusion]: 1.347e-05 [matmul_add_comm_reduction]: 2.182e-05 [allreduce_slice_to_reducescatter]: 7.93001e-06 [virtual_shard_identity]: 2.212e-05 [virtual_dataset]: 1.375e-05 [get_grad_eliminate_]: 1.314e-05 [virtual_output]: 1.322e-05 [merge_forward]: 7.13e-06 [cell_reuse_recompute_pass]: 1.59998e-06 [offload_activation]: 2.443e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.755e-05 [merge_recompute_call_nodes]: 2.11e-06 [before_grad]: 2.093e-05 [set_forward_comm_id_for_comm_node_pass]: 1.579e-05 [meta_fg_expand]: 6.56e-06 [flash_sp_send_recv_attached]: 6.43998e-06 [receive_attached]: 1.682e-05 [after_resolve]: 1.854e-05 [a_after_grad]: 2.154e-05 [renormalize]: 0.00412414 [add_forward_monad_depend]: 1.202e-05 [auto_monad_grad]: 2.86e-06 [auto_monad_eliminator]: 4.857e-05 [cse]: 0.00027677 [a_3]: 0.00011338 [Cycle 2]: 0.0497063, [45] [expand_dump_flag]: 2.90998e-06 [switch_simplify]: 1.748e-05 [loop_unroll]: 1.307e-05 [a_1]: 0.0003714 [with_stream_mark]: 2.828e-05 [recompute_prepare]: 2.078e-05 [updatestate_depend_eliminate]: 7.92003e-06 [updatestate_assign_eliminate]: 6.30002e-06 [updatestate_loads_eliminate]: 5.52999e-06 [parameter_eliminate]: 2.61999e-06 [a_2]: 0.00018004 [accelerated_algorithm]: 2.22e-05 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 3.86999e-06 [shard_inline]: 1.5e-05 [merge_send_recv]: 1.339e-05 [auto_parallel]: 1.529e-05 [parallel]: 1.129e-05 [flash_sp]: 6.08998e-06 [merge_comm]: 7.23e-06 [allreduce_fusion]: 5.72001e-06 [matmul_add_comm_reduction]: 1.457e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.811e-05 [virtual_dataset]: 1.281e-05 [get_grad_eliminate_]: 1.192e-05 [virtual_output]: 1.299e-05 [merge_forward]: 8.70001e-06 [cell_reuse_recompute_pass]: 4.1e-06 [offload_activation]: 1.543e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.0481596 [merge_recompute_call_nodes]: 5.61e-06 [before_grad]: 3.956e-05 [set_forward_comm_id_for_comm_node_pass]: 2.553e-05 [meta_fg_expand]: 9.79e-06 [flash_sp_send_recv_attached]: 3.38e-06 [receive_attached]: 2.53003e-06 [after_resolve]: 2.1e-05 [a_after_grad]: 2.181e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 8.84998e-06 [auto_monad_grad]: 3.78001e-06 [auto_monad_eliminator]: 3.161e-05 [cse]: 7.921e-05 [a_3]: 9.32e-05 [py_interpret_to_execute_after_opt_a]: 1.204e-05 [slice_cell_reuse_recomputed_activation]: 2.51e-06 [rewriter_after_opt_a]: 6.056e-05 [convert_after_rewriter]: 1.71e-06 [order_py_execute_after_rewriter]: 1.24998e-06 [mutable_eliminate]: 0.00084928 [opt_b]: 0.00047602, [1] [Cycle 1]: 0.00046586, [7] [b_1]: 0.00032468 [b_2]: 1.435e-05 [updatestate_depend_eliminate]: 1.076e-05 [updatestate_assign_eliminate]: 6.02999e-06 [updatestate_loads_eliminate]: 6.07001e-06 [renormalize]: 1.02e-06 [cse]: 6.404e-05 [optimize_parallel_all_gather_comm]: 4.02e-05 [overlap_param_gather]: 1.337e-05 [cconv]: 4.028e-05 [loop_unroll]: 0.00052103 [opt_after_cconv]: 0.0001862, [1] [Cycle 1]: 0.00017938, [7] [c_1]: 7.051e-05 [parameter_eliminate]: 5.79e-06 [updatestate_depend_eliminate]: 8.66997e-06 [updatestate_assign_eliminate]: 5.47999e-06 [updatestate_loads_eliminate]: 5.94e-06 [cse]: 4.725e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 7.025e-05 [tuple_transform]: 0.00015879, [1] [Cycle 1]: 0.00015349, [4] [d_1]: 0.00011559 [none_parameter_eliminate]: 2.04e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 1.325e-05 [partial_unused_args_eliminate]: 2.24001e-06 [add_recomputation]: 9.786e-05 [cse_after_recomputation]: 4.011e-05, [1] [Cycle 1]: 3.528e-05, [1] [cse]: 2.916e-05 [environ_conv]: 4.322e-05 [swap_dp_allreduce_reducescatter]: 2.759e-05 [bias_add_comm_swap]: 1.079e-05 [label_micro_interleaved_index]: 1.291e-05 [label_fine_grained_interleaved_index]: 3.04999e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 1.92001e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.62999e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 9.09998e-06 [full_micro_interleaved_order_control]: 1.045e-05 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.42999e-06 [interleave_parallel_branches]: 8.60001e-06 [overlap_opt_shard_in_pipeline]: 2.754e-05 [overlap_opt_shard_grad_in_pipeline]: 1.67999e-06 [control_data_broadcast_order]: 2.11e-05 [grouped_pairwise_exchange_alltoall]: 2.29001e-06 [offloading_packed_experts]: 7.06999e-06 [overlap_recompute_and_grad_model_parallel]: 1.428e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64e-06 [overlap_recompute_comm]: 2.39999e-06 [overlap_grad_ring_attention]: 2.14e-05 [overlap_grad_flash_sp]: 6.072e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 1.037e-05 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 0.00012671, [1] [Cycle 1]: 0.00012053, [6] [build]: 1.457e-05 [elim_shapecalc]: 2.105e-05 [elim_not_effective]: 2.109e-05 [opt_reshape]: 1.303e-05 [fold_const_symbol]: 1.812e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.77002e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 3.523e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 2.112e-05 [opt_after_jit_grad]: 0.00059403 [validate]: 0.00010803 Sums bootstrap : 0.001049s : 0.18% type_inference : 0.505512s : 88.94% event_method : 0.000272s : 0.05% auto_monad : 0.000217s : 0.04% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000074s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000022s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000095s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000657s : 0.12% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000241s : 0.04% optimize.opt_a.loop_unroll : 0.000086s : 0.02% optimize.opt_a.a_1 : 0.002091s : 0.37% optimize.opt_a.with_stream_mark : 0.000065s : 0.01% optimize.opt_a.recompute_prepare : 0.000044s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000026s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000390s : 0.07% optimize.opt_a.accelerated_algorithm : 0.000068s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000030s : 0.01% optimize.opt_a.merge_send_recv : 0.000058s : 0.01% optimize.opt_a.auto_parallel : 0.000032s : 0.01% optimize.opt_a.parallel : 0.000114s : 0.02% optimize.opt_a.flash_sp : 0.000045s : 0.01% optimize.opt_a.merge_comm : 0.000017s : 0.00% optimize.opt_a.allreduce_fusion : 0.000019s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000036s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000040s : 0.01% optimize.opt_a.virtual_dataset : 0.000027s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000025s : 0.00% optimize.opt_a.virtual_output : 0.000026s : 0.00% optimize.opt_a.merge_forward : 0.000016s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000040s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.048187s : 8.48% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.00% optimize.opt_a.before_grad : 0.000060s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000041s : 0.01% optimize.opt_a.meta_fg_expand : 0.000016s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000040s : 0.01% optimize.opt_a.a_after_grad : 0.000043s : 0.01% optimize.opt_a.renormalize : 0.004124s : 0.73% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.00% optimize.opt_a.auto_monad_grad : 0.000007s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000080s : 0.01% optimize.opt_a.cse : 0.000356s : 0.06% optimize.opt_a.a_3 : 0.000207s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000061s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000849s : 0.15% optimize.opt_b.b_1 : 0.000325s : 0.06% optimize.opt_b.b_2 : 0.000014s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000064s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000040s : 0.01% optimize.overlap_param_gather : 0.000013s : 0.00% optimize.cconv : 0.000040s : 0.01% optimize.loop_unroll : 0.000521s : 0.09% optimize.opt_after_cconv.c_1 : 0.000071s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.cse : 0.000047s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000070s : 0.01% optimize.tuple_transform.d_1 : 0.000116s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000098s : 0.02% optimize.cse_after_recomputation.cse : 0.000029s : 0.01% optimize.environ_conv : 0.000043s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000061s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000021s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000018s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000035s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000021s : 0.00% opt_after_jit_grad : 0.000594s : 0.10% validate : 0.000108s : 0.02% Time group info: ------[substitution.] 0.000789 137 0.34% : 0.000003s : 5: substitution.elim_not_effective 1.73% : 0.000014s : 6: substitution.float_tuple_getitem_switch 0.31% : 0.000002s : 5: substitution.fold_const_symbol 1.20% : 0.000010s : 9: substitution.graph_param_transform 57.27% : 0.000452s : 11: substitution.inline 1.35% : 0.000011s : 10: substitution.j_node_and_user_rematch 3.49% : 0.000027s : 2: substitution.less_batch_normalization 2.72% : 0.000021s : 8: substitution.minmaximum_grad 2.97% : 0.000023s : 10: substitution.remove_not_recompute_node 1.07% : 0.000008s : 2: substitution.replace_old_param 2.82% : 0.000022s : 2: substitution.switch_simplify 5.01% : 0.000040s : 12: substitution.tuple_list_convert_item_index_to_positive 3.75% : 0.000030s : 12: substitution.tuple_list_get_item_const_eliminator 3.29% : 0.000026s : 12: substitution.tuple_list_get_item_depend_reorder 9.31% : 0.000073s : 19: substitution.tuple_list_get_item_eliminator 3.37% : 0.000027s : 12: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.505360 2 99.22% : 0.501410s : 1: type_inference.infer 0.78% : 0.003950s : 1: type_inference.specialize ------[replace.] 0.000240 16 58.41% : 0.000140s : 11: replace.inline 22.17% : 0.000053s : 2: replace.switch_simplify 19.42% : 0.000047s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000470 16 94.09% : 0.000442s : 11: match.inline 4.41% : 0.000021s : 2: match.switch_simplify 1.50% : 0.000007s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000476 3095 0.89% : 0.000004s : 33: predicate.accumulaten_eliminater 0.54% : 0.000003s : 9: predicate.ad_related_special_op_eliminate 0.54% : 0.000003s : 20: predicate.addn_check_dump 1.02% : 0.000005s : 33: predicate.addn_zero_filter 0.88% : 0.000004s : 33: predicate.adjust_all_reduce_mul_add 2.38% : 0.000011s : 53: predicate.arithmetic_simplify 0.96% : 0.000005s : 33: predicate.cast_eliminate 0.58% : 0.000003s : 20: predicate.check_bprop_eliminate 0.56% : 0.000003s : 20: predicate.compare_switch_simplify 0.20% : 0.000001s : 10: predicate.const_output_eliminate 0.53% : 0.000003s : 20: predicate.depend_value_elim 0.91% : 0.000004s : 33: predicate.dict_get_item_const_eliminator 1.03% : 0.000005s : 33: predicate.dict_get_item_eliminator 0.89% : 0.000004s : 33: predicate.dict_set_item_eliminator 0.75% : 0.000004s : 19: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 9: predicate.elim_not_effective 0.33% : 0.000002s : 9: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000006s : 43: predicate.environ_add_const_eliminate 1.07% : 0.000005s : 43: predicate.environ_get_add_eliminate 1.08% : 0.000005s : 43: predicate.environ_get_depend_swap 1.81% : 0.000009s : 63: predicate.environ_get_eliminate 1.15% : 0.000005s : 43: predicate.environ_get_set_eliminate 1.27% : 0.000006s : 47: predicate.exchange_switch_depend_value 2.02% : 0.000010s : 47: predicate.float_depend_g_call 0.59% : 0.000003s : 20: predicate.float_environ_get_switch 1.17% : 0.000006s : 30: predicate.float_tuple_getitem_switch 0.17% : 0.000001s : 9: predicate.fold_const_symbol 0.85% : 0.000004s : 20: predicate.get_grad_eliminate 0.19% : 0.000001s : 9: predicate.graph_param_transform 0.56% : 0.000003s : 20: predicate.incorporate_call 0.49% : 0.000002s : 20: predicate.incorporate_call_switch 5.61% : 0.000027s : 140: predicate.inline 1.06% : 0.000005s : 20: predicate.inline_without_move 0.35% : 0.000002s : 20: predicate.j_node_and_user_rematch 1.11% : 0.000005s : 20: predicate.less_batch_normalization 1.67% : 0.000008s : 55: predicate.list_to_tuple_eliminator_ 2.28% : 0.000011s : 89: predicate.load_eliminater 0.66% : 0.000003s : 10: predicate.loop_unroll_after_grad 2.42% : 0.000012s : 83: predicate.loop_unroll_before_grad 1.72% : 0.000008s : 53: predicate.make_slice_get_slice_eliminator 0.63% : 0.000003s : 20: predicate.merge_addn 0.63% : 0.000003s : 20: predicate.micro_step_allgather_replace 0.59% : 0.000003s : 20: predicate.mini_step_allgather_replace 0.95% : 0.000005s : 33: predicate.minmaximum_grad 1.01% : 0.000005s : 10: predicate.mutable_eliminate 0.37% : 0.000002s : 9: predicate.opt_reshape 0.31% : 0.000001s : 10: predicate.parallel_virtual_node 2.05% : 0.000010s : 47: predicate.partial_defer_inline 1.28% : 0.000006s : 46: predicate.partial_eliminate 0.91% : 0.000004s : 33: predicate.print_const_string_wrapper 0.67% : 0.000003s : 20: predicate.reduce_all_const_elim 1.26% : 0.000006s : 33: predicate.reduce_eliminate 2.47% : 0.000012s : 89: predicate.redundant_stop_gradient_eliminater 0.67% : 0.000003s : 20: predicate.remove_not_recompute_node 1.34% : 0.000006s : 56: predicate.replace_applicator 0.43% : 0.000002s : 20: predicate.replace_old_param 0.19% : 0.000001s : 10: predicate.reset_defer_inline 1.11% : 0.000005s : 33: predicate.reshape_eliminate 0.66% : 0.000003s : 20: predicate.row_tensor_add_zeros_like 0.34% : 0.000002s : 10: predicate.row_tensor_eliminate 0.94% : 0.000004s : 20: predicate.same_eliminate 0.48% : 0.000002s : 20: predicate.set_cell_output_no_recompute 1.03% : 0.000005s : 20: predicate.shard_identity_eliminate 0.61% : 0.000003s : 19: predicate.special_op_eliminate 0.65% : 0.000003s : 20: predicate.specialize_transform 1.08% : 0.000005s : 20: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000004s : 20: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 10: predicate.switch_call_monad_eliminater 1.42% : 0.000007s : 47: predicate.switch_defer_inline 1.93% : 0.000009s : 67: predicate.switch_layer_defer_inline 5.76% : 0.000027s : 163: predicate.switch_simplify 1.00% : 0.000005s : 33: predicate.tile_eliminate 0.84% : 0.000004s : 33: predicate.transpose_eliminate 1.59% : 0.000008s : 52: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000008s : 52: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000007s : 52: predicate.tuple_list_get_item_depend_reorder 3.56% : 0.000017s : 75: predicate.tuple_list_get_item_eliminator 1.51% : 0.000007s : 52: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000011s : 72: predicate.tuple_list_set_item_eliminator 1.80% : 0.000009s : 55: predicate.tuple_to_list_eliminator_ 2.22% : 0.000011s : 89: predicate.updatestate_pure_node_eliminater 2.91% : 0.000014s : 109: predicate.updatestate_useless_node_eliminater 0.34% : 0.000002s : 10: predicate.value_based_eliminate 0.79% : 0.000004s : 20: predicate.virtual_dataset_eliminate 0.72% : 0.000003s : 20: predicate.virtual_output_eliminate 0.25% : 0.000001s : 9: predicate.virtual_view_grad_eliminate 0.39% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003113 29 56.87% : 0.001771s : 16: func_graph_cloner_run.FuncGraphClonerGraph 43.13% : 0.001343s : 13: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.785577 192 0.00% : 0.000004s : 1: ForceFp32Comm 6.26% : 0.049153s : 1: add_attr 6.25% : 0.049131s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000102s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000229s : 1: auto_monad 0.01% : 0.000040s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.14% : 0.001116s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000025s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000043s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000047s : 1: environ_conv 0.04% : 0.000289s : 1: event_method 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.07% : 0.000530s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.11% : 0.000862s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000030s : 1: opt.transform.mutable_eliminate 6.55% : 0.051448s : 78: opt.transform.opt_a 0.01% : 0.000069s : 1: opt.transform.opt_after_cconv 0.01% : 0.000045s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000306s : 28: opt.transform.opt_b 0.02% : 0.000126s : 2: opt.transform.opt_trans_graph 0.01% : 0.000069s : 4: opt.transform.symbol_engine_opt 7.33% : 0.057602s : 1: opt_a 0.02% : 0.000190s : 1: opt_after_cconv 0.08% : 0.000606s : 1: opt_after_jit_grad 0.06% : 0.000480s : 1: opt_b 7.84% : 0.061563s : 1: optimize 0.01% : 0.000045s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000065s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000102s : 1: pre_auto_parallel 0.00% : 0.000015s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.01% : 0.000076s : 1: remove_dup_value 0.32% : 0.002488s : 1: renormalize.infer 0.21% : 0.001617s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000025s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000066s : 1: rewriter_after_opt_a 0.09% : 0.000675s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000130s : 1: symbol_engine_optimizer 0.02% : 0.000162s : 1: tuple_transform 64.35% : 0.505543s : 1: type_inference TotalTime = 0.361731, [21] [bootstrap]: 0.00130371 [type_inference]: 0.187143 [event_method]: 2.452e-05 [auto_monad]: 0.00016666 [graph_reusing]: 6.21998e-06 [inline]: 3.68e-06 [add_attr]: 0.00966013, [1] [add_attr_with_inline]: 0.00963961, [1] [Cycle 1]: 0.00015228, [2] [tag_attr]: 3.462e-05 [meta_addattr_fg_expand]: 1.353e-05 [parallel-infer-symbol]: 5.77001e-06 [pre_auto_parallel]: 5.872e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.162287, [53] [py_interpret_to_execute]: 9.05999e-06 [rewriter_before_opt_a]: 0.00022889 [opt_a]: 0.159198, [2] [Cycle 1]: 0.158478, [45] [expand_dump_flag]: 3.46999e-06 [switch_simplify]: 7.279e-05 [loop_unroll]: 2.944e-05 [a_1]: 0.00057694 [with_stream_mark]: 2.312e-05 [recompute_prepare]: 9.14e-06 [updatestate_depend_eliminate]: 1.327e-05 [updatestate_assign_eliminate]: 9.81998e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 7.53e-05 [accelerated_algorithm]: 6.39001e-06 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 2.12999e-06 [shard_inline]: 5.20001e-06 [merge_send_recv]: 4.176e-05 [auto_parallel]: 8.28999e-06 [parallel]: 9.393e-05 [flash_sp]: 3.152e-05 [merge_comm]: 5.60001e-06 [allreduce_fusion]: 1.203e-05 [matmul_add_comm_reduction]: 1.705e-05 [allreduce_slice_to_reducescatter]: 8.03999e-06 [virtual_shard_identity]: 1.124e-05 [virtual_dataset]: 6.05002e-06 [get_grad_eliminate_]: 5.54e-06 [virtual_output]: 6.47001e-06 [merge_forward]: 4.31002e-06 [cell_reuse_recompute_pass]: 1.82999e-06 [offload_activation]: 1.861e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.23e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 9.89001e-06 [set_forward_comm_id_for_comm_node_pass]: 1.259e-05 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 3.32002e-06 [receive_attached]: 1.671e-05 [after_resolve]: 1.015e-05 [a_after_grad]: 9.20001e-06 [renormalize]: 0.156807 [add_forward_monad_depend]: 1.179e-05 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 3.225e-05 [cse]: 5.515e-05 [a_3]: 5.919e-05 [Cycle 2]: 0.00070322, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 8.52e-06 [loop_unroll]: 6.26e-06 [a_1]: 0.00011226 [with_stream_mark]: 2.283e-05 [recompute_prepare]: 5.72001e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 2.49999e-06 [a_2]: 6.042e-05 [accelerated_algorithm]: 6.01998e-06 [shard]: 2.74999e-06 [meta_shard_fg_expand]: 2.56998e-06 [shard_inline]: 5.81e-06 [merge_send_recv]: 8.72998e-06 [auto_parallel]: 9.29998e-06 [parallel]: 9.24e-06 [flash_sp]: 4.13001e-06 [merge_comm]: 3.08998e-06 [allreduce_fusion]: 2.98e-06 [matmul_add_comm_reduction]: 9.47999e-06 [allreduce_slice_to_reducescatter]: 1.27e-06 [virtual_shard_identity]: 9.71e-06 [virtual_dataset]: 5.66e-06 [get_grad_eliminate_]: 5.25999e-06 [virtual_output]: 6.15002e-06 [merge_forward]: 4.60001e-06 [cell_reuse_recompute_pass]: 2.84001e-06 [offload_activation]: 1.132e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.848e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 1.014e-05 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 3.66001e-06 [flash_sp_send_recv_attached]: 2.26e-06 [receive_attached]: 2.81e-06 [after_resolve]: 1.028e-05 [a_after_grad]: 8.22e-06 [renormalize]: 3.60014e-07 [add_forward_monad_depend]: 2.51e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 8.50999e-06 [cse]: 1.992e-05 [a_3]: 3.146e-05 [py_interpret_to_execute_after_opt_a]: 1.035e-05 [slice_cell_reuse_recomputed_activation]: 2.18998e-06 [rewriter_after_opt_a]: 3.44e-05 [convert_after_rewriter]: 1.45001e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00086975 [opt_b]: 0.00021056, [1] [Cycle 1]: 0.00020107, [7] [b_1]: 0.00010857 [b_2]: 9.19e-06 [updatestate_depend_eliminate]: 7.7e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 9.10019e-07 [cse]: 2.769e-05 [optimize_parallel_all_gather_comm]: 3.371e-05 [overlap_param_gather]: 1.173e-05 [cconv]: 3.18e-05 [loop_unroll]: 0.00064373 [opt_after_cconv]: 0.00011428, [1] [Cycle 1]: 0.00010552, [7] [c_1]: 2.768e-05 [parameter_eliminate]: 5.40001e-06 [updatestate_depend_eliminate]: 6.46999e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 2.459e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 1.571e-05 [tuple_transform]: 7.542e-05, [1] [Cycle 1]: 7.091e-05, [4] [d_1]: 4.385e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 6.11998e-06 [partial_unused_args_eliminate]: 2.14999e-06 [add_recomputation]: 7.904e-05 [cse_after_recomputation]: 2.373e-05, [1] [Cycle 1]: 1.875e-05, [1] [cse]: 1.18e-05 [environ_conv]: 4.042e-05 [swap_dp_allreduce_reducescatter]: 2.612e-05 [bias_add_comm_swap]: 4.326e-05 [label_micro_interleaved_index]: 1.576e-05 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.21998e-06 [micro_interleaved_order_control]: 2.66999e-06 [assign_add_opt]: 1.37999e-06 [ForceFp32Comm]: 1.11002e-06 [remove_cast_before_assign_add]: 9.87001e-06 [full_micro_interleaved_order_control]: 1.058e-05 [reorder_send_recv_between_fp_bp]: 3.03998e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.10999e-06 [interleave_split_concat_branches]: 1.22999e-06 [interleave_parallel_branches]: 8.57e-06 [overlap_opt_shard_in_pipeline]: 2.389e-05 [overlap_opt_shard_grad_in_pipeline]: 1.82001e-06 [control_data_broadcast_order]: 1.478e-05 [grouped_pairwise_exchange_alltoall]: 1.43002e-06 [offloading_packed_experts]: 4.66002e-06 [overlap_recompute_and_grad_model_parallel]: 1.198e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.59001e-06 [overlap_grad_ring_attention]: 2.03e-05 [overlap_grad_flash_sp]: 5.693e-05 [begin_end_overlap_inline]: 5.20027e-07 [split_matmul_comm_elemetwise]: 9.05001e-06 [split_layernorm_comm]: 1.66002e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 9.035e-05, [1] [Cycle 1]: 8.38e-05, [6] [build]: 4.27e-06 [elim_shapecalc]: 1.455e-05 [elim_not_effective]: 1.359e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 9.41998e-06 [renormalize]: 3.29979e-07 [detach_backward]: 2.26e-06 [pipeline_parallel_scheduler]: 1.70001e-06 [auto_monad_reorder]: 2.505e-05 [get_jit_bprop_graph]: 2.46e-06 [rewriter_after_jit_bprop_graph]: 5.61e-06 [opt_after_jit_grad]: 0.00071585 [validate]: 7.201e-05 Sums bootstrap : 0.001304s : 0.37% type_inference : 0.187143s : 53.34% event_method : 0.000025s : 0.01% auto_monad : 0.000167s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000059s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.00% optimize.rewriter_before_opt_a : 0.000229s : 0.07% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000081s : 0.02% optimize.opt_a.loop_unroll : 0.000036s : 0.01% optimize.opt_a.a_1 : 0.000689s : 0.20% optimize.opt_a.with_stream_mark : 0.000046s : 0.01% optimize.opt_a.recompute_prepare : 0.000015s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000136s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000050s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000103s : 0.03% optimize.opt_a.flash_sp : 0.000036s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.01% optimize.opt_a.a_after_grad : 0.000017s : 0.00% optimize.opt_a.renormalize : 0.156807s : 44.69% optimize.opt_a.add_forward_monad_depend : 0.000014s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.01% optimize.opt_a.cse : 0.000075s : 0.02% optimize.opt_a.a_3 : 0.000091s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000034s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000870s : 0.25% optimize.opt_b.b_1 : 0.000109s : 0.03% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000028s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.01% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000032s : 0.01% optimize.loop_unroll : 0.000644s : 0.18% optimize.opt_after_cconv.c_1 : 0.000028s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.00% optimize.tuple_transform.d_1 : 0.000044s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000079s : 0.02% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000040s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000026s : 0.01% optimize.bias_add_comm_swap : 0.000043s : 0.01% optimize.label_micro_interleaved_index : 0.000016s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000024s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.01% optimize.overlap_grad_flash_sp : 0.000057s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000716s : 0.20% validate : 0.000072s : 0.02% Time group info: ------[substitution.] 0.000214 24 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000002s : 2: substitution.fold_const_symbol 3.10% : 0.000007s : 3: substitution.graph_param_transform 73.66% : 0.000157s : 5: substitution.inline 2.10% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.38% : 0.000014s : 4: substitution.remove_not_recompute_node 2.62% : 0.000006s : 2: substitution.replace_old_param 10.39% : 0.000022s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.187026 2 99.29% : 0.185692s : 1: type_inference.infer 0.71% : 0.001334s : 1: type_inference.specialize ------[replace.] 0.000058 7 76.25% : 0.000044s : 5: replace.inline 23.75% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000176 7 88.09% : 0.000155s : 5: match.inline 11.91% : 0.000021s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000195 1031 1.03% : 0.000002s : 11: predicate.accumulaten_eliminater 1.39% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 0.98% : 0.000002s : 11: predicate.addn_zero_filter 0.69% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.69% : 0.000005s : 17: predicate.arithmetic_simplify 1.06% : 0.000002s : 11: predicate.cast_eliminate 0.54% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.13% : 0.000000s : 3: predicate.const_output_eliminate 0.45% : 0.000001s : 6: predicate.depend_value_elim 0.92% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 0.97% : 0.000002s : 11: predicate.dict_get_item_eliminator 1.18% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.43% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 14: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.05% : 0.000002s : 14: predicate.environ_get_depend_swap 1.43% : 0.000003s : 20: predicate.environ_get_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.22% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.30% : 0.000004s : 18: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.82% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.64% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.44% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 5.47% : 0.000011s : 47: predicate.inline 0.85% : 0.000002s : 6: predicate.inline_without_move 0.24% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 6: predicate.less_batch_normalization 2.13% : 0.000004s : 19: predicate.list_to_tuple_eliminator_ 2.17% : 0.000004s : 30: predicate.load_eliminater 1.44% : 0.000003s : 3: predicate.loop_unroll_after_grad 2.81% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 6: predicate.merge_addn 0.45% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.44% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.73% : 0.000001s : 11: predicate.minmaximum_grad 2.16% : 0.000004s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.38% : 0.000001s : 3: predicate.parallel_virtual_node 1.61% : 0.000003s : 18: predicate.partial_defer_inline 1.17% : 0.000002s : 16: predicate.partial_eliminate 0.95% : 0.000002s : 11: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.31% : 0.000003s : 11: predicate.reduce_eliminate 2.27% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000001s : 6: predicate.remove_not_recompute_node 1.35% : 0.000003s : 19: predicate.replace_applicator 0.69% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 1.04% : 0.000002s : 11: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000002s : 6: predicate.same_eliminate 0.33% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.07% : 0.000002s : 6: predicate.shard_identity_eliminate 0.75% : 0.000001s : 6: predicate.special_op_eliminate 0.60% : 0.000001s : 6: predicate.specialize_transform 1.49% : 0.000003s : 6: predicate.split_environ_get_set_with_tuple_value 1.34% : 0.000003s : 6: predicate.stack_unstack_eliminate 0.25% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.42% : 0.000003s : 18: predicate.switch_defer_inline 1.84% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.07% : 0.000010s : 61: predicate.switch_simplify 0.90% : 0.000002s : 11: predicate.tile_eliminate 0.87% : 0.000002s : 11: predicate.transpose_eliminate 1.46% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000005s : 23: predicate.tuple_list_set_item_eliminator 1.79% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.01% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.51% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 3: predicate.value_based_eliminate 0.77% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000872 12 45.28% : 0.000395s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.72% : 0.000477s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.691674 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.40% : 0.009669s : 1: add_attr 1.39% : 0.009645s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000086s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000178s : 1: auto_monad 0.00% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000048s : 1: bias_add_comm_swap 0.20% : 0.001371s : 1: bootstrap 0.01% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000045s : 1: environ_conv 0.00% : 0.000032s : 1: event_method 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000019s : 1: label_micro_interleaved_index 0.09% : 0.000656s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.13% : 0.000889s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000027s : 1: opt.transform.mutable_eliminate 0.16% : 0.001117s : 78: opt.transform.opt_a 0.00% : 0.000026s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000087s : 28: opt.transform.opt_b 0.01% : 0.000047s : 2: opt.transform.opt_trans_graph 0.01% : 0.000039s : 4: opt.transform.symbol_engine_opt 23.02% : 0.159202s : 1: opt_a 0.02% : 0.000118s : 1: opt_after_cconv 0.11% : 0.000731s : 1: opt_after_jit_grad 0.03% : 0.000215s : 1: opt_b 23.46% : 0.162294s : 1: optimize 0.01% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000062s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000028s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000064s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000019s : 1: remove_dup_value 0.07% : 0.000498s : 1: renormalize.infer 22.60% : 0.156293s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000040s : 1: rewriter_after_opt_a 0.03% : 0.000236s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000094s : 1: symbol_engine_optimizer 0.01% : 0.000079s : 1: tuple_transform 27.06% : 0.187180s : 1: type_inference TotalTime = 0.155003, [21] [bootstrap]: 0.00064481 [type_inference]: 0.100425 [event_method]: 2.154e-05 [auto_monad]: 8.027e-05 [graph_reusing]: 6.73998e-06 [inline]: 3.4e-06 [add_attr]: 0.0467213, [1] [add_attr_with_inline]: 0.0467048, [1] [Cycle 1]: 9.668e-05, [2] [tag_attr]: 2.783e-05 [meta_addattr_fg_expand]: 5.27001e-06 [parallel-infer-symbol]: 3.65e-06 [pre_auto_parallel]: 4.704e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00618975, [53] [py_interpret_to_execute]: 1.233e-05 [rewriter_before_opt_a]: 0.00022534 [opt_a]: 0.0035855, [2] [Cycle 1]: 0.00299025, [45] [expand_dump_flag]: 4.60999e-06 [switch_simplify]: 4.18e-05 [loop_unroll]: 2.908e-05 [a_1]: 0.00068428 [with_stream_mark]: 2.485e-05 [recompute_prepare]: 1.107e-05 [updatestate_depend_eliminate]: 4.22e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.81999e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 7.351e-05 [accelerated_algorithm]: 7.21001e-06 [shard]: 2.17999e-06 [meta_shard_fg_expand]: 2.29999e-06 [shard_inline]: 5.10001e-06 [merge_send_recv]: 9.27999e-06 [auto_parallel]: 7.03e-06 [parallel]: 4.266e-05 [flash_sp]: 9.77001e-06 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 1.059e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 7.85e-06 [virtual_dataset]: 7.05998e-06 [get_grad_eliminate_]: 6.19999e-06 [virtual_output]: 5.82999e-06 [merge_forward]: 4.18999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 9.87999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.6e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 1.094e-05 [set_forward_comm_id_for_comm_node_pass]: 3.95e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 3.23e-06 [receive_attached]: 2.74999e-06 [after_resolve]: 9.78998e-06 [a_after_grad]: 9.31e-06 [renormalize]: 0.00149561 [add_forward_monad_depend]: 8.17998e-06 [auto_monad_grad]: 3.05998e-06 [auto_monad_eliminator]: 2.004e-05 [cse]: 3.43e-05 [a_3]: 4.623e-05 [Cycle 2]: 0.00058172, [45] [expand_dump_flag]: 2.63998e-06 [switch_simplify]: 7.2e-06 [loop_unroll]: 5.09e-06 [a_1]: 9.636e-05 [with_stream_mark]: 1.654e-05 [recompute_prepare]: 5.56998e-06 [updatestate_depend_eliminate]: 3.35e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 1.99999e-06 [a_2]: 5.613e-05 [accelerated_algorithm]: 4.99998e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 4.76002e-06 [merge_send_recv]: 6.09999e-06 [auto_parallel]: 8.09002e-06 [parallel]: 7.43e-06 [flash_sp]: 3.72998e-06 [merge_comm]: 2.64001e-06 [allreduce_fusion]: 2.91999e-06 [matmul_add_comm_reduction]: 7.84997e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 6.71999e-06 [virtual_dataset]: 4.63999e-06 [get_grad_eliminate_]: 4.62998e-06 [virtual_output]: 4.52e-06 [merge_forward]: 3.01001e-06 [cell_reuse_recompute_pass]: 2.49999e-06 [offload_activation]: 8.49002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.491e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 8.42e-06 [set_forward_comm_id_for_comm_node_pass]: 3.14999e-06 [meta_fg_expand]: 2.19001e-06 [flash_sp_send_recv_attached]: 1.19e-06 [receive_attached]: 1.71e-06 [after_resolve]: 1.015e-05 [a_after_grad]: 7.31999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.77999e-06 [auto_monad_grad]: 1.60001e-06 [auto_monad_eliminator]: 6.67002e-06 [cse]: 1.553e-05 [a_3]: 2.875e-05 [py_interpret_to_execute_after_opt_a]: 7.21001e-06 [slice_cell_reuse_recomputed_activation]: 1.91003e-06 [rewriter_after_opt_a]: 1.808e-05 [convert_after_rewriter]: 1.87001e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00079608 [opt_b]: 0.0001973, [1] [Cycle 1]: 0.00018868, [7] [b_1]: 0.0001 [b_2]: 7.03e-06 [updatestate_depend_eliminate]: 8.10999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 2.81e-06 [renormalize]: 8.60018e-07 [cse]: 2.375e-05 [optimize_parallel_all_gather_comm]: 2.037e-05 [overlap_param_gather]: 2.23998e-06 [cconv]: 3.504e-05 [loop_unroll]: 0.00052378 [opt_after_cconv]: 0.00010861, [1] [Cycle 1]: 0.0001013, [7] [c_1]: 2.368e-05 [parameter_eliminate]: 5.47001e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.49999e-06 [cse]: 2.088e-05 [renormalize]: 1.05001e-06 [remove_dup_value]: 1.52e-05 [tuple_transform]: 7.372e-05, [1] [Cycle 1]: 6.828e-05, [4] [d_1]: 4.028e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 3.9002e-07 [switch_simplify]: 5.57001e-06 [partial_unused_args_eliminate]: 2.20002e-06 [add_recomputation]: 6.472e-05 [cse_after_recomputation]: 2.218e-05, [1] [Cycle 1]: 1.689e-05, [1] [cse]: 1.083e-05 [environ_conv]: 5.86998e-06 [swap_dp_allreduce_reducescatter]: 5.49998e-06 [bias_add_comm_swap]: 3.38999e-06 [label_micro_interleaved_index]: 6.56e-06 [label_fine_grained_interleaved_index]: 3.01999e-06 [merge_cast_opt]: 1.45001e-06 [slice_recompute_activation]: 2.23002e-06 [micro_interleaved_order_control]: 2.66999e-06 [assign_add_opt]: 1.45001e-06 [ForceFp32Comm]: 7.2e-07 [remove_cast_before_assign_add]: 1.15999e-06 [full_micro_interleaved_order_control]: 2.46e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.39998e-06 [add_comm_op_reuse_tag]: 1.33002e-06 [interleave_split_concat_branches]: 1.46002e-06 [interleave_parallel_branches]: 1.30001e-06 [overlap_opt_shard_in_pipeline]: 6.28e-06 [overlap_opt_shard_grad_in_pipeline]: 1.62001e-06 [control_data_broadcast_order]: 1.288e-05 [grouped_pairwise_exchange_alltoall]: 1.44998e-06 [offloading_packed_experts]: 3.64002e-06 [overlap_recompute_and_grad_model_parallel]: 5.44e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.45001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.78e-06 [overlap_grad_ring_attention]: 5.05999e-06 [overlap_grad_flash_sp]: 2.275e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.53998e-06 [split_layernorm_comm]: 1.71e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.077e-05, [1] [Cycle 1]: 7.651e-05, [6] [build]: 4.48001e-06 [elim_shapecalc]: 1.071e-05 [elim_not_effective]: 1.22e-05 [opt_reshape]: 5.97999e-06 [fold_const_symbol]: 8.70001e-06 [renormalize]: 1.69995e-07 [detach_backward]: 2.57001e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 1.709e-05 [get_jit_bprop_graph]: 2.14e-06 [rewriter_after_jit_bprop_graph]: 5.77001e-06 [opt_after_jit_grad]: 0.00055416 [validate]: 4.283e-05 Sums bootstrap : 0.000645s : 0.60% type_inference : 0.100425s : 93.73% event_method : 0.000022s : 0.02% auto_monad : 0.000080s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000012s : 0.01% optimize.rewriter_before_opt_a : 0.000225s : 0.21% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000049s : 0.05% optimize.opt_a.loop_unroll : 0.000034s : 0.03% optimize.opt_a.a_1 : 0.000781s : 0.73% optimize.opt_a.with_stream_mark : 0.000041s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000130s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000010s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000050s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000010s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000017s : 0.02% optimize.opt_a.renormalize : 0.001496s : 1.40% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.02% optimize.opt_a.cse : 0.000050s : 0.05% optimize.opt_a.a_3 : 0.000075s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000796s : 0.74% optimize.opt_b.b_1 : 0.000100s : 0.09% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.03% optimize.loop_unroll : 0.000524s : 0.49% optimize.opt_after_cconv.c_1 : 0.000024s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000040s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.06% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000554s : 0.52% validate : 0.000043s : 0.04% Time group info: ------[substitution.] 0.000212 24 1.12% : 0.000002s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.59% : 0.000006s : 3: substitution.graph_param_transform 82.32% : 0.000175s : 5: substitution.inline 2.14% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.71% : 0.000006s : 4: substitution.remove_not_recompute_node 2.18% : 0.000005s : 2: substitution.replace_old_param 6.36% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.100330 2 98.70% : 0.099024s : 1: type_inference.infer 1.30% : 0.001306s : 1: type_inference.specialize ------[replace.] 0.000070 7 77.01% : 0.000054s : 5: replace.inline 22.99% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 7 93.39% : 0.000172s : 5: match.inline 6.61% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000183 1031 1.06% : 0.000002s : 11: predicate.accumulaten_eliminater 1.01% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 1.04% : 0.000002s : 11: predicate.addn_zero_filter 0.77% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.28% : 0.000004s : 17: predicate.arithmetic_simplify 0.99% : 0.000002s : 11: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.45% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.65% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.40% : 0.000003s : 11: predicate.dict_get_item_eliminator 1.01% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 3: predicate.elim_not_effective 0.34% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_depend_swap 2.00% : 0.000004s : 20: predicate.environ_get_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.37% : 0.000003s : 18: predicate.exchange_switch_depend_value 2.31% : 0.000004s : 18: predicate.float_depend_g_call 0.52% : 0.000001s : 6: predicate.float_environ_get_switch 0.90% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000001s : 3: predicate.graph_param_transform 0.48% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000012s : 47: predicate.inline 0.72% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 6: predicate.less_batch_normalization 1.87% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.57% : 0.000005s : 30: predicate.load_eliminater 1.43% : 0.000003s : 3: predicate.loop_unroll_after_grad 2.67% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.69% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 11: predicate.minmaximum_grad 1.74% : 0.000003s : 3: predicate.mutable_eliminate 0.37% : 0.000001s : 3: predicate.opt_reshape 0.54% : 0.000001s : 3: predicate.parallel_virtual_node 1.89% : 0.000003s : 18: predicate.partial_defer_inline 1.28% : 0.000002s : 16: predicate.partial_eliminate 0.94% : 0.000002s : 11: predicate.print_const_string_wrapper 0.49% : 0.000001s : 6: predicate.reduce_all_const_elim 1.18% : 0.000002s : 11: predicate.reduce_eliminate 2.59% : 0.000005s : 30: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 6: predicate.remove_not_recompute_node 1.14% : 0.000002s : 19: predicate.replace_applicator 0.50% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 0.87% : 0.000002s : 11: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 0.97% : 0.000002s : 6: predicate.same_eliminate 0.49% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.80% : 0.000001s : 6: predicate.shard_identity_eliminate 0.62% : 0.000001s : 6: predicate.special_op_eliminate 0.56% : 0.000001s : 6: predicate.specialize_transform 0.90% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 18: predicate.switch_defer_inline 2.09% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.31% : 0.000010s : 61: predicate.switch_simplify 0.98% : 0.000002s : 11: predicate.tile_eliminate 0.85% : 0.000002s : 11: predicate.transpose_eliminate 1.45% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.37% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.23% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.60% : 0.000007s : 25: predicate.tuple_list_get_item_eliminator 1.33% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.02% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.18% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.71% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.42% : 0.000001s : 3: predicate.value_based_eliminate 0.91% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000925 12 38.94% : 0.000360s : 5: func_graph_cloner_run.FuncGraphClonerGraph 61.06% : 0.000565s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.210626 192 0.00% : 0.000004s : 1: ForceFp32Comm 22.20% : 0.046766s : 1: add_attr 22.18% : 0.046711s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000086s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.32% : 0.000680s : 1: bootstrap 0.02% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000009s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.25% : 0.000532s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.38% : 0.000810s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 0.55% : 0.001155s : 78: opt.transform.opt_a 0.01% : 0.000023s : 1: opt.transform.opt_after_cconv 0.01% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000080s : 28: opt.transform.opt_b 0.02% : 0.000044s : 2: opt.transform.opt_trans_graph 0.02% : 0.000033s : 4: opt.transform.symbol_engine_opt 1.70% : 0.003590s : 1: opt_a 0.05% : 0.000112s : 1: opt_after_cconv 0.27% : 0.000565s : 1: opt_after_jit_grad 0.10% : 0.000202s : 1: opt_b 2.94% : 0.006196s : 1: optimize 0.01% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.01% : 0.000018s : 1: py_interpret_to_execute 0.01% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.36% : 0.000760s : 1: renormalize.infer 0.34% : 0.000724s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000021s : 1: rewriter_after_opt_a 0.11% : 0.000233s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000084s : 1: symbol_engine_optimizer 0.04% : 0.000077s : 1: tuple_transform 47.70% : 0.100459s : 1: type_inference TotalTime = 0.204576, [21] [bootstrap]: 0.0008193 [type_inference]: 0.132386 [event_method]: 2.276e-05 [auto_monad]: 8.045e-05 [graph_reusing]: 6.24999e-06 [inline]: 3.04001e-06 [add_attr]: 0.0642024, [1] [add_attr_with_inline]: 0.0641855, [1] [Cycle 1]: 0.00014456, [2] [tag_attr]: 2.958e-05 [meta_addattr_fg_expand]: 6.09001e-06 [parallel-infer-symbol]: 4.74998e-06 [pre_auto_parallel]: 4.699e-05 [insert-virtual-dataset]: 3.23e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.56e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.0060714, [53] [py_interpret_to_execute]: 8.13001e-06 [rewriter_before_opt_a]: 0.00022459 [opt_a]: 0.00331108, [2] [Cycle 1]: 0.00271591, [45] [expand_dump_flag]: 4.22e-06 [switch_simplify]: 4.183e-05 [loop_unroll]: 7.011e-05 [a_1]: 0.00067694 [with_stream_mark]: 2.356e-05 [recompute_prepare]: 1.047e-05 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 2.31998e-06 [a_2]: 7.147e-05 [accelerated_algorithm]: 7.23999e-06 [shard]: 2.26e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 5.60001e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 6.93e-06 [parallel]: 0.0002071 [flash_sp]: 1.224e-05 [merge_comm]: 4.95001e-06 [allreduce_fusion]: 3.76999e-06 [matmul_add_comm_reduction]: 1.184e-05 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 1.063e-05 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.34999e-06 [virtual_output]: 6.90998e-06 [merge_forward]: 5.56e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.18e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.56e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.047e-05 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.71999e-06 [receive_attached]: 2.53003e-06 [after_resolve]: 1.037e-05 [a_after_grad]: 9.12001e-06 [renormalize]: 0.001029 [add_forward_monad_depend]: 7.23e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.908e-05 [cse]: 3.071e-05 [a_3]: 4.682e-05 [Cycle 2]: 0.00058199, [45] [expand_dump_flag]: 2.49001e-06 [switch_simplify]: 7.53999e-06 [loop_unroll]: 5.30999e-06 [a_1]: 9.71e-05 [with_stream_mark]: 1.422e-05 [recompute_prepare]: 5.62999e-06 [updatestate_depend_eliminate]: 3.30998e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 2.50002e-06 [a_2]: 5.701e-05 [accelerated_algorithm]: 5.09998e-06 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 4.55999e-06 [merge_send_recv]: 5.37999e-06 [auto_parallel]: 6.22001e-06 [parallel]: 6.99001e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 2.91999e-06 [allreduce_fusion]: 3.21999e-06 [matmul_add_comm_reduction]: 6.59999e-06 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 6.07001e-06 [virtual_dataset]: 5.34998e-06 [get_grad_eliminate_]: 5.20001e-06 [virtual_output]: 5.07e-06 [merge_forward]: 3.58999e-06 [cell_reuse_recompute_pass]: 2.63e-06 [offload_activation]: 7.83001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.274e-05 [merge_recompute_call_nodes]: 1.19998e-06 [before_grad]: 9.57999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.54999e-06 [flash_sp_send_recv_attached]: 1.74998e-06 [receive_attached]: 2.01e-06 [after_resolve]: 9.03002e-06 [a_after_grad]: 7.85998e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.73002e-06 [auto_monad_grad]: 8.59989e-07 [auto_monad_eliminator]: 6.74001e-06 [cse]: 1.32e-05 [a_3]: 3.049e-05 [py_interpret_to_execute_after_opt_a]: 6.96001e-06 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 2.062e-05 [convert_after_rewriter]: 1.79e-06 [order_py_execute_after_rewriter]: 1.19998e-06 [mutable_eliminate]: 0.0008089 [opt_b]: 0.00027674, [1] [Cycle 1]: 0.0002676, [7] [b_1]: 0.00010303 [b_2]: 7.61999e-06 [updatestate_depend_eliminate]: 7.696e-05 [updatestate_assign_eliminate]: 3.57002e-06 [updatestate_loads_eliminate]: 2.66e-06 [renormalize]: 6.89994e-07 [cse]: 2.288e-05 [optimize_parallel_all_gather_comm]: 2.421e-05 [overlap_param_gather]: 2.37999e-06 [cconv]: 3.43e-05 [loop_unroll]: 0.00054178 [opt_after_cconv]: 0.00010951, [1] [Cycle 1]: 0.00010262, [7] [c_1]: 2.653e-05 [parameter_eliminate]: 5.56e-06 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.21998e-06 [cse]: 1.951e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 1.413e-05 [tuple_transform]: 7.364e-05, [1] [Cycle 1]: 6.819e-05, [4] [d_1]: 3.938e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 5.94e-06 [partial_unused_args_eliminate]: 2.32001e-06 [add_recomputation]: 0.00010922 [cse_after_recomputation]: 2.253e-05, [1] [Cycle 1]: 1.768e-05, [1] [cse]: 1.163e-05 [environ_conv]: 5.51002e-06 [swap_dp_allreduce_reducescatter]: 5.90002e-06 [bias_add_comm_swap]: 3.93001e-06 [label_micro_interleaved_index]: 5.61e-06 [label_fine_grained_interleaved_index]: 2.96001e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.61e-06 [micro_interleaved_order_control]: 2.76999e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 3.11001e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.09998e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.49998e-06 [interleave_parallel_branches]: 1.30001e-06 [overlap_opt_shard_in_pipeline]: 4.22e-05 [overlap_opt_shard_grad_in_pipeline]: 2.05002e-06 [control_data_broadcast_order]: 1.529e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 3.97e-06 [overlap_recompute_and_grad_model_parallel]: 5.14e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.67001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.70001e-06 [overlap_recompute_comm]: 2.83e-06 [overlap_grad_ring_attention]: 4.53999e-06 [overlap_grad_flash_sp]: 2.41e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.33002e-06 [split_layernorm_comm]: 1.71e-06 [handle_group_info]: 1.79998e-06 [symbol_engine_optimizer]: 7.951e-05, [1] [Cycle 1]: 7.458e-05, [6] [build]: 4.03001e-06 [elim_shapecalc]: 1.031e-05 [elim_not_effective]: 1.169e-05 [opt_reshape]: 6.17999e-06 [fold_const_symbol]: 8.55001e-06 [renormalize]: 1.60013e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 2.047e-05 [get_jit_bprop_graph]: 2.46e-06 [rewriter_after_jit_bprop_graph]: 5.97999e-06 [opt_after_jit_grad]: 0.00056962 [validate]: 5.034e-05 Sums bootstrap : 0.000819s : 0.59% type_inference : 0.132386s : 95.09% event_method : 0.000023s : 0.02% auto_monad : 0.000080s : 0.06% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000047s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000225s : 0.16% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000049s : 0.04% optimize.opt_a.loop_unroll : 0.000075s : 0.05% optimize.opt_a.a_1 : 0.000774s : 0.56% optimize.opt_a.with_stream_mark : 0.000038s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000128s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000010s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000013s : 0.01% optimize.opt_a.parallel : 0.000214s : 0.15% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000017s : 0.01% optimize.opt_a.renormalize : 0.001029s : 0.74% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.02% optimize.opt_a.cse : 0.000044s : 0.03% optimize.opt_a.a_3 : 0.000077s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000809s : 0.58% optimize.opt_b.b_1 : 0.000103s : 0.07% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000077s : 0.06% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.02% optimize.loop_unroll : 0.000542s : 0.39% optimize.opt_after_cconv.c_1 : 0.000027s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000039s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000109s : 0.08% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000042s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000020s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000570s : 0.41% validate : 0.000050s : 0.04% Time group info: ------[substitution.] 0.000280 24 0.62% : 0.000002s : 2: substitution.elim_not_effective 0.49% : 0.000001s : 2: substitution.fold_const_symbol 2.09% : 0.000006s : 3: substitution.graph_param_transform 87.16% : 0.000244s : 5: substitution.inline 1.51% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.94% : 0.000005s : 4: substitution.remove_not_recompute_node 1.57% : 0.000004s : 2: substitution.replace_old_param 4.63% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.132306 2 99.13% : 0.131152s : 1: type_inference.infer 0.87% : 0.001153s : 1: type_inference.specialize ------[replace.] 0.000068 7 77.88% : 0.000053s : 5: replace.inline 22.12% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000253 7 95.32% : 0.000241s : 5: match.inline 4.68% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000176 1031 0.90% : 0.000002s : 11: predicate.accumulaten_eliminater 0.99% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000002s : 11: predicate.addn_zero_filter 0.87% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 2.26% : 0.000004s : 17: predicate.arithmetic_simplify 0.92% : 0.000002s : 11: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.58% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.11% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 14: predicate.environ_add_const_eliminate 0.99% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_depend_swap 1.65% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.38% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.45% : 0.000004s : 18: predicate.float_depend_g_call 0.51% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.42% : 0.000001s : 6: predicate.incorporate_call_switch 6.45% : 0.000011s : 47: predicate.inline 0.72% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.82% : 0.000001s : 6: predicate.less_batch_normalization 1.74% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.43% : 0.000004s : 30: predicate.load_eliminater 1.21% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.98% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.51% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 6: predicate.merge_addn 0.49% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000001s : 11: predicate.minmaximum_grad 1.19% : 0.000002s : 3: predicate.mutable_eliminate 0.66% : 0.000001s : 3: predicate.opt_reshape 0.64% : 0.000001s : 3: predicate.parallel_virtual_node 1.93% : 0.000003s : 18: predicate.partial_defer_inline 1.37% : 0.000002s : 16: predicate.partial_eliminate 0.95% : 0.000002s : 11: predicate.print_const_string_wrapper 0.65% : 0.000001s : 6: predicate.reduce_all_const_elim 1.09% : 0.000002s : 11: predicate.reduce_eliminate 2.29% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 19: predicate.replace_applicator 0.53% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 0.94% : 0.000002s : 11: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 3: predicate.row_tensor_eliminate 0.99% : 0.000002s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.73% : 0.000001s : 6: predicate.specialize_transform 1.03% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.28% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 18: predicate.switch_defer_inline 2.00% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.47% : 0.000010s : 61: predicate.switch_simplify 0.90% : 0.000002s : 11: predicate.tile_eliminate 1.07% : 0.000002s : 11: predicate.transpose_eliminate 1.52% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.45% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.88% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.87% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000795 12 45.67% : 0.000363s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.33% : 0.000432s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.277160 192 0.00% : 0.000004s : 1: ForceFp32Comm 23.17% : 0.064211s : 1: add_attr 23.16% : 0.064192s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000114s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.03% : 0.000087s : 1: auto_monad 0.01% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.34% : 0.000941s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.20% : 0.000553s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.30% : 0.000822s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.43% : 0.001194s : 78: opt.transform.opt_a 0.01% : 0.000025s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000082s : 28: opt.transform.opt_b 0.02% : 0.000043s : 2: opt.transform.opt_trans_graph 0.01% : 0.000033s : 4: opt.transform.symbol_engine_opt 1.20% : 0.003314s : 1: opt_a 0.04% : 0.000113s : 1: opt_after_cconv 0.21% : 0.000583s : 1: opt_after_jit_grad 0.10% : 0.000281s : 1: opt_b 2.19% : 0.006078s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000046s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 0.19% : 0.000515s : 1: renormalize.infer 0.18% : 0.000502s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000024s : 1: rewriter_after_opt_a 0.08% : 0.000232s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000082s : 1: symbol_engine_optimizer 0.03% : 0.000077s : 1: tuple_transform 47.78% : 0.132415s : 1: type_inference TotalTime = 0.0798662, [21] [bootstrap]: 0.00062994 [type_inference]: 0.0132167 [event_method]: 2.298e-05 [auto_monad]: 8.065e-05 [graph_reusing]: 5.87999e-06 [inline]: 3.72002e-06 [add_attr]: 0.0580598, [1] [add_attr_with_inline]: 0.0580434, [1] [Cycle 1]: 9.468e-05, [2] [tag_attr]: 3.061e-05 [meta_addattr_fg_expand]: 5.64998e-06 [parallel-infer-symbol]: 3.96001e-06 [pre_auto_parallel]: 4.973e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 6.09987e-07 [dataset_repeat_opt]: 2.43e-06 [pipeline_split]: 2.01998e-06 [optimize]: 0.00673265, [53] [py_interpret_to_execute]: 8.97e-06 [rewriter_before_opt_a]: 0.00021166 [opt_a]: 0.00384337, [2] [Cycle 1]: 0.00315327, [45] [expand_dump_flag]: 5.91998e-06 [switch_simplify]: 4.382e-05 [loop_unroll]: 2.857e-05 [a_1]: 0.00071447 [with_stream_mark]: 2.392e-05 [recompute_prepare]: 1.285e-05 [updatestate_depend_eliminate]: 4.84998e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 2.56e-06 [a_2]: 7.654e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 1.90001e-06 [meta_shard_fg_expand]: 2.65002e-06 [shard_inline]: 5.94e-06 [merge_send_recv]: 1.028e-05 [auto_parallel]: 8.07e-06 [parallel]: 2.007e-05 [flash_sp]: 9.81e-06 [merge_comm]: 4.15999e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.66002e-06 [virtual_dataset]: 8.13999e-06 [get_grad_eliminate_]: 6.98998e-06 [virtual_output]: 6.96999e-06 [merge_forward]: 4.55001e-06 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 1.32e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.573e-05 [merge_recompute_call_nodes]: 1.76003e-06 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 3.58999e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 3.27002e-06 [after_resolve]: 9.97999e-06 [a_after_grad]: 1.049e-05 [renormalize]: 0.00148596 [add_forward_monad_depend]: 9.41998e-06 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.069e-05 [cse]: 3.945e-05 [a_3]: 5.671e-05 [Cycle 2]: 0.00067464, [45] [expand_dump_flag]: 1.47e-05 [switch_simplify]: 8.54e-06 [loop_unroll]: 6.02999e-06 [a_1]: 0.00011361 [with_stream_mark]: 1.881e-05 [recompute_prepare]: 5.75001e-06 [updatestate_depend_eliminate]: 3.31999e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.21e-06 [a_2]: 6.303e-05 [accelerated_algorithm]: 5.73997e-06 [shard]: 2.16998e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 5.14e-06 [merge_send_recv]: 8.12998e-06 [auto_parallel]: 8.04002e-06 [parallel]: 9.75002e-06 [flash_sp]: 4.06001e-06 [merge_comm]: 2.73e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 8.52998e-06 [allreduce_slice_to_reducescatter]: 5.59987e-07 [virtual_shard_identity]: 8.13999e-06 [virtual_dataset]: 5.44998e-06 [get_grad_eliminate_]: 5.10999e-06 [virtual_output]: 5.05999e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 3.51001e-06 [offload_activation]: 1.01e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.814e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 9.20999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 2.57001e-06 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.87999e-06 [after_resolve]: 1.204e-05 [a_after_grad]: 8.69e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.09e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 9.17001e-06 [cse]: 1.826e-05 [a_3]: 3.054e-05 [py_interpret_to_execute_after_opt_a]: 1.081e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 2.069e-05 [convert_after_rewriter]: 2.19001e-06 [order_py_execute_after_rewriter]: 1.28002e-06 [mutable_eliminate]: 0.00092398 [opt_b]: 0.00021554, [1] [Cycle 1]: 0.00020573, [7] [b_1]: 0.00011451 [b_2]: 7.26999e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 8.89995e-07 [cse]: 2.645e-05 [optimize_parallel_all_gather_comm]: 2.016e-05 [overlap_param_gather]: 2.13002e-06 [cconv]: 3.938e-05 [loop_unroll]: 0.00061946 [opt_after_cconv]: 0.0001239, [1] [Cycle 1]: 0.0001148, [7] [c_1]: 2.816e-05 [parameter_eliminate]: 5.50001e-06 [updatestate_depend_eliminate]: 6.93e-06 [updatestate_assign_eliminate]: 2.53003e-06 [updatestate_loads_eliminate]: 2.78e-06 [cse]: 2.449e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 1.518e-05 [tuple_transform]: 9.496e-05, [1] [Cycle 1]: 8.771e-05, [4] [d_1]: 5.102e-05 [none_parameter_eliminate]: 1.99e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 7.11001e-06 [partial_unused_args_eliminate]: 2.10002e-06 [add_recomputation]: 6.454e-05 [cse_after_recomputation]: 2.465e-05, [1] [Cycle 1]: 1.868e-05, [1] [cse]: 1.099e-05 [environ_conv]: 5.53002e-06 [swap_dp_allreduce_reducescatter]: 6.76e-06 [bias_add_comm_swap]: 3.80998e-06 [label_micro_interleaved_index]: 6.66e-06 [label_fine_grained_interleaved_index]: 3.31999e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 3.13e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 9.09989e-07 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 3.21999e-06 [comm_op_add_attrs]: 1.37e-06 [add_comm_op_reuse_tag]: 1.02998e-06 [interleave_split_concat_branches]: 1.40001e-06 [interleave_parallel_branches]: 1.77001e-06 [overlap_opt_shard_in_pipeline]: 1.69e-06 [overlap_opt_shard_grad_in_pipeline]: 1.75001e-06 [control_data_broadcast_order]: 1.607e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 4.63001e-06 [overlap_recompute_and_grad_model_parallel]: 6.16e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.72999e-06 [overlap_recompute_comm]: 2.34999e-06 [overlap_grad_ring_attention]: 4.61002e-06 [overlap_grad_flash_sp]: 2.335e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.58998e-06 [split_layernorm_comm]: 2.09e-06 [handle_group_info]: 1.18001e-06 [symbol_engine_optimizer]: 9.786e-05, [1] [Cycle 1]: 9.192e-05, [6] [build]: 4.40999e-06 [elim_shapecalc]: 1.199e-05 [elim_not_effective]: 1.381e-05 [opt_reshape]: 8.10999e-06 [fold_const_symbol]: 1.042e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.19999e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 1.766e-05 [get_jit_bprop_graph]: 2.44999e-06 [rewriter_after_jit_bprop_graph]: 6.81999e-06 [opt_after_jit_grad]: 0.00076871 [validate]: 5.41e-05 Sums bootstrap : 0.000630s : 3.07% type_inference : 0.013217s : 64.38% event_method : 0.000023s : 0.11% auto_monad : 0.000081s : 0.39% graph_reusing : 0.000006s : 0.03% inline : 0.000004s : 0.02% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.15% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.03% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000050s : 0.24% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000009s : 0.04% optimize.rewriter_before_opt_a : 0.000212s : 1.03% optimize.opt_a.expand_dump_flag : 0.000021s : 0.10% optimize.opt_a.switch_simplify : 0.000052s : 0.26% optimize.opt_a.loop_unroll : 0.000035s : 0.17% optimize.opt_a.a_1 : 0.000828s : 4.03% optimize.opt_a.with_stream_mark : 0.000043s : 0.21% optimize.opt_a.recompute_prepare : 0.000019s : 0.09% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.02% optimize.opt_a.a_2 : 0.000140s : 0.68% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.02% optimize.opt_a.shard_inline : 0.000011s : 0.05% optimize.opt_a.merge_send_recv : 0.000018s : 0.09% optimize.opt_a.auto_parallel : 0.000016s : 0.08% optimize.opt_a.parallel : 0.000030s : 0.15% optimize.opt_a.flash_sp : 0.000014s : 0.07% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000007s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.08% optimize.opt_a.virtual_dataset : 0.000014s : 0.07% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.06% optimize.opt_a.merge_forward : 0.000009s : 0.04% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.03% optimize.opt_a.offload_activation : 0.000023s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.16% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.02% optimize.opt_a.before_grad : 0.000022s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000006s : 0.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.03% optimize.opt_a.after_resolve : 0.000022s : 0.11% optimize.opt_a.a_after_grad : 0.000019s : 0.09% optimize.opt_a.renormalize : 0.001486s : 7.24% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.06% optimize.opt_a.auto_monad_grad : 0.000005s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.15% optimize.opt_a.cse : 0.000058s : 0.28% optimize.opt_a.a_3 : 0.000087s : 0.43% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000021s : 0.10% optimize.convert_after_rewriter : 0.000002s : 0.01% optimize.order_py_execute_after_rewriter : 0.000001s : 0.01% optimize.mutable_eliminate : 0.000924s : 4.50% optimize.opt_b.b_1 : 0.000115s : 0.56% optimize.opt_b.b_2 : 0.000007s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.04% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.13% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.10% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000039s : 0.19% optimize.loop_unroll : 0.000619s : 3.02% optimize.opt_after_cconv.c_1 : 0.000028s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.03% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.12% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.07% optimize.tuple_transform.d_1 : 0.000051s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000065s : 0.31% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000006s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.03% optimize.bias_add_comm_swap : 0.000004s : 0.02% optimize.label_micro_interleaved_index : 0.000007s : 0.03% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.02% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.01% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.02% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000002s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000016s : 0.08% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000005s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.03% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000023s : 0.11% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.06% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000018s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000007s : 0.03% opt_after_jit_grad : 0.000769s : 3.74% validate : 0.000054s : 0.26% Time group info: ------[substitution.] 0.000224 24 0.84% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.69% : 0.000006s : 3: substitution.graph_param_transform 82.40% : 0.000185s : 5: substitution.inline 2.08% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.58% : 0.000006s : 4: substitution.remove_not_recompute_node 2.65% : 0.000006s : 2: substitution.replace_old_param 6.12% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.013127 2 90.93% : 0.011937s : 1: type_inference.infer 9.07% : 0.001191s : 1: type_inference.specialize ------[replace.] 0.000075 7 77.51% : 0.000058s : 5: replace.inline 22.49% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 7 93.63% : 0.000181s : 5: match.inline 6.37% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1031 1.22% : 0.000002s : 11: predicate.accumulaten_eliminater 1.26% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 1.34% : 0.000003s : 11: predicate.addn_zero_filter 0.78% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 2.28% : 0.000005s : 17: predicate.arithmetic_simplify 1.23% : 0.000002s : 11: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.52% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.48% : 0.000001s : 6: predicate.depend_value_elim 0.85% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 11: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.08% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.40% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.25% : 0.000003s : 14: predicate.environ_get_depend_swap 1.54% : 0.000003s : 20: predicate.environ_get_eliminate 0.94% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.28% : 0.000003s : 18: predicate.exchange_switch_depend_value 2.50% : 0.000005s : 18: predicate.float_depend_g_call 0.58% : 0.000001s : 6: predicate.float_environ_get_switch 0.93% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.65% : 0.000001s : 6: predicate.get_grad_eliminate 0.15% : 0.000000s : 3: predicate.graph_param_transform 0.45% : 0.000001s : 6: predicate.incorporate_call 0.39% : 0.000001s : 6: predicate.incorporate_call_switch 5.47% : 0.000011s : 47: predicate.inline 0.56% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.89% : 0.000002s : 6: predicate.less_batch_normalization 1.92% : 0.000004s : 19: predicate.list_to_tuple_eliminator_ 2.31% : 0.000005s : 30: predicate.load_eliminater 1.71% : 0.000003s : 3: predicate.loop_unroll_after_grad 2.55% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.89% : 0.000004s : 17: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.42% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.74% : 0.000001s : 11: predicate.minmaximum_grad 2.08% : 0.000004s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 1.80% : 0.000004s : 18: predicate.partial_defer_inline 1.14% : 0.000002s : 16: predicate.partial_eliminate 0.91% : 0.000002s : 11: predicate.print_const_string_wrapper 0.48% : 0.000001s : 6: predicate.reduce_all_const_elim 1.38% : 0.000003s : 11: predicate.reduce_eliminate 2.36% : 0.000005s : 30: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 19: predicate.replace_applicator 0.43% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 1.12% : 0.000002s : 11: predicate.reshape_eliminate 0.61% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 3: predicate.row_tensor_eliminate 0.77% : 0.000002s : 6: predicate.same_eliminate 0.33% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 6: predicate.shard_identity_eliminate 0.85% : 0.000002s : 6: predicate.special_op_eliminate 0.61% : 0.000001s : 6: predicate.specialize_transform 1.03% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 1.02% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.23% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 18: predicate.switch_defer_inline 1.94% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.06% : 0.000010s : 61: predicate.switch_simplify 0.88% : 0.000002s : 11: predicate.tile_eliminate 1.17% : 0.000002s : 11: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.38% : 0.000007s : 25: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.72% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.05% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.66% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.63% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000996 12 37.20% : 0.000371s : 5: func_graph_cloner_run.FuncGraphClonerGraph 62.80% : 0.000626s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.147500 192 0.00% : 0.000004s : 1: ForceFp32Comm 39.37% : 0.058068s : 1: add_attr 39.36% : 0.058050s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000070s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.06% : 0.000088s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.45% : 0.000667s : 1: bootstrap 0.03% : 0.000044s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.02% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.02% : 0.000030s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.43% : 0.000633s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.64% : 0.000941s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000025s : 1: opt.transform.mutable_eliminate 0.84% : 0.001232s : 78: opt.transform.opt_a 0.02% : 0.000026s : 1: opt.transform.opt_after_cconv 0.02% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000089s : 28: opt.transform.opt_b 0.04% : 0.000056s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 2.61% : 0.003848s : 1: opt_a 0.09% : 0.000128s : 1: opt_after_cconv 0.53% : 0.000785s : 1: opt_after_jit_grad 0.15% : 0.000220s : 1: opt_b 4.57% : 0.006740s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000055s : 1: pre_auto_parallel 0.01% : 0.000013s : 1: py_interpret_to_execute 0.01% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.49% : 0.000724s : 1: renormalize.infer 0.51% : 0.000747s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000024s : 1: rewriter_after_opt_a 0.15% : 0.000219s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000101s : 1: symbol_engine_optimizer 0.07% : 0.000098s : 1: tuple_transform 8.98% : 0.013249s : 1: type_inference TotalTime = 1.89495, [21] [bootstrap]: 0.00099696 [type_inference]: 1.62267 [event_method]: 0.0844182 [auto_monad]: 0.00028814 [graph_reusing]: 1.174e-05 [inline]: 3.8e-06 [add_attr]: 0.00550699, [1] [add_attr_with_inline]: 0.00549134, [1] [Cycle 1]: 0.00016573, [2] [tag_attr]: 6.741e-05 [meta_addattr_fg_expand]: 1.504e-05 [parallel-infer-symbol]: 5.12999e-06 [pre_auto_parallel]: 8.168e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 1.02e-06 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.179725, [53] [py_interpret_to_execute]: 8.00999e-06 [rewriter_before_opt_a]: 0.00052337 [opt_a]: 0.175453, [3] [Cycle 1]: 0.114512, [45] [expand_dump_flag]: 5.57999e-06 [switch_simplify]: 0.00020767 [loop_unroll]: 8.545e-05 [a_1]: 0.00259679 [with_stream_mark]: 0.0001055 [recompute_prepare]: 4.193e-05 [updatestate_depend_eliminate]: 1.584e-05 [updatestate_assign_eliminate]: 1.124e-05 [updatestate_loads_eliminate]: 1.07e-05 [parameter_eliminate]: 4.96002e-06 [a_2]: 0.000374 [accelerated_algorithm]: 8.472e-05 [shard]: 2.89999e-06 [meta_shard_fg_expand]: 1.008e-05 [shard_inline]: 2.385e-05 [merge_send_recv]: 2.293e-05 [auto_parallel]: 1.999e-05 [parallel]: 0.00010213 [flash_sp]: 1.638e-05 [merge_comm]: 1.389e-05 [allreduce_fusion]: 1.177e-05 [matmul_add_comm_reduction]: 3.854e-05 [allreduce_slice_to_reducescatter]: 1.05001e-06 [virtual_shard_identity]: 2.933e-05 [virtual_dataset]: 2.302e-05 [get_grad_eliminate_]: 2.26e-05 [virtual_output]: 2.331e-05 [merge_forward]: 1.414e-05 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 2.655e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.118e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 3.871e-05 [set_forward_comm_id_for_comm_node_pass]: 1.292e-05 [meta_fg_expand]: 0.00311156 [flash_sp_send_recv_attached]: 7.78999e-06 [receive_attached]: 3.91999e-06 [after_resolve]: 0.00011006 [a_after_grad]: 0.00012951 [renormalize]: 0.105487 [add_forward_monad_depend]: 1.846e-05 [auto_monad_grad]: 8.68001e-06 [auto_monad_eliminator]: 8.574e-05 [cse]: 0.00052003 [a_3]: 0.0005827 [Cycle 2]: 0.0592316, [45] [expand_dump_flag]: 3.56999e-06 [switch_simplify]: 7.719e-05 [loop_unroll]: 7.186e-05 [a_1]: 0.00240478 [with_stream_mark]: 3.894e-05 [recompute_prepare]: 2.729e-05 [updatestate_depend_eliminate]: 9.72001e-06 [updatestate_assign_eliminate]: 8.60999e-06 [updatestate_loads_eliminate]: 7.36999e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00024931 [accelerated_algorithm]: 2.207e-05 [shard]: 2.61e-06 [meta_shard_fg_expand]: 6.49999e-06 [shard_inline]: 1.643e-05 [merge_send_recv]: 1.49e-05 [auto_parallel]: 1.498e-05 [parallel]: 1.176e-05 [flash_sp]: 4.95001e-06 [merge_comm]: 7.66001e-06 [allreduce_fusion]: 7.3e-06 [matmul_add_comm_reduction]: 1.556e-05 [allreduce_slice_to_reducescatter]: 1.30001e-06 [virtual_shard_identity]: 1.788e-05 [virtual_dataset]: 1.542e-05 [get_grad_eliminate_]: 1.828e-05 [virtual_output]: 1.633e-05 [merge_forward]: 9.25999e-06 [cell_reuse_recompute_pass]: 2.32999e-06 [offload_activation]: 1.939e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.961e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 2.52e-05 [set_forward_comm_id_for_comm_node_pass]: 8.10999e-06 [meta_fg_expand]: 0.00021015 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.90998e-06 [after_resolve]: 3.199e-05 [a_after_grad]: 2.975e-05 [renormalize]: 0.0549665 [add_forward_monad_depend]: 1.562e-05 [auto_monad_grad]: 2.99001e-06 [auto_monad_eliminator]: 3.917e-05 [cse]: 0.00022757 [a_3]: 0.00014682 [Cycle 3]: 0.00168423, [45] [expand_dump_flag]: 3.71001e-06 [switch_simplify]: 2.199e-05 [loop_unroll]: 1.754e-05 [a_1]: 0.00051613 [with_stream_mark]: 2.741e-05 [recompute_prepare]: 2.081e-05 [updatestate_depend_eliminate]: 9.34e-06 [updatestate_assign_eliminate]: 7.29001e-06 [updatestate_loads_eliminate]: 7.44002e-06 [parameter_eliminate]: 2.96999e-06 [a_2]: 0.00024235 [accelerated_algorithm]: 2.639e-05 [shard]: 2.71e-06 [meta_shard_fg_expand]: 4.90999e-06 [shard_inline]: 1.667e-05 [merge_send_recv]: 1.585e-05 [auto_parallel]: 1.582e-05 [parallel]: 1.104e-05 [flash_sp]: 2.21998e-06 [merge_comm]: 7.94002e-06 [allreduce_fusion]: 7.11001e-06 [matmul_add_comm_reduction]: 1.598e-05 [allreduce_slice_to_reducescatter]: 1.22999e-06 [virtual_shard_identity]: 1.983e-05 [virtual_dataset]: 1.548e-05 [get_grad_eliminate_]: 1.605e-05 [virtual_output]: 1.596e-05 [merge_forward]: 8.55999e-06 [cell_reuse_recompute_pass]: 3.28e-06 [offload_activation]: 1.8e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.242e-05 [merge_recompute_call_nodes]: 2.14999e-06 [before_grad]: 2.483e-05 [set_forward_comm_id_for_comm_node_pass]: 8.95999e-06 [meta_fg_expand]: 5.51998e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 3.08998e-06 [after_resolve]: 2.822e-05 [a_after_grad]: 2.587e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.3e-06 [auto_monad_grad]: 2.61999e-06 [auto_monad_eliminator]: 2.127e-05 [cse]: 5.665e-05 [a_3]: 0.00010636 [py_interpret_to_execute_after_opt_a]: 1.251e-05 [slice_cell_reuse_recomputed_activation]: 2.46e-06 [rewriter_after_opt_a]: 5.237e-05 [convert_after_rewriter]: 1.42e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00086919 [opt_b]: 0.0006301, [1] [Cycle 1]: 0.00061954, [7] [b_1]: 0.0004383 [b_2]: 2.198e-05 [updatestate_depend_eliminate]: 1.633e-05 [updatestate_assign_eliminate]: 7.03998e-06 [updatestate_loads_eliminate]: 6.49999e-06 [renormalize]: 6.69999e-07 [cse]: 7.785e-05 [optimize_parallel_all_gather_comm]: 3.745e-05 [overlap_param_gather]: 3.35998e-06 [cconv]: 4.287e-05 [loop_unroll]: 0.00065243 [opt_after_cconv]: 0.00023766, [1] [Cycle 1]: 0.00022577, [7] [c_1]: 9.161e-05 [parameter_eliminate]: 6.07999e-06 [updatestate_depend_eliminate]: 1.18e-05 [updatestate_assign_eliminate]: 6.88e-06 [updatestate_loads_eliminate]: 6.52001e-06 [cse]: 6.159e-05 [renormalize]: 1.05999e-06 [remove_dup_value]: 9.807e-05 [tuple_transform]: 0.00023017, [1] [Cycle 1]: 0.00022296, [4] [d_1]: 0.00017164 [none_parameter_eliminate]: 3.28998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 1.806e-05 [partial_unused_args_eliminate]: 2.71999e-06 [add_recomputation]: 0.00016153 [cse_after_recomputation]: 6.415e-05, [1] [Cycle 1]: 5.645e-05, [1] [cse]: 4.767e-05 [environ_conv]: 1.652e-05 [swap_dp_allreduce_reducescatter]: 1.313e-05 [bias_add_comm_swap]: 4.09997e-06 [label_micro_interleaved_index]: 7.57002e-06 [label_fine_grained_interleaved_index]: 2.80002e-06 [merge_cast_opt]: 1.62999e-06 [slice_recompute_activation]: 2.32999e-06 [micro_interleaved_order_control]: 3.06001e-06 [assign_add_opt]: 1.52999e-06 [ForceFp32Comm]: 1.06997e-06 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.81e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 1.23002e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.30999e-06 [overlap_opt_shard_in_pipeline]: 2.617e-05 [overlap_opt_shard_grad_in_pipeline]: 2.34001e-06 [control_data_broadcast_order]: 2.894e-05 [grouped_pairwise_exchange_alltoall]: 1.87001e-06 [offloading_packed_experts]: 8.07003e-06 [overlap_recompute_and_grad_model_parallel]: 7.75998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.63998e-06 [overlap_grad_ring_attention]: 7.29001e-06 [overlap_grad_flash_sp]: 3.602e-05 [begin_end_overlap_inline]: 7.89994e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 1.87999e-06 [symbol_engine_optimizer]: 0.00016369, [1] [Cycle 1]: 0.00015647, [6] [build]: 1.688e-05 [elim_shapecalc]: 2.94e-05 [elim_not_effective]: 3.071e-05 [opt_reshape]: 1.748e-05 [fold_const_symbol]: 2.292e-05 [renormalize]: 5.10016e-07 [detach_backward]: 3.08e-06 [pipeline_parallel_scheduler]: 1.51998e-06 [auto_monad_reorder]: 3.613e-05 [get_jit_bprop_graph]: 2.96999e-06 [rewriter_after_jit_bprop_graph]: 7.82e-06 [opt_after_jit_grad]: 0.00073674 [validate]: 0.0001377 Sums bootstrap : 0.000997s : 0.05% type_inference : 1.622668s : 85.96% event_method : 0.084418s : 4.47% auto_monad : 0.000288s : 0.02% graph_reusing : 0.000012s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000067s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000082s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000523s : 0.03% optimize.opt_a.expand_dump_flag : 0.000013s : 0.00% optimize.opt_a.switch_simplify : 0.000307s : 0.02% optimize.opt_a.loop_unroll : 0.000175s : 0.01% optimize.opt_a.a_1 : 0.005518s : 0.29% optimize.opt_a.with_stream_mark : 0.000172s : 0.01% optimize.opt_a.recompute_prepare : 0.000090s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000035s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000027s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000026s : 0.00% optimize.opt_a.parameter_eliminate : 0.000011s : 0.00% optimize.opt_a.a_2 : 0.000866s : 0.05% optimize.opt_a.accelerated_algorithm : 0.000133s : 0.01% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000021s : 0.00% optimize.opt_a.shard_inline : 0.000057s : 0.00% optimize.opt_a.merge_send_recv : 0.000054s : 0.00% optimize.opt_a.auto_parallel : 0.000051s : 0.00% optimize.opt_a.parallel : 0.000125s : 0.01% optimize.opt_a.flash_sp : 0.000024s : 0.00% optimize.opt_a.merge_comm : 0.000029s : 0.00% optimize.opt_a.allreduce_fusion : 0.000026s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000070s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000067s : 0.00% optimize.opt_a.virtual_dataset : 0.000054s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000057s : 0.00% optimize.opt_a.virtual_output : 0.000056s : 0.00% optimize.opt_a.merge_forward : 0.000032s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000064s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000103s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000089s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000030s : 0.00% optimize.opt_a.meta_fg_expand : 0.003327s : 0.18% optimize.opt_a.flash_sp_send_recv_attached : 0.000013s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000170s : 0.01% optimize.opt_a.a_after_grad : 0.000185s : 0.01% optimize.opt_a.renormalize : 0.160454s : 8.50% optimize.opt_a.add_forward_monad_depend : 0.000037s : 0.00% optimize.opt_a.auto_monad_grad : 0.000014s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000146s : 0.01% optimize.opt_a.cse : 0.000804s : 0.04% optimize.opt_a.a_3 : 0.000836s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000869s : 0.05% optimize.opt_b.b_1 : 0.000438s : 0.02% optimize.opt_b.b_2 : 0.000022s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000078s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000037s : 0.00% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000043s : 0.00% optimize.loop_unroll : 0.000652s : 0.03% optimize.opt_after_cconv.c_1 : 0.000092s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000062s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000098s : 0.01% optimize.tuple_transform.d_1 : 0.000172s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000018s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000162s : 0.01% optimize.cse_after_recomputation.cse : 0.000048s : 0.00% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000026s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000029s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000036s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000029s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000031s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000036s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000737s : 0.04% validate : 0.000138s : 0.01% Time group info: ------[substitution.] 0.001595 361 0.25% : 0.000004s : 6: substitution.elim_not_effective 0.73% : 0.000012s : 13: substitution.float_depend_g_call 1.36% : 0.000022s : 13: substitution.float_tuple_getitem_switch 0.18% : 0.000003s : 6: substitution.fold_const_symbol 0.83% : 0.000013s : 12: substitution.graph_param_transform 0.21% : 0.000003s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 54.92% : 0.000876s : 20: substitution.inline 1.53% : 0.000024s : 2: substitution.inline_without_move 1.06% : 0.000017s : 23: substitution.j_node_and_user_rematch 3.83% : 0.000061s : 3: substitution.less_batch_normalization 3.11% : 0.000050s : 21: substitution.minmaximum_grad 1.53% : 0.000024s : 13: substitution.partial_eliminate 1.37% : 0.000022s : 23: substitution.remove_not_recompute_node 1.98% : 0.000032s : 9: substitution.replace_applicator 1.32% : 0.000021s : 27: substitution.replace_old_param 0.29% : 0.000005s : 1: substitution.set_cell_output_no_recompute 1.64% : 0.000026s : 3: substitution.switch_simplify 5.30% : 0.000084s : 27: substitution.tuple_list_convert_item_index_to_positive 2.40% : 0.000038s : 27: substitution.tuple_list_get_item_const_eliminator 3.12% : 0.000050s : 27: substitution.tuple_list_get_item_depend_reorder 9.87% : 0.000157s : 54: substitution.tuple_list_get_item_eliminator 3.03% : 0.000048s : 27: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 1.622469 2 98.89% : 1.604441s : 1: type_inference.infer 1.11% : 0.018028s : 1: type_inference.specialize ------[replace.] 0.000433 40 55.69% : 0.000241s : 20: replace.inline 11.77% : 0.000051s : 3: replace.switch_simplify 32.54% : 0.000141s : 17: replace.tuple_list_get_item_eliminator ------[match.] 0.000929 40 92.88% : 0.000863s : 20: match.inline 2.59% : 0.000024s : 3: match.switch_simplify 4.53% : 0.000042s : 17: match.tuple_list_get_item_eliminator ------[predicate.] 0.001281 8458 0.88% : 0.000011s : 98: predicate.accumulaten_eliminater 0.36% : 0.000005s : 12: predicate.ad_related_special_op_eliminate 0.48% : 0.000006s : 50: predicate.addn_check_dump 0.90% : 0.000012s : 98: predicate.addn_zero_filter 0.81% : 0.000010s : 98: predicate.adjust_all_reduce_mul_add 1.89% : 0.000024s : 148: predicate.arithmetic_simplify 0.94% : 0.000012s : 98: predicate.cast_eliminate 1.04% : 0.000013s : 105: predicate.check_bprop_eliminate 0.52% : 0.000007s : 50: predicate.compare_switch_simplify 0.10% : 0.000001s : 14: predicate.const_output_eliminate 0.53% : 0.000007s : 50: predicate.depend_value_elim 0.96% : 0.000012s : 98: predicate.dict_get_item_const_eliminator 1.14% : 0.000015s : 98: predicate.dict_get_item_eliminator 0.89% : 0.000011s : 98: predicate.dict_set_item_eliminator 0.37% : 0.000005s : 26: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 12: predicate.elim_not_effective 0.14% : 0.000002s : 12: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000013s : 112: predicate.environ_add_const_eliminate 0.94% : 0.000012s : 112: predicate.environ_get_add_eliminate 1.02% : 0.000013s : 112: predicate.environ_get_depend_swap 1.45% : 0.000019s : 162: predicate.environ_get_eliminate 0.97% : 0.000012s : 112: predicate.environ_get_set_eliminate 1.32% : 0.000017s : 135: predicate.exchange_switch_depend_value 1.90% : 0.000024s : 135: predicate.float_depend_g_call 0.54% : 0.000007s : 50: predicate.float_environ_get_switch 0.76% : 0.000010s : 64: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 12: predicate.fold_const_symbol 0.52% : 0.000007s : 50: predicate.get_grad_eliminate 0.10% : 0.000001s : 12: predicate.graph_param_transform 0.48% : 0.000006s : 50: predicate.incorporate_call 0.44% : 0.000006s : 50: predicate.incorporate_call_switch 4.78% : 0.000061s : 361: predicate.inline 1.23% : 0.000016s : 90: predicate.inline_without_move 0.28% : 0.000004s : 50: predicate.j_node_and_user_rematch 0.71% : 0.000009s : 50: predicate.less_batch_normalization 1.44% : 0.000018s : 141: predicate.list_to_tuple_eliminator_ 2.19% : 0.000028s : 241: predicate.load_eliminater 0.41% : 0.000005s : 14: predicate.loop_unroll_after_grad 1.97% : 0.000025s : 187: predicate.loop_unroll_before_grad 1.24% : 0.000016s : 126: predicate.make_slice_get_slice_eliminator 0.49% : 0.000006s : 50: predicate.merge_addn 0.99% : 0.000013s : 105: predicate.micro_step_allgather_replace 1.06% : 0.000014s : 105: predicate.mini_step_allgather_replace 0.91% : 0.000012s : 98: predicate.minmaximum_grad 0.42% : 0.000005s : 14: predicate.mutable_eliminate 0.17% : 0.000002s : 12: predicate.opt_reshape 0.20% : 0.000003s : 14: predicate.parallel_virtual_node 1.86% : 0.000024s : 135: predicate.partial_defer_inline 1.33% : 0.000017s : 129: predicate.partial_eliminate 0.96% : 0.000012s : 98: predicate.print_const_string_wrapper 0.53% : 0.000007s : 50: predicate.reduce_all_const_elim 1.24% : 0.000016s : 98: predicate.reduce_eliminate 2.14% : 0.000027s : 241: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000004s : 50: predicate.remove_not_recompute_node 1.79% : 0.000023s : 220: predicate.replace_applicator 0.64% : 0.000008s : 90: predicate.replace_old_param 0.14% : 0.000002s : 14: predicate.reset_defer_inline 13.10% : 0.000168s : 98: predicate.reshape_eliminate 1.07% : 0.000014s : 105: predicate.row_tensor_add_zeros_like 0.16% : 0.000002s : 14: predicate.row_tensor_eliminate 1.25% : 0.000016s : 105: predicate.same_eliminate 0.39% : 0.000005s : 50: predicate.set_cell_output_no_recompute 0.62% : 0.000008s : 50: predicate.shard_identity_eliminate 0.32% : 0.000004s : 26: predicate.special_op_eliminate 0.55% : 0.000007s : 50: predicate.specialize_transform 1.27% : 0.000016s : 105: predicate.split_environ_get_set_with_tuple_value 1.12% : 0.000014s : 90: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 14: predicate.switch_call_monad_eliminater 1.43% : 0.000018s : 135: predicate.switch_defer_inline 2.40% : 0.000031s : 240: predicate.switch_layer_defer_inline 4.26% : 0.000055s : 390: predicate.switch_simplify 0.91% : 0.000012s : 98: predicate.tile_eliminate 0.87% : 0.000011s : 98: predicate.transpose_eliminate 1.33% : 0.000017s : 124: predicate.tuple_list_convert_item_index_to_positive 1.26% : 0.000016s : 124: predicate.tuple_list_get_item_const_eliminator 1.25% : 0.000016s : 124: predicate.tuple_list_get_item_depend_reorder 2.52% : 0.000032s : 191: predicate.tuple_list_get_item_eliminator 1.28% : 0.000016s : 124: predicate.tuple_list_get_set_item_eliminator 1.88% : 0.000024s : 174: predicate.tuple_list_set_item_eliminator 1.37% : 0.000018s : 141: predicate.tuple_to_list_eliminator_ 2.07% : 0.000027s : 241: predicate.updatestate_pure_node_eliminater 2.60% : 0.000033s : 291: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 14: predicate.value_based_eliminate 0.54% : 0.000007s : 50: predicate.virtual_dataset_eliminate 0.57% : 0.000007s : 50: predicate.virtual_output_eliminate 0.12% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.17% : 0.000002s : 14: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.020532 67 32.59% : 0.006692s : 43: func_graph_cloner_run.FuncGraphClonerGraph 67.41% : 0.013840s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.249817 233 0.00% : 0.000004s : 1: ForceFp32Comm 0.25% : 0.005513s : 1: add_attr 0.24% : 0.005497s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000172s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000305s : 1: auto_monad 0.00% : 0.000042s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.05% : 0.001078s : 1: bootstrap 0.00% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000033s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000067s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000008s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 3.75% : 0.084468s : 1: event_method 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.03% : 0.000670s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.04% : 0.000886s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000035s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000040s : 1: opt.transform.mutable_eliminate 0.38% : 0.008550s : 117: opt.transform.opt_a 0.00% : 0.000090s : 1: opt.transform.opt_after_cconv 0.00% : 0.000063s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000415s : 28: opt.transform.opt_b 0.01% : 0.000185s : 2: opt.transform.opt_trans_graph 0.00% : 0.000094s : 4: opt.transform.symbol_engine_opt 7.80% : 0.175458s : 1: opt_a 0.01% : 0.000242s : 1: opt_after_cconv 0.03% : 0.000755s : 1: opt_after_jit_grad 0.03% : 0.000635s : 1: opt_b 7.99% : 0.179732s : 1: optimize 0.00% : 0.000042s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000041s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000031s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.00% : 0.000086s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000105s : 1: remove_dup_value 4.60% : 0.103509s : 2: renormalize.infer 2.53% : 0.056911s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000057s : 1: rewriter_after_opt_a 0.02% : 0.000531s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000167s : 1: symbol_engine_optimizer 0.01% : 0.000234s : 1: tuple_transform 72.13% : 1.622699s : 1: type_inference group_cases_1 have all been run, results of sub cases are below: case: ('PYBOOST',) {} pass. case: (1,) {} pass. case: (0,) {} pass. case: (1,) {} pass. case: (0,) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. case: (1,) {} pass. ops group_cases_2 with 8 cases start to running, all cases are below: case: (, 0) case: (, 1) case: (, 0) case: (, 1) case: (,) case: (, 'KBK') case: (, 'pynative') case: (, 0) ops group_cases_2 total running memory: 68M, memory threshold: 51200M TotalTime = 0.665724, [30] [bootstrap]: 0.00141362 [type_inference]: 0.639138 [event_method]: 0.00054552 [auto_monad]: 0.00029671 [graph_reusing]: 1.133e-05 [pre_auto_parallel]: 1.364e-05 [py_interpret_to_execute]: 6.656e-05 [rewriter_before_opt_a]: 0.00021193 [expand_dump_flag]: 4.68999e-06 [jit_opt_a]: 0.0185974, [2] [Cycle 1]: 0.00703293, [27] [switch_simplify]: 0.00087426 [loop_unroll]: 6.692e-05 [a_1]: 0.00133275 [with_stream_mark]: 2.818e-05 [recompute_prepare]: 2.112e-05 [updatestate_depend_eliminate]: 2.006e-05 [updatestate_assign_eliminate]: 1.424e-05 [updatestate_loads_eliminate]: 5.76e-06 [parameter_eliminate]: 2.63e-06 [specialize_transform]: 1.386e-05 [updatestate_useless_node_eliminater]: 1.202e-05 [accelerated_algorithm]: 1.354e-05 [meta_shard_fg_expand]: 1.389e-05 [get_grad_eliminate_]: 1.185e-05 [merge_forward]: 7.65998e-06 [cell_reuse_recompute_pass]: 1.82001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.036e-05 [j_node_and_user_rematch]: 1.932e-05 [meta_fg_expand]: 6.10002e-06 [replace_old_param]: 2.287e-05 [inline_without_move]: 1.234e-05 [renormalize]: 0.00377623 [add_forward_monad_depend]: 3.369e-05 [auto_monad_grad]: 3.18e-06 [auto_monad_eliminator]: 3.99e-05 [cse]: 0.00022221 [replace_applicator]: 3.836e-05 [Cycle 2]: 0.0007224, [27] [switch_simplify]: 1.322e-05 [loop_unroll]: 1.189e-05 [a_1]: 0.00026309 [with_stream_mark]: 2.793e-05 [recompute_prepare]: 1.431e-05 [updatestate_depend_eliminate]: 7.21999e-06 [updatestate_assign_eliminate]: 5.71003e-06 [updatestate_loads_eliminate]: 5.47001e-06 [parameter_eliminate]: 2.12001e-06 [specialize_transform]: 1.287e-05 [updatestate_useless_node_eliminater]: 1.117e-05 [accelerated_algorithm]: 1.162e-05 [meta_shard_fg_expand]: 3.43e-06 [get_grad_eliminate_]: 1.073e-05 [merge_forward]: 7.13e-06 [cell_reuse_recompute_pass]: 3.23e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.928e-05 [j_node_and_user_rematch]: 1.595e-05 [meta_fg_expand]: 4.42e-06 [replace_old_param]: 2.095e-05 [inline_without_move]: 1.106e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.09001e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.654e-05 [cse]: 4.846e-05 [replace_applicator]: 1.323e-05 [py_interpret_to_execute_after_opt_a]: 2.781e-05 [rewriter_after_opt_a]: 0.0001245 [convert_after_rewriter]: 7.221e-05 [order_py_execute_after_rewriter]: 1.001e-05 [mutable_eliminate]: 0.00095311 [jit_opt_b]: 9.299e-05, [1] [Cycle 1]: 8.281e-05, [2] [frontend_op_eliminate]: 3.277e-05 [inline_after_opt_a]: 3.49e-05 [cconv]: 3.916e-05 [loop_unroll]: 0.00048322 [jit_opt_after_cconv]: 0.00027197, [1] [Cycle 1]: 0.0002647, [11] [c_1]: 5.095e-05 [parameter_eliminate]: 4.4e-06 [updatestate_depend_eliminate]: 1.155e-05 [updatestate_assign_eliminate]: 5.76003e-06 [updatestate_loads_eliminate]: 4.82998e-06 [cse]: 6.684e-05 [call_graph_tuple_transform]: 3.859e-05 [tuple_list_get_item_eliminator]: 1.13e-05 [none_parameter_eliminate]: 2.21e-06 [renormalize]: 8.89995e-07 [switch_simplify]: 1.173e-05 [remove_dup_value]: 7.748e-05 [partial_unused_args_eliminate]: 3.24001e-06 [environ_conv]: 3.839e-05 [add_recomputation]: 9.373e-05 [cse_after_recomputation]: 3.958e-05, [1] [Cycle 1]: 3.265e-05, [1] [cse]: 2.493e-05 [auto_monad_reorder]: 3.372e-05 [get_jit_bprop_graph]: 2.03997e-06 [rewriter_after_jit_bprop_graph]: 5.41998e-06 [opt_after_jit_grad]: 0.00053564 [symbol_engine_optimizer]: 0.00014685, [1] [Cycle 1]: 0.00014026, [6] [build]: 4.461e-05 [elim_shapecalc]: 1.506e-05 [elim_not_effective]: 2.171e-05 [opt_reshape]: 1.124e-05 [fold_const_symbol]: 1.567e-05 [renormalize]: 4.60015e-07 [validate]: 9.21e-05 Sums bootstrap : 0.001414s : 0.22% type_inference : 0.639138s : 98.04% event_method : 0.000546s : 0.08% auto_monad : 0.000297s : 0.05% graph_reusing : 0.000011s : 0.00% pre_auto_parallel : 0.000014s : 0.00% py_interpret_to_execute : 0.000067s : 0.01% rewriter_before_opt_a : 0.000212s : 0.03% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000887s : 0.14% jit_opt_a.loop_unroll : 0.000079s : 0.01% jit_opt_a.a_1 : 0.001596s : 0.24% jit_opt_a.with_stream_mark : 0.000056s : 0.01% jit_opt_a.recompute_prepare : 0.000035s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000027s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000011s : 0.00% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000027s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000023s : 0.00% jit_opt_a.accelerated_algorithm : 0.000025s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000017s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000023s : 0.00% jit_opt_a.merge_forward : 0.000015s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000070s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000035s : 0.01% jit_opt_a.meta_fg_expand : 0.000011s : 0.00% jit_opt_a.replace_old_param : 0.000044s : 0.01% jit_opt_a.inline_without_move : 0.000023s : 0.00% jit_opt_a.renormalize : 0.003776s : 0.58% jit_opt_a.add_forward_monad_depend : 0.000037s : 0.01% jit_opt_a.auto_monad_grad : 0.000005s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000056s : 0.01% jit_opt_a.cse : 0.000271s : 0.04% jit_opt_a.replace_applicator : 0.000052s : 0.01% py_interpret_to_execute_after_opt_a : 0.000028s : 0.00% rewriter_after_opt_a : 0.000124s : 0.02% convert_after_rewriter : 0.000072s : 0.01% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000953s : 0.15% jit_opt_b.frontend_op_eliminate : 0.000033s : 0.01% jit_opt_b.inline_after_opt_a : 0.000035s : 0.01% cconv : 0.000039s : 0.01% loop_unroll : 0.000483s : 0.07% jit_opt_after_cconv.c_1 : 0.000051s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.cse : 0.000067s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000039s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000011s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000012s : 0.00% remove_dup_value : 0.000077s : 0.01% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000038s : 0.01% add_recomputation : 0.000094s : 0.01% cse_after_recomputation.cse : 0.000025s : 0.00% auto_monad_reorder : 0.000034s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000536s : 0.08% symbol_engine_optimizer.build : 0.000045s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000022s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000092s : 0.01% Time group info: ------[substitution.] 0.000454 63 0.63% : 0.000003s : 3: substitution.elim_not_effective 0.46% : 0.000002s : 3: substitution.fold_const_symbol 2.12% : 0.000010s : 9: substitution.graph_param_transform 80.45% : 0.000365s : 16: substitution.inline 1.47% : 0.000007s : 6: substitution.j_node_and_user_rematch 3.42% : 0.000016s : 6: substitution.remove_not_recompute_node 2.50% : 0.000011s : 12: substitution.replace_old_param 8.96% : 0.000041s : 8: substitution.switch_simplify ------[type_inference.] 0.638971 2 99.29% : 0.634441s : 1: type_inference.infer 0.71% : 0.004530s : 1: type_inference.specialize ------[replace.] 0.000253 24 45.18% : 0.000114s : 16: replace.inline 54.82% : 0.000139s : 8: replace.switch_simplify ------[match.] 0.000391 24 90.83% : 0.000355s : 16: match.inline 9.17% : 0.000036s : 8: match.switch_simplify ------[predicate.] 0.000372 2107 1.18% : 0.000004s : 33: predicate.accumulaten_eliminater 0.66% : 0.000002s : 9: predicate.ad_related_special_op_eliminate 1.04% : 0.000004s : 33: predicate.addn_check_dump 1.30% : 0.000005s : 33: predicate.addn_zero_filter 2.02% : 0.000007s : 33: predicate.arithmetic_simplify 1.30% : 0.000005s : 33: predicate.cast_eliminate 0.31% : 0.000001s : 9: predicate.check_bprop_eliminate 1.03% : 0.000004s : 33: predicate.compare_switch_simplify 1.09% : 0.000004s : 33: predicate.depend_value_elim 1.10% : 0.000004s : 33: predicate.dict_get_item_const_eliminator 1.19% : 0.000004s : 33: predicate.dict_get_item_eliminator 1.16% : 0.000004s : 33: predicate.dict_set_item_eliminator 0.43% : 0.000002s : 9: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 9: predicate.elim_not_effective 0.42% : 0.000002s : 9: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000004s : 33: predicate.environ_add_const_eliminate 1.07% : 0.000004s : 33: predicate.environ_get_add_eliminate 1.08% : 0.000004s : 33: predicate.environ_get_depend_swap 1.19% : 0.000004s : 33: predicate.environ_get_eliminate 1.04% : 0.000004s : 33: predicate.environ_get_set_eliminate 0.20% : 0.000001s : 9: predicate.fold_const_symbol 0.65% : 0.000002s : 18: predicate.get_grad_eliminate 0.25% : 0.000001s : 9: predicate.graph_param_transform 4.46% : 0.000017s : 67: predicate.inline 0.84% : 0.000003s : 18: predicate.inline_without_move 0.38% : 0.000001s : 18: predicate.j_node_and_user_rematch 0.94% : 0.000004s : 18: predicate.less_batch_normalization 1.28% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 1.97% : 0.000007s : 42: predicate.load_eliminater 0.91% : 0.000003s : 9: predicate.loop_unroll_after_grad 3.43% : 0.000013s : 76: predicate.loop_unroll_before_grad 1.57% : 0.000006s : 42: predicate.make_slice_get_slice_eliminator 1.02% : 0.000004s : 33: predicate.merge_addn 1.04% : 0.000004s : 33: predicate.minmaximum_grad 1.14% : 0.000004s : 9: predicate.mutable_eliminate 0.38% : 0.000001s : 9: predicate.opt_reshape 1.82% : 0.000007s : 42: predicate.partial_eliminate 1.08% : 0.000004s : 33: predicate.print_const_string_wrapper 1.73% : 0.000006s : 33: predicate.reduce_eliminate 1.22% : 0.000005s : 33: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000002s : 18: predicate.remove_not_recompute_node 1.54% : 0.000006s : 51: predicate.replace_applicator 0.53% : 0.000002s : 18: predicate.replace_old_param 0.28% : 0.000001s : 9: predicate.reset_defer_inline 1.23% : 0.000005s : 33: predicate.reshape_eliminate 1.19% : 0.000004s : 33: predicate.row_tensor_add_zeros_like 0.45% : 0.000002s : 9: predicate.row_tensor_eliminate 1.13% : 0.000004s : 33: predicate.same_eliminate 0.63% : 0.000002s : 18: predicate.set_cell_output_no_recompute 0.71% : 0.000003s : 18: predicate.special_op_eliminate 0.80% : 0.000003s : 18: predicate.specialize_transform 1.34% : 0.000005s : 33: predicate.split_environ_get_set_with_tuple_value 1.17% : 0.000004s : 33: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 9: predicate.switch_call_monad_eliminater 2.23% : 0.000008s : 49: predicate.switch_defer_inline 2.12% : 0.000008s : 49: predicate.switch_layer_defer_inline 21.79% : 0.000081s : 150: predicate.switch_simplify 1.18% : 0.000004s : 33: predicate.tile_eliminate 1.11% : 0.000004s : 33: predicate.transpose_eliminate 1.40% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.26% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.76% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.49% : 0.000006s : 33: predicate.tuple_list_set_item_eliminator 1.15% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 1.35% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.07% : 0.000008s : 60: predicate.updatestate_useless_node_eliminater 1.42% : 0.000005s : 33: predicate.value_based_eliminate 0.24% : 0.000001s : 9: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 9: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003042 29 54.44% : 0.001656s : 11: func_graph_cloner_run.FuncGraphClonerGraph 45.56% : 0.001386s : 18: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.670493 72 0.01% : 0.000097s : 1: add_recomputation 0.05% : 0.000307s : 1: auto_monad 0.01% : 0.000037s : 1: auto_monad_reorder 0.22% : 0.001464s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.01% : 0.000077s : 1: convert_after_rewriter 0.01% : 0.000042s : 1: cse_after_recomputation 0.01% : 0.000041s : 1: environ_conv 0.08% : 0.000561s : 1: event_method 0.00% : 0.000008s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 2.77% : 0.018601s : 1: jit_opt_a 0.04% : 0.000275s : 1: jit_opt_after_cconv 0.01% : 0.000096s : 1: jit_opt_b 0.07% : 0.000492s : 1: loop_unroll 0.14% : 0.000968s : 1: mutable_eliminate 0.43% : 0.002851s : 26: opt.transform.jit_opt_a 0.02% : 0.000109s : 4: opt.transform.jit_opt_after_cconv 0.01% : 0.000059s : 4: opt.transform.jit_opt_b 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000028s : 1: opt.transform.mutable_eliminate 0.01% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000060s : 4: opt.transform.symbol_engine_opt 0.08% : 0.000546s : 1: opt_after_jit_grad 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000016s : 1: pre_auto_parallel 0.01% : 0.000070s : 1: py_interpret_to_execute 0.00% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000081s : 1: remove_dup_value 0.34% : 0.002308s : 1: renormalize.infer 0.22% : 0.001451s : 1: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000131s : 1: rewriter_after_opt_a 0.03% : 0.000218s : 1: rewriter_before_opt_a 0.02% : 0.000150s : 1: symbol_engine_optimizer 95.33% : 0.639168s : 1: type_inference TotalTime = 0.949567, [21] [bootstrap]: 0.00957396 [type_inference]: 0.793101 [event_method]: 1.881e-05 [auto_monad]: 0.00045433 [graph_reusing]: 3.305e-05 [inline]: 3.7e-06 [add_attr]: 0.0551892, [1] [add_attr_with_inline]: 0.0551718, [1] [Cycle 1]: 0.00015066, [2] [tag_attr]: 4.019e-05 [meta_addattr_fg_expand]: 1.394e-05 [parallel-infer-symbol]: 4.06001e-06 [pre_auto_parallel]: 6.19e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.71998e-06 [optimize]: 0.0736932, [53] [py_interpret_to_execute]: 9.77999e-06 [rewriter_before_opt_a]: 9.555e-05 [opt_a]: 0.045311, [2] [Cycle 1]: 0.0437479, [45] [expand_dump_flag]: 3.11001e-06 [switch_simplify]: 6.524e-05 [loop_unroll]: 3.019e-05 [a_1]: 0.0009168 [with_stream_mark]: 2.677e-05 [recompute_prepare]: 2.05e-05 [updatestate_depend_eliminate]: 4.263e-05 [updatestate_assign_eliminate]: 1.282e-05 [updatestate_loads_eliminate]: 2.314e-05 [parameter_eliminate]: 2.44999e-06 [a_2]: 0.00023396 [accelerated_algorithm]: 6.043e-05 [shard]: 3.26999e-06 [meta_shard_fg_expand]: 4.22e-06 [shard_inline]: 1.647e-05 [merge_send_recv]: 4.463e-05 [auto_parallel]: 1.492e-05 [parallel]: 0.0001924 [flash_sp]: 6.83e-05 [merge_comm]: 1.26e-05 [allreduce_fusion]: 1.518e-05 [matmul_add_comm_reduction]: 2.463e-05 [allreduce_slice_to_reducescatter]: 8.42e-06 [virtual_shard_identity]: 2.648e-05 [virtual_dataset]: 1.592e-05 [get_grad_eliminate_]: 1.563e-05 [virtual_output]: 1.526e-05 [merge_forward]: 8.35001e-06 [cell_reuse_recompute_pass]: 1.76e-06 [offload_activation]: 2.498e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.06e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 2.649e-05 [set_forward_comm_id_for_comm_node_pass]: 1.578e-05 [meta_fg_expand]: 6.66e-06 [flash_sp_send_recv_attached]: 5.56002e-06 [receive_attached]: 1.751e-05 [after_resolve]: 2.428e-05 [a_after_grad]: 2.415e-05 [renormalize]: 0.0408916 [add_forward_monad_depend]: 1.372e-05 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 6.891e-05 [cse]: 0.00018796 [a_3]: 0.00012761 [Cycle 2]: 0.00154876, [45] [expand_dump_flag]: 2.80002e-06 [switch_simplify]: 1.807e-05 [loop_unroll]: 1.468e-05 [a_1]: 0.00043299 [with_stream_mark]: 2.226e-05 [recompute_prepare]: 1.6e-05 [updatestate_depend_eliminate]: 8.74e-06 [updatestate_assign_eliminate]: 9.82001e-06 [updatestate_loads_eliminate]: 1.426e-05 [parameter_eliminate]: 1.63002e-06 [a_2]: 0.0002124 [accelerated_algorithm]: 2.148e-05 [shard]: 2.31998e-06 [meta_shard_fg_expand]: 3.41001e-06 [shard_inline]: 1.432e-05 [merge_send_recv]: 1.313e-05 [auto_parallel]: 1.461e-05 [parallel]: 9.67001e-06 [flash_sp]: 4.12e-06 [merge_comm]: 7.55e-06 [allreduce_fusion]: 7.18e-06 [matmul_add_comm_reduction]: 1.551e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 1.59e-05 [virtual_dataset]: 1.428e-05 [get_grad_eliminate_]: 1.467e-05 [virtual_output]: 1.438e-05 [merge_forward]: 8.18999e-06 [cell_reuse_recompute_pass]: 2.88998e-06 [offload_activation]: 1.592e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.65e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 2.249e-05 [set_forward_comm_id_for_comm_node_pass]: 7.35e-06 [meta_fg_expand]: 5.30001e-06 [flash_sp_send_recv_attached]: 1.70001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 2.245e-05 [a_after_grad]: 2.351e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.43002e-06 [auto_monad_eliminator]: 3.768e-05 [cse]: 4.826e-05 [a_3]: 0.00010324 [py_interpret_to_execute_after_opt_a]: 1.211e-05 [slice_cell_reuse_recomputed_activation]: 2.95998e-06 [rewriter_after_opt_a]: 5.626e-05 [convert_after_rewriter]: 1.53002e-06 [order_py_execute_after_rewriter]: 1.25001e-06 [mutable_eliminate]: 0.0252594 [opt_b]: 0.00063795, [1] [Cycle 1]: 0.00062631, [7] [b_1]: 0.00041532 [b_2]: 1.98e-05 [updatestate_depend_eliminate]: 1.687e-05 [updatestate_assign_eliminate]: 8.99e-06 [updatestate_loads_eliminate]: 1.683e-05 [renormalize]: 1.11002e-06 [cse]: 0.0001026 [optimize_parallel_all_gather_comm]: 5.384e-05 [overlap_param_gather]: 1.114e-05 [cconv]: 4.119e-05 [loop_unroll]: 0.00080281 [opt_after_cconv]: 0.00024505, [1] [Cycle 1]: 0.00023771, [7] [c_1]: 0.00010728 [parameter_eliminate]: 5.91e-06 [updatestate_depend_eliminate]: 1.191e-05 [updatestate_assign_eliminate]: 7.98999e-06 [updatestate_loads_eliminate]: 1.112e-05 [cse]: 5.604e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 6.697e-05 [tuple_transform]: 0.00014497, [1] [Cycle 1]: 0.00013977, [4] [d_1]: 0.0001026 [none_parameter_eliminate]: 1.88002e-06 [renormalize]: 1.10012e-07 [switch_simplify]: 1.567e-05 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 0.00011588 [cse_after_recomputation]: 4.276e-05, [1] [Cycle 1]: 3.791e-05, [1] [cse]: 3.247e-05 [environ_conv]: 3.928e-05 [swap_dp_allreduce_reducescatter]: 2.979e-05 [bias_add_comm_swap]: 1.15e-05 [label_micro_interleaved_index]: 1.559e-05 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.24999e-06 [micro_interleaved_order_control]: 2.35002e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 8.80001e-06 [full_micro_interleaved_order_control]: 1.025e-05 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 8.13999e-06 [overlap_opt_shard_in_pipeline]: 2.695e-05 [overlap_opt_shard_grad_in_pipeline]: 1.73002e-06 [control_data_broadcast_order]: 2.502e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 6.87002e-06 [overlap_recompute_and_grad_model_parallel]: 1.453e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.39998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 2.102e-05 [overlap_grad_flash_sp]: 6.34e-05 [begin_end_overlap_inline]: 8.10018e-07 [split_matmul_comm_elemetwise]: 1.022e-05 [split_layernorm_comm]: 1.96003e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 0.00018572, [1] [Cycle 1]: 0.00018041, [6] [build]: 3.696e-05 [elim_shapecalc]: 2.213e-05 [elim_not_effective]: 2.624e-05 [opt_reshape]: 1.54e-05 [fold_const_symbol]: 4.785e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.56e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 5.952e-05 [get_jit_bprop_graph]: 2.33002e-06 [rewriter_after_jit_bprop_graph]: 6.80002e-06 [opt_after_jit_grad]: 0.00064948 [validate]: 0.0163891 Sums bootstrap : 0.009574s : 1.07% type_inference : 0.793101s : 88.81% event_method : 0.000019s : 0.00% auto_monad : 0.000454s : 0.05% graph_reusing : 0.000033s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000040s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000062s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.00% optimize.rewriter_before_opt_a : 0.000096s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000083s : 0.01% optimize.opt_a.loop_unroll : 0.000045s : 0.01% optimize.opt_a.a_1 : 0.001350s : 0.15% optimize.opt_a.with_stream_mark : 0.000049s : 0.01% optimize.opt_a.recompute_prepare : 0.000036s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000051s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000037s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000446s : 0.05% optimize.opt_a.accelerated_algorithm : 0.000082s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000031s : 0.00% optimize.opt_a.merge_send_recv : 0.000058s : 0.01% optimize.opt_a.auto_parallel : 0.000030s : 0.00% optimize.opt_a.parallel : 0.000202s : 0.02% optimize.opt_a.flash_sp : 0.000072s : 0.01% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000022s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000040s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000042s : 0.00% optimize.opt_a.virtual_dataset : 0.000030s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000030s : 0.00% optimize.opt_a.virtual_output : 0.000030s : 0.00% optimize.opt_a.merge_forward : 0.000017s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000041s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000067s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000049s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000023s : 0.00% optimize.opt_a.meta_fg_expand : 0.000012s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000047s : 0.01% optimize.opt_a.a_after_grad : 0.000048s : 0.01% optimize.opt_a.renormalize : 0.040892s : 4.58% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000107s : 0.01% optimize.opt_a.cse : 0.000236s : 0.03% optimize.opt_a.a_3 : 0.000231s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.025259s : 2.83% optimize.opt_b.b_1 : 0.000415s : 0.05% optimize.opt_b.b_2 : 0.000020s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000017s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000103s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000054s : 0.01% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000041s : 0.00% optimize.loop_unroll : 0.000803s : 0.09% optimize.opt_after_cconv.c_1 : 0.000107s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000056s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000067s : 0.01% optimize.tuple_transform.d_1 : 0.000103s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000016s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000116s : 0.01% optimize.cse_after_recomputation.cse : 0.000032s : 0.00% optimize.environ_conv : 0.000039s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000030s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000016s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000027s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000025s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000063s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000037s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000048s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000060s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000649s : 0.07% validate : 0.016389s : 1.84% Time group info: ------[substitution.] 0.000416 98 2.69% : 0.000011s : 2: substitution.depend_value_elim 0.90% : 0.000004s : 7: substitution.elim_not_effective 0.73% : 0.000003s : 7: substitution.fold_const_symbol 2.71% : 0.000011s : 12: substitution.graph_param_transform 47.20% : 0.000196s : 4: substitution.inline 2.06% : 0.000009s : 14: substitution.j_node_and_user_rematch 6.13% : 0.000025s : 2: substitution.less_batch_normalization 2.13% : 0.000009s : 12: substitution.load_eliminater 4.93% : 0.000021s : 14: substitution.remove_not_recompute_node 2.32% : 0.000010s : 6: substitution.replace_old_param 2.53% : 0.000011s : 8: substitution.updatestate_pure_node_eliminater 25.67% : 0.000107s : 10: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.793000 2 99.76% : 0.791133s : 1: type_inference.infer 0.24% : 0.001867s : 1: type_inference.specialize ------[replace.] 0.000039 4 100.00% : 0.000039s : 4: replace.inline ------[match.] 0.000193 4 100.00% : 0.000193s : 4: match.inline ------[predicate.] 0.000461 3222 0.83% : 0.000004s : 31: predicate.accumulaten_eliminater 0.83% : 0.000004s : 12: predicate.ad_related_special_op_eliminate 0.76% : 0.000003s : 26: predicate.addn_check_dump 1.00% : 0.000005s : 31: predicate.addn_zero_filter 0.78% : 0.000004s : 31: predicate.adjust_all_reduce_mul_add 2.21% : 0.000010s : 57: predicate.arithmetic_simplify 0.83% : 0.000004s : 31: predicate.cast_eliminate 0.78% : 0.000004s : 26: predicate.check_bprop_eliminate 0.69% : 0.000003s : 26: predicate.compare_switch_simplify 0.26% : 0.000001s : 13: predicate.const_output_eliminate 0.80% : 0.000004s : 26: predicate.depend_value_elim 0.85% : 0.000004s : 31: predicate.dict_get_item_const_eliminator 0.99% : 0.000005s : 31: predicate.dict_get_item_eliminator 0.84% : 0.000004s : 31: predicate.dict_set_item_eliminator 1.00% : 0.000005s : 25: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 12: predicate.elim_not_effective 0.45% : 0.000002s : 12: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000006s : 44: predicate.environ_add_const_eliminate 1.19% : 0.000006s : 44: predicate.environ_get_add_eliminate 1.21% : 0.000006s : 44: predicate.environ_get_depend_swap 1.99% : 0.000009s : 70: predicate.environ_get_eliminate 1.18% : 0.000005s : 44: predicate.environ_get_set_eliminate 0.90% : 0.000004s : 35: predicate.exchange_switch_depend_value 1.39% : 0.000006s : 35: predicate.float_depend_g_call 0.83% : 0.000004s : 26: predicate.float_environ_get_switch 1.06% : 0.000005s : 39: predicate.float_tuple_getitem_switch 0.20% : 0.000001s : 12: predicate.fold_const_symbol 0.83% : 0.000004s : 26: predicate.get_grad_eliminate 0.26% : 0.000001s : 12: predicate.graph_param_transform 0.71% : 0.000003s : 26: predicate.incorporate_call 0.66% : 0.000003s : 26: predicate.incorporate_call_switch 5.51% : 0.000025s : 144: predicate.inline 0.87% : 0.000004s : 26: predicate.inline_without_move 0.42% : 0.000002s : 26: predicate.j_node_and_user_rematch 1.13% : 0.000005s : 28: predicate.less_batch_normalization 1.74% : 0.000008s : 56: predicate.list_to_tuple_eliminator_ 2.46% : 0.000011s : 88: predicate.load_eliminater 0.75% : 0.000003s : 13: predicate.loop_unroll_after_grad 1.34% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.69% : 0.000008s : 57: predicate.make_slice_get_slice_eliminator 0.84% : 0.000004s : 26: predicate.merge_addn 0.71% : 0.000003s : 26: predicate.micro_step_allgather_replace 0.70% : 0.000003s : 26: predicate.mini_step_allgather_replace 0.76% : 0.000004s : 31: predicate.minmaximum_grad 2.23% : 0.000010s : 13: predicate.mutable_eliminate 0.40% : 0.000002s : 12: predicate.opt_reshape 0.51% : 0.000002s : 13: predicate.parallel_virtual_node 1.22% : 0.000006s : 35: predicate.partial_defer_inline 1.34% : 0.000006s : 44: predicate.partial_eliminate 0.92% : 0.000004s : 31: predicate.print_const_string_wrapper 0.82% : 0.000004s : 26: predicate.reduce_all_const_elim 1.09% : 0.000005s : 31: predicate.reduce_eliminate 2.50% : 0.000012s : 88: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000003s : 26: predicate.remove_not_recompute_node 1.31% : 0.000006s : 57: predicate.replace_applicator 0.56% : 0.000003s : 26: predicate.replace_old_param 0.43% : 0.000002s : 13: predicate.reset_defer_inline 0.87% : 0.000004s : 31: predicate.reshape_eliminate 0.82% : 0.000004s : 26: predicate.row_tensor_add_zeros_like 0.48% : 0.000002s : 13: predicate.row_tensor_eliminate 0.97% : 0.000004s : 26: predicate.same_eliminate 0.55% : 0.000003s : 28: predicate.set_cell_output_no_recompute 0.99% : 0.000005s : 26: predicate.shard_identity_eliminate 0.79% : 0.000004s : 25: predicate.special_op_eliminate 0.83% : 0.000004s : 26: predicate.specialize_transform 1.04% : 0.000005s : 26: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000004s : 26: predicate.stack_unstack_eliminate 0.39% : 0.000002s : 13: predicate.switch_call_monad_eliminater 0.98% : 0.000005s : 35: predicate.switch_defer_inline 1.74% : 0.000008s : 61: predicate.switch_layer_defer_inline 3.76% : 0.000017s : 119: predicate.switch_simplify 0.87% : 0.000004s : 31: predicate.tile_eliminate 0.83% : 0.000004s : 31: predicate.transpose_eliminate 1.66% : 0.000008s : 56: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000008s : 56: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000007s : 56: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000014s : 82: predicate.tuple_list_get_item_eliminator 1.67% : 0.000008s : 56: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000012s : 82: predicate.tuple_list_set_item_eliminator 1.60% : 0.000007s : 56: predicate.tuple_to_list_eliminator_ 2.50% : 0.000012s : 88: predicate.updatestate_pure_node_eliminater 3.25% : 0.000015s : 114: predicate.updatestate_useless_node_eliminater 0.45% : 0.000002s : 13: predicate.value_based_eliminate 0.80% : 0.000004s : 26: predicate.virtual_dataset_eliminate 0.80% : 0.000004s : 26: predicate.virtual_output_eliminate 0.36% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.58% : 0.000003s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.102673 34 63.42% : 0.065111s : 28: func_graph_cloner_run.FuncGraphClonerGraph 36.58% : 0.037562s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.106180 192 0.00% : 0.000004s : 1: ForceFp32Comm 4.99% : 0.055224s : 1: add_attr 4.99% : 0.055177s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000121s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000466s : 1: auto_monad 0.01% : 0.000064s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.87% : 0.009661s : 1: bootstrap 0.00% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000028s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000046s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000043s : 1: environ_conv 0.00% : 0.000026s : 1: event_method 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000041s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000019s : 1: label_micro_interleaved_index 0.07% : 0.000813s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 2.29% : 0.025287s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000061s : 1: opt.transform.mutable_eliminate 0.23% : 0.002528s : 78: opt.transform.opt_a 0.01% : 0.000106s : 1: opt.transform.opt_after_cconv 0.00% : 0.000054s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000396s : 28: opt.transform.opt_b 0.01% : 0.000116s : 2: opt.transform.opt_trans_graph 0.01% : 0.000108s : 4: opt.transform.symbol_engine_opt 4.10% : 0.045315s : 1: opt_a 0.02% : 0.000249s : 1: opt_after_cconv 0.06% : 0.000660s : 1: opt_after_jit_grad 0.06% : 0.000643s : 1: opt_b 6.66% : 0.073700s : 1: optimize 0.01% : 0.000058s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000067s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000031s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000067s : 1: pre_auto_parallel 0.00% : 0.000014s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.01% : 0.000072s : 1: remove_dup_value 0.26% : 0.002911s : 1: renormalize.infer 3.43% : 0.037963s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000061s : 1: rewriter_after_opt_a 0.01% : 0.000100s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000034s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000189s : 1: symbol_engine_optimizer 0.01% : 0.000148s : 1: tuple_transform 71.70% : 0.793129s : 1: type_inference TotalTime = 0.727079, [21] [bootstrap]: 0.00131331 [type_inference]: 0.58545 [event_method]: 0.00031206 [auto_monad]: 0.00023657 [graph_reusing]: 1.021e-05 [inline]: 3.77002e-06 [add_attr]: 0.0115838, [1] [add_attr_with_inline]: 0.011563, [1] [Cycle 1]: 0.00018831, [2] [tag_attr]: 7.136e-05 [meta_addattr_fg_expand]: 2.026e-05 [parallel-infer-symbol]: 3.79002e-06 [pre_auto_parallel]: 8.629e-05 [insert-virtual-dataset]: 2.94999e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.42001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.126749, [53] [py_interpret_to_execute]: 1.088e-05 [rewriter_before_opt_a]: 0.00062397 [opt_a]: 0.00754222, [2] [Cycle 1]: 0.00643881, [45] [expand_dump_flag]: 5.72001e-06 [switch_simplify]: 0.00021358 [loop_unroll]: 6.884e-05 [a_1]: 0.00135118 [with_stream_mark]: 2.8e-05 [recompute_prepare]: 1.63e-05 [updatestate_depend_eliminate]: 1.553e-05 [updatestate_assign_eliminate]: 1.217e-05 [updatestate_loads_eliminate]: 4.28999e-06 [parameter_eliminate]: 2.66e-06 [a_2]: 0.00016097 [accelerated_algorithm]: 1.178e-05 [shard]: 1.84e-06 [meta_shard_fg_expand]: 5.15001e-06 [shard_inline]: 1.108e-05 [merge_send_recv]: 4.184e-05 [auto_parallel]: 1.244e-05 [parallel]: 7.507e-05 [flash_sp]: 3.59e-05 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 1.242e-05 [matmul_add_comm_reduction]: 1.961e-05 [allreduce_slice_to_reducescatter]: 8.25e-06 [virtual_shard_identity]: 1.397e-05 [virtual_dataset]: 1.104e-05 [get_grad_eliminate_]: 1.066e-05 [virtual_output]: 1.088e-05 [merge_forward]: 5.63002e-06 [cell_reuse_recompute_pass]: 1.71002e-06 [offload_activation]: 2.014e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.07e-05 [merge_recompute_call_nodes]: 1.41002e-06 [before_grad]: 1.593e-05 [set_forward_comm_id_for_comm_node_pass]: 1.348e-05 [meta_fg_expand]: 5.03002e-06 [flash_sp_send_recv_attached]: 3.29001e-06 [receive_attached]: 1.598e-05 [after_resolve]: 1.658e-05 [a_after_grad]: 1.659e-05 [renormalize]: 0.00344979 [add_forward_monad_depend]: 1.068e-05 [auto_monad_grad]: 3.16001e-06 [auto_monad_eliminator]: 3.728e-05 [cse]: 0.00022889 [a_3]: 9.283e-05 [Cycle 2]: 0.00108518, [45] [expand_dump_flag]: 2.56e-06 [switch_simplify]: 1.39e-05 [loop_unroll]: 1.014e-05 [a_1]: 0.00029696 [with_stream_mark]: 2.083e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 5.50001e-06 [updatestate_assign_eliminate]: 4.44002e-06 [updatestate_loads_eliminate]: 4.4e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00015022 [accelerated_algorithm]: 1.203e-05 [shard]: 2.64001e-06 [meta_shard_fg_expand]: 3.2e-06 [shard_inline]: 1.084e-05 [merge_send_recv]: 9.82001e-06 [auto_parallel]: 1.229e-05 [parallel]: 9.24998e-06 [flash_sp]: 3.73001e-06 [merge_comm]: 5.12999e-06 [allreduce_fusion]: 4.49002e-06 [matmul_add_comm_reduction]: 1.097e-05 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 1.137e-05 [virtual_dataset]: 9.84999e-06 [get_grad_eliminate_]: 9.48002e-06 [virtual_output]: 9.76998e-06 [merge_forward]: 5.77001e-06 [cell_reuse_recompute_pass]: 3.71999e-06 [offload_activation]: 1.235e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.587e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.493e-05 [set_forward_comm_id_for_comm_node_pass]: 5.89999e-06 [meta_fg_expand]: 3.44001e-06 [flash_sp_send_recv_attached]: 1.79e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.487e-05 [a_after_grad]: 2.613e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.81e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 1.281e-05 [cse]: 3.563e-05 [a_3]: 6.83e-05 [py_interpret_to_execute_after_opt_a]: 9.99999e-06 [slice_cell_reuse_recomputed_activation]: 2.21998e-06 [rewriter_after_opt_a]: 4.341e-05 [convert_after_rewriter]: 1.64998e-06 [order_py_execute_after_rewriter]: 1.47999e-06 [mutable_eliminate]: 0.00085645 [opt_b]: 0.00036973, [1] [Cycle 1]: 0.00036128, [7] [b_1]: 0.00024693 [b_2]: 1.211e-05 [updatestate_depend_eliminate]: 8.85001e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.81999e-06 [renormalize]: 9.20001e-07 [cse]: 4.672e-05 [optimize_parallel_all_gather_comm]: 3.3e-05 [overlap_param_gather]: 1.389e-05 [cconv]: 3.035e-05 [loop_unroll]: 0.00058481 [opt_after_cconv]: 0.00016277, [1] [Cycle 1]: 0.00015628, [7] [c_1]: 5.644e-05 [parameter_eliminate]: 4.22e-06 [updatestate_depend_eliminate]: 8.38999e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 4e-06 [cse]: 4.119e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 5.736e-05 [tuple_transform]: 0.115486, [1] [Cycle 1]: 0.115477, [4] [d_1]: 0.11536 [none_parameter_eliminate]: 8.69e-06 [renormalize]: 1.22999e-06 [switch_simplify]: 2.474e-05 [partial_unused_args_eliminate]: 4.92e-06 [add_recomputation]: 0.00011658 [cse_after_recomputation]: 8.697e-05, [1] [Cycle 1]: 7.621e-05, [1] [cse]: 6.724e-05 [environ_conv]: 3.511e-05 [swap_dp_allreduce_reducescatter]: 2.912e-05 [bias_add_comm_swap]: 1.357e-05 [label_micro_interleaved_index]: 1.658e-05 [label_fine_grained_interleaved_index]: 2.48e-06 [merge_cast_opt]: 1.55999e-06 [slice_recompute_activation]: 2.41e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.77999e-06 [ForceFp32Comm]: 9.79984e-07 [remove_cast_before_assign_add]: 8.99e-06 [full_micro_interleaved_order_control]: 9.44e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 7.65e-06 [overlap_opt_shard_in_pipeline]: 2.739e-05 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 2.184e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 6.01998e-06 [overlap_recompute_and_grad_model_parallel]: 1.405e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.60001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 1.96998e-06 [overlap_grad_ring_attention]: 2.045e-05 [overlap_grad_flash_sp]: 6.131e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 1.062e-05 [split_layernorm_comm]: 1.76003e-06 [handle_group_info]: 1.02998e-06 [symbol_engine_optimizer]: 0.00012371, [1] [Cycle 1]: 0.00011782, [6] [build]: 1.457e-05 [elim_shapecalc]: 2.169e-05 [elim_not_effective]: 1.965e-05 [opt_reshape]: 1.154e-05 [fold_const_symbol]: 1.51e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 3.374e-05 [get_jit_bprop_graph]: 2.24999e-06 [rewriter_after_jit_bprop_graph]: 8.20999e-06 [opt_after_jit_grad]: 0.00090928 [validate]: 0.0001035 Sums bootstrap : 0.001313s : 0.18% type_inference : 0.585450s : 81.96% event_method : 0.000312s : 0.04% auto_monad : 0.000237s : 0.03% graph_reusing : 0.000010s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000071s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000086s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000624s : 0.09% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000227s : 0.03% optimize.opt_a.loop_unroll : 0.000079s : 0.01% optimize.opt_a.a_1 : 0.001648s : 0.23% optimize.opt_a.with_stream_mark : 0.000049s : 0.01% optimize.opt_a.recompute_prepare : 0.000028s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000311s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000024s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000022s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.01% optimize.opt_a.auto_parallel : 0.000025s : 0.00% optimize.opt_a.parallel : 0.000084s : 0.01% optimize.opt_a.flash_sp : 0.000040s : 0.01% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000031s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.00% optimize.opt_a.virtual_dataset : 0.000021s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000020s : 0.00% optimize.opt_a.virtual_output : 0.000021s : 0.00% optimize.opt_a.merge_forward : 0.000011s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000032s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000019s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.00% optimize.opt_a.a_after_grad : 0.000043s : 0.01% optimize.opt_a.renormalize : 0.003450s : 0.48% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000050s : 0.01% optimize.opt_a.cse : 0.000265s : 0.04% optimize.opt_a.a_3 : 0.000161s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000856s : 0.12% optimize.opt_b.b_1 : 0.000247s : 0.03% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000047s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000030s : 0.00% optimize.loop_unroll : 0.000585s : 0.08% optimize.opt_after_cconv.c_1 : 0.000056s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000041s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000057s : 0.01% optimize.tuple_transform.d_1 : 0.115360s : 16.15% optimize.tuple_transform.none_parameter_eliminate : 0.000009s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000025s : 0.00% optimize.partial_unused_args_eliminate : 0.000005s : 0.00% optimize.add_recomputation : 0.000117s : 0.02% optimize.cse_after_recomputation.cse : 0.000067s : 0.01% optimize.environ_conv : 0.000035s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000029s : 0.00% optimize.bias_add_comm_swap : 0.000014s : 0.00% optimize.label_micro_interleaved_index : 0.000017s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000009s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000027s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.00% optimize.overlap_grad_flash_sp : 0.000061s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000034s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000909s : 0.13% validate : 0.000104s : 0.01% Time group info: ------[substitution.] 0.000577 76 0.48% : 0.000003s : 3: substitution.elim_not_effective 1.53% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.42% : 0.000002s : 3: substitution.fold_const_symbol 2.30% : 0.000013s : 7: substitution.graph_param_transform 64.75% : 0.000373s : 11: substitution.inline 0.98% : 0.000006s : 6: substitution.j_node_and_user_rematch 2.36% : 0.000014s : 3: substitution.minmaximum_grad 1.24% : 0.000007s : 6: substitution.remove_not_recompute_node 1.00% : 0.000006s : 2: substitution.replace_old_param 3.91% : 0.000023s : 2: substitution.switch_simplify 3.85% : 0.000022s : 5: substitution.tuple_list_convert_item_index_to_positive 3.65% : 0.000021s : 5: substitution.tuple_list_get_item_const_eliminator 2.72% : 0.000016s : 5: substitution.tuple_list_get_item_depend_reorder 8.27% : 0.000048s : 10: substitution.tuple_list_get_item_eliminator 2.54% : 0.000015s : 5: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.585269 2 99.24% : 0.580828s : 1: type_inference.infer 0.76% : 0.004441s : 1: type_inference.specialize ------[replace.] 0.000201 16 55.67% : 0.000112s : 11: replace.inline 23.55% : 0.000047s : 2: replace.switch_simplify 20.78% : 0.000042s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000393 16 93.34% : 0.000367s : 11: match.inline 5.14% : 0.000020s : 2: match.switch_simplify 1.52% : 0.000006s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000377 2549 0.99% : 0.000004s : 27: predicate.accumulaten_eliminater 0.78% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 16: predicate.addn_check_dump 1.09% : 0.000004s : 27: predicate.addn_zero_filter 0.82% : 0.000003s : 27: predicate.adjust_all_reduce_mul_add 2.17% : 0.000008s : 43: predicate.arithmetic_simplify 0.97% : 0.000004s : 27: predicate.cast_eliminate 0.55% : 0.000002s : 16: predicate.check_bprop_eliminate 0.53% : 0.000002s : 16: predicate.compare_switch_simplify 0.20% : 0.000001s : 8: predicate.const_output_eliminate 0.54% : 0.000002s : 16: predicate.depend_value_elim 0.99% : 0.000004s : 27: predicate.dict_get_item_const_eliminator 1.07% : 0.000004s : 27: predicate.dict_get_item_eliminator 0.92% : 0.000003s : 27: predicate.dict_set_item_eliminator 0.90% : 0.000003s : 15: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 7: predicate.elim_not_effective 0.45% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000004s : 35: predicate.environ_add_const_eliminate 1.08% : 0.000004s : 35: predicate.environ_get_add_eliminate 1.07% : 0.000004s : 35: predicate.environ_get_depend_swap 1.67% : 0.000006s : 51: predicate.environ_get_eliminate 1.07% : 0.000004s : 35: predicate.environ_get_set_eliminate 1.43% : 0.000005s : 41: predicate.exchange_switch_depend_value 2.11% : 0.000008s : 41: predicate.float_depend_g_call 0.56% : 0.000002s : 16: predicate.float_environ_get_switch 0.83% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.16% : 0.000001s : 7: predicate.fold_const_symbol 0.66% : 0.000002s : 16: predicate.get_grad_eliminate 0.39% : 0.000001s : 7: predicate.graph_param_transform 0.51% : 0.000002s : 16: predicate.incorporate_call 0.47% : 0.000002s : 16: predicate.incorporate_call_switch 5.43% : 0.000020s : 116: predicate.inline 0.67% : 0.000003s : 16: predicate.inline_without_move 0.32% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.88% : 0.000003s : 16: predicate.less_batch_normalization 1.71% : 0.000006s : 45: predicate.list_to_tuple_eliminator_ 2.41% : 0.000009s : 73: predicate.load_eliminater 0.64% : 0.000002s : 8: predicate.loop_unroll_after_grad 3.10% : 0.000012s : 78: predicate.loop_unroll_before_grad 1.50% : 0.000006s : 43: predicate.make_slice_get_slice_eliminator 0.67% : 0.000003s : 16: predicate.merge_addn 0.62% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.52% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.83% : 0.000003s : 27: predicate.minmaximum_grad 0.84% : 0.000003s : 8: predicate.mutable_eliminate 0.30% : 0.000001s : 7: predicate.opt_reshape 0.31% : 0.000001s : 8: predicate.parallel_virtual_node 2.06% : 0.000008s : 41: predicate.partial_defer_inline 1.36% : 0.000005s : 38: predicate.partial_eliminate 0.93% : 0.000003s : 27: predicate.print_const_string_wrapper 0.58% : 0.000002s : 16: predicate.reduce_all_const_elim 1.27% : 0.000005s : 27: predicate.reduce_eliminate 2.41% : 0.000009s : 73: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 16: predicate.remove_not_recompute_node 1.40% : 0.000005s : 46: predicate.replace_applicator 0.38% : 0.000001s : 16: predicate.replace_old_param 0.27% : 0.000001s : 8: predicate.reset_defer_inline 0.98% : 0.000004s : 27: predicate.reshape_eliminate 0.66% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 8: predicate.row_tensor_eliminate 0.81% : 0.000003s : 16: predicate.same_eliminate 0.42% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.72% : 0.000003s : 16: predicate.shard_identity_eliminate 0.68% : 0.000003s : 15: predicate.special_op_eliminate 0.59% : 0.000002s : 16: predicate.specialize_transform 0.73% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.55% : 0.000006s : 41: predicate.switch_defer_inline 2.20% : 0.000008s : 57: predicate.switch_layer_defer_inline 6.87% : 0.000026s : 146: predicate.switch_simplify 0.88% : 0.000003s : 27: predicate.tile_eliminate 0.88% : 0.000003s : 27: predicate.transpose_eliminate 1.61% : 0.000006s : 42: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000006s : 42: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000006s : 42: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000012s : 61: predicate.tuple_list_get_item_eliminator 1.64% : 0.000006s : 42: predicate.tuple_list_get_set_item_eliminator 3.38% : 0.000013s : 58: predicate.tuple_list_set_item_eliminator 1.68% : 0.000006s : 45: predicate.tuple_to_list_eliminator_ 2.17% : 0.000008s : 73: predicate.updatestate_pure_node_eliminater 2.89% : 0.000011s : 89: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 8: predicate.value_based_eliminate 0.64% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.69% : 0.000003s : 16: predicate.virtual_output_eliminate 0.23% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003218 28 58.49% : 0.001882s : 15: func_graph_cloner_run.FuncGraphClonerGraph 41.51% : 0.001336s : 13: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.987040 192 0.00% : 0.000004s : 1: ForceFp32Comm 1.17% : 0.011591s : 1: add_attr 1.17% : 0.011568s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000122s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.03% : 0.000251s : 1: auto_monad 0.00% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.14% : 0.001387s : 1: bootstrap 0.00% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000025s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000090s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000040s : 1: environ_conv 0.03% : 0.000334s : 1: event_method 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000010s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.06% : 0.000595s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.09% : 0.000868s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000025s : 1: opt.transform.mutable_eliminate 0.26% : 0.002613s : 78: opt.transform.opt_a 0.01% : 0.000055s : 1: opt.transform.opt_after_cconv 0.00% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000227s : 28: opt.transform.opt_b 11.69% : 0.115374s : 2: opt.transform.opt_trans_graph 0.01% : 0.000063s : 4: opt.transform.symbol_engine_opt 0.76% : 0.007547s : 1: opt_a 0.02% : 0.000166s : 1: opt_after_cconv 0.09% : 0.000921s : 1: opt_after_jit_grad 0.04% : 0.000374s : 1: opt_b 12.84% : 0.126755s : 1: optimize 0.00% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000065s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000031s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000009s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000091s : 1: pre_auto_parallel 0.00% : 0.000014s : 1: py_interpret_to_execute 0.00% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.01% : 0.000062s : 1: remove_dup_value 0.21% : 0.002066s : 1: renormalize.infer 0.14% : 0.001369s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000048s : 1: rewriter_after_opt_a 0.06% : 0.000638s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000032s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000126s : 1: symbol_engine_optimizer 11.70% : 0.115492s : 1: tuple_transform 59.32% : 0.585487s : 1: type_inference group_cases_2 have all been run, results of sub cases are below: case: () {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. case: (0,) {} pass. case: (1,) {} pass. case: (1,) {} pass. case: (0,) {} pass. case: ('pynative',) {} pass. ops group_cases_3 with 8 cases start to running, all cases are below: case: (, 1) case: (, 0) case: (, 1) case: (, 'pynative') case: (, 'KBK') case: (, mindspore.float32, 0, True) case: (, mindspore.float32, 0, False) case: (, mindspore.float32, 1, True) ops group_cases_3 total running memory: 292M, memory threshold: 51200M TotalTime = 0.608988, [30] [bootstrap]: 0.0019039 [type_inference]: 0.388074 [event_method]: 0.00016972 [auto_monad]: 0.00025998 [graph_reusing]: 7.80998e-06 [pre_auto_parallel]: 1.187e-05 [py_interpret_to_execute]: 6.638e-05 [rewriter_before_opt_a]: 0.00019917 [expand_dump_flag]: 3.7e-06 [jit_opt_a]: 0.15803, [3] [Cycle 1]: 0.141691, [27] [switch_simplify]: 0.00018506 [loop_unroll]: 6.62e-05 [a_1]: 0.0458454 [with_stream_mark]: 4.907e-05 [recompute_prepare]: 3.773e-05 [updatestate_depend_eliminate]: 2.318e-05 [updatestate_assign_eliminate]: 1.798e-05 [updatestate_loads_eliminate]: 1.016e-05 [parameter_eliminate]: 3.85e-06 [specialize_transform]: 2.345e-05 [updatestate_useless_node_eliminater]: 2.058e-05 [accelerated_algorithm]: 8.158e-05 [meta_shard_fg_expand]: 8.42e-06 [get_grad_eliminate_]: 2.168e-05 [merge_forward]: 1.232e-05 [cell_reuse_recompute_pass]: 1.12e-06 [cell_reuse_handle_not_recompute_node_pass]: 5.456e-05 [j_node_and_user_rematch]: 4.592e-05 [meta_fg_expand]: 0.00325981 [replace_old_param]: 0.00011415 [inline_without_move]: 8.413e-05 [renormalize]: 0.0906737 [add_forward_monad_depend]: 6.207e-05 [auto_monad_grad]: 9.07999e-06 [auto_monad_eliminator]: 8.923e-05 [cse]: 0.00031939 [replace_applicator]: 0.00012551 [Cycle 2]: 0.00528694, [27] [switch_simplify]: 6.388e-05 [loop_unroll]: 6.258e-05 [a_1]: 0.0023281 [with_stream_mark]: 4.758e-05 [recompute_prepare]: 3.212e-05 [updatestate_depend_eliminate]: 1.112e-05 [updatestate_assign_eliminate]: 8.85001e-06 [updatestate_loads_eliminate]: 7.71999e-06 [parameter_eliminate]: 3.81999e-06 [specialize_transform]: 3.589e-05 [updatestate_useless_node_eliminater]: 1.653e-05 [accelerated_algorithm]: 2.71e-05 [meta_shard_fg_expand]: 4.74e-06 [get_grad_eliminate_]: 1.533e-05 [merge_forward]: 1.154e-05 [cell_reuse_recompute_pass]: 1.89e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.602e-05 [j_node_and_user_rematch]: 3.075e-05 [meta_fg_expand]: 0.00012781 [replace_old_param]: 3.266e-05 [inline_without_move]: 1.833e-05 [renormalize]: 0.00184133 [add_forward_monad_depend]: 1.165e-05 [auto_monad_grad]: 2.93e-06 [auto_monad_eliminator]: 3.64e-05 [cse]: 0.00013706 [replace_applicator]: 3.44e-05 [Cycle 3]: 0.00104703, [27] [switch_simplify]: 1.657e-05 [loop_unroll]: 1.459e-05 [a_1]: 0.00045087 [with_stream_mark]: 2.836e-05 [recompute_prepare]: 2.067e-05 [updatestate_depend_eliminate]: 1.03e-05 [updatestate_assign_eliminate]: 9.15001e-06 [updatestate_loads_eliminate]: 7.68001e-06 [parameter_eliminate]: 1.56002e-06 [specialize_transform]: 1.599e-05 [updatestate_useless_node_eliminater]: 1.547e-05 [accelerated_algorithm]: 2.256e-05 [meta_shard_fg_expand]: 3.86001e-06 [get_grad_eliminate_]: 1.56e-05 [merge_forward]: 1.103e-05 [cell_reuse_recompute_pass]: 2.79001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.134e-05 [j_node_and_user_rematch]: 2.568e-05 [meta_fg_expand]: 6.12999e-06 [replace_old_param]: 2.326e-05 [inline_without_move]: 1.603e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.93998e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 2.739e-05 [cse]: 6.6e-05 [replace_applicator]: 2.168e-05 [py_interpret_to_execute_after_opt_a]: 3.374e-05 [rewriter_after_opt_a]: 0.00029852 [convert_after_rewriter]: 0.00010078 [order_py_execute_after_rewriter]: 1.333e-05 [mutable_eliminate]: 0.00093084 [jit_opt_b]: 0.00016483, [1] [Cycle 1]: 0.00015478, [2] [frontend_op_eliminate]: 5.537e-05 [inline_after_opt_a]: 8.444e-05 [cconv]: 4.683e-05 [loop_unroll]: 0.00047499 [jit_opt_after_cconv]: 0.00038536, [1] [Cycle 1]: 0.00037822, [11] [c_1]: 7.45e-05 [parameter_eliminate]: 5.44e-06 [updatestate_depend_eliminate]: 1.73e-05 [updatestate_assign_eliminate]: 1.047e-05 [updatestate_loads_eliminate]: 9.86e-06 [cse]: 0.00010923 [call_graph_tuple_transform]: 4.575e-05 [tuple_list_get_item_eliminator]: 3.038e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 7.89994e-07 [switch_simplify]: 1.683e-05 [remove_dup_value]: 0.00011919 [partial_unused_args_eliminate]: 2.46e-06 [environ_conv]: 2.647e-05 [add_recomputation]: 0.00014198 [cse_after_recomputation]: 5.669e-05, [1] [Cycle 1]: 4.978e-05, [1] [cse]: 4.113e-05 [auto_monad_reorder]: 4.563e-05 [get_jit_bprop_graph]: 2.41e-06 [rewriter_after_jit_bprop_graph]: 5.17999e-06 [opt_after_jit_grad]: 0.00055519 [symbol_engine_optimizer]: 0.00015966, [1] [Cycle 1]: 0.00015258, [6] [build]: 2.999e-05 [elim_shapecalc]: 1.943e-05 [elim_not_effective]: 3.21e-05 [opt_reshape]: 1.627e-05 [fold_const_symbol]: 2.505e-05 [renormalize]: 6.59988e-07 [validate]: 0.00014691 Sums bootstrap : 0.001904s : 0.35% type_inference : 0.388074s : 71.68% event_method : 0.000170s : 0.03% auto_monad : 0.000260s : 0.05% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000066s : 0.01% rewriter_before_opt_a : 0.000199s : 0.04% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000266s : 0.05% jit_opt_a.loop_unroll : 0.000143s : 0.03% jit_opt_a.a_1 : 0.048624s : 8.98% jit_opt_a.with_stream_mark : 0.000125s : 0.02% jit_opt_a.recompute_prepare : 0.000091s : 0.02% jit_opt_a.updatestate_depend_eliminate : 0.000045s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000036s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000026s : 0.00% jit_opt_a.parameter_eliminate : 0.000009s : 0.00% jit_opt_a.specialize_transform : 0.000075s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000053s : 0.01% jit_opt_a.accelerated_algorithm : 0.000131s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000017s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000053s : 0.01% jit_opt_a.merge_forward : 0.000035s : 0.01% jit_opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000122s : 0.02% jit_opt_a.j_node_and_user_rematch : 0.000102s : 0.02% jit_opt_a.meta_fg_expand : 0.003394s : 0.63% jit_opt_a.replace_old_param : 0.000170s : 0.03% jit_opt_a.inline_without_move : 0.000118s : 0.02% jit_opt_a.renormalize : 0.092515s : 17.09% jit_opt_a.add_forward_monad_depend : 0.000077s : 0.01% jit_opt_a.auto_monad_grad : 0.000014s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000153s : 0.03% jit_opt_a.cse : 0.000522s : 0.10% jit_opt_a.replace_applicator : 0.000182s : 0.03% py_interpret_to_execute_after_opt_a : 0.000034s : 0.01% rewriter_after_opt_a : 0.000299s : 0.06% convert_after_rewriter : 0.000101s : 0.02% order_py_execute_after_rewriter : 0.000013s : 0.00% mutable_eliminate : 0.000931s : 0.17% jit_opt_b.frontend_op_eliminate : 0.000055s : 0.01% jit_opt_b.inline_after_opt_a : 0.000084s : 0.02% cconv : 0.000047s : 0.01% loop_unroll : 0.000475s : 0.09% jit_opt_after_cconv.c_1 : 0.000074s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000017s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000010s : 0.00% jit_opt_after_cconv.cse : 0.000109s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000046s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000030s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000017s : 0.00% remove_dup_value : 0.000119s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000026s : 0.00% add_recomputation : 0.000142s : 0.03% cse_after_recomputation.cse : 0.000041s : 0.01% auto_monad_reorder : 0.000046s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000555s : 0.10% symbol_engine_optimizer.build : 0.000030s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000032s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000016s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000147s : 0.03% Time group info: ------[substitution.] 0.001433 278 0.39% : 0.000006s : 9: substitution.elim_not_effective 0.27% : 0.000004s : 9: substitution.fold_const_symbol 0.91% : 0.000013s : 13: substitution.graph_param_transform 56.81% : 0.000814s : 13: substitution.inline 1.64% : 0.000024s : 2: substitution.inline_without_move 2.03% : 0.000029s : 29: substitution.j_node_and_user_rematch 4.41% : 0.000063s : 3: substitution.less_batch_normalization 4.55% : 0.000065s : 25: substitution.minmaximum_grad 2.03% : 0.000029s : 5: substitution.partial_eliminate 1.59% : 0.000023s : 29: substitution.remove_not_recompute_node 3.19% : 0.000046s : 10: substitution.replace_applicator 1.48% : 0.000021s : 26: substitution.replace_old_param 0.34% : 0.000005s : 1: substitution.set_cell_output_no_recompute 5.32% : 0.000076s : 25: substitution.tuple_list_convert_item_index_to_positive 3.42% : 0.000049s : 25: substitution.tuple_list_get_item_depend_reorder 11.60% : 0.000166s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.387877 2 99.29% : 0.385114s : 1: type_inference.infer 0.71% : 0.002763s : 1: type_inference.specialize ------[replace.] 0.000341 32 55.68% : 0.000190s : 13: replace.inline 44.32% : 0.000151s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000867 32 92.57% : 0.000802s : 13: match.inline 7.43% : 0.000064s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000759 5185 1.40% : 0.000011s : 83: predicate.accumulaten_eliminater 0.53% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 2.65% : 0.000020s : 83: predicate.addn_check_dump 1.49% : 0.000011s : 83: predicate.addn_zero_filter 2.44% : 0.000019s : 83: predicate.arithmetic_simplify 1.42% : 0.000011s : 83: predicate.cast_eliminate 0.26% : 0.000002s : 13: predicate.check_bprop_eliminate 1.26% : 0.000010s : 83: predicate.compare_switch_simplify 1.31% : 0.000010s : 83: predicate.depend_value_elim 1.33% : 0.000010s : 83: predicate.dict_get_item_const_eliminator 1.38% : 0.000010s : 83: predicate.dict_get_item_eliminator 1.35% : 0.000010s : 83: predicate.dict_set_item_eliminator 0.37% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 13: predicate.elim_not_effective 0.29% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.000011s : 83: predicate.environ_add_const_eliminate 1.35% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.31% : 0.000010s : 83: predicate.environ_get_depend_swap 1.31% : 0.000010s : 83: predicate.environ_get_eliminate 1.30% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.13% : 0.000001s : 13: predicate.fold_const_symbol 1.02% : 0.000008s : 47: predicate.get_grad_eliminate 0.12% : 0.000001s : 13: predicate.graph_param_transform 4.88% : 0.000037s : 141: predicate.inline 1.88% : 0.000014s : 80: predicate.inline_without_move 0.46% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.18% : 0.000009s : 47: predicate.less_batch_normalization 1.78% : 0.000013s : 102: predicate.list_to_tuple_eliminator_ 2.17% : 0.000016s : 115: predicate.load_eliminater 0.48% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.79% : 0.000021s : 151: predicate.loop_unroll_before_grad 1.59% : 0.000012s : 96: predicate.make_slice_get_slice_eliminator 1.28% : 0.000010s : 83: predicate.merge_addn 1.42% : 0.000011s : 83: predicate.minmaximum_grad 0.71% : 0.000005s : 13: predicate.mutable_eliminate 0.27% : 0.000002s : 13: predicate.opt_reshape 2.65% : 0.000020s : 115: predicate.partial_eliminate 1.28% : 0.000010s : 83: predicate.print_const_string_wrapper 1.72% : 0.000013s : 83: predicate.reduce_eliminate 1.74% : 0.000013s : 102: predicate.redundant_stop_gradient_eliminater 0.61% : 0.000005s : 47: predicate.remove_not_recompute_node 2.63% : 0.000020s : 194: predicate.replace_applicator 0.99% : 0.000007s : 80: predicate.replace_old_param 0.20% : 0.000002s : 13: predicate.reset_defer_inline 1.49% : 0.000011s : 83: predicate.reshape_eliminate 1.38% : 0.000010s : 83: predicate.row_tensor_add_zeros_like 0.39% : 0.000003s : 13: predicate.row_tensor_eliminate 1.42% : 0.000011s : 83: predicate.same_eliminate 0.63% : 0.000005s : 47: predicate.set_cell_output_no_recompute 0.51% : 0.000004s : 26: predicate.special_op_eliminate 1.08% : 0.000008s : 47: predicate.specialize_transform 1.86% : 0.000014s : 83: predicate.split_environ_get_set_with_tuple_value 1.47% : 0.000011s : 83: predicate.stack_unstack_eliminate 0.27% : 0.000002s : 13: predicate.switch_call_monad_eliminater 3.05% : 0.000023s : 115: predicate.switch_defer_inline 2.22% : 0.000017s : 115: predicate.switch_layer_defer_inline 5.35% : 0.000041s : 279: predicate.switch_simplify 1.41% : 0.000011s : 83: predicate.tile_eliminate 1.38% : 0.000011s : 83: predicate.transpose_eliminate 1.70% : 0.000013s : 83: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000012s : 83: predicate.tuple_list_get_item_depend_reorder 3.72% : 0.000028s : 128: predicate.tuple_list_get_item_eliminator 1.90% : 0.000014s : 83: predicate.tuple_list_set_item_eliminator 1.77% : 0.000013s : 102: predicate.tuple_to_list_eliminator_ 1.96% : 0.000015s : 115: predicate.updatestate_pure_node_eliminater 2.91% : 0.000022s : 162: predicate.updatestate_useless_node_eliminater 1.75% : 0.000013s : 83: predicate.value_based_eliminate 0.22% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.28% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004055 38 68.91% : 0.002794s : 21: func_graph_cloner_run.FuncGraphClonerGraph 31.09% : 0.001261s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.751629 87 0.02% : 0.000146s : 1: add_recomputation 0.04% : 0.000269s : 1: auto_monad 0.01% : 0.000049s : 1: auto_monad_reorder 0.26% : 0.001945s : 1: bootstrap 0.01% : 0.000050s : 1: cconv 0.02% : 0.000132s : 1: convert_after_rewriter 0.01% : 0.000059s : 1: cse_after_recomputation 0.00% : 0.000029s : 1: environ_conv 0.02% : 0.000178s : 1: event_method 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 21.03% : 0.158034s : 1: jit_opt_a 0.05% : 0.000388s : 1: jit_opt_after_cconv 0.02% : 0.000168s : 1: jit_opt_b 0.06% : 0.000484s : 1: loop_unroll 0.13% : 0.000943s : 1: mutable_eliminate 6.65% : 0.050014s : 39: opt.transform.jit_opt_a 0.02% : 0.000163s : 4: opt.transform.jit_opt_after_cconv 0.02% : 0.000130s : 4: opt.transform.jit_opt_b 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000045s : 1: opt.transform.mutable_eliminate 0.01% : 0.000056s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000089s : 4: opt.transform.symbol_engine_opt 0.08% : 0.000565s : 1: opt_after_jit_grad 0.00% : 0.000017s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000014s : 1: pre_auto_parallel 0.01% : 0.000071s : 1: py_interpret_to_execute 0.01% : 0.000038s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000123s : 1: remove_dup_value 11.83% : 0.088914s : 2: renormalize.infer 0.48% : 0.003571s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 7.51% : 0.056421s : 1: rewriter_after_opt_a 0.03% : 0.000203s : 1: rewriter_before_opt_a 0.02% : 0.000162s : 1: symbol_engine_optimizer 51.63% : 0.388102s : 1: type_inference TotalTime = 0.797449, [30] [bootstrap]: 0.00334359 [type_inference]: 0.546435 [event_method]: 0.00048796 [auto_monad]: 0.00040731 [graph_reusing]: 9.73998e-06 [pre_auto_parallel]: 1.347e-05 [py_interpret_to_execute]: 0.00010488 [rewriter_before_opt_a]: 0.00025954 [expand_dump_flag]: 5.66998e-06 [jit_opt_a]: 0.241842, [3] [Cycle 1]: 0.185566, [27] [switch_simplify]: 0.00020824 [loop_unroll]: 6.507e-05 [a_1]: 0.0017496 [with_stream_mark]: 4.793e-05 [recompute_prepare]: 3.583e-05 [updatestate_depend_eliminate]: 2.496e-05 [updatestate_assign_eliminate]: 1.76e-05 [updatestate_loads_eliminate]: 9.86998e-06 [parameter_eliminate]: 4.25999e-06 [specialize_transform]: 2.477e-05 [updatestate_useless_node_eliminater]: 2.102e-05 [accelerated_algorithm]: 7.475e-05 [meta_shard_fg_expand]: 1.09e-05 [get_grad_eliminate_]: 2.302e-05 [merge_forward]: 1.241e-05 [cell_reuse_recompute_pass]: 1.45999e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.848e-05 [j_node_and_user_rematch]: 4.773e-05 [meta_fg_expand]: 0.0476966 [replace_old_param]: 0.00013158 [inline_without_move]: 9.517e-05 [renormalize]: 0.134138 [add_forward_monad_depend]: 7.303e-05 [auto_monad_grad]: 1.227e-05 [auto_monad_eliminator]: 0.0001127 [cse]: 0.00034796 [replace_applicator]: 0.00013899 [Cycle 2]: 0.00729807, [27] [switch_simplify]: 6.692e-05 [loop_unroll]: 6.337e-05 [a_1]: 0.00249069 [with_stream_mark]: 4.083e-05 [recompute_prepare]: 3.265e-05 [updatestate_depend_eliminate]: 1.349e-05 [updatestate_assign_eliminate]: 9.31e-06 [updatestate_loads_eliminate]: 7.84997e-06 [parameter_eliminate]: 3.18e-06 [specialize_transform]: 1.697e-05 [updatestate_useless_node_eliminater]: 1.61e-05 [accelerated_algorithm]: 2.863e-05 [meta_shard_fg_expand]: 1.332e-05 [get_grad_eliminate_]: 4.095e-05 [merge_forward]: 1.548e-05 [cell_reuse_recompute_pass]: 4.01001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.688e-05 [j_node_and_user_rematch]: 3.339e-05 [meta_fg_expand]: 0.00034638 [replace_old_param]: 4.223e-05 [inline_without_move]: 1.779e-05 [renormalize]: 0.00326809 [add_forward_monad_depend]: 1.267e-05 [auto_monad_grad]: 2.70002e-06 [auto_monad_eliminator]: 3.592e-05 [cse]: 0.00032157 [replace_applicator]: 4.195e-05 [Cycle 3]: 0.00107691, [27] [switch_simplify]: 1.882e-05 [loop_unroll]: 1.723e-05 [a_1]: 0.00050993 [with_stream_mark]: 2.869e-05 [recompute_prepare]: 1.823e-05 [updatestate_depend_eliminate]: 1.033e-05 [updatestate_assign_eliminate]: 9.12001e-06 [updatestate_loads_eliminate]: 8.03999e-06 [parameter_eliminate]: 2.19001e-06 [specialize_transform]: 1.817e-05 [updatestate_useless_node_eliminater]: 1.525e-05 [accelerated_algorithm]: 2.3e-05 [meta_shard_fg_expand]: 5.70001e-06 [get_grad_eliminate_]: 1.599e-05 [merge_forward]: 9.24998e-06 [cell_reuse_recompute_pass]: 4.95001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.508e-05 [j_node_and_user_rematch]: 2.676e-05 [meta_fg_expand]: 6.96999e-06 [replace_old_param]: 2.418e-05 [inline_without_move]: 1.576e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.79999e-06 [auto_monad_grad]: 2.22999e-06 [auto_monad_eliminator]: 2.344e-05 [cse]: 6.486e-05 [replace_applicator]: 1.632e-05 [py_interpret_to_execute_after_opt_a]: 3.261e-05 [rewriter_after_opt_a]: 0.00024185 [convert_after_rewriter]: 1.757e-05 [order_py_execute_after_rewriter]: 1.041e-05 [mutable_eliminate]: 0.00100273 [jit_opt_b]: 0.00019189, [1] [Cycle 1]: 0.00017977, [2] [frontend_op_eliminate]: 7.327e-05 [inline_after_opt_a]: 8.992e-05 [cconv]: 4.942e-05 [loop_unroll]: 0.00059656 [jit_opt_after_cconv]: 0.00047617, [1] [Cycle 1]: 0.00046724, [11] [c_1]: 8.734e-05 [parameter_eliminate]: 5.22e-06 [updatestate_depend_eliminate]: 2.495e-05 [updatestate_assign_eliminate]: 1.236e-05 [updatestate_loads_eliminate]: 1.244e-05 [cse]: 0.00011751 [call_graph_tuple_transform]: 8.083e-05 [tuple_list_get_item_eliminator]: 3.739e-05 [none_parameter_eliminate]: 2.19001e-06 [renormalize]: 9.29984e-07 [switch_simplify]: 2.151e-05 [remove_dup_value]: 0.00012027 [partial_unused_args_eliminate]: 3.08e-06 [environ_conv]: 2.817e-05 [add_recomputation]: 0.00014222 [cse_after_recomputation]: 6.521e-05, [1] [Cycle 1]: 5.618e-05, [1] [cse]: 4.601e-05 [auto_monad_reorder]: 4.701e-05 [get_jit_bprop_graph]: 2.93e-06 [rewriter_after_jit_bprop_graph]: 5.45001e-06 [opt_after_jit_grad]: 0.00067833 [symbol_engine_optimizer]: 0.00017216, [1] [Cycle 1]: 0.00016445, [6] [build]: 3.202e-05 [elim_shapecalc]: 1.933e-05 [elim_not_effective]: 3.516e-05 [opt_reshape]: 1.785e-05 [fold_const_symbol]: 2.722e-05 [renormalize]: 1.37999e-06 [validate]: 0.00018265 Sums bootstrap : 0.003344s : 0.45% type_inference : 0.546435s : 73.04% event_method : 0.000488s : 0.07% auto_monad : 0.000407s : 0.05% graph_reusing : 0.000010s : 0.00% pre_auto_parallel : 0.000013s : 0.00% py_interpret_to_execute : 0.000105s : 0.01% rewriter_before_opt_a : 0.000260s : 0.03% expand_dump_flag : 0.000006s : 0.00% jit_opt_a.switch_simplify : 0.000294s : 0.04% jit_opt_a.loop_unroll : 0.000146s : 0.02% jit_opt_a.a_1 : 0.004750s : 0.63% jit_opt_a.with_stream_mark : 0.000117s : 0.02% jit_opt_a.recompute_prepare : 0.000087s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000049s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000036s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000026s : 0.00% jit_opt_a.parameter_eliminate : 0.000010s : 0.00% jit_opt_a.specialize_transform : 0.000060s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000052s : 0.01% jit_opt_a.accelerated_algorithm : 0.000126s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000030s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000080s : 0.01% jit_opt_a.merge_forward : 0.000037s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000130s : 0.02% jit_opt_a.j_node_and_user_rematch : 0.000108s : 0.01% jit_opt_a.meta_fg_expand : 0.048050s : 6.42% jit_opt_a.replace_old_param : 0.000198s : 0.03% jit_opt_a.inline_without_move : 0.000129s : 0.02% jit_opt_a.renormalize : 0.137406s : 18.37% jit_opt_a.add_forward_monad_depend : 0.000088s : 0.01% jit_opt_a.auto_monad_grad : 0.000017s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000172s : 0.02% jit_opt_a.cse : 0.000734s : 0.10% jit_opt_a.replace_applicator : 0.000197s : 0.03% py_interpret_to_execute_after_opt_a : 0.000033s : 0.00% rewriter_after_opt_a : 0.000242s : 0.03% convert_after_rewriter : 0.000018s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.001003s : 0.13% jit_opt_b.frontend_op_eliminate : 0.000073s : 0.01% jit_opt_b.inline_after_opt_a : 0.000090s : 0.01% cconv : 0.000049s : 0.01% loop_unroll : 0.000597s : 0.08% jit_opt_after_cconv.c_1 : 0.000087s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000025s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.00% jit_opt_after_cconv.cse : 0.000118s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000081s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000037s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000022s : 0.00% remove_dup_value : 0.000120s : 0.02% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000028s : 0.00% add_recomputation : 0.000142s : 0.02% cse_after_recomputation.cse : 0.000046s : 0.01% auto_monad_reorder : 0.000047s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000678s : 0.09% symbol_engine_optimizer.build : 0.000032s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000035s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000018s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000027s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000183s : 0.02% Time group info: ------[substitution.] 0.001508 278 0.32% : 0.000005s : 9: substitution.elim_not_effective 0.29% : 0.000004s : 9: substitution.fold_const_symbol 1.20% : 0.000018s : 13: substitution.graph_param_transform 58.27% : 0.000879s : 13: substitution.inline 2.11% : 0.000032s : 2: substitution.inline_without_move 2.11% : 0.000032s : 29: substitution.j_node_and_user_rematch 3.75% : 0.000057s : 3: substitution.less_batch_normalization 3.57% : 0.000054s : 25: substitution.minmaximum_grad 1.84% : 0.000028s : 5: substitution.partial_eliminate 1.91% : 0.000029s : 29: substitution.remove_not_recompute_node 3.34% : 0.000050s : 10: substitution.replace_applicator 1.51% : 0.000023s : 26: substitution.replace_old_param 0.26% : 0.000004s : 1: substitution.set_cell_output_no_recompute 4.78% : 0.000072s : 25: substitution.tuple_list_convert_item_index_to_positive 3.27% : 0.000049s : 25: substitution.tuple_list_get_item_depend_reorder 11.48% : 0.000173s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.545303 2 99.24% : 0.541158s : 1: type_inference.infer 0.76% : 0.004145s : 1: type_inference.specialize ------[replace.] 0.000386 32 51.58% : 0.000199s : 13: replace.inline 48.42% : 0.000187s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000931 32 93.13% : 0.000867s : 13: match.inline 6.87% : 0.000064s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000827 5185 1.53% : 0.000013s : 83: predicate.accumulaten_eliminater 0.56% : 0.000005s : 13: predicate.ad_related_special_op_eliminate 1.34% : 0.000011s : 83: predicate.addn_check_dump 1.35% : 0.000011s : 83: predicate.addn_zero_filter 2.38% : 0.000020s : 83: predicate.arithmetic_simplify 1.31% : 0.000011s : 83: predicate.cast_eliminate 0.26% : 0.000002s : 13: predicate.check_bprop_eliminate 1.26% : 0.000010s : 83: predicate.compare_switch_simplify 1.43% : 0.000012s : 83: predicate.depend_value_elim 1.32% : 0.000011s : 83: predicate.dict_get_item_const_eliminator 1.48% : 0.000012s : 83: predicate.dict_get_item_eliminator 1.45% : 0.000012s : 83: predicate.dict_set_item_eliminator 0.35% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.13% : 0.000001s : 13: predicate.elim_not_effective 0.30% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.44% : 0.000012s : 83: predicate.environ_add_const_eliminate 1.27% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.34% : 0.000011s : 83: predicate.environ_get_depend_swap 1.42% : 0.000012s : 83: predicate.environ_get_eliminate 1.26% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.13% : 0.000001s : 13: predicate.fold_const_symbol 1.03% : 0.000009s : 47: predicate.get_grad_eliminate 2.49% : 0.000021s : 13: predicate.graph_param_transform 3.94% : 0.000033s : 141: predicate.inline 1.67% : 0.000014s : 80: predicate.inline_without_move 0.42% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.23% : 0.000010s : 47: predicate.less_batch_normalization 1.71% : 0.000014s : 102: predicate.list_to_tuple_eliminator_ 2.17% : 0.000018s : 115: predicate.load_eliminater 0.74% : 0.000006s : 13: predicate.loop_unroll_after_grad 2.67% : 0.000022s : 151: predicate.loop_unroll_before_grad 1.75% : 0.000014s : 96: predicate.make_slice_get_slice_eliminator 1.25% : 0.000010s : 83: predicate.merge_addn 1.49% : 0.000012s : 83: predicate.minmaximum_grad 0.99% : 0.000008s : 13: predicate.mutable_eliminate 0.25% : 0.000002s : 13: predicate.opt_reshape 2.33% : 0.000019s : 115: predicate.partial_eliminate 1.38% : 0.000011s : 83: predicate.print_const_string_wrapper 1.96% : 0.000016s : 83: predicate.reduce_eliminate 1.84% : 0.000015s : 102: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000004s : 47: predicate.remove_not_recompute_node 2.46% : 0.000020s : 194: predicate.replace_applicator 0.94% : 0.000008s : 80: predicate.replace_old_param 0.18% : 0.000001s : 13: predicate.reset_defer_inline 1.36% : 0.000011s : 83: predicate.reshape_eliminate 1.40% : 0.000012s : 83: predicate.row_tensor_add_zeros_like 0.36% : 0.000003s : 13: predicate.row_tensor_eliminate 1.30% : 0.000011s : 83: predicate.same_eliminate 0.58% : 0.000005s : 47: predicate.set_cell_output_no_recompute 0.58% : 0.000005s : 26: predicate.special_op_eliminate 0.83% : 0.000007s : 47: predicate.specialize_transform 1.94% : 0.000016s : 83: predicate.split_environ_get_set_with_tuple_value 1.49% : 0.000012s : 83: predicate.stack_unstack_eliminate 0.25% : 0.000002s : 13: predicate.switch_call_monad_eliminater 3.26% : 0.000027s : 115: predicate.switch_defer_inline 2.03% : 0.000017s : 115: predicate.switch_layer_defer_inline 5.35% : 0.000044s : 279: predicate.switch_simplify 1.39% : 0.000011s : 83: predicate.tile_eliminate 1.30% : 0.000011s : 83: predicate.transpose_eliminate 1.67% : 0.000014s : 83: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000015s : 83: predicate.tuple_list_get_item_depend_reorder 3.70% : 0.000031s : 128: predicate.tuple_list_get_item_eliminator 1.89% : 0.000016s : 83: predicate.tuple_list_set_item_eliminator 1.65% : 0.000014s : 102: predicate.tuple_to_list_eliminator_ 1.84% : 0.000015s : 115: predicate.updatestate_pure_node_eliminater 2.81% : 0.000023s : 162: predicate.updatestate_useless_node_eliminater 1.94% : 0.000016s : 83: predicate.value_based_eliminate 0.20% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.35% : 0.000003s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005823 38 60.71% : 0.003535s : 21: func_graph_cloner_run.FuncGraphClonerGraph 39.29% : 0.002288s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.941228 87 0.02% : 0.000147s : 1: add_recomputation 0.04% : 0.000421s : 1: auto_monad 0.01% : 0.000050s : 1: auto_monad_reorder 0.36% : 0.003384s : 1: bootstrap 0.01% : 0.000053s : 1: cconv 0.00% : 0.000021s : 1: convert_after_rewriter 0.01% : 0.000067s : 1: cse_after_recomputation 0.00% : 0.000031s : 1: environ_conv 0.05% : 0.000498s : 1: event_method 0.00% : 0.000008s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 25.69% : 0.241848s : 1: jit_opt_a 0.05% : 0.000480s : 1: jit_opt_after_cconv 0.02% : 0.000195s : 1: jit_opt_b 0.06% : 0.000608s : 1: loop_unroll 0.11% : 0.001019s : 1: mutable_eliminate 0.66% : 0.006246s : 39: opt.transform.jit_opt_a 0.02% : 0.000222s : 4: opt.transform.jit_opt_after_cconv 0.02% : 0.000152s : 4: opt.transform.jit_opt_b 0.00% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000046s : 1: opt.transform.mutable_eliminate 0.01% : 0.000064s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000095s : 4: opt.transform.symbol_engine_opt 0.07% : 0.000688s : 1: opt_after_jit_grad 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000016s : 1: pre_auto_parallel 0.01% : 0.000112s : 1: py_interpret_to_execute 0.00% : 0.000036s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000125s : 1: remove_dup_value 13.98% : 0.131583s : 2: renormalize.infer 0.61% : 0.005782s : 2: renormalize.specialize 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000250s : 1: rewriter_after_opt_a 0.03% : 0.000266s : 1: rewriter_before_opt_a 0.02% : 0.000175s : 1: symbol_engine_optimizer 58.06% : 0.546467s : 1: type_inference TotalTime = 13.8422, [24] [bootstrap]: 0.0536542 [type_inference]: 2.30359 [event_method]: 0.00167937 [auto_monad]: 0.00096714 [graph_reusing]: 1.04e-05 [inline]: 4.574e-05 [add_attr]: 0.117671, [1] [add_attr_with_inline]: 0.117648, [1] [Cycle 1]: 0.00175068, [2] [tag_attr]: 0.00158298 [meta_addattr_fg_expand]: 3.117e-05 [parallel-infer-symbol]: 5.69e-06 [pre_auto_parallel]: 0.00013121 [insert-virtual-dataset]: 3.01999e-06 [parallel-infer-symbol-second]: 1.20999e-06 [dataset_repeat_opt]: 2.92002e-06 [pipeline_split]: 2.02001e-06 [optimize]: 0.149631, [53] [py_interpret_to_execute]: 9.29998e-06 [rewriter_before_opt_a]: 0.00043111 [opt_a]: 0.115496, [2] [Cycle 1]: 0.10318, [45] [expand_dump_flag]: 5.20001e-06 [switch_simplify]: 0.00023806 [loop_unroll]: 5.952e-05 [a_1]: 0.00183087 [with_stream_mark]: 5.96e-05 [recompute_prepare]: 3.422e-05 [updatestate_depend_eliminate]: 8.592e-05 [updatestate_assign_eliminate]: 9.397e-05 [updatestate_loads_eliminate]: 2.707e-05 [parameter_eliminate]: 4.28001e-06 [a_2]: 0.00029788 [accelerated_algorithm]: 5.582e-05 [shard]: 2.73e-06 [meta_shard_fg_expand]: 5.30001e-06 [shard_inline]: 1.794e-05 [merge_send_recv]: 5.021e-05 [auto_parallel]: 1.509e-05 [parallel]: 0.00018047 [flash_sp]: 3.959e-05 [merge_comm]: 1.181e-05 [allreduce_fusion]: 1.645e-05 [matmul_add_comm_reduction]: 2.377e-05 [allreduce_slice_to_reducescatter]: 7.98001e-06 [virtual_shard_identity]: 3.104e-05 [virtual_dataset]: 2.004e-05 [get_grad_eliminate_]: 1.945e-05 [virtual_output]: 1.846e-05 [merge_forward]: 1.079e-05 [cell_reuse_recompute_pass]: 2.19999e-06 [offload_activation]: 2.446e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.588e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 2.931e-05 [set_forward_comm_id_for_comm_node_pass]: 1.862e-05 [meta_fg_expand]: 7.63001e-06 [flash_sp_send_recv_attached]: 6.22001e-06 [receive_attached]: 1.801e-05 [after_resolve]: 3.081e-05 [a_after_grad]: 2.957e-05 [renormalize]: 0.0951933 [add_forward_monad_depend]: 1.348e-05 [auto_monad_grad]: 3.23e-06 [auto_monad_eliminator]: 8.485e-05 [cse]: 0.00378993 [a_3]: 0.00015904 [Cycle 2]: 0.0122934, [45] [expand_dump_flag]: 4.24002e-06 [switch_simplify]: 2.043e-05 [loop_unroll]: 1.745e-05 [a_1]: 0.00051059 [with_stream_mark]: 3.573e-05 [recompute_prepare]: 2.086e-05 [updatestate_depend_eliminate]: 1.082e-05 [updatestate_assign_eliminate]: 1.114e-05 [updatestate_loads_eliminate]: 1.462e-05 [parameter_eliminate]: 3.55e-06 [a_2]: 0.00024074 [accelerated_algorithm]: 2.482e-05 [shard]: 2.58998e-06 [meta_shard_fg_expand]: 7.11001e-06 [shard_inline]: 0.00831208 [merge_send_recv]: 4.527e-05 [auto_parallel]: 2.61e-05 [parallel]: 1.018e-05 [flash_sp]: 6.21e-06 [merge_comm]: 8.52e-06 [allreduce_fusion]: 8.65001e-06 [matmul_add_comm_reduction]: 2.121e-05 [allreduce_slice_to_reducescatter]: 1.58002e-06 [virtual_shard_identity]: 3.318e-05 [virtual_dataset]: 1.877e-05 [get_grad_eliminate_]: 1.763e-05 [virtual_output]: 1.621e-05 [merge_forward]: 1.598e-05 [cell_reuse_recompute_pass]: 3.2e-06 [offload_activation]: 1.715e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.24e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 2.61e-05 [set_forward_comm_id_for_comm_node_pass]: 9.04e-06 [meta_fg_expand]: 9.53997e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 2.89001e-06 [after_resolve]: 2.593e-05 [a_after_grad]: 2.675e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 7.28e-06 [auto_monad_grad]: 3.13998e-06 [auto_monad_eliminator]: 0.00201078 [cse]: 0.00015254 [a_3]: 0.00013348 [py_interpret_to_execute_after_opt_a]: 1.005e-05 [slice_cell_reuse_recomputed_activation]: 3.09001e-06 [rewriter_after_opt_a]: 6.869e-05 [convert_after_rewriter]: 1.53002e-06 [order_py_execute_after_rewriter]: 1.52001e-06 [mutable_eliminate]: 0.00168724 [opt_b]: 0.00723652, [1] [Cycle 1]: 0.00722131, [7] [b_1]: 0.00695313 [b_2]: 2.825e-05 [updatestate_depend_eliminate]: 2.269e-05 [updatestate_assign_eliminate]: 1.321e-05 [updatestate_loads_eliminate]: 1.69e-05 [renormalize]: 8.2e-07 [cse]: 0.00011237 [optimize_parallel_all_gather_comm]: 9.3e-05 [overlap_param_gather]: 1.152e-05 [cconv]: 4.357e-05 [loop_unroll]: 0.0030254 [opt_after_cconv]: 0.00524002, [1] [Cycle 1]: 0.00522826, [7] [c_1]: 0.00498752 [parameter_eliminate]: 9.56e-06 [updatestate_depend_eliminate]: 2.566e-05 [updatestate_assign_eliminate]: 1.363e-05 [updatestate_loads_eliminate]: 1.729e-05 [cse]: 8.863e-05 [renormalize]: 1.34e-06 [remove_dup_value]: 8.996e-05 [tuple_transform]: 0.00029165, [1] [Cycle 1]: 0.00028181, [4] [d_1]: 0.00012293 [none_parameter_eliminate]: 2.41e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 2.161e-05 [partial_unused_args_eliminate]: 3.04999e-06 [add_recomputation]: 0.00017581 [cse_after_recomputation]: 6.313e-05, [1] [Cycle 1]: 5.674e-05, [1] [cse]: 4.831e-05 [environ_conv]: 5.063e-05 [swap_dp_allreduce_reducescatter]: 3.452e-05 [bias_add_comm_swap]: 1.291e-05 [label_micro_interleaved_index]: 1.621e-05 [label_fine_grained_interleaved_index]: 2.52001e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.08002e-06 [micro_interleaved_order_control]: 2.79999e-06 [assign_add_opt]: 1.64998e-06 [ForceFp32Comm]: 1.13001e-06 [remove_cast_before_assign_add]: 9.05001e-06 [full_micro_interleaved_order_control]: 1.038e-05 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 7.86001e-06 [overlap_opt_shard_in_pipeline]: 2.806e-05 [overlap_opt_shard_grad_in_pipeline]: 1.89999e-06 [control_data_broadcast_order]: 3.753e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 2.295e-05 [overlap_recompute_and_grad_model_parallel]: 1.911e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.39998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.54e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 2.474e-05 [overlap_grad_flash_sp]: 0.00028026 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 1.173e-05 [split_layernorm_comm]: 1.93002e-06 [handle_group_info]: 1.32999e-06 [symbol_engine_optimizer]: 0.0147155, [1] [Cycle 1]: 0.0147036, [6] [build]: 1.691e-05 [elim_shapecalc]: 2.893e-05 [elim_not_effective]: 3.245e-05 [opt_reshape]: 8.085e-05 [fold_const_symbol]: 5.076e-05 [renormalize]: 1.44998e-06 [detach_backward]: 4.94e-06 [pipeline_parallel_scheduler]: 2.49999e-06 [auto_monad_reorder]: 0.00013194 [get_jit_bprop_graph]: 2.79001e-06 [rewriter_after_jit_bprop_graph]: 9.32001e-06 [opt_after_jit_grad]: 0.00375496 [validate]: 0.00034487 [backend_pass]: 1.27e-06 [task_emit]: 11.2099 [execute]: 1.033e-05 Sums bootstrap : 0.053654s : 0.39% type_inference : 2.303593s : 16.80% event_method : 0.001679s : 0.01% auto_monad : 0.000967s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000046s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.001583s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000031s : 0.00% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000131s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.00% optimize.rewriter_before_opt_a : 0.000431s : 0.00% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000258s : 0.00% optimize.opt_a.loop_unroll : 0.000077s : 0.00% optimize.opt_a.a_1 : 0.002341s : 0.02% optimize.opt_a.with_stream_mark : 0.000095s : 0.00% optimize.opt_a.recompute_prepare : 0.000055s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000097s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000105s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000042s : 0.00% optimize.opt_a.parameter_eliminate : 0.000008s : 0.00% optimize.opt_a.a_2 : 0.000539s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000081s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.00% optimize.opt_a.shard_inline : 0.008330s : 0.06% optimize.opt_a.merge_send_recv : 0.000095s : 0.00% optimize.opt_a.auto_parallel : 0.000041s : 0.00% optimize.opt_a.parallel : 0.000191s : 0.00% optimize.opt_a.flash_sp : 0.000046s : 0.00% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000025s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000045s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000064s : 0.00% optimize.opt_a.virtual_dataset : 0.000039s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000037s : 0.00% optimize.opt_a.virtual_output : 0.000035s : 0.00% optimize.opt_a.merge_forward : 0.000027s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000042s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000088s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000055s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000028s : 0.00% optimize.opt_a.meta_fg_expand : 0.000017s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000057s : 0.00% optimize.opt_a.a_after_grad : 0.000056s : 0.00% optimize.opt_a.renormalize : 0.095193s : 0.69% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.002096s : 0.02% optimize.opt_a.cse : 0.003942s : 0.03% optimize.opt_a.a_3 : 0.000293s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000069s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.001687s : 0.01% optimize.opt_b.b_1 : 0.006953s : 0.05% optimize.opt_b.b_2 : 0.000028s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000017s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000112s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000093s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000044s : 0.00% optimize.loop_unroll : 0.003025s : 0.02% optimize.opt_after_cconv.c_1 : 0.004988s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000026s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000017s : 0.00% optimize.opt_after_cconv.cse : 0.000089s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000090s : 0.00% optimize.tuple_transform.d_1 : 0.000123s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000022s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000176s : 0.00% optimize.cse_after_recomputation.cse : 0.000048s : 0.00% optimize.environ_conv : 0.000051s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000035s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000016s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000038s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000023s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000019s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.00% optimize.overlap_grad_flash_sp : 0.000280s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000029s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000032s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000081s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000051s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000005s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000132s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000009s : 0.00% opt_after_jit_grad : 0.003755s : 0.03% validate : 0.000345s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 11.209884s : 81.77% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000820 119 1.38% : 0.000011s : 2: substitution.depend_value_elim 0.47% : 0.000004s : 7: substitution.elim_not_effective 0.71% : 0.000006s : 7: substitution.fold_const_symbol 1.43% : 0.000012s : 12: substitution.graph_param_transform 55.85% : 0.000458s : 10: substitution.inline 1.31% : 0.000011s : 14: substitution.j_node_and_user_rematch 3.54% : 0.000029s : 2: substitution.less_batch_normalization 2.00% : 0.000016s : 12: substitution.load_eliminater 2.45% : 0.000020s : 14: substitution.remove_not_recompute_node 1.26% : 0.000010s : 6: substitution.replace_old_param 3.16% : 0.000026s : 4: substitution.switch_simplify 1.32% : 0.000011s : 1: substitution.tuple_list_get_item_eliminator 2.21% : 0.000018s : 12: substitution.updatestate_pure_node_eliminater 22.92% : 0.000188s : 16: substitution.updatestate_useless_node_eliminater ------[type_inference.] 2.303380 2 93.80% : 2.160463s : 1: type_inference.infer 6.20% : 0.142917s : 1: type_inference.specialize ------[replace.] 0.000230 17 49.42% : 0.000114s : 10: replace.inline 38.10% : 0.000088s : 4: replace.switch_simplify 7.92% : 0.000018s : 1: replace.tuple_list_get_item_eliminator 4.56% : 0.000011s : 2: replace.updatestate_useless_node_eliminater ------[match.] 0.000495 17 90.72% : 0.000449s : 10: match.inline 4.49% : 0.000022s : 4: match.switch_simplify 2.03% : 0.000010s : 1: match.tuple_list_get_item_eliminator 2.76% : 0.000014s : 2: match.updatestate_useless_node_eliminater ------[predicate.] 0.011867 3900 0.05% : 0.000005s : 40: predicate.accumulaten_eliminater 0.06% : 0.000007s : 12: predicate.ad_related_special_op_eliminate 0.03% : 0.000004s : 28: predicate.addn_check_dump 0.04% : 0.000005s : 40: predicate.addn_zero_filter 0.04% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 0.10% : 0.000012s : 68: predicate.arithmetic_simplify 0.05% : 0.000006s : 40: predicate.cast_eliminate 0.03% : 0.000004s : 28: predicate.check_bprop_eliminate 0.03% : 0.000004s : 28: predicate.compare_switch_simplify 0.02% : 0.000002s : 14: predicate.const_output_eliminate 0.03% : 0.000004s : 28: predicate.depend_value_elim 0.04% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 0.05% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.05% : 0.000006s : 40: predicate.dict_set_item_eliminator 0.05% : 0.000006s : 26: predicate.dumpgradient_eliminate 0.01% : 0.000001s : 12: predicate.elim_not_effective 0.02% : 0.000002s : 12: predicate.elim_shapecalc_of_broadcastargs 0.06% : 0.000007s : 54: predicate.environ_add_const_eliminate 54.57% : 0.006476s : 54: predicate.environ_get_add_eliminate 0.06% : 0.000007s : 54: predicate.environ_get_depend_swap 0.09% : 0.000010s : 82: predicate.environ_get_eliminate 0.08% : 0.000010s : 54: predicate.environ_get_set_eliminate 0.06% : 0.000007s : 53: predicate.exchange_switch_depend_value 0.10% : 0.000012s : 53: predicate.float_depend_g_call 0.03% : 0.000003s : 28: predicate.float_environ_get_switch 0.05% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.01% : 0.000002s : 12: predicate.fold_const_symbol 0.04% : 0.000005s : 28: predicate.get_grad_eliminate 0.01% : 0.000001s : 12: predicate.graph_param_transform 0.03% : 0.000003s : 28: predicate.incorporate_call 0.03% : 0.000003s : 28: predicate.incorporate_call_switch 0.30% : 0.000036s : 177: predicate.inline 0.04% : 0.000005s : 28: predicate.inline_without_move 0.02% : 0.000002s : 28: predicate.j_node_and_user_rematch 0.06% : 0.000007s : 30: predicate.less_batch_normalization 0.09% : 0.000010s : 67: predicate.list_to_tuple_eliminator_ 40.21% : 0.004772s : 109: predicate.load_eliminater 0.08% : 0.000010s : 14: predicate.loop_unroll_after_grad 0.10% : 0.000011s : 80: predicate.loop_unroll_before_grad 0.10% : 0.000012s : 68: predicate.make_slice_get_slice_eliminator 0.03% : 0.000004s : 28: predicate.merge_addn 0.03% : 0.000004s : 28: predicate.micro_step_allgather_replace 0.04% : 0.000004s : 28: predicate.mini_step_allgather_replace 0.04% : 0.000005s : 40: predicate.minmaximum_grad 0.06% : 0.000007s : 14: predicate.mutable_eliminate 0.10% : 0.000012s : 12: predicate.opt_reshape 0.02% : 0.000003s : 14: predicate.parallel_virtual_node 0.09% : 0.000010s : 53: predicate.partial_defer_inline 0.06% : 0.000007s : 55: predicate.partial_eliminate 0.04% : 0.000005s : 40: predicate.print_const_string_wrapper 0.04% : 0.000004s : 28: predicate.reduce_all_const_elim 0.06% : 0.000007s : 40: predicate.reduce_eliminate 0.13% : 0.000015s : 109: predicate.redundant_stop_gradient_eliminater 0.02% : 0.000003s : 28: predicate.remove_not_recompute_node 0.07% : 0.000009s : 69: predicate.replace_applicator 0.02% : 0.000003s : 28: predicate.replace_old_param 0.01% : 0.000002s : 14: predicate.reset_defer_inline 0.04% : 0.000005s : 40: predicate.reshape_eliminate 0.03% : 0.000004s : 28: predicate.row_tensor_add_zeros_like 0.02% : 0.000003s : 14: predicate.row_tensor_eliminate 0.05% : 0.000006s : 28: predicate.same_eliminate 0.03% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.04% : 0.000005s : 28: predicate.shard_identity_eliminate 0.04% : 0.000005s : 26: predicate.special_op_eliminate 0.04% : 0.000004s : 28: predicate.specialize_transform 0.05% : 0.000006s : 28: predicate.split_environ_get_set_with_tuple_value 0.04% : 0.000005s : 28: predicate.stack_unstack_eliminate 0.04% : 0.000004s : 14: predicate.switch_call_monad_eliminater 0.06% : 0.000007s : 53: predicate.switch_defer_inline 0.10% : 0.000012s : 81: predicate.switch_layer_defer_inline 0.24% : 0.000029s : 181: predicate.switch_simplify 0.05% : 0.000006s : 40: predicate.tile_eliminate 0.04% : 0.000005s : 40: predicate.transpose_eliminate 0.08% : 0.000009s : 66: predicate.tuple_list_convert_item_index_to_positive 0.08% : 0.000009s : 66: predicate.tuple_list_get_item_const_eliminator 0.08% : 0.000009s : 66: predicate.tuple_list_get_item_depend_reorder 0.33% : 0.000039s : 95: predicate.tuple_list_get_item_eliminator 0.08% : 0.000010s : 66: predicate.tuple_list_get_set_item_eliminator 0.12% : 0.000014s : 94: predicate.tuple_list_set_item_eliminator 0.09% : 0.000010s : 67: predicate.tuple_to_list_eliminator_ 0.12% : 0.000014s : 109: predicate.updatestate_pure_node_eliminater 0.17% : 0.000020s : 139: predicate.updatestate_useless_node_eliminater 0.03% : 0.000004s : 14: predicate.value_based_eliminate 0.04% : 0.000005s : 28: predicate.virtual_dataset_eliminate 0.04% : 0.000004s : 28: predicate.virtual_output_eliminate 0.01% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.02% : 0.000003s : 14: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.138735 125 80.84% : 0.112159s : 112: func_graph_cloner_run.FuncGraphClonerGraph 19.16% : 0.026577s : 13: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 14.229002 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.83% : 0.117679s : 1: add_attr 0.83% : 0.117655s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000183s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000985s : 1: auto_monad 0.00% : 0.000138s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000018s : 1: bias_add_comm_swap 0.38% : 0.053729s : 1: bootstrap 0.00% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000041s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000066s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000009s : 1: detach_backward 0.00% : 0.000056s : 1: environ_conv 0.01% : 0.001704s : 1: event_method 0.00% : 0.000024s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000050s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.02% : 0.003044s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.01% : 0.001704s : 1: mutable_eliminate 0.00% : 0.000028s : 1: offloading_packed_experts 0.00% : 0.000059s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000044s : 1: opt.transform.mutable_eliminate 0.09% : 0.012230s : 78: opt.transform.opt_a 0.03% : 0.004979s : 1: opt.transform.opt_after_cconv 0.00% : 0.000088s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.006919s : 28: opt.transform.opt_b 0.00% : 0.000138s : 2: opt.transform.opt_trans_graph 0.00% : 0.000175s : 4: opt.transform.symbol_engine_opt 0.81% : 0.115502s : 1: opt_a 0.04% : 0.005247s : 1: opt_after_cconv 0.03% : 0.003773s : 1: opt_after_jit_grad 0.05% : 0.007243s : 1: opt_b 1.05% : 0.149640s : 1: optimize 0.00% : 0.000098s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000289s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000023s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000010s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000139s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000096s : 1: remove_dup_value 0.48% : 0.068315s : 1: renormalize.infer 0.19% : 0.026854s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000073s : 1: rewriter_after_opt_a 0.00% : 0.000447s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000038s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.014722s : 1: symbol_engine_optimizer 78.78% : 11.209994s : 1: task_emit 0.00% : 0.000295s : 1: tuple_transform 16.19% : 2.303641s : 1: type_inference 0.00% : 0.000413s : 1: validate TotalTime = 14.8979, [24] [bootstrap]: 0.00098121 [type_inference]: 0.731625 [event_method]: 2.651e-05 [auto_monad]: 0.00016004 [graph_reusing]: 6.33e-06 [inline]: 2.86e-06 [add_attr]: 0.0096043, [1] [add_attr_with_inline]: 0.00958509, [1] [Cycle 1]: 0.0001308, [2] [tag_attr]: 3.345e-05 [meta_addattr_fg_expand]: 1.324e-05 [parallel-infer-symbol]: 3.78001e-06 [pre_auto_parallel]: 5.685e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.09989e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0733833, [53] [py_interpret_to_execute]: 4.70999e-06 [rewriter_before_opt_a]: 0.00023255 [opt_a]: 0.00516784, [2] [Cycle 1]: 0.00449186, [45] [expand_dump_flag]: 3.25002e-06 [switch_simplify]: 7.261e-05 [loop_unroll]: 3.193e-05 [a_1]: 0.0006049 [with_stream_mark]: 1.918e-05 [recompute_prepare]: 7.53e-06 [updatestate_depend_eliminate]: 1.141e-05 [updatestate_assign_eliminate]: 1.098e-05 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 8.094e-05 [accelerated_algorithm]: 7.17002e-06 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 3.933e-05 [auto_parallel]: 7.48e-06 [parallel]: 8.243e-05 [flash_sp]: 3.152e-05 [merge_comm]: 4.18999e-06 [allreduce_fusion]: 1.055e-05 [matmul_add_comm_reduction]: 1.523e-05 [allreduce_slice_to_reducescatter]: 7.78001e-06 [virtual_shard_identity]: 1.006e-05 [virtual_dataset]: 6.83998e-06 [get_grad_eliminate_]: 6.39001e-06 [virtual_output]: 6.69999e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.638e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.009e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.03e-05 [set_forward_comm_id_for_comm_node_pass]: 1.037e-05 [meta_fg_expand]: 2.68003e-06 [flash_sp_send_recv_attached]: 2.41998e-06 [receive_attached]: 1.693e-05 [after_resolve]: 1.082e-05 [a_after_grad]: 1.038e-05 [renormalize]: 0.00288683 [add_forward_monad_depend]: 9.04e-06 [auto_monad_grad]: 2.93998e-06 [auto_monad_eliminator]: 2.943e-05 [cse]: 5.026e-05 [a_3]: 6.025e-05 [Cycle 2]: 0.0006625, [45] [expand_dump_flag]: 1.92999e-06 [switch_simplify]: 8.84e-06 [loop_unroll]: 6.41e-06 [a_1]: 0.00013028 [with_stream_mark]: 1.815e-05 [recompute_prepare]: 6.50997e-06 [updatestate_depend_eliminate]: 3.48999e-06 [updatestate_assign_eliminate]: 3.05998e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.62999e-06 [a_2]: 7.166e-05 [accelerated_algorithm]: 6.55997e-06 [shard]: 2.43002e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 6.21e-06 [merge_send_recv]: 8.02e-06 [auto_parallel]: 9.69e-06 [parallel]: 8.67e-06 [flash_sp]: 3.3e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 3.01999e-06 [matmul_add_comm_reduction]: 7.86001e-06 [allreduce_slice_to_reducescatter]: 7.60017e-07 [virtual_shard_identity]: 6.58998e-06 [virtual_dataset]: 5.86e-06 [get_grad_eliminate_]: 6.21998e-06 [virtual_output]: 5.59e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 2.49999e-06 [offload_activation]: 1.012e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.447e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 9.72001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 2.31998e-06 [flash_sp_send_recv_attached]: 1.74e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 1.034e-05 [a_after_grad]: 9.21002e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 7.51999e-06 [cse]: 1.419e-05 [a_3]: 3.539e-05 [py_interpret_to_execute_after_opt_a]: 6.43e-06 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 3.124e-05 [convert_after_rewriter]: 1.18001e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.0659028 [opt_b]: 0.00038284, [1] [Cycle 1]: 0.00037278, [7] [b_1]: 0.00025914 [b_2]: 9.78002e-06 [updatestate_depend_eliminate]: 1.223e-05 [updatestate_assign_eliminate]: 3.86001e-06 [updatestate_loads_eliminate]: 3.2e-06 [renormalize]: 1.82001e-06 [cse]: 4.265e-05 [optimize_parallel_all_gather_comm]: 3.751e-05 [overlap_param_gather]: 1.358e-05 [cconv]: 3.676e-05 [loop_unroll]: 0.00066138 [opt_after_cconv]: 0.00010875, [1] [Cycle 1]: 0.00010232, [7] [c_1]: 3.151e-05 [parameter_eliminate]: 7.58001e-06 [updatestate_depend_eliminate]: 6.08002e-06 [updatestate_assign_eliminate]: 2.60997e-06 [updatestate_loads_eliminate]: 2.16e-06 [cse]: 1.922e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.634e-05 [tuple_transform]: 8.209e-05, [1] [Cycle 1]: 7.762e-05, [4] [d_1]: 4.979e-05 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 7.15e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 6.715e-05 [cse_after_recomputation]: 2.187e-05, [1] [Cycle 1]: 1.714e-05, [1] [cse]: 1.173e-05 [environ_conv]: 3.226e-05 [swap_dp_allreduce_reducescatter]: 2.379e-05 [bias_add_comm_swap]: 1.075e-05 [label_micro_interleaved_index]: 1.192e-05 [label_fine_grained_interleaved_index]: 2.39999e-06 [merge_cast_opt]: 1.53002e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 3.44001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 8.60018e-07 [remove_cast_before_assign_add]: 8.52998e-06 [full_micro_interleaved_order_control]: 1.021e-05 [reorder_send_recv_between_fp_bp]: 2.94001e-06 [comm_op_add_attrs]: 1.24e-06 [add_comm_op_reuse_tag]: 1.16002e-06 [interleave_split_concat_branches]: 1.29003e-06 [interleave_parallel_branches]: 8.13999e-06 [overlap_opt_shard_in_pipeline]: 2.572e-05 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.23e-05 [grouped_pairwise_exchange_alltoall]: 1.75001e-06 [offloading_packed_experts]: 3.53999e-06 [overlap_recompute_and_grad_model_parallel]: 1.23e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 1.832e-05 [overlap_grad_flash_sp]: 4.876e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 9.89999e-06 [split_layernorm_comm]: 1.68002e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 7.737e-05, [1] [Cycle 1]: 7.242e-05, [6] [build]: 3.28e-06 [elim_shapecalc]: 1.109e-05 [elim_not_effective]: 1.303e-05 [opt_reshape]: 6.99001e-06 [fold_const_symbol]: 9.87001e-06 [renormalize]: 1.50001e-07 [detach_backward]: 2.22999e-06 [pipeline_parallel_scheduler]: 1.39e-06 [auto_monad_reorder]: 2.226e-05 [get_jit_bprop_graph]: 1.74998e-06 [rewriter_after_jit_bprop_graph]: 5.29e-06 [opt_after_jit_grad]: 0.00049899 [validate]: 0.00011253 [backend_pass]: 1.27999e-06 [task_emit]: 14.0118 [execute]: 4.799e-05 Sums bootstrap : 0.000981s : 0.01% type_inference : 0.731625s : 4.94% event_method : 0.000027s : 0.00% auto_monad : 0.000160s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000057s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000233s : 0.00% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000081s : 0.00% optimize.opt_a.loop_unroll : 0.000038s : 0.00% optimize.opt_a.a_1 : 0.000735s : 0.00% optimize.opt_a.with_stream_mark : 0.000037s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000153s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000047s : 0.00% optimize.opt_a.auto_parallel : 0.000017s : 0.00% optimize.opt_a.parallel : 0.000091s : 0.00% optimize.opt_a.flash_sp : 0.000035s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.00% optimize.opt_a.a_after_grad : 0.000020s : 0.00% optimize.opt_a.renormalize : 0.002887s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.00% optimize.opt_a.cse : 0.000064s : 0.00% optimize.opt_a.a_3 : 0.000096s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000031s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.065903s : 0.44% optimize.opt_b.b_1 : 0.000259s : 0.00% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.000043s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000038s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000037s : 0.00% optimize.loop_unroll : 0.000661s : 0.00% optimize.opt_after_cconv.c_1 : 0.000032s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.00% optimize.tuple_transform.d_1 : 0.000050s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.00% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000032s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000024s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000026s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000018s : 0.00% optimize.overlap_grad_flash_sp : 0.000049s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000499s : 0.00% validate : 0.000113s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 14.011788s : 94.56% execute : 0.000048s : 0.00% Time group info: ------[substitution.] 0.000203 25 1.08% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 3.18% : 0.000006s : 4: substitution.graph_param_transform 74.76% : 0.000152s : 5: substitution.inline 2.12% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.75% : 0.000012s : 4: substitution.remove_not_recompute_node 2.13% : 0.000004s : 2: substitution.replace_old_param 10.37% : 0.000021s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.731516 2 99.64% : 0.728853s : 1: type_inference.infer 0.36% : 0.002663s : 1: type_inference.specialize ------[replace.] 0.000064 7 75.56% : 0.000049s : 5: replace.inline 24.44% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 7 88.18% : 0.000149s : 5: match.inline 11.82% : 0.000020s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1257 0.83% : 0.000002s : 13: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.76% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.04% : 0.000004s : 21: predicate.arithmetic_simplify 1.02% : 0.000002s : 13: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.83% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.27% : 0.000003s : 17: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 17: predicate.environ_get_depend_swap 1.89% : 0.000004s : 25: predicate.environ_get_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 20: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.29% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.70% : 0.000012s : 57: predicate.inline 0.74% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 36: predicate.load_eliminater 1.05% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.47% : 0.000005s : 36: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 13: predicate.minmaximum_grad 2.89% : 0.000006s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.86% : 0.000004s : 20: predicate.partial_defer_inline 1.31% : 0.000003s : 19: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.16% : 0.000002s : 13: predicate.reduce_eliminate 2.30% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 23: predicate.replace_applicator 0.52% : 0.000001s : 8: predicate.replace_old_param 0.73% : 0.000001s : 4: predicate.reset_defer_inline 0.90% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.77% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000001s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 1.23% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.40% : 0.000003s : 20: predicate.switch_defer_inline 1.94% : 0.000004s : 28: predicate.switch_layer_defer_inline 4.99% : 0.000010s : 68: predicate.switch_simplify 0.86% : 0.000002s : 13: predicate.tile_eliminate 0.85% : 0.000002s : 13: predicate.transpose_eliminate 1.46% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.42% : 0.000007s : 31: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.59% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.59% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 36: predicate.updatestate_pure_node_eliminater 2.87% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.82% : 0.000002s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002448 21 71.68% : 0.001755s : 14: func_graph_cloner_run.FuncGraphClonerGraph 28.32% : 0.000693s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 14.916536 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.06% : 0.009610s : 1: add_attr 0.06% : 0.009590s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000167s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.01% : 0.001041s : 1: bootstrap 0.00% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000036s : 1: environ_conv 0.00% : 0.000034s : 1: event_method 0.00% : 0.000207s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.00% : 0.000671s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.44% : 0.065925s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000037s : 1: opt.transform.mutable_eliminate 0.01% : 0.001201s : 78: opt.transform.opt_a 0.00% : 0.000030s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000235s : 28: opt.transform.opt_b 0.00% : 0.000055s : 2: opt.transform.opt_trans_graph 0.00% : 0.000037s : 4: opt.transform.symbol_engine_opt 0.03% : 0.005172s : 1: opt_a 0.00% : 0.000112s : 1: opt_after_cconv 0.00% : 0.000510s : 1: opt_after_jit_grad 0.00% : 0.000388s : 1: opt_b 0.49% : 0.073390s : 1: optimize 0.00% : 0.000042s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000053s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000021s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000030s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000018s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000020s : 1: remove_dup_value 0.01% : 0.002066s : 1: renormalize.infer 0.01% : 0.000808s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000035s : 1: rewriter_after_opt_a 0.00% : 0.000239s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000080s : 1: symbol_engine_optimizer 93.94% : 14.012172s : 1: task_emit 0.00% : 0.000085s : 1: tuple_transform 4.91% : 0.731673s : 1: type_inference 0.00% : 0.000147s : 1: validate TotalTime = 0.983697, [24] [bootstrap]: 0.0808752 [type_inference]: 0.619503 [event_method]: 0.0117159 [auto_monad]: 0.00025388 [graph_reusing]: 1.127e-05 [inline]: 3.76001e-06 [add_attr]: 0.00988112, [1] [add_attr_with_inline]: 0.00986357, [1] [Cycle 1]: 0.00019102, [2] [tag_attr]: 6.044e-05 [meta_addattr_fg_expand]: 1.473e-05 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 9.463e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 1.14e-06 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0804434, [53] [py_interpret_to_execute]: 1.1e-05 [rewriter_before_opt_a]: 0.00040661 [opt_a]: 0.0762854, [3] [Cycle 1]: 0.057921, [45] [expand_dump_flag]: 6.29001e-06 [switch_simplify]: 0.00020714 [loop_unroll]: 6.691e-05 [a_1]: 0.00169736 [with_stream_mark]: 4.402e-05 [recompute_prepare]: 3.438e-05 [updatestate_depend_eliminate]: 9.31e-06 [updatestate_assign_eliminate]: 7.67998e-06 [updatestate_loads_eliminate]: 7.50003e-06 [parameter_eliminate]: 3.85e-06 [a_2]: 0.00024062 [accelerated_algorithm]: 1.905e-05 [shard]: 3.06001e-06 [meta_shard_fg_expand]: 6.42001e-06 [shard_inline]: 1.751e-05 [merge_send_recv]: 2.302e-05 [auto_parallel]: 1.706e-05 [parallel]: 0.00013836 [flash_sp]: 1.919e-05 [merge_comm]: 1.247e-05 [allreduce_fusion]: 8.48001e-06 [matmul_add_comm_reduction]: 3.65e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 2.408e-05 [virtual_dataset]: 1.527e-05 [get_grad_eliminate_]: 1.564e-05 [virtual_output]: 1.501e-05 [merge_forward]: 1.127e-05 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 2.089e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.645e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 2.746e-05 [set_forward_comm_id_for_comm_node_pass]: 1.093e-05 [meta_fg_expand]: 0.00284159 [flash_sp_send_recv_attached]: 6.36e-06 [receive_attached]: 3.08e-06 [after_resolve]: 8.356e-05 [a_after_grad]: 9.836e-05 [renormalize]: 0.0506697 [add_forward_monad_depend]: 2.138e-05 [auto_monad_grad]: 9.40001e-06 [auto_monad_eliminator]: 8.519e-05 [cse]: 0.00038968 [a_3]: 0.00043933 [Cycle 2]: 0.0172065, [45] [expand_dump_flag]: 4.47998e-06 [switch_simplify]: 0.0120444 [loop_unroll]: 7.122e-05 [a_1]: 0.0018331 [with_stream_mark]: 4.301e-05 [recompute_prepare]: 2.044e-05 [updatestate_depend_eliminate]: 8.31002e-06 [updatestate_assign_eliminate]: 6.36998e-06 [updatestate_loads_eliminate]: 5.14e-06 [parameter_eliminate]: 3.63999e-06 [a_2]: 0.00016056 [accelerated_algorithm]: 1.494e-05 [shard]: 2.42001e-06 [meta_shard_fg_expand]: 5.30999e-06 [shard_inline]: 1.088e-05 [merge_send_recv]: 1.375e-05 [auto_parallel]: 1.404e-05 [parallel]: 1.14e-05 [flash_sp]: 4.58999e-06 [merge_comm]: 6.32001e-06 [allreduce_fusion]: 5.45001e-06 [matmul_add_comm_reduction]: 1.598e-05 [allreduce_slice_to_reducescatter]: 1.22e-06 [virtual_shard_identity]: 1.596e-05 [virtual_dataset]: 1.045e-05 [get_grad_eliminate_]: 1.059e-05 [virtual_output]: 1.018e-05 [merge_forward]: 8.14997e-06 [cell_reuse_recompute_pass]: 2.62001e-06 [offload_activation]: 1.67e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.055e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 2.013e-05 [set_forward_comm_id_for_comm_node_pass]: 7.5e-06 [meta_fg_expand]: 0.00012061 [flash_sp_send_recv_attached]: 2.15002e-06 [receive_attached]: 3.03998e-06 [after_resolve]: 2.627e-05 [a_after_grad]: 1.844e-05 [renormalize]: 0.00187128 [add_forward_monad_depend]: 3.767e-05 [auto_monad_grad]: 2.83e-06 [auto_monad_eliminator]: 3.009e-05 [cse]: 0.00011751 [a_3]: 9.361e-05 [Cycle 3]: 0.00113351, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 1.432e-05 [loop_unroll]: 1.189e-05 [a_1]: 0.00027029 [with_stream_mark]: 2.443e-05 [recompute_prepare]: 1.222e-05 [updatestate_depend_eliminate]: 6.86999e-06 [updatestate_assign_eliminate]: 5.79e-06 [updatestate_loads_eliminate]: 4.90999e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 0.00013133 [accelerated_algorithm]: 1.217e-05 [shard]: 2.57001e-06 [meta_shard_fg_expand]: 2.66999e-06 [shard_inline]: 9.94999e-06 [merge_send_recv]: 1.092e-05 [auto_parallel]: 1.24e-05 [parallel]: 9.05999e-06 [flash_sp]: 1.16002e-06 [merge_comm]: 5.94e-06 [allreduce_fusion]: 5.84999e-06 [matmul_add_comm_reduction]: 1.117e-05 [allreduce_slice_to_reducescatter]: 5.49975e-07 [virtual_shard_identity]: 1.264e-05 [virtual_dataset]: 9.25999e-06 [get_grad_eliminate_]: 1.04e-05 [virtual_output]: 9.96e-06 [merge_forward]: 6.46e-06 [cell_reuse_recompute_pass]: 3.36999e-06 [offload_activation]: 1.489e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.343e-05 [merge_recompute_call_nodes]: 1.29e-06 [before_grad]: 1.833e-05 [set_forward_comm_id_for_comm_node_pass]: 7.48e-06 [meta_fg_expand]: 5.86998e-06 [flash_sp_send_recv_attached]: 1.55999e-06 [receive_attached]: 1.93002e-06 [after_resolve]: 1.65e-05 [a_after_grad]: 1.753e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.76999e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 1.753e-05 [cse]: 6.909e-05 [a_3]: 6.843e-05 [py_interpret_to_execute_after_opt_a]: 1.144e-05 [slice_cell_reuse_recomputed_activation]: 2.76999e-06 [rewriter_after_opt_a]: 3.591e-05 [convert_after_rewriter]: 1.59e-06 [order_py_execute_after_rewriter]: 1.33002e-06 [mutable_eliminate]: 0.00099583 [opt_b]: 0.00040919, [1] [Cycle 1]: 0.00039738, [7] [b_1]: 0.00022708 [b_2]: 1.3e-05 [updatestate_depend_eliminate]: 1.474e-05 [updatestate_assign_eliminate]: 5.02e-06 [updatestate_loads_eliminate]: 5.12999e-06 [renormalize]: 1.77001e-06 [cse]: 8.233e-05 [optimize_parallel_all_gather_comm]: 3.137e-05 [overlap_param_gather]: 2.16998e-06 [cconv]: 4.149e-05 [loop_unroll]: 0.00066022 [opt_after_cconv]: 0.00018797, [1] [Cycle 1]: 0.00017998, [7] [c_1]: 5.907e-05 [parameter_eliminate]: 6.71e-06 [updatestate_depend_eliminate]: 1.074e-05 [updatestate_assign_eliminate]: 4.78001e-06 [updatestate_loads_eliminate]: 5.20001e-06 [cse]: 5.403e-05 [renormalize]: 6.29982e-07 [remove_dup_value]: 0.00029299 [tuple_transform]: 0.00015793, [1] [Cycle 1]: 0.00015018, [4] [d_1]: 9.841e-05 [none_parameter_eliminate]: 3.75998e-06 [renormalize]: 7.40023e-07 [switch_simplify]: 1.656e-05 [partial_unused_args_eliminate]: 2.80002e-06 [add_recomputation]: 0.00014753 [cse_after_recomputation]: 6.112e-05, [1] [Cycle 1]: 5.399e-05, [1] [cse]: 4.541e-05 [environ_conv]: 1.489e-05 [swap_dp_allreduce_reducescatter]: 1.078e-05 [bias_add_comm_swap]: 4.93001e-06 [label_micro_interleaved_index]: 8.18001e-06 [label_fine_grained_interleaved_index]: 3.21999e-06 [merge_cast_opt]: 1.47999e-06 [slice_recompute_activation]: 2.38002e-06 [micro_interleaved_order_control]: 3.01999e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 1.12999e-06 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.47001e-06 [reorder_send_recv_between_fp_bp]: 2.53998e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.15001e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.28002e-06 [overlap_opt_shard_in_pipeline]: 4.371e-05 [overlap_opt_shard_grad_in_pipeline]: 2.12001e-06 [control_data_broadcast_order]: 3.132e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 7.57998e-06 [overlap_recompute_and_grad_model_parallel]: 7.51001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.20001e-06 [overlap_recompute_comm]: 2.49001e-06 [overlap_grad_ring_attention]: 7.66999e-06 [overlap_grad_flash_sp]: 3.592e-05 [begin_end_overlap_inline]: 9.89996e-07 [split_matmul_comm_elemetwise]: 2.83998e-06 [split_layernorm_comm]: 2.06e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 0.00018189, [1] [Cycle 1]: 0.00017458, [6] [build]: 6.38998e-06 [elim_shapecalc]: 2.297e-05 [elim_not_effective]: 2.512e-05 [opt_reshape]: 2.228e-05 [fold_const_symbol]: 2.086e-05 [renormalize]: 3.19997e-07 [detach_backward]: 3.42002e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 4.228e-05 [get_jit_bprop_graph]: 2.54999e-06 [rewriter_after_jit_bprop_graph]: 8.20999e-06 [opt_after_jit_grad]: 0.00089348 [validate]: 8.695e-05 [backend_pass]: 1.22999e-06 [task_emit]: 0.179363 [execute]: 1.07e-05 Sums bootstrap : 0.080875s : 8.32% type_inference : 0.619503s : 63.76% event_method : 0.011716s : 1.21% auto_monad : 0.000254s : 0.03% graph_reusing : 0.000011s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000060s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000095s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000407s : 0.04% optimize.opt_a.expand_dump_flag : 0.000014s : 0.00% optimize.opt_a.switch_simplify : 0.012266s : 1.26% optimize.opt_a.loop_unroll : 0.000150s : 0.02% optimize.opt_a.a_1 : 0.003801s : 0.39% optimize.opt_a.with_stream_mark : 0.000111s : 0.01% optimize.opt_a.recompute_prepare : 0.000067s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000024s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000018s : 0.00% optimize.opt_a.parameter_eliminate : 0.000009s : 0.00% optimize.opt_a.a_2 : 0.000533s : 0.05% optimize.opt_a.accelerated_algorithm : 0.000046s : 0.00% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000014s : 0.00% optimize.opt_a.shard_inline : 0.000038s : 0.00% optimize.opt_a.merge_send_recv : 0.000048s : 0.00% optimize.opt_a.auto_parallel : 0.000043s : 0.00% optimize.opt_a.parallel : 0.000159s : 0.02% optimize.opt_a.flash_sp : 0.000025s : 0.00% optimize.opt_a.merge_comm : 0.000025s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000064s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000053s : 0.01% optimize.opt_a.virtual_dataset : 0.000035s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000037s : 0.00% optimize.opt_a.virtual_output : 0.000035s : 0.00% optimize.opt_a.merge_forward : 0.000026s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000052s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000090s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000066s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.002968s : 0.31% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000126s : 0.01% optimize.opt_a.a_after_grad : 0.000134s : 0.01% optimize.opt_a.renormalize : 0.052541s : 5.41% optimize.opt_a.add_forward_monad_depend : 0.000062s : 0.01% optimize.opt_a.auto_monad_grad : 0.000015s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000133s : 0.01% optimize.opt_a.cse : 0.000576s : 0.06% optimize.opt_a.a_3 : 0.000601s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000996s : 0.10% optimize.opt_b.b_1 : 0.000227s : 0.02% optimize.opt_b.b_2 : 0.000013s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.000082s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000041s : 0.00% optimize.loop_unroll : 0.000660s : 0.07% optimize.opt_after_cconv.c_1 : 0.000059s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000054s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000293s : 0.03% optimize.tuple_transform.d_1 : 0.000098s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000004s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000017s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000148s : 0.02% optimize.cse_after_recomputation.cse : 0.000045s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000044s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000031s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000036s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000021s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000042s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000893s : 0.09% validate : 0.000087s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.179363s : 18.46% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.001205 178 0.31% : 0.000004s : 5: substitution.elim_not_effective 1.01% : 0.000012s : 11: substitution.float_depend_g_call 0.35% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.26% : 0.000003s : 5: substitution.fold_const_symbol 0.99% : 0.000012s : 7: substitution.graph_param_transform 0.37% : 0.000004s : 2: substitution.incorporate_call 0.18% : 0.000002s : 2: substitution.incorporate_call_switch 65.89% : 0.000794s : 19: substitution.inline 2.22% : 0.000027s : 2: substitution.inline_without_move 1.22% : 0.000015s : 18: substitution.j_node_and_user_rematch 1.35% : 0.000016s : 7: substitution.minmaximum_grad 0.17% : 0.000002s : 1: substitution.opt_reshape 2.05% : 0.000025s : 11: substitution.partial_eliminate 1.70% : 0.000021s : 18: substitution.remove_not_recompute_node 3.24% : 0.000039s : 9: substitution.replace_applicator 1.42% : 0.000017s : 9: substitution.replace_old_param 1.91% : 0.000023s : 2: substitution.reshape_eliminate 0.44% : 0.000005s : 1: substitution.set_cell_output_no_recompute 2.85% : 0.000034s : 3: substitution.switch_simplify 2.48% : 0.000030s : 7: substitution.tuple_list_convert_item_index_to_positive 1.02% : 0.000012s : 7: substitution.tuple_list_get_item_const_eliminator 1.33% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 5.73% : 0.000069s : 16: substitution.tuple_list_get_item_eliminator 1.52% : 0.000018s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.619337 2 99.00% : 0.613169s : 1: type_inference.infer 1.00% : 0.006168s : 1: type_inference.specialize ------[replace.] 0.000373 29 59.64% : 0.000222s : 19: replace.inline 15.79% : 0.000059s : 3: replace.switch_simplify 24.57% : 0.000092s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000841 29 92.93% : 0.000781s : 19: match.inline 3.66% : 0.000031s : 3: match.switch_simplify 3.42% : 0.000029s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000873 4861 1.01% : 0.000009s : 58: predicate.accumulaten_eliminater 0.45% : 0.000004s : 7: predicate.ad_related_special_op_eliminate 0.48% : 0.000004s : 26: predicate.addn_check_dump 1.14% : 0.000010s : 58: predicate.addn_zero_filter 1.00% : 0.000009s : 58: predicate.adjust_all_reduce_mul_add 2.30% : 0.000020s : 84: predicate.arithmetic_simplify 1.12% : 0.000010s : 58: predicate.cast_eliminate 1.10% : 0.000010s : 60: predicate.check_bprop_eliminate 0.50% : 0.000004s : 26: predicate.compare_switch_simplify 0.07% : 0.000001s : 7: predicate.const_output_eliminate 0.42% : 0.000004s : 26: predicate.depend_value_elim 0.94% : 0.000008s : 58: predicate.dict_get_item_const_eliminator 1.50% : 0.000013s : 58: predicate.dict_get_item_eliminator 0.97% : 0.000008s : 58: predicate.dict_set_item_eliminator 0.37% : 0.000003s : 14: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 7: predicate.elim_not_effective 0.22% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000010s : 65: predicate.environ_add_const_eliminate 1.06% : 0.000009s : 65: predicate.environ_get_add_eliminate 1.07% : 0.000009s : 65: predicate.environ_get_depend_swap 1.86% : 0.000016s : 91: predicate.environ_get_eliminate 1.10% : 0.000010s : 65: predicate.environ_get_set_eliminate 1.39% : 0.000012s : 84: predicate.exchange_switch_depend_value 2.21% : 0.000019s : 84: predicate.float_depend_g_call 0.48% : 0.000004s : 26: predicate.float_environ_get_switch 0.66% : 0.000006s : 33: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 7: predicate.fold_const_symbol 0.70% : 0.000006s : 26: predicate.get_grad_eliminate 0.11% : 0.000001s : 7: predicate.graph_param_transform 0.48% : 0.000004s : 26: predicate.incorporate_call 0.37% : 0.000003s : 26: predicate.incorporate_call_switch 4.95% : 0.000043s : 208: predicate.inline 1.20% : 0.000010s : 50: predicate.inline_without_move 0.21% : 0.000002s : 26: predicate.j_node_and_user_rematch 0.86% : 0.000007s : 26: predicate.less_batch_normalization 1.77% : 0.000015s : 79: predicate.list_to_tuple_eliminator_ 2.46% : 0.000021s : 137: predicate.load_eliminater 0.39% : 0.000003s : 7: predicate.loop_unroll_after_grad 2.96% : 0.000026s : 122: predicate.loop_unroll_before_grad 1.37% : 0.000012s : 72: predicate.make_slice_get_slice_eliminator 0.55% : 0.000005s : 26: predicate.merge_addn 1.04% : 0.000009s : 60: predicate.micro_step_allgather_replace 1.09% : 0.000010s : 60: predicate.mini_step_allgather_replace 1.07% : 0.000009s : 58: predicate.minmaximum_grad 0.56% : 0.000005s : 7: predicate.mutable_eliminate 0.28% : 0.000002s : 7: predicate.opt_reshape 0.19% : 0.000002s : 7: predicate.parallel_virtual_node 2.47% : 0.000022s : 84: predicate.partial_defer_inline 1.23% : 0.000011s : 72: predicate.partial_eliminate 1.14% : 0.000010s : 58: predicate.print_const_string_wrapper 0.55% : 0.000005s : 26: predicate.reduce_all_const_elim 1.35% : 0.000012s : 58: predicate.reduce_eliminate 2.24% : 0.000020s : 137: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000003s : 26: predicate.remove_not_recompute_node 1.51% : 0.000013s : 125: predicate.replace_applicator 0.60% : 0.000005s : 50: predicate.replace_old_param 0.12% : 0.000001s : 7: predicate.reset_defer_inline 1.21% : 0.000011s : 58: predicate.reshape_eliminate 1.12% : 0.000010s : 60: predicate.row_tensor_add_zeros_like 0.18% : 0.000002s : 7: predicate.row_tensor_eliminate 1.50% : 0.000013s : 60: predicate.same_eliminate 0.28% : 0.000002s : 26: predicate.set_cell_output_no_recompute 0.72% : 0.000006s : 26: predicate.shard_identity_eliminate 0.32% : 0.000003s : 14: predicate.special_op_eliminate 0.49% : 0.000004s : 26: predicate.specialize_transform 1.44% : 0.000013s : 60: predicate.split_environ_get_set_with_tuple_value 1.30% : 0.000011s : 50: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.70% : 0.000015s : 84: predicate.switch_defer_inline 2.58% : 0.000023s : 144: predicate.switch_layer_defer_inline 7.87% : 0.000069s : 245: predicate.switch_simplify 1.08% : 0.000009s : 58: predicate.tile_eliminate 0.97% : 0.000008s : 58: predicate.transpose_eliminate 1.62% : 0.000014s : 72: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000012s : 72: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000012s : 72: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000024s : 105: predicate.tuple_list_get_item_eliminator 1.24% : 0.000011s : 72: predicate.tuple_list_get_set_item_eliminator 2.00% : 0.000017s : 98: predicate.tuple_list_set_item_eliminator 1.43% : 0.000012s : 79: predicate.tuple_to_list_eliminator_ 2.02% : 0.000018s : 137: predicate.updatestate_pure_node_eliminater 2.69% : 0.000023s : 163: predicate.updatestate_useless_node_eliminater 0.16% : 0.000001s : 7: predicate.value_based_eliminate 0.58% : 0.000005s : 26: predicate.virtual_dataset_eliminate 0.66% : 0.000006s : 26: predicate.virtual_output_eliminate 0.10% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.19% : 0.000002s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006372 48 73.49% : 0.004683s : 25: func_graph_cloner_run.FuncGraphClonerGraph 26.51% : 0.001689s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.144741 237 0.00% : 0.000005s : 1: ForceFp32Comm 0.86% : 0.009888s : 1: add_attr 0.86% : 0.009870s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000153s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000267s : 1: auto_monad 0.00% : 0.000047s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000006s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 7.07% : 0.080962s : 1: bootstrap 0.00% : 0.000045s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000036s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000065s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000007s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 1.03% : 0.011776s : 1: event_method 0.00% : 0.000019s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.06% : 0.000673s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.09% : 0.001013s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000035s : 1: opt.transform.mutable_eliminate 1.56% : 0.017846s : 117: opt.transform.opt_a 0.00% : 0.000057s : 1: opt.transform.opt_after_cconv 0.00% : 0.000049s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000200s : 28: opt.transform.opt_b 0.01% : 0.000110s : 2: opt.transform.opt_trans_graph 0.01% : 0.000084s : 4: opt.transform.symbol_engine_opt 6.66% : 0.076291s : 1: opt_a 0.02% : 0.000192s : 1: opt_after_cconv 0.08% : 0.000908s : 1: opt_after_jit_grad 0.04% : 0.000414s : 1: opt_b 7.03% : 0.080450s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000040s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000048s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.01% : 0.000101s : 1: pre_auto_parallel 0.00% : 0.000015s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000310s : 1: remove_dup_value 4.25% : 0.048687s : 2: renormalize.infer 0.33% : 0.003814s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000013s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000041s : 1: rewriter_after_opt_a 0.04% : 0.000419s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000185s : 1: symbol_engine_optimizer 15.67% : 0.179391s : 1: task_emit 0.01% : 0.000162s : 1: tuple_transform 54.12% : 0.619543s : 1: type_inference 0.01% : 0.000143s : 1: validate group_cases_3 have all been run, results of sub cases are below: case: (1,) {} pass. case: ('pynative',) {} pass. case: (1,) {} pass. case: (mindspore.float32, 0, False) {} pass. case: (mindspore.float32, 1, True) {} pass. case: (mindspore.float32, 0, True) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. ops group_cases_4 with 8 cases start to running, all cases are below: case: (, mindspore.float32, 1, False) case: (, mindspore.float16, 0, True) case: (, mindspore.float16, 0, False) case: (, mindspore.float16, 1, True) case: (, mindspore.float16, 1, False) case: (, mindspore.bfloat16, 0, True) case: (, mindspore.bfloat16, 0, False) case: (, mindspore.bfloat16, 1, True) ops group_cases_4 total running memory: 576M, memory threshold: 51200M TotalTime = 2.71263, [33] [bootstrap]: 0.00192367 [type_inference]: 0.217463 [event_method]: 0.00015454 [auto_monad]: 0.00030057 [graph_reusing]: 7.85e-06 [pre_auto_parallel]: 1.242e-05 [py_interpret_to_execute]: 6.221e-05 [rewriter_before_opt_a]: 0.00019587 [expand_dump_flag]: 3.56999e-06 [jit_opt_a]: 0.109283, [3] [Cycle 1]: 0.0514074, [27] [switch_simplify]: 0.00017171 [loop_unroll]: 6.556e-05 [a_1]: 0.00155518 [with_stream_mark]: 4.766e-05 [recompute_prepare]: 3.332e-05 [updatestate_depend_eliminate]: 2.334e-05 [updatestate_assign_eliminate]: 1.804e-05 [updatestate_loads_eliminate]: 9.90002e-06 [parameter_eliminate]: 3.41001e-06 [specialize_transform]: 2.289e-05 [updatestate_useless_node_eliminater]: 2.083e-05 [accelerated_algorithm]: 7.956e-05 [meta_shard_fg_expand]: 5.08002e-06 [get_grad_eliminate_]: 2.183e-05 [merge_forward]: 1.306e-05 [cell_reuse_recompute_pass]: 1.27e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.438e-05 [j_node_and_user_rematch]: 4.565e-05 [meta_fg_expand]: 0.00291578 [replace_old_param]: 0.00010286 [inline_without_move]: 8.481e-05 [renormalize]: 0.0451135 [add_forward_monad_depend]: 6.021e-05 [auto_monad_grad]: 9.40001e-06 [auto_monad_eliminator]: 9.463e-05 [cse]: 0.00033324 [replace_applicator]: 0.00012556 [Cycle 2]: 0.0255038, [27] [switch_simplify]: 6.437e-05 [loop_unroll]: 6.296e-05 [a_1]: 0.00218931 [with_stream_mark]: 3.758e-05 [recompute_prepare]: 2.622e-05 [updatestate_depend_eliminate]: 1.163e-05 [updatestate_assign_eliminate]: 8.60999e-06 [updatestate_loads_eliminate]: 8.01001e-06 [parameter_eliminate]: 2.86999e-06 [specialize_transform]: 1.781e-05 [updatestate_useless_node_eliminater]: 1.535e-05 [accelerated_algorithm]: 2.525e-05 [meta_shard_fg_expand]: 4.83001e-06 [get_grad_eliminate_]: 1.432e-05 [merge_forward]: 9.74e-06 [cell_reuse_recompute_pass]: 2.07999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.536e-05 [j_node_and_user_rematch]: 2.79e-05 [meta_fg_expand]: 0.00013926 [replace_old_param]: 3.597e-05 [inline_without_move]: 1.591e-05 [renormalize]: 0.0222179 [add_forward_monad_depend]: 1.24e-05 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 3.507e-05 [cse]: 0.00018038 [replace_applicator]: 3.89e-05 [Cycle 3]: 0.00096621, [27] [switch_simplify]: 1.834e-05 [loop_unroll]: 1.672e-05 [a_1]: 0.00047216 [with_stream_mark]: 2.315e-05 [recompute_prepare]: 1.67e-05 [updatestate_depend_eliminate]: 9.25001e-06 [updatestate_assign_eliminate]: 8.18001e-06 [updatestate_loads_eliminate]: 8.05999e-06 [parameter_eliminate]: 2.14e-06 [specialize_transform]: 1.537e-05 [updatestate_useless_node_eliminater]: 1.442e-05 [accelerated_algorithm]: 2.057e-05 [meta_shard_fg_expand]: 3.48e-06 [get_grad_eliminate_]: 1.415e-05 [merge_forward]: 8.90001e-06 [cell_reuse_recompute_pass]: 3.36999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.051e-05 [j_node_and_user_rematch]: 2.458e-05 [meta_fg_expand]: 5.83997e-06 [replace_old_param]: 2.272e-05 [inline_without_move]: 1.582e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.12001e-06 [auto_monad_grad]: 1.54e-06 [auto_monad_eliminator]: 1.869e-05 [cse]: 4.775e-05 [replace_applicator]: 1.54e-05 [py_interpret_to_execute_after_opt_a]: 2.437e-05 [rewriter_after_opt_a]: 0.00023734 [convert_after_rewriter]: 1.621e-05 [order_py_execute_after_rewriter]: 9.63002e-06 [mutable_eliminate]: 0.00087057 [jit_opt_b]: 0.00013599, [1] [Cycle 1]: 0.00012647, [2] [frontend_op_eliminate]: 4.635e-05 [inline_after_opt_a]: 6.675e-05 [cconv]: 3.54e-05 [loop_unroll]: 0.00052701 [jit_opt_after_cconv]: 0.00034761, [1] [Cycle 1]: 0.00034067, [11] [c_1]: 7.479e-05 [parameter_eliminate]: 4.68001e-06 [updatestate_depend_eliminate]: 1.572e-05 [updatestate_assign_eliminate]: 9.42999e-06 [updatestate_loads_eliminate]: 9.25999e-06 [cse]: 7.388e-05 [call_graph_tuple_transform]: 4.708e-05 [tuple_list_get_item_eliminator]: 3.07e-05 [none_parameter_eliminate]: 2.01998e-06 [renormalize]: 9.70002e-07 [switch_simplify]: 1.668e-05 [remove_dup_value]: 8.852e-05 [partial_unused_args_eliminate]: 2.94001e-06 [environ_conv]: 2.422e-05 [add_recomputation]: 0.00012887 [cse_after_recomputation]: 5.79e-05, [1] [Cycle 1]: 5.061e-05, [1] [cse]: 4.106e-05 [auto_monad_reorder]: 4.718e-05 [get_jit_bprop_graph]: 2.44001e-06 [rewriter_after_jit_bprop_graph]: 5.61998e-06 [opt_after_jit_grad]: 0.00064093 [symbol_engine_optimizer]: 0.00016272, [1] [Cycle 1]: 0.00015556, [6] [build]: 3.298e-05 [elim_shapecalc]: 1.921e-05 [elim_not_effective]: 3.189e-05 [opt_reshape]: 1.628e-05 [fold_const_symbol]: 2.488e-05 [renormalize]: 1.20999e-06 [validate]: 0.00014761 [backend_pass]: 1.34e-06 [task_emit]: 2.37927 [execute]: 9.62001e-06 Sums bootstrap : 0.001924s : 0.07% type_inference : 0.217463s : 8.11% event_method : 0.000155s : 0.01% auto_monad : 0.000301s : 0.01% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000062s : 0.00% rewriter_before_opt_a : 0.000196s : 0.01% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000254s : 0.01% jit_opt_a.loop_unroll : 0.000145s : 0.01% jit_opt_a.a_1 : 0.004217s : 0.16% jit_opt_a.with_stream_mark : 0.000108s : 0.00% jit_opt_a.recompute_prepare : 0.000076s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000044s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000035s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000026s : 0.00% jit_opt_a.parameter_eliminate : 0.000008s : 0.00% jit_opt_a.specialize_transform : 0.000056s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000051s : 0.00% jit_opt_a.accelerated_algorithm : 0.000125s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000013s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000050s : 0.00% jit_opt_a.merge_forward : 0.000032s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000110s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000098s : 0.00% jit_opt_a.meta_fg_expand : 0.003061s : 0.11% jit_opt_a.replace_old_param : 0.000162s : 0.01% jit_opt_a.inline_without_move : 0.000117s : 0.00% jit_opt_a.renormalize : 0.067331s : 2.51% jit_opt_a.add_forward_monad_depend : 0.000075s : 0.00% jit_opt_a.auto_monad_grad : 0.000013s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000148s : 0.01% jit_opt_a.cse : 0.000561s : 0.02% jit_opt_a.replace_applicator : 0.000180s : 0.01% py_interpret_to_execute_after_opt_a : 0.000024s : 0.00% rewriter_after_opt_a : 0.000237s : 0.01% convert_after_rewriter : 0.000016s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000871s : 0.03% jit_opt_b.frontend_op_eliminate : 0.000046s : 0.00% jit_opt_b.inline_after_opt_a : 0.000067s : 0.00% cconv : 0.000035s : 0.00% loop_unroll : 0.000527s : 0.02% jit_opt_after_cconv.c_1 : 0.000075s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000016s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.cse : 0.000074s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000047s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000031s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000017s : 0.00% remove_dup_value : 0.000089s : 0.00% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000024s : 0.00% add_recomputation : 0.000129s : 0.00% cse_after_recomputation.cse : 0.000041s : 0.00% auto_monad_reorder : 0.000047s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000641s : 0.02% symbol_engine_optimizer.build : 0.000033s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000032s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000016s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000148s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 2.379271s : 88.78% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001259 278 0.41% : 0.000005s : 9: substitution.elim_not_effective 0.32% : 0.000004s : 9: substitution.fold_const_symbol 1.05% : 0.000013s : 13: substitution.graph_param_transform 55.11% : 0.000694s : 13: substitution.inline 2.16% : 0.000027s : 2: substitution.inline_without_move 2.17% : 0.000027s : 29: substitution.j_node_and_user_rematch 4.80% : 0.000060s : 3: substitution.less_batch_normalization 4.13% : 0.000052s : 25: substitution.minmaximum_grad 2.25% : 0.000028s : 5: substitution.partial_eliminate 1.72% : 0.000022s : 29: substitution.remove_not_recompute_node 3.28% : 0.000041s : 10: substitution.replace_applicator 1.73% : 0.000022s : 26: substitution.replace_old_param 0.31% : 0.000004s : 1: substitution.set_cell_output_no_recompute 5.52% : 0.000070s : 25: substitution.tuple_list_convert_item_index_to_positive 3.48% : 0.000044s : 25: substitution.tuple_list_get_item_depend_reorder 11.56% : 0.000146s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.217287 2 98.82% : 0.214731s : 1: type_inference.infer 1.18% : 0.002556s : 1: type_inference.specialize ------[replace.] 0.000301 32 55.26% : 0.000166s : 13: replace.inline 44.74% : 0.000135s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000740 32 92.16% : 0.000682s : 13: match.inline 7.84% : 0.000058s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000729 5185 1.43% : 0.000010s : 83: predicate.accumulaten_eliminater 0.46% : 0.000003s : 13: predicate.ad_related_special_op_eliminate 1.33% : 0.000010s : 83: predicate.addn_check_dump 1.56% : 0.000011s : 83: predicate.addn_zero_filter 2.10% : 0.000015s : 83: predicate.arithmetic_simplify 1.56% : 0.000011s : 83: predicate.cast_eliminate 0.24% : 0.000002s : 13: predicate.check_bprop_eliminate 1.32% : 0.000010s : 83: predicate.compare_switch_simplify 1.36% : 0.000010s : 83: predicate.depend_value_elim 1.43% : 0.000010s : 83: predicate.dict_get_item_const_eliminator 1.43% : 0.000010s : 83: predicate.dict_get_item_eliminator 1.38% : 0.000010s : 83: predicate.dict_set_item_eliminator 0.40% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 13: predicate.elim_not_effective 0.29% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.46% : 0.000011s : 83: predicate.environ_add_const_eliminate 1.41% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.37% : 0.000010s : 83: predicate.environ_get_depend_swap 1.39% : 0.000010s : 83: predicate.environ_get_eliminate 1.32% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.15% : 0.000001s : 13: predicate.fold_const_symbol 0.93% : 0.000007s : 47: predicate.get_grad_eliminate 0.13% : 0.000001s : 13: predicate.graph_param_transform 4.59% : 0.000033s : 141: predicate.inline 2.01% : 0.000015s : 80: predicate.inline_without_move 0.47% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.20% : 0.000009s : 47: predicate.less_batch_normalization 1.80% : 0.000013s : 102: predicate.list_to_tuple_eliminator_ 2.08% : 0.000015s : 115: predicate.load_eliminater 0.52% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.94% : 0.000021s : 151: predicate.loop_unroll_before_grad 1.74% : 0.000013s : 96: predicate.make_slice_get_slice_eliminator 1.33% : 0.000010s : 83: predicate.merge_addn 1.43% : 0.000010s : 83: predicate.minmaximum_grad 0.53% : 0.000004s : 13: predicate.mutable_eliminate 0.29% : 0.000002s : 13: predicate.opt_reshape 2.56% : 0.000019s : 115: predicate.partial_eliminate 1.41% : 0.000010s : 83: predicate.print_const_string_wrapper 1.93% : 0.000014s : 83: predicate.reduce_eliminate 1.87% : 0.000014s : 102: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000004s : 47: predicate.remove_not_recompute_node 2.85% : 0.000021s : 194: predicate.replace_applicator 1.10% : 0.000008s : 80: predicate.replace_old_param 0.15% : 0.000001s : 13: predicate.reset_defer_inline 1.53% : 0.000011s : 83: predicate.reshape_eliminate 1.41% : 0.000010s : 83: predicate.row_tensor_add_zeros_like 0.33% : 0.000002s : 13: predicate.row_tensor_eliminate 1.54% : 0.000011s : 83: predicate.same_eliminate 0.58% : 0.000004s : 47: predicate.set_cell_output_no_recompute 0.51% : 0.000004s : 26: predicate.special_op_eliminate 1.07% : 0.000008s : 47: predicate.specialize_transform 1.60% : 0.000012s : 83: predicate.split_environ_get_set_with_tuple_value 1.39% : 0.000010s : 83: predicate.stack_unstack_eliminate 0.26% : 0.000002s : 13: predicate.switch_call_monad_eliminater 2.57% : 0.000019s : 115: predicate.switch_defer_inline 2.27% : 0.000017s : 115: predicate.switch_layer_defer_inline 5.79% : 0.000042s : 279: predicate.switch_simplify 1.36% : 0.000010s : 83: predicate.tile_eliminate 1.50% : 0.000011s : 83: predicate.transpose_eliminate 1.87% : 0.000014s : 83: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000012s : 83: predicate.tuple_list_get_item_depend_reorder 3.86% : 0.000028s : 128: predicate.tuple_list_get_item_eliminator 1.79% : 0.000013s : 83: predicate.tuple_list_set_item_eliminator 1.83% : 0.000013s : 102: predicate.tuple_to_list_eliminator_ 1.93% : 0.000014s : 115: predicate.updatestate_pure_node_eliminater 3.00% : 0.000022s : 162: predicate.updatestate_useless_node_eliminater 1.83% : 0.000013s : 83: predicate.value_based_eliminate 0.24% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.32% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004156 38 64.98% : 0.002701s : 21: func_graph_cloner_run.FuncGraphClonerGraph 35.02% : 0.001456s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.785780 91 0.00% : 0.000135s : 1: add_recomputation 0.01% : 0.000311s : 1: auto_monad 0.00% : 0.000050s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.07% : 0.001950s : 1: bootstrap 0.00% : 0.000038s : 1: cconv 0.00% : 0.000019s : 1: convert_after_rewriter 0.00% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000027s : 1: environ_conv 0.01% : 0.000162s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 3.92% : 0.109287s : 1: jit_opt_a 0.01% : 0.000350s : 1: jit_opt_after_cconv 0.00% : 0.000139s : 1: jit_opt_b 0.02% : 0.000537s : 1: loop_unroll 0.03% : 0.000882s : 1: mutable_eliminate 0.20% : 0.005544s : 39: opt.transform.jit_opt_a 0.01% : 0.000165s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000105s : 4: opt.transform.jit_opt_b 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000029s : 1: opt.transform.mutable_eliminate 0.00% : 0.000059s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000089s : 4: opt.transform.symbol_engine_opt 0.02% : 0.000653s : 1: opt_after_jit_grad 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.00% : 0.000066s : 1: py_interpret_to_execute 0.00% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000092s : 1: remove_dup_value 2.28% : 0.063431s : 2: renormalize.infer 0.14% : 0.003868s : 2: renormalize.specialize 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000243s : 1: rewriter_after_opt_a 0.01% : 0.000200s : 1: rewriter_before_opt_a 0.01% : 0.000165s : 1: symbol_engine_optimizer 85.41% : 2.379297s : 1: task_emit 7.81% : 0.217487s : 1: type_inference 0.01% : 0.000206s : 1: validate TotalTime = 2.7539, [33] [bootstrap]: 0.00144865 [type_inference]: 0.319987 [event_method]: 0.00027067 [auto_monad]: 0.00033394 [graph_reusing]: 7.66999e-06 [pre_auto_parallel]: 1.424e-05 [py_interpret_to_execute]: 6.571e-05 [rewriter_before_opt_a]: 0.00022564 [expand_dump_flag]: 2.59001e-06 [jit_opt_a]: 0.27574, [3] [Cycle 1]: 0.200709, [27] [switch_simplify]: 0.00018935 [loop_unroll]: 6.524e-05 [a_1]: 0.00169448 [with_stream_mark]: 4.926e-05 [recompute_prepare]: 4.153e-05 [updatestate_depend_eliminate]: 2.653e-05 [updatestate_assign_eliminate]: 1.962e-05 [updatestate_loads_eliminate]: 9.66e-06 [parameter_eliminate]: 3.05002e-06 [specialize_transform]: 2.372e-05 [updatestate_useless_node_eliminater]: 2.095e-05 [accelerated_algorithm]: 8.709e-05 [meta_shard_fg_expand]: 5.65001e-06 [get_grad_eliminate_]: 2.312e-05 [merge_forward]: 1.401e-05 [cell_reuse_recompute_pass]: 1.15999e-06 [cell_reuse_handle_not_recompute_node_pass]: 5.067e-05 [j_node_and_user_rematch]: 4.889e-05 [meta_fg_expand]: 0.0671081 [replace_old_param]: 0.00011098 [inline_without_move]: 8.145e-05 [renormalize]: 0.129953 [add_forward_monad_depend]: 0.00015642 [auto_monad_grad]: 1.049e-05 [auto_monad_eliminator]: 0.00010065 [cse]: 0.00030691 [replace_applicator]: 0.00012252 [Cycle 2]: 0.0050299, [27] [switch_simplify]: 6.26e-05 [loop_unroll]: 6.218e-05 [a_1]: 0.00211858 [with_stream_mark]: 3.71e-05 [recompute_prepare]: 2.315e-05 [updatestate_depend_eliminate]: 1.062e-05 [updatestate_assign_eliminate]: 8.42e-06 [updatestate_loads_eliminate]: 8.02003e-06 [parameter_eliminate]: 3.23e-06 [specialize_transform]: 1.651e-05 [updatestate_useless_node_eliminater]: 1.502e-05 [accelerated_algorithm]: 2.249e-05 [meta_shard_fg_expand]: 4.62998e-06 [get_grad_eliminate_]: 1.535e-05 [merge_forward]: 9.94001e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.613e-05 [j_node_and_user_rematch]: 2.587e-05 [meta_fg_expand]: 0.00013276 [replace_old_param]: 2.627e-05 [inline_without_move]: 1.535e-05 [renormalize]: 0.00192707 [add_forward_monad_depend]: 9.31e-06 [auto_monad_grad]: 3.18e-06 [auto_monad_eliminator]: 3.083e-05 [cse]: 0.00015178 [replace_applicator]: 3.053e-05 [Cycle 3]: 0.00094878, [27] [switch_simplify]: 1.735e-05 [loop_unroll]: 1.487e-05 [a_1]: 0.00044625 [with_stream_mark]: 2.229e-05 [recompute_prepare]: 1.673e-05 [updatestate_depend_eliminate]: 9.56e-06 [updatestate_assign_eliminate]: 7.98999e-06 [updatestate_loads_eliminate]: 7.31001e-06 [parameter_eliminate]: 2.24001e-06 [specialize_transform]: 1.523e-05 [updatestate_useless_node_eliminater]: 1.502e-05 [accelerated_algorithm]: 2.121e-05 [meta_shard_fg_expand]: 4.12e-06 [get_grad_eliminate_]: 1.44e-05 [merge_forward]: 9.82999e-06 [cell_reuse_recompute_pass]: 2.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.89e-05 [j_node_and_user_rematch]: 2.451e-05 [meta_fg_expand]: 6.15002e-06 [replace_old_param]: 2.112e-05 [inline_without_move]: 1.46e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.56e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.874e-05 [cse]: 5.401e-05 [replace_applicator]: 1.638e-05 [py_interpret_to_execute_after_opt_a]: 2.71e-05 [rewriter_after_opt_a]: 0.00021012 [convert_after_rewriter]: 1.671e-05 [order_py_execute_after_rewriter]: 1.012e-05 [mutable_eliminate]: 0.0009256 [jit_opt_b]: 0.00015399, [1] [Cycle 1]: 0.00014447, [2] [frontend_op_eliminate]: 5.531e-05 [inline_after_opt_a]: 7.256e-05 [cconv]: 3.618e-05 [loop_unroll]: 0.00051102 [jit_opt_after_cconv]: 0.00036207, [1] [Cycle 1]: 0.00035466, [11] [c_1]: 7.628e-05 [parameter_eliminate]: 3.88999e-06 [updatestate_depend_eliminate]: 1.649e-05 [updatestate_assign_eliminate]: 1.031e-05 [updatestate_loads_eliminate]: 9.47001e-06 [cse]: 8.488e-05 [call_graph_tuple_transform]: 4.619e-05 [tuple_list_get_item_eliminator]: 3.074e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 6.59988e-07 [switch_simplify]: 1.639e-05 [remove_dup_value]: 0.00010832 [partial_unused_args_eliminate]: 3.20998e-06 [environ_conv]: 2.373e-05 [add_recomputation]: 0.00013007 [cse_after_recomputation]: 5.862e-05, [1] [Cycle 1]: 5.204e-05, [1] [cse]: 4.394e-05 [auto_monad_reorder]: 4.493e-05 [get_jit_bprop_graph]: 2.12001e-06 [rewriter_after_jit_bprop_graph]: 4.53999e-06 [opt_after_jit_grad]: 0.00059055 [symbol_engine_optimizer]: 0.00015844, [1] [Cycle 1]: 0.00015193, [6] [build]: 2.851e-05 [elim_shapecalc]: 1.948e-05 [elim_not_effective]: 3.138e-05 [opt_reshape]: 1.647e-05 [fold_const_symbol]: 2.654e-05 [renormalize]: 6.69999e-07 [validate]: 0.00012927 [backend_pass]: 1.34998e-06 [task_emit]: 2.15192 [execute]: 8.72e-06 Sums bootstrap : 0.001449s : 0.05% type_inference : 0.319987s : 11.92% event_method : 0.000271s : 0.01% auto_monad : 0.000334s : 0.01% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000014s : 0.00% py_interpret_to_execute : 0.000066s : 0.00% rewriter_before_opt_a : 0.000226s : 0.01% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000269s : 0.01% jit_opt_a.loop_unroll : 0.000142s : 0.01% jit_opt_a.a_1 : 0.004259s : 0.16% jit_opt_a.with_stream_mark : 0.000109s : 0.00% jit_opt_a.recompute_prepare : 0.000081s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000047s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000036s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000025s : 0.00% jit_opt_a.parameter_eliminate : 0.000009s : 0.00% jit_opt_a.specialize_transform : 0.000055s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000051s : 0.00% jit_opt_a.accelerated_algorithm : 0.000131s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000014s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000053s : 0.00% jit_opt_a.merge_forward : 0.000034s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000126s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000099s : 0.00% jit_opt_a.meta_fg_expand : 0.067247s : 2.51% jit_opt_a.replace_old_param : 0.000158s : 0.01% jit_opt_a.inline_without_move : 0.000111s : 0.00% jit_opt_a.renormalize : 0.131881s : 4.91% jit_opt_a.add_forward_monad_depend : 0.000168s : 0.01% jit_opt_a.auto_monad_grad : 0.000015s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000150s : 0.01% jit_opt_a.cse : 0.000513s : 0.02% jit_opt_a.replace_applicator : 0.000169s : 0.01% py_interpret_to_execute_after_opt_a : 0.000027s : 0.00% rewriter_after_opt_a : 0.000210s : 0.01% convert_after_rewriter : 0.000017s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000926s : 0.03% jit_opt_b.frontend_op_eliminate : 0.000055s : 0.00% jit_opt_b.inline_after_opt_a : 0.000073s : 0.00% cconv : 0.000036s : 0.00% loop_unroll : 0.000511s : 0.02% jit_opt_after_cconv.c_1 : 0.000076s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000016s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.cse : 0.000085s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000046s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000031s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000016s : 0.00% remove_dup_value : 0.000108s : 0.00% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000024s : 0.00% add_recomputation : 0.000130s : 0.00% cse_after_recomputation.cse : 0.000044s : 0.00% auto_monad_reorder : 0.000045s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000591s : 0.02% symbol_engine_optimizer.build : 0.000029s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000031s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000016s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000027s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000129s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.151918s : 80.19% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.001294 278 0.35% : 0.000005s : 9: substitution.elim_not_effective 0.31% : 0.000004s : 9: substitution.fold_const_symbol 1.02% : 0.000013s : 13: substitution.graph_param_transform 55.79% : 0.000722s : 13: substitution.inline 1.87% : 0.000024s : 2: substitution.inline_without_move 2.16% : 0.000028s : 29: substitution.j_node_and_user_rematch 4.89% : 0.000063s : 3: substitution.less_batch_normalization 4.19% : 0.000054s : 25: substitution.minmaximum_grad 2.50% : 0.000032s : 5: substitution.partial_eliminate 2.54% : 0.000033s : 29: substitution.remove_not_recompute_node 3.31% : 0.000043s : 10: substitution.replace_applicator 1.43% : 0.000018s : 26: substitution.replace_old_param 0.42% : 0.000005s : 1: substitution.set_cell_output_no_recompute 5.02% : 0.000065s : 25: substitution.tuple_list_convert_item_index_to_positive 3.31% : 0.000043s : 25: substitution.tuple_list_get_item_depend_reorder 10.88% : 0.000141s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.319783 2 98.49% : 0.314951s : 1: type_inference.infer 1.51% : 0.004832s : 1: type_inference.specialize ------[replace.] 0.000321 32 54.02% : 0.000173s : 13: replace.inline 45.98% : 0.000147s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000765 32 92.96% : 0.000711s : 13: match.inline 7.04% : 0.000054s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000723 5185 1.58% : 0.000011s : 83: predicate.accumulaten_eliminater 0.42% : 0.000003s : 13: predicate.ad_related_special_op_eliminate 1.38% : 0.000010s : 83: predicate.addn_check_dump 1.48% : 0.000011s : 83: predicate.addn_zero_filter 2.18% : 0.000016s : 83: predicate.arithmetic_simplify 1.44% : 0.000010s : 83: predicate.cast_eliminate 0.25% : 0.000002s : 13: predicate.check_bprop_eliminate 1.37% : 0.000010s : 83: predicate.compare_switch_simplify 1.38% : 0.000010s : 83: predicate.depend_value_elim 1.33% : 0.000010s : 83: predicate.dict_get_item_const_eliminator 1.40% : 0.000010s : 83: predicate.dict_get_item_eliminator 1.43% : 0.000010s : 83: predicate.dict_set_item_eliminator 0.35% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 13: predicate.elim_not_effective 0.28% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000010s : 83: predicate.environ_add_const_eliminate 1.33% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.33% : 0.000010s : 83: predicate.environ_get_depend_swap 1.37% : 0.000010s : 83: predicate.environ_get_eliminate 1.36% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.14% : 0.000001s : 13: predicate.fold_const_symbol 0.97% : 0.000007s : 47: predicate.get_grad_eliminate 0.16% : 0.000001s : 13: predicate.graph_param_transform 4.55% : 0.000033s : 141: predicate.inline 1.71% : 0.000012s : 80: predicate.inline_without_move 0.48% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.34% : 0.000010s : 47: predicate.less_batch_normalization 1.84% : 0.000013s : 102: predicate.list_to_tuple_eliminator_ 2.25% : 0.000016s : 115: predicate.load_eliminater 0.60% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.96% : 0.000021s : 151: predicate.loop_unroll_before_grad 1.76% : 0.000013s : 96: predicate.make_slice_get_slice_eliminator 1.33% : 0.000010s : 83: predicate.merge_addn 1.42% : 0.000010s : 83: predicate.minmaximum_grad 0.69% : 0.000005s : 13: predicate.mutable_eliminate 0.28% : 0.000002s : 13: predicate.opt_reshape 2.54% : 0.000018s : 115: predicate.partial_eliminate 1.35% : 0.000010s : 83: predicate.print_const_string_wrapper 1.98% : 0.000014s : 83: predicate.reduce_eliminate 1.81% : 0.000013s : 102: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000004s : 47: predicate.remove_not_recompute_node 2.73% : 0.000020s : 194: predicate.replace_applicator 0.98% : 0.000007s : 80: predicate.replace_old_param 0.17% : 0.000001s : 13: predicate.reset_defer_inline 1.47% : 0.000011s : 83: predicate.reshape_eliminate 1.41% : 0.000010s : 83: predicate.row_tensor_add_zeros_like 0.35% : 0.000002s : 13: predicate.row_tensor_eliminate 1.44% : 0.000010s : 83: predicate.same_eliminate 0.66% : 0.000005s : 47: predicate.set_cell_output_no_recompute 0.51% : 0.000004s : 26: predicate.special_op_eliminate 1.04% : 0.000008s : 47: predicate.specialize_transform 1.61% : 0.000012s : 83: predicate.split_environ_get_set_with_tuple_value 1.49% : 0.000011s : 83: predicate.stack_unstack_eliminate 0.29% : 0.000002s : 13: predicate.switch_call_monad_eliminater 2.70% : 0.000020s : 115: predicate.switch_defer_inline 2.25% : 0.000016s : 115: predicate.switch_layer_defer_inline 5.81% : 0.000042s : 279: predicate.switch_simplify 1.39% : 0.000010s : 83: predicate.tile_eliminate 1.40% : 0.000010s : 83: predicate.transpose_eliminate 1.75% : 0.000013s : 83: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000012s : 83: predicate.tuple_list_get_item_depend_reorder 3.79% : 0.000027s : 128: predicate.tuple_list_get_item_eliminator 1.85% : 0.000013s : 83: predicate.tuple_list_set_item_eliminator 1.76% : 0.000013s : 102: predicate.tuple_to_list_eliminator_ 2.00% : 0.000014s : 115: predicate.updatestate_pure_node_eliminater 3.12% : 0.000023s : 162: predicate.updatestate_useless_node_eliminater 1.82% : 0.000013s : 83: predicate.value_based_eliminate 0.25% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.33% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.047949 38 97.46% : 0.046732s : 21: func_graph_cloner_run.FuncGraphClonerGraph 2.54% : 0.001217s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.891712 91 0.00% : 0.000134s : 1: add_recomputation 0.01% : 0.000343s : 1: auto_monad 0.00% : 0.000048s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.05% : 0.001472s : 1: bootstrap 0.00% : 0.000039s : 1: cconv 0.00% : 0.000019s : 1: convert_after_rewriter 0.00% : 0.000061s : 1: cse_after_recomputation 0.00% : 0.000026s : 1: environ_conv 0.01% : 0.000277s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 9.54% : 0.275743s : 1: jit_opt_a 0.01% : 0.000365s : 1: jit_opt_after_cconv 0.01% : 0.000158s : 1: jit_opt_b 0.02% : 0.000518s : 1: loop_unroll 0.03% : 0.000938s : 1: mutable_eliminate 0.19% : 0.005606s : 39: opt.transform.jit_opt_a 0.01% : 0.000166s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000118s : 4: opt.transform.jit_opt_b 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000037s : 1: opt.transform.mutable_eliminate 0.00% : 0.000054s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000091s : 4: opt.transform.symbol_engine_opt 0.02% : 0.000597s : 1: opt_after_jit_grad 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000016s : 1: pre_auto_parallel 0.00% : 0.000070s : 1: py_interpret_to_execute 0.00% : 0.000030s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000113s : 1: remove_dup_value 0.56% : 0.016086s : 2: renormalize.infer 4.00% : 0.115765s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000216s : 1: rewriter_after_opt_a 0.01% : 0.000229s : 1: rewriter_before_opt_a 0.01% : 0.000161s : 1: symbol_engine_optimizer 74.42% : 2.151942s : 1: task_emit 11.07% : 0.320004s : 1: type_inference 0.01% : 0.000181s : 1: validate TotalTime = 2.90314, [33] [bootstrap]: 0.00133558 [type_inference]: 0.657448 [event_method]: 0.00021779 [auto_monad]: 0.00037774 [graph_reusing]: 8.87e-06 [pre_auto_parallel]: 1.246e-05 [py_interpret_to_execute]: 6.714e-05 [rewriter_before_opt_a]: 0.00023848 [expand_dump_flag]: 3.97e-06 [jit_opt_a]: 0.196411, [3] [Cycle 1]: 0.121743, [27] [switch_simplify]: 0.00016314 [loop_unroll]: 6.646e-05 [a_1]: 0.00164291 [with_stream_mark]: 3.983e-05 [recompute_prepare]: 3.538e-05 [updatestate_depend_eliminate]: 2.349e-05 [updatestate_assign_eliminate]: 1.797e-05 [updatestate_loads_eliminate]: 9.56e-06 [parameter_eliminate]: 3.87002e-06 [specialize_transform]: 2.319e-05 [updatestate_useless_node_eliminater]: 1.994e-05 [accelerated_algorithm]: 6.121e-05 [meta_shard_fg_expand]: 4.97999e-06 [get_grad_eliminate_]: 2.185e-05 [merge_forward]: 1.245e-05 [cell_reuse_recompute_pass]: 1.10001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.357e-05 [j_node_and_user_rematch]: 4.43e-05 [meta_fg_expand]: 0.00261324 [replace_old_param]: 0.0001059 [inline_without_move]: 8.206e-05 [renormalize]: 0.115803 [add_forward_monad_depend]: 3.547e-05 [auto_monad_grad]: 9.47999e-06 [auto_monad_eliminator]: 0.00012405 [cse]: 0.00028618 [replace_applicator]: 0.00011567 [Cycle 2]: 0.0626452, [27] [switch_simplify]: 6.188e-05 [loop_unroll]: 6.335e-05 [a_1]: 0.00226314 [with_stream_mark]: 3.17e-05 [recompute_prepare]: 2.063e-05 [updatestate_depend_eliminate]: 1.032e-05 [updatestate_assign_eliminate]: 9.15001e-06 [updatestate_loads_eliminate]: 7.65998e-06 [parameter_eliminate]: 3.16001e-06 [specialize_transform]: 1.697e-05 [updatestate_useless_node_eliminater]: 1.473e-05 [accelerated_algorithm]: 2.23e-05 [meta_shard_fg_expand]: 3.93001e-06 [get_grad_eliminate_]: 1.474e-05 [merge_forward]: 9.17001e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.409e-05 [j_node_and_user_rematch]: 2.68e-05 [meta_fg_expand]: 0.00012012 [replace_old_param]: 2.555e-05 [inline_without_move]: 1.501e-05 [renormalize]: 0.0594093 [add_forward_monad_depend]: 1.423e-05 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 4.126e-05 [cse]: 0.00015111 [replace_applicator]: 4.092e-05 [Cycle 3]: 0.00100458, [27] [switch_simplify]: 1.731e-05 [loop_unroll]: 1.629e-05 [a_1]: 0.00050507 [with_stream_mark]: 2.562e-05 [recompute_prepare]: 1.689e-05 [updatestate_depend_eliminate]: 9.57001e-06 [updatestate_assign_eliminate]: 8.35999e-06 [updatestate_loads_eliminate]: 7.85998e-06 [parameter_eliminate]: 2.43e-06 [specialize_transform]: 1.483e-05 [updatestate_useless_node_eliminater]: 1.428e-05 [accelerated_algorithm]: 2.125e-05 [meta_shard_fg_expand]: 3.94002e-06 [get_grad_eliminate_]: 1.489e-05 [merge_forward]: 1.022e-05 [cell_reuse_recompute_pass]: 3.55e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.16e-05 [j_node_and_user_rematch]: 2.48e-05 [meta_fg_expand]: 6.00002e-06 [replace_old_param]: 2.237e-05 [inline_without_move]: 1.554e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.52001e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 1.987e-05 [cse]: 4.969e-05 [replace_applicator]: 1.444e-05 [py_interpret_to_execute_after_opt_a]: 3.335e-05 [rewriter_after_opt_a]: 0.00137906 [convert_after_rewriter]: 3.671e-05 [order_py_execute_after_rewriter]: 1.046e-05 [mutable_eliminate]: 0.00099341 [jit_opt_b]: 0.00016254, [1] [Cycle 1]: 0.00015296, [2] [frontend_op_eliminate]: 5.717e-05 [inline_after_opt_a]: 7.969e-05 [cconv]: 4.294e-05 [loop_unroll]: 0.00061089 [jit_opt_after_cconv]: 0.00044577, [1] [Cycle 1]: 0.00043741, [11] [c_1]: 0.00011347 [parameter_eliminate]: 6.99001e-06 [updatestate_depend_eliminate]: 2.04e-05 [updatestate_assign_eliminate]: 1.119e-05 [updatestate_loads_eliminate]: 1.139e-05 [cse]: 0.00011118 [call_graph_tuple_transform]: 5.194e-05 [tuple_list_get_item_eliminator]: 3.108e-05 [none_parameter_eliminate]: 1.99e-06 [renormalize]: 1.08001e-06 [switch_simplify]: 1.784e-05 [remove_dup_value]: 0.00011946 [partial_unused_args_eliminate]: 3.00002e-06 [environ_conv]: 2.599e-05 [add_recomputation]: 0.00014253 [cse_after_recomputation]: 5.991e-05, [1] [Cycle 1]: 5.252e-05, [1] [cse]: 4.424e-05 [auto_monad_reorder]: 4.611e-05 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 6.36998e-06 [opt_after_jit_grad]: 0.00064541 [symbol_engine_optimizer]: 0.00016705, [1] [Cycle 1]: 0.00015967, [6] [build]: 3.091e-05 [elim_shapecalc]: 2.045e-05 [elim_not_effective]: 3.212e-05 [opt_reshape]: 1.727e-05 [fold_const_symbol]: 2.586e-05 [renormalize]: 6.69999e-07 [validate]: 0.00011506 [backend_pass]: 1.47999e-06 [task_emit]: 2.04149 [execute]: 8.58001e-06 Sums bootstrap : 0.001336s : 0.05% type_inference : 0.657448s : 22.74% event_method : 0.000218s : 0.01% auto_monad : 0.000378s : 0.01% graph_reusing : 0.000009s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000067s : 0.00% rewriter_before_opt_a : 0.000238s : 0.01% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000242s : 0.01% jit_opt_a.loop_unroll : 0.000146s : 0.01% jit_opt_a.a_1 : 0.004411s : 0.15% jit_opt_a.with_stream_mark : 0.000097s : 0.00% jit_opt_a.recompute_prepare : 0.000073s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000043s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000035s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000025s : 0.00% jit_opt_a.parameter_eliminate : 0.000009s : 0.00% jit_opt_a.specialize_transform : 0.000055s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000049s : 0.00% jit_opt_a.accelerated_algorithm : 0.000105s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000013s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000051s : 0.00% jit_opt_a.merge_forward : 0.000032s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000109s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000096s : 0.00% jit_opt_a.meta_fg_expand : 0.002739s : 0.09% jit_opt_a.replace_old_param : 0.000154s : 0.01% jit_opt_a.inline_without_move : 0.000113s : 0.00% jit_opt_a.renormalize : 0.175212s : 6.06% jit_opt_a.add_forward_monad_depend : 0.000052s : 0.00% jit_opt_a.auto_monad_grad : 0.000013s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000185s : 0.01% jit_opt_a.cse : 0.000487s : 0.02% jit_opt_a.replace_applicator : 0.000171s : 0.01% py_interpret_to_execute_after_opt_a : 0.000033s : 0.00% rewriter_after_opt_a : 0.001379s : 0.05% convert_after_rewriter : 0.000037s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000993s : 0.03% jit_opt_b.frontend_op_eliminate : 0.000057s : 0.00% jit_opt_b.inline_after_opt_a : 0.000080s : 0.00% cconv : 0.000043s : 0.00% loop_unroll : 0.000611s : 0.02% jit_opt_after_cconv.c_1 : 0.000113s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% jit_opt_after_cconv.cse : 0.000111s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000052s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000031s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000018s : 0.00% remove_dup_value : 0.000119s : 0.00% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000026s : 0.00% add_recomputation : 0.000143s : 0.00% cse_after_recomputation.cse : 0.000044s : 0.00% auto_monad_reorder : 0.000046s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000645s : 0.02% symbol_engine_optimizer.build : 0.000031s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000032s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000026s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000115s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.041494s : 70.62% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.001236 278 0.34% : 0.000004s : 9: substitution.elim_not_effective 0.32% : 0.000004s : 9: substitution.fold_const_symbol 1.05% : 0.000013s : 13: substitution.graph_param_transform 49.90% : 0.000617s : 13: substitution.inline 2.06% : 0.000026s : 2: substitution.inline_without_move 2.19% : 0.000027s : 29: substitution.j_node_and_user_rematch 3.42% : 0.000042s : 3: substitution.less_batch_normalization 11.87% : 0.000147s : 25: substitution.minmaximum_grad 2.17% : 0.000027s : 5: substitution.partial_eliminate 1.77% : 0.000022s : 29: substitution.remove_not_recompute_node 2.98% : 0.000037s : 10: substitution.replace_applicator 1.63% : 0.000020s : 26: substitution.replace_old_param 0.36% : 0.000004s : 1: substitution.set_cell_output_no_recompute 5.22% : 0.000064s : 25: substitution.tuple_list_convert_item_index_to_positive 3.45% : 0.000043s : 25: substitution.tuple_list_get_item_depend_reorder 11.26% : 0.000139s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.657239 2 89.29% : 0.586843s : 1: type_inference.infer 10.71% : 0.070396s : 1: type_inference.specialize ------[replace.] 0.000272 32 51.40% : 0.000140s : 13: replace.inline 48.60% : 0.000132s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000657 32 92.42% : 0.000607s : 13: match.inline 7.58% : 0.000050s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000885 5185 1.23% : 0.000011s : 83: predicate.accumulaten_eliminater 0.47% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 1.06% : 0.000009s : 83: predicate.addn_check_dump 2.23% : 0.000020s : 83: predicate.addn_zero_filter 2.64% : 0.000023s : 83: predicate.arithmetic_simplify 2.71% : 0.000024s : 83: predicate.cast_eliminate 0.21% : 0.000002s : 13: predicate.check_bprop_eliminate 1.05% : 0.000009s : 83: predicate.compare_switch_simplify 1.10% : 0.000010s : 83: predicate.depend_value_elim 1.09% : 0.000010s : 83: predicate.dict_get_item_const_eliminator 1.39% : 0.000012s : 83: predicate.dict_get_item_eliminator 1.09% : 0.000010s : 83: predicate.dict_set_item_eliminator 0.28% : 0.000002s : 13: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 13: predicate.elim_not_effective 0.28% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000010s : 83: predicate.environ_add_const_eliminate 1.08% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.08% : 0.000010s : 83: predicate.environ_get_depend_swap 1.11% : 0.000010s : 83: predicate.environ_get_eliminate 1.10% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.11% : 0.000001s : 13: predicate.fold_const_symbol 0.82% : 0.000007s : 47: predicate.get_grad_eliminate 0.12% : 0.000001s : 13: predicate.graph_param_transform 3.58% : 0.000032s : 141: predicate.inline 1.41% : 0.000012s : 80: predicate.inline_without_move 0.37% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.13% : 0.000010s : 47: predicate.less_batch_normalization 3.54% : 0.000031s : 102: predicate.list_to_tuple_eliminator_ 1.65% : 0.000015s : 115: predicate.load_eliminater 0.53% : 0.000005s : 13: predicate.loop_unroll_after_grad 2.53% : 0.000022s : 151: predicate.loop_unroll_before_grad 1.36% : 0.000012s : 96: predicate.make_slice_get_slice_eliminator 1.08% : 0.000010s : 83: predicate.merge_addn 1.15% : 0.000010s : 83: predicate.minmaximum_grad 0.55% : 0.000005s : 13: predicate.mutable_eliminate 0.22% : 0.000002s : 13: predicate.opt_reshape 2.04% : 0.000018s : 115: predicate.partial_eliminate 1.12% : 0.000010s : 83: predicate.print_const_string_wrapper 8.37% : 0.000074s : 83: predicate.reduce_eliminate 1.48% : 0.000013s : 102: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000004s : 47: predicate.remove_not_recompute_node 2.30% : 0.000020s : 194: predicate.replace_applicator 0.84% : 0.000007s : 80: predicate.replace_old_param 0.16% : 0.000001s : 13: predicate.reset_defer_inline 2.30% : 0.000020s : 83: predicate.reshape_eliminate 1.26% : 0.000011s : 83: predicate.row_tensor_add_zeros_like 0.32% : 0.000003s : 13: predicate.row_tensor_eliminate 1.21% : 0.000011s : 83: predicate.same_eliminate 0.57% : 0.000005s : 47: predicate.set_cell_output_no_recompute 0.49% : 0.000004s : 26: predicate.special_op_eliminate 0.75% : 0.000007s : 47: predicate.specialize_transform 1.35% : 0.000012s : 83: predicate.split_environ_get_set_with_tuple_value 1.13% : 0.000010s : 83: predicate.stack_unstack_eliminate 0.24% : 0.000002s : 13: predicate.switch_call_monad_eliminater 2.05% : 0.000018s : 115: predicate.switch_defer_inline 1.88% : 0.000017s : 115: predicate.switch_layer_defer_inline 4.48% : 0.000040s : 279: predicate.switch_simplify 1.23% : 0.000011s : 83: predicate.tile_eliminate 2.21% : 0.000020s : 83: predicate.transpose_eliminate 1.49% : 0.000013s : 83: predicate.tuple_list_convert_item_index_to_positive 1.36% : 0.000012s : 83: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000027s : 128: predicate.tuple_list_get_item_eliminator 1.53% : 0.000014s : 83: predicate.tuple_list_set_item_eliminator 3.17% : 0.000028s : 102: predicate.tuple_to_list_eliminator_ 1.66% : 0.000015s : 115: predicate.updatestate_pure_node_eliminater 2.42% : 0.000021s : 162: predicate.updatestate_useless_node_eliminater 3.97% : 0.000035s : 83: predicate.value_based_eliminate 0.19% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.25% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.061567 38 4.68% : 0.002884s : 21: func_graph_cloner_run.FuncGraphClonerGraph 95.32% : 0.058683s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.084377 91 0.00% : 0.000146s : 1: add_recomputation 0.01% : 0.000389s : 1: auto_monad 0.00% : 0.000049s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.04% : 0.001371s : 1: bootstrap 0.00% : 0.000046s : 1: cconv 0.00% : 0.000043s : 1: convert_after_rewriter 0.00% : 0.000062s : 1: cse_after_recomputation 0.00% : 0.000029s : 1: environ_conv 0.01% : 0.000227s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 6.37% : 0.196414s : 1: jit_opt_a 0.01% : 0.000449s : 1: jit_opt_after_cconv 0.01% : 0.000166s : 1: jit_opt_b 0.02% : 0.000622s : 1: loop_unroll 0.03% : 0.001008s : 1: mutable_eliminate 0.18% : 0.005691s : 39: opt.transform.jit_opt_a 0.01% : 0.000210s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000127s : 4: opt.transform.jit_opt_b 0.00% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000038s : 1: opt.transform.mutable_eliminate 0.00% : 0.000059s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000091s : 4: opt.transform.symbol_engine_opt 0.02% : 0.000656s : 1: opt_after_jit_grad 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.00% : 0.000072s : 1: py_interpret_to_execute 0.00% : 0.000036s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000124s : 1: remove_dup_value 3.70% : 0.114091s : 2: renormalize.infer 1.98% : 0.061066s : 2: renormalize.specialize 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.001405s : 1: rewriter_after_opt_a 0.01% : 0.000244s : 1: rewriter_before_opt_a 0.01% : 0.000170s : 1: symbol_engine_optimizer 66.19% : 2.041515s : 1: task_emit 21.32% : 0.657479s : 1: type_inference 0.01% : 0.000172s : 1: validate TotalTime = 2.72168, [33] [bootstrap]: 0.00135001 [type_inference]: 0.521725 [event_method]: 0.00015227 [auto_monad]: 0.00025266 [graph_reusing]: 7.18998e-06 [pre_auto_parallel]: 1.214e-05 [py_interpret_to_execute]: 5.79e-05 [rewriter_before_opt_a]: 0.00018868 [expand_dump_flag]: 3.33e-06 [jit_opt_a]: 0.0782432, [3] [Cycle 1]: 0.0472474, [27] [switch_simplify]: 0.00015912 [loop_unroll]: 6.432e-05 [a_1]: 0.00152262 [with_stream_mark]: 4.61e-05 [recompute_prepare]: 3.126e-05 [updatestate_depend_eliminate]: 2.131e-05 [updatestate_assign_eliminate]: 1.759e-05 [updatestate_loads_eliminate]: 9.61003e-06 [parameter_eliminate]: 2.94001e-06 [specialize_transform]: 2.286e-05 [updatestate_useless_node_eliminater]: 2.129e-05 [accelerated_algorithm]: 6.231e-05 [meta_shard_fg_expand]: 4.74e-06 [get_grad_eliminate_]: 2.059e-05 [merge_forward]: 1.153e-05 [cell_reuse_recompute_pass]: 1.70001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.297e-05 [j_node_and_user_rematch]: 4.287e-05 [meta_fg_expand]: 0.00251819 [replace_old_param]: 9.805e-05 [inline_without_move]: 8.377e-05 [renormalize]: 0.0415597 [add_forward_monad_depend]: 3.723e-05 [auto_monad_grad]: 7.78001e-06 [auto_monad_eliminator]: 8.083e-05 [cse]: 0.00028643 [replace_applicator]: 0.00011197 [Cycle 2]: 0.00473516, [27] [switch_simplify]: 6.365e-05 [loop_unroll]: 6.153e-05 [a_1]: 0.00210816 [with_stream_mark]: 3.139e-05 [recompute_prepare]: 2.316e-05 [updatestate_depend_eliminate]: 9.46e-06 [updatestate_assign_eliminate]: 8.34998e-06 [updatestate_loads_eliminate]: 7.86001e-06 [parameter_eliminate]: 3.41999e-06 [specialize_transform]: 1.606e-05 [updatestate_useless_node_eliminater]: 1.497e-05 [accelerated_algorithm]: 2.1e-05 [meta_shard_fg_expand]: 4.3e-06 [get_grad_eliminate_]: 1.433e-05 [merge_forward]: 1.01e-05 [cell_reuse_recompute_pass]: 1.66002e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.491e-05 [j_node_and_user_rematch]: 2.588e-05 [meta_fg_expand]: 0.00010274 [replace_old_param]: 2.608e-05 [inline_without_move]: 1.445e-05 [renormalize]: 0.00164363 [add_forward_monad_depend]: 8.28999e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 2.805e-05 [cse]: 0.00022035 [replace_applicator]: 3.04e-05 [Cycle 3]: 0.00089601, [27] [switch_simplify]: 1.684e-05 [loop_unroll]: 1.498e-05 [a_1]: 0.00043854 [with_stream_mark]: 1.856e-05 [recompute_prepare]: 1.52e-05 [updatestate_depend_eliminate]: 8.21002e-06 [updatestate_assign_eliminate]: 7.63999e-06 [updatestate_loads_eliminate]: 7.35e-06 [parameter_eliminate]: 1.44e-06 [specialize_transform]: 1.463e-05 [updatestate_useless_node_eliminater]: 1.423e-05 [accelerated_algorithm]: 1.892e-05 [meta_shard_fg_expand]: 2.78e-06 [get_grad_eliminate_]: 1.4e-05 [merge_forward]: 8.69e-06 [cell_reuse_recompute_pass]: 1.85001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.831e-05 [j_node_and_user_rematch]: 2.394e-05 [meta_fg_expand]: 5.25999e-06 [replace_old_param]: 2.102e-05 [inline_without_move]: 1.434e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.86e-06 [auto_monad_grad]: 1.72999e-06 [auto_monad_eliminator]: 1.627e-05 [cse]: 4.618e-05 [replace_applicator]: 1.53e-05 [py_interpret_to_execute_after_opt_a]: 2.17e-05 [rewriter_after_opt_a]: 0.00020952 [convert_after_rewriter]: 1.504e-05 [order_py_execute_after_rewriter]: 9.94001e-06 [mutable_eliminate]: 0.0008317 [jit_opt_b]: 0.00013456, [1] [Cycle 1]: 0.00012625, [2] [frontend_op_eliminate]: 4.783e-05 [inline_after_opt_a]: 6.551e-05 [cconv]: 3.254e-05 [loop_unroll]: 0.00046248 [jit_opt_after_cconv]: 0.00032587, [1] [Cycle 1]: 0.00031882, [11] [c_1]: 7.363e-05 [parameter_eliminate]: 3.43e-06 [updatestate_depend_eliminate]: 1.366e-05 [updatestate_assign_eliminate]: 9.24998e-06 [updatestate_loads_eliminate]: 8.89e-06 [cse]: 6.545e-05 [call_graph_tuple_transform]: 4.267e-05 [tuple_list_get_item_eliminator]: 2.932e-05 [none_parameter_eliminate]: 1.92999e-06 [renormalize]: 9.30013e-07 [switch_simplify]: 1.621e-05 [remove_dup_value]: 7.657e-05 [partial_unused_args_eliminate]: 2.27999e-06 [environ_conv]: 2.027e-05 [add_recomputation]: 0.00012231 [cse_after_recomputation]: 5.071e-05, [1] [Cycle 1]: 4.437e-05, [1] [cse]: 3.614e-05 [auto_monad_reorder]: 4.053e-05 [get_jit_bprop_graph]: 1.96998e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00050553 [symbol_engine_optimizer]: 0.00017362, [1] [Cycle 1]: 0.00016245, [6] [build]: 2.992e-05 [elim_shapecalc]: 2.439e-05 [elim_not_effective]: 3.021e-05 [opt_reshape]: 1.654e-05 [fold_const_symbol]: 2.518e-05 [renormalize]: 4.39992e-07 [validate]: 0.000129 [backend_pass]: 1.15001e-06 [task_emit]: 2.11576 [execute]: 9.77001e-06 Sums bootstrap : 0.001350s : 0.05% type_inference : 0.521725s : 19.36% event_method : 0.000152s : 0.01% auto_monad : 0.000253s : 0.01% graph_reusing : 0.000007s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000058s : 0.00% rewriter_before_opt_a : 0.000189s : 0.01% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000240s : 0.01% jit_opt_a.loop_unroll : 0.000141s : 0.01% jit_opt_a.a_1 : 0.004069s : 0.15% jit_opt_a.with_stream_mark : 0.000096s : 0.00% jit_opt_a.recompute_prepare : 0.000070s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000039s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000034s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000025s : 0.00% jit_opt_a.parameter_eliminate : 0.000008s : 0.00% jit_opt_a.specialize_transform : 0.000054s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000050s : 0.00% jit_opt_a.accelerated_algorithm : 0.000102s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000012s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000049s : 0.00% jit_opt_a.merge_forward : 0.000030s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000106s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000093s : 0.00% jit_opt_a.meta_fg_expand : 0.002626s : 0.10% jit_opt_a.replace_old_param : 0.000145s : 0.01% jit_opt_a.inline_without_move : 0.000113s : 0.00% jit_opt_a.renormalize : 0.043203s : 1.60% jit_opt_a.add_forward_monad_depend : 0.000047s : 0.00% jit_opt_a.auto_monad_grad : 0.000012s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000125s : 0.00% jit_opt_a.cse : 0.000553s : 0.02% jit_opt_a.replace_applicator : 0.000158s : 0.01% py_interpret_to_execute_after_opt_a : 0.000022s : 0.00% rewriter_after_opt_a : 0.000210s : 0.01% convert_after_rewriter : 0.000015s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000832s : 0.03% jit_opt_b.frontend_op_eliminate : 0.000048s : 0.00% jit_opt_b.inline_after_opt_a : 0.000066s : 0.00% cconv : 0.000033s : 0.00% loop_unroll : 0.000462s : 0.02% jit_opt_after_cconv.c_1 : 0.000074s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.cse : 0.000065s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000043s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000029s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000016s : 0.00% remove_dup_value : 0.000077s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000020s : 0.00% add_recomputation : 0.000122s : 0.00% cse_after_recomputation.cse : 0.000036s : 0.00% auto_monad_reorder : 0.000041s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000506s : 0.02% symbol_engine_optimizer.build : 0.000030s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000030s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000129s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.115761s : 78.51% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001177 278 0.38% : 0.000004s : 9: substitution.elim_not_effective 0.33% : 0.000004s : 9: substitution.fold_const_symbol 1.01% : 0.000012s : 13: substitution.graph_param_transform 52.87% : 0.000622s : 13: substitution.inline 2.08% : 0.000024s : 2: substitution.inline_without_move 2.16% : 0.000025s : 29: substitution.j_node_and_user_rematch 3.74% : 0.000044s : 3: substitution.less_batch_normalization 3.52% : 0.000041s : 25: substitution.minmaximum_grad 2.20% : 0.000026s : 5: substitution.partial_eliminate 1.79% : 0.000021s : 29: substitution.remove_not_recompute_node 3.15% : 0.000037s : 10: substitution.replace_applicator 1.71% : 0.000020s : 26: substitution.replace_old_param 0.39% : 0.000005s : 1: substitution.set_cell_output_no_recompute 5.44% : 0.000064s : 25: substitution.tuple_list_convert_item_index_to_positive 3.59% : 0.000042s : 25: substitution.tuple_list_get_item_depend_reorder 15.64% : 0.000184s : 54: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.521588 2 99.57% : 0.519364s : 1: type_inference.infer 0.43% : 0.002225s : 1: type_inference.specialize ------[replace.] 0.000284 32 52.70% : 0.000150s : 13: replace.inline 47.30% : 0.000135s : 19: replace.tuple_list_get_item_eliminator ------[match.] 0.000716 32 85.70% : 0.000613s : 13: match.inline 14.30% : 0.000102s : 19: match.tuple_list_get_item_eliminator ------[predicate.] 0.000709 5185 1.41% : 0.000010s : 83: predicate.accumulaten_eliminater 0.47% : 0.000003s : 13: predicate.ad_related_special_op_eliminate 1.36% : 0.000010s : 83: predicate.addn_check_dump 1.39% : 0.000010s : 83: predicate.addn_zero_filter 1.95% : 0.000014s : 83: predicate.arithmetic_simplify 1.45% : 0.000010s : 83: predicate.cast_eliminate 0.31% : 0.000002s : 13: predicate.check_bprop_eliminate 1.35% : 0.000010s : 83: predicate.compare_switch_simplify 1.39% : 0.000010s : 83: predicate.depend_value_elim 1.37% : 0.000010s : 83: predicate.dict_get_item_const_eliminator 1.51% : 0.000011s : 83: predicate.dict_get_item_eliminator 1.38% : 0.000010s : 83: predicate.dict_set_item_eliminator 0.40% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.16% : 0.000001s : 13: predicate.elim_not_effective 0.43% : 0.000003s : 13: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000010s : 83: predicate.environ_add_const_eliminate 1.36% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.37% : 0.000010s : 83: predicate.environ_get_depend_swap 1.39% : 0.000010s : 83: predicate.environ_get_eliminate 1.36% : 0.000010s : 83: predicate.environ_get_set_eliminate 0.15% : 0.000001s : 13: predicate.fold_const_symbol 0.94% : 0.000007s : 47: predicate.get_grad_eliminate 0.14% : 0.000001s : 13: predicate.graph_param_transform 4.16% : 0.000029s : 141: predicate.inline 1.77% : 0.000013s : 80: predicate.inline_without_move 0.49% : 0.000003s : 47: predicate.j_node_and_user_rematch 1.04% : 0.000007s : 47: predicate.less_batch_normalization 1.84% : 0.000013s : 102: predicate.list_to_tuple_eliminator_ 2.04% : 0.000014s : 115: predicate.load_eliminater 0.52% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.87% : 0.000020s : 151: predicate.loop_unroll_before_grad 1.75% : 0.000012s : 96: predicate.make_slice_get_slice_eliminator 1.36% : 0.000010s : 83: predicate.merge_addn 1.48% : 0.000010s : 83: predicate.minmaximum_grad 0.56% : 0.000004s : 13: predicate.mutable_eliminate 0.27% : 0.000002s : 13: predicate.opt_reshape 2.58% : 0.000018s : 115: predicate.partial_eliminate 1.36% : 0.000010s : 83: predicate.print_const_string_wrapper 1.75% : 0.000012s : 83: predicate.reduce_eliminate 3.89% : 0.000028s : 102: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000004s : 47: predicate.remove_not_recompute_node 2.69% : 0.000019s : 194: predicate.replace_applicator 0.97% : 0.000007s : 80: predicate.replace_old_param 0.19% : 0.000001s : 13: predicate.reset_defer_inline 1.44% : 0.000010s : 83: predicate.reshape_eliminate 1.42% : 0.000010s : 83: predicate.row_tensor_add_zeros_like 0.37% : 0.000003s : 13: predicate.row_tensor_eliminate 1.42% : 0.000010s : 83: predicate.same_eliminate 0.62% : 0.000004s : 47: predicate.set_cell_output_no_recompute 0.51% : 0.000004s : 26: predicate.special_op_eliminate 0.95% : 0.000007s : 47: predicate.specialize_transform 1.56% : 0.000011s : 83: predicate.split_environ_get_set_with_tuple_value 1.47% : 0.000010s : 83: predicate.stack_unstack_eliminate 0.26% : 0.000002s : 13: predicate.switch_call_monad_eliminater 2.67% : 0.000019s : 115: predicate.switch_defer_inline 2.21% : 0.000016s : 115: predicate.switch_layer_defer_inline 5.73% : 0.000041s : 279: predicate.switch_simplify 1.39% : 0.000010s : 83: predicate.tile_eliminate 1.39% : 0.000010s : 83: predicate.transpose_eliminate 1.82% : 0.000013s : 83: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000012s : 83: predicate.tuple_list_get_item_depend_reorder 3.56% : 0.000025s : 128: predicate.tuple_list_get_item_eliminator 1.72% : 0.000012s : 83: predicate.tuple_list_set_item_eliminator 1.79% : 0.000013s : 102: predicate.tuple_to_list_eliminator_ 1.95% : 0.000014s : 115: predicate.updatestate_pure_node_eliminater 3.14% : 0.000022s : 162: predicate.updatestate_useless_node_eliminater 1.73% : 0.000012s : 83: predicate.value_based_eliminate 0.25% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.37% : 0.000003s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003874 38 69.98% : 0.002711s : 21: func_graph_cloner_run.FuncGraphClonerGraph 30.02% : 0.001163s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.770470 91 0.00% : 0.000126s : 1: add_recomputation 0.01% : 0.000261s : 1: auto_monad 0.00% : 0.000043s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.05% : 0.001377s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000018s : 1: convert_after_rewriter 0.00% : 0.000053s : 1: cse_after_recomputation 0.00% : 0.000023s : 1: environ_conv 0.01% : 0.000160s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 2.82% : 0.078247s : 1: jit_opt_a 0.01% : 0.000329s : 1: jit_opt_after_cconv 0.00% : 0.000137s : 1: jit_opt_b 0.02% : 0.000471s : 1: loop_unroll 0.03% : 0.000843s : 1: mutable_eliminate 0.19% : 0.005314s : 39: opt.transform.jit_opt_a 0.01% : 0.000158s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000105s : 4: opt.transform.jit_opt_b 0.00% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000028s : 1: opt.transform.mutable_eliminate 0.00% : 0.000054s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000092s : 4: opt.transform.symbol_engine_opt 0.03% : 0.000861s : 1: opt_after_jit_grad 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000014s : 1: pre_auto_parallel 0.00% : 0.000061s : 1: py_interpret_to_execute 0.00% : 0.000024s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000080s : 1: remove_dup_value 1.45% : 0.040180s : 2: renormalize.infer 0.11% : 0.002996s : 2: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000214s : 1: rewriter_after_opt_a 0.01% : 0.000192s : 1: rewriter_before_opt_a 0.01% : 0.000177s : 1: symbol_engine_optimizer 76.37% : 2.115783s : 1: task_emit 18.83% : 0.521745s : 1: type_inference 0.01% : 0.000181s : 1: validate group_cases_4 have all been run, results of sub cases are below: case: (mindspore.float32, 1, False) {} pass. case: (mindspore.float16, 1, False) {} pass. case: (mindspore.float16, 1, True) {} pass. case: (mindspore.bfloat16, 1, True) {} pass. case: (mindspore.float16, 0, True) {} pass. case: (mindspore.bfloat16, 0, False) {} pass. case: (mindspore.float16, 0, False) {} pass. case: (mindspore.bfloat16, 0, True) {} pass. ops group_cases_5 with 8 cases start to running, all cases are below: case: (, mindspore.bfloat16, 1, False) case: (, 'pynative') case: (, 'KBK') case: (, 'GE') case: (, 0) case: (, 1) case: (, 1) case: (, 0) ops group_cases_5 total running memory: 302M, memory threshold: 51200M [WARNING] ME(71725:281473890602800,ForkProcess-47):2026-01-29-17:44:30.995.732 [mindspore/context.py:1334] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(71873:281473890602800,ForkProcess-48):2026-01-29-17:44:31.271.35 [mindspore/context.py:1334] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. TotalTime = 0.470327, [30] [bootstrap]: 0.00505479 [type_inference]: 0.184897 [event_method]: 0.00031085 [auto_monad]: 0.00033149 [graph_reusing]: 8.23999e-06 [pre_auto_parallel]: 1.26e-05 [py_interpret_to_execute]: 7.436e-05 [rewriter_before_opt_a]: 0.00020665 [expand_dump_flag]: 3.81001e-06 [jit_opt_a]: 0.274873, [4] [Cycle 1]: 0.231365, [27] [switch_simplify]: 0.0002211 [loop_unroll]: 7.247e-05 [a_1]: 0.00219903 [with_stream_mark]: 6.385e-05 [recompute_prepare]: 4.936e-05 [updatestate_depend_eliminate]: 3.318e-05 [updatestate_assign_eliminate]: 2.139e-05 [updatestate_loads_eliminate]: 1.247e-05 [parameter_eliminate]: 4.58999e-06 [specialize_transform]: 2.681e-05 [updatestate_useless_node_eliminater]: 2.561e-05 [accelerated_algorithm]: 0.00010645 [meta_shard_fg_expand]: 1.518e-05 [get_grad_eliminate_]: 2.793e-05 [merge_forward]: 1.555e-05 [cell_reuse_recompute_pass]: 1.27e-06 [cell_reuse_handle_not_recompute_node_pass]: 6.012e-05 [j_node_and_user_rematch]: 5.414e-05 [meta_fg_expand]: 0.0959119 [replace_old_param]: 0.00019974 [inline_without_move]: 0.00017707 [renormalize]: 0.125679 [add_forward_monad_depend]: 0.0006162 [auto_monad_grad]: 8.495e-05 [auto_monad_eliminator]: 0.00042186 [cse]: 0.00357249 [replace_applicator]: 0.00121737 [Cycle 2]: 0.0170142, [27] [switch_simplify]: 0.00025759 [loop_unroll]: 0.00021381 [a_1]: 0.00731719 [with_stream_mark]: 4.434e-05 [recompute_prepare]: 2.742e-05 [updatestate_depend_eliminate]: 1.415e-05 [updatestate_assign_eliminate]: 1.152e-05 [updatestate_loads_eliminate]: 1.074e-05 [parameter_eliminate]: 2.89999e-06 [specialize_transform]: 2.109e-05 [updatestate_useless_node_eliminater]: 2.053e-05 [accelerated_algorithm]: 3.074e-05 [meta_shard_fg_expand]: 1.329e-05 [get_grad_eliminate_]: 2.045e-05 [merge_forward]: 1.349e-05 [cell_reuse_recompute_pass]: 1.40999e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.458e-05 [j_node_and_user_rematch]: 3.722e-05 [meta_fg_expand]: 0.00036774 [replace_old_param]: 3.211e-05 [inline_without_move]: 2.102e-05 [renormalize]: 0.0073411 [add_forward_monad_depend]: 1.16e-05 [auto_monad_grad]: 2.89999e-06 [auto_monad_eliminator]: 4.666e-05 [cse]: 0.00078481 [replace_applicator]: 4.577e-05 [Cycle 3]: 0.0147768, [27] [switch_simplify]: 2.313e-05 [loop_unroll]: 1.961e-05 [a_1]: 0.00105458 [with_stream_mark]: 3.114e-05 [recompute_prepare]: 1.786e-05 [updatestate_depend_eliminate]: 8.64e-06 [updatestate_assign_eliminate]: 7.78001e-06 [updatestate_loads_eliminate]: 7.11999e-06 [parameter_eliminate]: 3.18998e-06 [specialize_transform]: 1.576e-05 [updatestate_useless_node_eliminater]: 1.31e-05 [accelerated_algorithm]: 2.201e-05 [meta_shard_fg_expand]: 5.52001e-06 [get_grad_eliminate_]: 1.278e-05 [merge_forward]: 8.28999e-06 [cell_reuse_recompute_pass]: 3.03998e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.197e-05 [j_node_and_user_rematch]: 2.361e-05 [meta_fg_expand]: 7.01001e-06 [replace_old_param]: 2.097e-05 [inline_without_move]: 1.349e-05 [renormalize]: 0.0129769 [add_forward_monad_depend]: 1.382e-05 [auto_monad_grad]: 2.51998e-06 [auto_monad_eliminator]: 3.844e-05 [cse]: 0.00013417 [replace_applicator]: 4.347e-05 [Cycle 4]: 0.0010845, [27] [switch_simplify]: 1.688e-05 [loop_unroll]: 1.485e-05 [a_1]: 0.00045359 [with_stream_mark]: 2.525e-05 [recompute_prepare]: 1.561e-05 [updatestate_depend_eliminate]: 9.49e-06 [updatestate_assign_eliminate]: 9.27001e-06 [updatestate_loads_eliminate]: 7.29001e-06 [parameter_eliminate]: 2.05002e-06 [specialize_transform]: 1.439e-05 [updatestate_useless_node_eliminater]: 1.383e-05 [accelerated_algorithm]: 2.453e-05 [meta_shard_fg_expand]: 5.33002e-06 [get_grad_eliminate_]: 1.305e-05 [merge_forward]: 9.64999e-06 [cell_reuse_recompute_pass]: 4.48999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.266e-05 [j_node_and_user_rematch]: 2.604e-05 [meta_fg_expand]: 6.53003e-06 [replace_old_param]: 2.331e-05 [inline_without_move]: 1.343e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.15002e-06 [auto_monad_grad]: 1.62001e-06 [auto_monad_eliminator]: 2.225e-05 [cse]: 0.00012515 [replace_applicator]: 2.541e-05 [py_interpret_to_execute_after_opt_a]: 3.525e-05 [rewriter_after_opt_a]: 0.00024651 [convert_after_rewriter]: 1.577e-05 [order_py_execute_after_rewriter]: 9.08002e-06 [mutable_eliminate]: 0.00123156 [jit_opt_b]: 0.00014398, [1] [Cycle 1]: 0.00013337, [2] [frontend_op_eliminate]: 5.093e-05 [inline_after_opt_a]: 6.787e-05 [cconv]: 4.012e-05 [loop_unroll]: 0.0006669 [jit_opt_after_cconv]: 0.00034966, [1] [Cycle 1]: 0.00034113, [11] [c_1]: 6.633e-05 [parameter_eliminate]: 6.84001e-06 [updatestate_depend_eliminate]: 1.812e-05 [updatestate_assign_eliminate]: 7.08998e-06 [updatestate_loads_eliminate]: 6.66e-06 [cse]: 7.912e-05 [call_graph_tuple_transform]: 4.589e-05 [tuple_list_get_item_eliminator]: 2.812e-05 [none_parameter_eliminate]: 1.99999e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 1.423e-05 [remove_dup_value]: 0.00010007 [partial_unused_args_eliminate]: 2.71e-06 [environ_conv]: 3.338e-05 [add_recomputation]: 0.00014951 [cse_after_recomputation]: 6.719e-05, [1] [Cycle 1]: 5.741e-05, [1] [cse]: 4.602e-05 [auto_monad_reorder]: 4.262e-05 [get_jit_bprop_graph]: 2.51e-06 [rewriter_after_jit_bprop_graph]: 7.33e-06 [opt_after_jit_grad]: 0.00065803 [symbol_engine_optimizer]: 0.00019066, [1] [Cycle 1]: 0.00018267, [6] [build]: 3.41e-05 [elim_shapecalc]: 1.923e-05 [elim_not_effective]: 3.822e-05 [opt_reshape]: 1.936e-05 [fold_const_symbol]: 3.398e-05 [renormalize]: 5.69999e-07 [validate]: 0.00015399 Sums bootstrap : 0.005055s : 1.10% type_inference : 0.184897s : 40.37% event_method : 0.000311s : 0.07% auto_monad : 0.000331s : 0.07% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000013s : 0.00% py_interpret_to_execute : 0.000074s : 0.02% rewriter_before_opt_a : 0.000207s : 0.05% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000519s : 0.11% jit_opt_a.loop_unroll : 0.000321s : 0.07% jit_opt_a.a_1 : 0.011024s : 2.41% jit_opt_a.with_stream_mark : 0.000165s : 0.04% jit_opt_a.recompute_prepare : 0.000110s : 0.02% jit_opt_a.updatestate_depend_eliminate : 0.000065s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000050s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000038s : 0.01% jit_opt_a.parameter_eliminate : 0.000013s : 0.00% jit_opt_a.specialize_transform : 0.000078s : 0.02% jit_opt_a.updatestate_useless_node_eliminater : 0.000073s : 0.02% jit_opt_a.accelerated_algorithm : 0.000184s : 0.04% jit_opt_a.meta_shard_fg_expand : 0.000039s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000074s : 0.02% jit_opt_a.merge_forward : 0.000047s : 0.01% jit_opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000169s : 0.04% jit_opt_a.j_node_and_user_rematch : 0.000141s : 0.03% jit_opt_a.meta_fg_expand : 0.096293s : 21.02% jit_opt_a.replace_old_param : 0.000276s : 0.06% jit_opt_a.inline_without_move : 0.000225s : 0.05% jit_opt_a.renormalize : 0.145997s : 31.88% jit_opt_a.add_forward_monad_depend : 0.000644s : 0.14% jit_opt_a.auto_monad_grad : 0.000092s : 0.02% jit_opt_a.auto_monad_eliminator : 0.000529s : 0.12% jit_opt_a.cse : 0.004617s : 1.01% jit_opt_a.replace_applicator : 0.001332s : 0.29% py_interpret_to_execute_after_opt_a : 0.000035s : 0.01% rewriter_after_opt_a : 0.000247s : 0.05% convert_after_rewriter : 0.000016s : 0.00% order_py_execute_after_rewriter : 0.000009s : 0.00% mutable_eliminate : 0.001232s : 0.27% jit_opt_b.frontend_op_eliminate : 0.000051s : 0.01% jit_opt_b.inline_after_opt_a : 0.000068s : 0.01% cconv : 0.000040s : 0.01% loop_unroll : 0.000667s : 0.15% jit_opt_after_cconv.c_1 : 0.000066s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000018s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.cse : 0.000079s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000046s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000028s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000014s : 0.00% remove_dup_value : 0.000100s : 0.02% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000033s : 0.01% add_recomputation : 0.000150s : 0.03% cse_after_recomputation.cse : 0.000046s : 0.01% auto_monad_reorder : 0.000043s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000658s : 0.14% symbol_engine_optimizer.build : 0.000034s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000038s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000019s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000034s : 0.01% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000154s : 0.03% Time group info: ------[substitution.] 0.004831 486 0.30% : 0.000014s : 3: substitution.addn_check_dump 2.84% : 0.000137s : 5: substitution.addn_zero_filter 0.32% : 0.000016s : 2: substitution.cast_eliminate 0.14% : 0.000007s : 8: substitution.elim_not_effective 0.13% : 0.000006s : 8: substitution.fold_const_symbol 21.19% : 0.001024s : 5: substitution.getattr_setattr_resolve 0.23% : 0.000011s : 11: substitution.graph_param_transform 50.69% : 0.002449s : 35: substitution.inline 0.93% : 0.000045s : 5: substitution.inline_without_move 0.73% : 0.000035s : 47: substitution.j_node_and_user_rematch 1.68% : 0.000081s : 7: substitution.less_batch_normalization 0.31% : 0.000015s : 3: substitution.merge_addn 1.44% : 0.000070s : 39: substitution.minmaximum_grad 1.51% : 0.000073s : 6: substitution.partial_eliminate 0.68% : 0.000033s : 47: substitution.remove_not_recompute_node 6.00% : 0.000290s : 57: substitution.replace_applicator 0.62% : 0.000030s : 31: substitution.replace_old_param 0.14% : 0.000007s : 1: substitution.set_cell_output_no_recompute 2.57% : 0.000124s : 42: substitution.tuple_list_convert_item_index_to_positive 1.65% : 0.000080s : 39: substitution.tuple_list_get_item_depend_reorder 4.99% : 0.000241s : 79: substitution.tuple_list_get_item_eliminator 0.91% : 0.000044s : 6: substitution.tuple_list_set_item_eliminator ------[type_inference.] 0.184720 2 97.41% : 0.179938s : 1: type_inference.infer 2.59% : 0.004783s : 1: type_inference.specialize ------[replace.] 0.001618 99 3.16% : 0.000051s : 2: replace.addn_zero_filter 0.89% : 0.000014s : 2: replace.cast_eliminate 4.76% : 0.000077s : 4: replace.getattr_setattr_resolve 27.93% : 0.000452s : 35: replace.inline 27.42% : 0.000444s : 21: replace.replace_applicator 31.17% : 0.000504s : 32: replace.tuple_list_get_item_eliminator 4.67% : 0.000076s : 3: replace.tuple_list_set_item_eliminator ------[match.] 0.003559 99 2.70% : 0.000096s : 2: match.addn_zero_filter 0.40% : 0.000014s : 2: match.cast_eliminate 21.51% : 0.000766s : 4: match.getattr_setattr_resolve 68.02% : 0.002421s : 35: match.inline 3.90% : 0.000139s : 21: match.replace_applicator 2.72% : 0.000097s : 32: match.tuple_list_get_item_eliminator 0.74% : 0.000026s : 3: match.tuple_list_set_item_eliminator ------[predicate.] 0.002432 8637 0.84% : 0.000020s : 134: predicate.accumulaten_eliminater 0.19% : 0.000005s : 11: predicate.ad_related_special_op_eliminate 0.69% : 0.000017s : 134: predicate.addn_check_dump 0.87% : 0.000021s : 136: predicate.addn_zero_filter 1.38% : 0.000034s : 136: predicate.arithmetic_simplify 0.89% : 0.000022s : 138: predicate.cast_eliminate 0.07% : 0.000002s : 11: predicate.check_bprop_eliminate 0.72% : 0.000018s : 134: predicate.compare_switch_simplify 0.79% : 0.000019s : 134: predicate.depend_value_elim 0.79% : 0.000019s : 138: predicate.dict_get_item_const_eliminator 0.91% : 0.000022s : 138: predicate.dict_get_item_eliminator 0.87% : 0.000021s : 138: predicate.dict_set_item_eliminator 0.12% : 0.000003s : 11: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 11: predicate.elim_not_effective 0.10% : 0.000003s : 11: predicate.elim_shapecalc_of_broadcastargs 0.75% : 0.000018s : 138: predicate.environ_add_const_eliminate 3.92% : 0.000095s : 138: predicate.environ_get_add_eliminate 0.73% : 0.000018s : 138: predicate.environ_get_depend_swap 0.79% : 0.000019s : 138: predicate.environ_get_eliminate 0.78% : 0.000019s : 138: predicate.environ_get_set_eliminate 0.06% : 0.000001s : 11: predicate.fold_const_symbol 0.44% : 0.000011s : 64: predicate.get_grad_eliminate 0.38% : 0.000009s : 31: predicate.getattr_setattr_resolve 0.05% : 0.000001s : 11: predicate.graph_param_transform 2.42% : 0.000059s : 230: predicate.inline 1.26% : 0.000031s : 163: predicate.inline_without_move 0.21% : 0.000005s : 64: predicate.j_node_and_user_rematch 0.67% : 0.000016s : 64: predicate.less_batch_normalization 1.05% : 0.000026s : 173: predicate.list_to_tuple_eliminator_ 2.73% : 0.000066s : 184: predicate.load_eliminater 0.23% : 0.000006s : 11: predicate.loop_unroll_after_grad 2.46% : 0.000060s : 316: predicate.loop_unroll_before_grad 0.96% : 0.000023s : 149: predicate.make_slice_get_slice_eliminator 0.71% : 0.000017s : 134: predicate.merge_addn 0.90% : 0.000022s : 136: predicate.minmaximum_grad 0.28% : 0.000007s : 11: predicate.mutable_eliminate 0.09% : 0.000002s : 11: predicate.opt_reshape 1.27% : 0.000031s : 184: predicate.partial_eliminate 0.75% : 0.000018s : 134: predicate.print_const_string_wrapper 1.14% : 0.000028s : 136: predicate.reduce_eliminate 1.10% : 0.000027s : 173: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000006s : 64: predicate.remove_not_recompute_node 1.84% : 0.000045s : 464: predicate.replace_applicator 0.58% : 0.000014s : 163: predicate.replace_old_param 0.05% : 0.000001s : 11: predicate.reset_defer_inline 0.80% : 0.000019s : 136: predicate.reshape_eliminate 0.86% : 0.000021s : 134: predicate.row_tensor_add_zeros_like 0.12% : 0.000003s : 11: predicate.row_tensor_eliminate 36.96% : 0.000899s : 134: predicate.same_eliminate 0.24% : 0.000006s : 64: predicate.set_cell_output_no_recompute 0.17% : 0.000004s : 22: predicate.special_op_eliminate 0.41% : 0.000010s : 64: predicate.specialize_transform 1.03% : 0.000025s : 134: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000018s : 134: predicate.stack_unstack_eliminate 0.07% : 0.000002s : 11: predicate.switch_call_monad_eliminater 2.58% : 0.000063s : 208: predicate.switch_defer_inline 1.34% : 0.000033s : 208: predicate.switch_layer_defer_inline 4.68% : 0.000114s : 535: predicate.switch_simplify 0.83% : 0.000020s : 136: predicate.tile_eliminate 0.79% : 0.000019s : 136: predicate.transpose_eliminate 1.02% : 0.000025s : 138: predicate.tuple_list_convert_item_index_to_positive 0.98% : 0.000024s : 138: predicate.tuple_list_get_item_depend_reorder 2.01% : 0.000049s : 195: predicate.tuple_list_get_item_eliminator 1.24% : 0.000030s : 141: predicate.tuple_list_set_item_eliminator 1.01% : 0.000024s : 173: predicate.tuple_to_list_eliminator_ 1.12% : 0.000027s : 184: predicate.updatestate_pure_node_eliminater 1.57% : 0.000038s : 248: predicate.updatestate_useless_node_eliminater 1.11% : 0.000027s : 134: predicate.value_based_eliminate 0.06% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.11% : 0.000003s : 11: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.022287 120 53.38% : 0.011897s : 61: func_graph_cloner_run.FuncGraphClonerGraph 0.62% : 0.000139s : 2: func_graph_cloner_run.FuncGraphClonerNode 45.99% : 0.010250s : 57: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.632084 104 0.02% : 0.000155s : 1: add_recomputation 0.05% : 0.000340s : 1: auto_monad 0.01% : 0.000046s : 1: auto_monad_reorder 0.80% : 0.005081s : 1: bootstrap 0.01% : 0.000043s : 1: cconv 0.00% : 0.000020s : 1: convert_after_rewriter 0.01% : 0.000070s : 1: cse_after_recomputation 0.01% : 0.000038s : 1: environ_conv 0.05% : 0.000319s : 1: event_method 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 43.49% : 0.274879s : 1: jit_opt_a 0.06% : 0.000353s : 1: jit_opt_after_cconv 0.02% : 0.000147s : 1: jit_opt_b 0.11% : 0.000681s : 1: loop_unroll 0.20% : 0.001251s : 1: mutable_eliminate 2.27% : 0.014378s : 52: opt.transform.jit_opt_a 0.02% : 0.000149s : 4: opt.transform.jit_opt_after_cconv 0.02% : 0.000107s : 4: opt.transform.jit_opt_b 0.01% : 0.000054s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000147s : 1: opt.transform.mutable_eliminate 0.01% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.001186s : 2: opt.transform.opt_resolve 0.02% : 0.000105s : 4: opt.transform.symbol_engine_opt 0.11% : 0.000669s : 1: opt_after_jit_grad 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.01% : 0.000078s : 1: py_interpret_to_execute 0.01% : 0.000038s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000105s : 1: remove_dup_value 19.26% : 0.121765s : 3: renormalize.infer 3.83% : 0.024179s : 3: renormalize.specialize 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000254s : 1: rewriter_after_opt_a 0.03% : 0.000210s : 1: rewriter_before_opt_a 0.03% : 0.000194s : 1: symbol_engine_optimizer 29.26% : 0.184919s : 1: type_inference TotalTime = 4.48317, [24] [bootstrap]: 0.0011894 [type_inference]: 0.72736 [event_method]: 2.594e-05 [auto_monad]: 0.00015257 [graph_reusing]: 6.23e-06 [inline]: 4.65001e-06 [add_attr]: 0.0433857, [1] [add_attr_with_inline]: 0.0433669, [1] [Cycle 1]: 0.00013485, [2] [tag_attr]: 3.781e-05 [meta_addattr_fg_expand]: 1.3e-05 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 5.546e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.41e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.00970208, [53] [py_interpret_to_execute]: 6.96001e-06 [rewriter_before_opt_a]: 0.00024464 [opt_a]: 0.00659295, [2] [Cycle 1]: 0.00564812, [45] [expand_dump_flag]: 3.4e-06 [switch_simplify]: 5.903e-05 [loop_unroll]: 2.881e-05 [a_1]: 0.00065286 [with_stream_mark]: 3.667e-05 [recompute_prepare]: 1.354e-05 [updatestate_depend_eliminate]: 1.461e-05 [updatestate_assign_eliminate]: 1.122e-05 [updatestate_loads_eliminate]: 4.37e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 0.00018824 [accelerated_algorithm]: 1.163e-05 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 2.91999e-06 [shard_inline]: 9.72001e-06 [merge_send_recv]: 4.195e-05 [auto_parallel]: 8.81997e-06 [parallel]: 8.419e-05 [flash_sp]: 3.259e-05 [merge_comm]: 5.67001e-06 [allreduce_fusion]: 1.159e-05 [matmul_add_comm_reduction]: 1.774e-05 [allreduce_slice_to_reducescatter]: 7.73999e-06 [virtual_shard_identity]: 1.522e-05 [virtual_dataset]: 9.88998e-06 [get_grad_eliminate_]: 9.46998e-06 [virtual_output]: 9.62999e-06 [merge_forward]: 6.38998e-06 [cell_reuse_recompute_pass]: 1.38002e-06 [offload_activation]: 1.809e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.574e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.431e-05 [set_forward_comm_id_for_comm_node_pass]: 1.23e-05 [meta_fg_expand]: 3.73001e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 1.744e-05 [after_resolve]: 1.673e-05 [a_after_grad]: 1.481e-05 [renormalize]: 0.00373581 [add_forward_monad_depend]: 8.50001e-06 [auto_monad_grad]: 2.69999e-06 [auto_monad_eliminator]: 3.146e-05 [cse]: 7.917e-05 [a_3]: 7.866e-05 [Cycle 2]: 0.00093215, [45] [expand_dump_flag]: 2.47001e-06 [switch_simplify]: 1.176e-05 [loop_unroll]: 9.89001e-06 [a_1]: 0.00023689 [with_stream_mark]: 1.84e-05 [recompute_prepare]: 9.79e-06 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 4.22998e-06 [parameter_eliminate]: 1.47999e-06 [a_2]: 0.00011624 [accelerated_algorithm]: 1.094e-05 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 2.53e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 9.05001e-06 [auto_parallel]: 9.49e-06 [parallel]: 7.64002e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 4.65999e-06 [allreduce_fusion]: 3.81999e-06 [matmul_add_comm_reduction]: 9.74e-06 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 1.173e-05 [virtual_dataset]: 9.15001e-06 [get_grad_eliminate_]: 9.15999e-06 [virtual_output]: 9.51e-06 [merge_forward]: 5.60001e-06 [cell_reuse_recompute_pass]: 2.02999e-06 [offload_activation]: 9.79e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.978e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.26e-05 [set_forward_comm_id_for_comm_node_pass]: 4.23999e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 1.74e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.726e-05 [a_after_grad]: 1.487e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.70002e-06 [auto_monad_grad]: 1.42999e-06 [auto_monad_eliminator]: 8.05999e-06 [cse]: 2.801e-05 [a_3]: 5.809e-05 [py_interpret_to_execute_after_opt_a]: 9.24e-06 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 3.574e-05 [convert_after_rewriter]: 1.27999e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00081887 [opt_b]: 0.00030738, [1] [Cycle 1]: 0.00029868, [7] [b_1]: 0.00019658 [b_2]: 1.223e-05 [updatestate_depend_eliminate]: 1.073e-05 [updatestate_assign_eliminate]: 3.40003e-06 [updatestate_loads_eliminate]: 3.57002e-06 [renormalize]: 5.39992e-07 [cse]: 3.487e-05 [optimize_parallel_all_gather_comm]: 3.292e-05 [overlap_param_gather]: 1.042e-05 [cconv]: 3.17e-05 [loop_unroll]: 0.00051141 [opt_after_cconv]: 0.0001354, [1] [Cycle 1]: 0.00012862, [7] [c_1]: 4.856e-05 [parameter_eliminate]: 3.98001e-06 [updatestate_depend_eliminate]: 6.81001e-06 [updatestate_assign_eliminate]: 3.31999e-06 [updatestate_loads_eliminate]: 2.97002e-06 [cse]: 2.863e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 4.394e-05 [tuple_transform]: 0.0001002, [1] [Cycle 1]: 9.533e-05, [4] [d_1]: 6.399e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 9.12999e-06 [partial_unused_args_eliminate]: 1.59e-06 [add_recomputation]: 0.00010593 [cse_after_recomputation]: 2.975e-05, [1] [Cycle 1]: 2.492e-05, [1] [cse]: 1.78e-05 [environ_conv]: 3.587e-05 [swap_dp_allreduce_reducescatter]: 2.525e-05 [bias_add_comm_swap]: 1.07e-05 [label_micro_interleaved_index]: 1.427e-05 [label_fine_grained_interleaved_index]: 3.2e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 1.86e-06 [micro_interleaved_order_control]: 2.56998e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 9.29984e-07 [remove_cast_before_assign_add]: 9.56e-06 [full_micro_interleaved_order_control]: 9.84001e-06 [reorder_send_recv_between_fp_bp]: 2.75997e-06 [comm_op_add_attrs]: 1.89999e-06 [add_comm_op_reuse_tag]: 1.30999e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 8.25999e-06 [overlap_opt_shard_in_pipeline]: 2.678e-05 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.806e-05 [grouped_pairwise_exchange_alltoall]: 1.64998e-06 [offloading_packed_experts]: 4.72998e-06 [overlap_recompute_and_grad_model_parallel]: 1.306e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.60001e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 1.93e-05 [overlap_grad_flash_sp]: 4.942e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 1.038e-05 [split_layernorm_comm]: 1.69e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 9.997e-05, [1] [Cycle 1]: 9.391e-05, [6] [build]: 3.26999e-06 [elim_shapecalc]: 1.673e-05 [elim_not_effective]: 1.731e-05 [opt_reshape]: 1.04e-05 [fold_const_symbol]: 1.302e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.23998e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 2.7e-05 [get_jit_bprop_graph]: 1.49e-06 [rewriter_after_jit_bprop_graph]: 4.32998e-06 [opt_after_jit_grad]: 0.00057232 [validate]: 8.771e-05 [backend_pass]: 1.15999e-06 [task_emit]: 3.70025 [execute]: 1.172e-05 Sums bootstrap : 0.001189s : 0.03% type_inference : 0.727360s : 16.39% event_method : 0.000026s : 0.00% auto_monad : 0.000153s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000005s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000038s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000055s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000245s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000071s : 0.00% optimize.opt_a.loop_unroll : 0.000039s : 0.00% optimize.opt_a.a_1 : 0.000890s : 0.02% optimize.opt_a.with_stream_mark : 0.000055s : 0.00% optimize.opt_a.recompute_prepare : 0.000023s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000304s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.00% optimize.opt_a.merge_send_recv : 0.000051s : 0.00% optimize.opt_a.auto_parallel : 0.000018s : 0.00% optimize.opt_a.parallel : 0.000092s : 0.00% optimize.opt_a.flash_sp : 0.000036s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.00% optimize.opt_a.virtual_dataset : 0.000019s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.00% optimize.opt_a.virtual_output : 0.000019s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000028s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000046s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000027s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000034s : 0.00% optimize.opt_a.a_after_grad : 0.000030s : 0.00% optimize.opt_a.renormalize : 0.003736s : 0.08% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.00% optimize.opt_a.cse : 0.000107s : 0.00% optimize.opt_a.a_3 : 0.000137s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000819s : 0.02% optimize.opt_b.b_1 : 0.000197s : 0.00% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000032s : 0.00% optimize.loop_unroll : 0.000511s : 0.01% optimize.opt_after_cconv.c_1 : 0.000049s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000029s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.00% optimize.tuple_transform.d_1 : 0.000064s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000106s : 0.00% optimize.cse_after_recomputation.cse : 0.000018s : 0.00% optimize.environ_conv : 0.000036s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000027s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000049s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000572s : 0.01% validate : 0.000088s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.700252s : 83.37% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000213 31 0.95% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 3.58% : 0.000008s : 7: substitution.graph_param_transform 76.81% : 0.000163s : 3: substitution.inline 1.90% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.45% : 0.000014s : 4: substitution.remove_not_recompute_node 3.46% : 0.000007s : 6: substitution.replace_old_param 6.19% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.727207 2 99.52% : 0.723714s : 1: type_inference.infer 0.48% : 0.003493s : 1: type_inference.specialize ------[replace.] 0.000063 6 64.37% : 0.000040s : 3: replace.inline 35.63% : 0.000022s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 6 93.29% : 0.000161s : 3: match.inline 6.71% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000253 2039 0.79% : 0.000002s : 19: predicate.accumulaten_eliminater 0.71% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.68% : 0.000002s : 16: predicate.addn_check_dump 0.94% : 0.000002s : 19: predicate.addn_zero_filter 0.70% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 35: predicate.arithmetic_simplify 0.81% : 0.000002s : 19: predicate.cast_eliminate 0.71% : 0.000002s : 16: predicate.check_bprop_eliminate 0.66% : 0.000002s : 16: predicate.compare_switch_simplify 0.30% : 0.000001s : 8: predicate.const_output_eliminate 0.72% : 0.000002s : 16: predicate.depend_value_elim 0.88% : 0.000002s : 19: predicate.dict_get_item_const_eliminator 1.08% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.76% : 0.000002s : 19: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 15: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 7: predicate.elim_not_effective 0.63% : 0.000002s : 7: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000003s : 27: predicate.environ_add_const_eliminate 1.00% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.02% : 0.000003s : 27: predicate.environ_get_depend_swap 1.78% : 0.000005s : 43: predicate.environ_get_eliminate 1.03% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.09% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.01% : 0.000005s : 25: predicate.float_depend_g_call 0.68% : 0.000002s : 16: predicate.float_environ_get_switch 1.14% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.24% : 0.000001s : 7: predicate.fold_const_symbol 0.81% : 0.000002s : 16: predicate.get_grad_eliminate 0.25% : 0.000001s : 7: predicate.graph_param_transform 0.68% : 0.000002s : 16: predicate.incorporate_call 0.60% : 0.000002s : 16: predicate.incorporate_call_switch 5.87% : 0.000015s : 92: predicate.inline 0.83% : 0.000002s : 16: predicate.inline_without_move 0.49% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 16: predicate.less_batch_normalization 1.63% : 0.000004s : 37: predicate.list_to_tuple_eliminator_ 2.30% : 0.000006s : 57: predicate.load_eliminater 1.16% : 0.000003s : 8: predicate.loop_unroll_after_grad 1.93% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 35: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 16: predicate.merge_addn 0.66% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.72% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.78% : 0.000002s : 19: predicate.minmaximum_grad 1.38% : 0.000003s : 8: predicate.mutable_eliminate 0.50% : 0.000001s : 7: predicate.opt_reshape 0.41% : 0.000001s : 8: predicate.parallel_virtual_node 1.70% : 0.000004s : 25: predicate.partial_defer_inline 1.38% : 0.000003s : 30: predicate.partial_eliminate 0.80% : 0.000002s : 19: predicate.print_const_string_wrapper 0.79% : 0.000002s : 16: predicate.reduce_all_const_elim 1.01% : 0.000003s : 19: predicate.reduce_eliminate 2.37% : 0.000006s : 57: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000002s : 16: predicate.remove_not_recompute_node 1.67% : 0.000004s : 38: predicate.replace_applicator 0.71% : 0.000002s : 16: predicate.replace_old_param 0.33% : 0.000001s : 8: predicate.reset_defer_inline 0.84% : 0.000002s : 19: predicate.reshape_eliminate 0.73% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 8: predicate.row_tensor_eliminate 0.93% : 0.000002s : 16: predicate.same_eliminate 0.66% : 0.000002s : 16: predicate.set_cell_output_no_recompute 1.26% : 0.000003s : 16: predicate.shard_identity_eliminate 0.83% : 0.000002s : 15: predicate.special_op_eliminate 0.77% : 0.000002s : 16: predicate.specialize_transform 1.24% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.18% : 0.000003s : 25: predicate.switch_defer_inline 1.84% : 0.000005s : 41: predicate.switch_layer_defer_inline 4.37% : 0.000011s : 85: predicate.switch_simplify 0.88% : 0.000002s : 19: predicate.tile_eliminate 0.88% : 0.000002s : 19: predicate.transpose_eliminate 1.40% : 0.000004s : 34: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000004s : 34: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 34: predicate.tuple_list_get_item_depend_reorder 3.32% : 0.000008s : 53: predicate.tuple_list_get_item_eliminator 1.50% : 0.000004s : 34: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000006s : 50: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 37: predicate.tuple_to_list_eliminator_ 2.21% : 0.000006s : 57: predicate.updatestate_pure_node_eliminater 2.95% : 0.000007s : 73: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 8: predicate.value_based_eliminate 0.82% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.87% : 0.000002s : 16: predicate.virtual_output_eliminate 0.30% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.65% : 0.000002s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006521 25 87.12% : 0.005681s : 20: func_graph_cloner_run.FuncGraphClonerGraph 12.88% : 0.000840s : 5: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.541871 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.96% : 0.043391s : 1: add_attr 0.95% : 0.043372s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000111s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.00% : 0.000158s : 1: auto_monad 0.00% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.03% : 0.001226s : 1: bootstrap 0.00% : 0.000036s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000041s : 1: environ_conv 0.00% : 0.000034s : 1: event_method 0.00% : 0.000020s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000522s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000831s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.04% : 0.001624s : 78: opt.transform.opt_a 0.00% : 0.000047s : 1: opt.transform.opt_after_cconv 0.00% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000179s : 28: opt.transform.opt_b 0.00% : 0.000071s : 2: opt.transform.opt_trans_graph 0.00% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.15% : 0.006597s : 1: opt_a 0.00% : 0.000139s : 1: opt_after_cconv 0.01% : 0.000585s : 1: opt_after_jit_grad 0.01% : 0.000311s : 1: opt_b 0.21% : 0.009708s : 1: optimize 0.00% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000006s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000060s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000049s : 1: remove_dup_value 0.06% : 0.002785s : 1: renormalize.infer 0.02% : 0.000937s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000039s : 1: rewriter_after_opt_a 0.01% : 0.000252s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000029s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000103s : 1: symbol_engine_optimizer 81.47% : 3.700300s : 1: task_emit 0.00% : 0.000103s : 1: tuple_transform 16.02% : 0.727400s : 1: type_inference 0.00% : 0.000124s : 1: validate TotalTime = 0.71222, [33] [bootstrap]: 0.00074997 [type_inference]: 0.452741 [event_method]: 0.00077754 [auto_monad]: 0.00016086 [graph_reusing]: 8.55999e-06 [pre_auto_parallel]: 3.28e-06 [py_interpret_to_execute]: 5.655e-05 [rewriter_before_opt_a]: 0.00018554 [expand_dump_flag]: 4.27e-06 [jit_opt_a]: 0.133891, [3] [Cycle 1]: 0.0729302, [27] [switch_simplify]: 0.00018305 [loop_unroll]: 7.251e-05 [a_1]: 0.00175639 [with_stream_mark]: 4.685e-05 [recompute_prepare]: 3.733e-05 [updatestate_depend_eliminate]: 1.272e-05 [updatestate_assign_eliminate]: 1.018e-05 [updatestate_loads_eliminate]: 1.015e-05 [parameter_eliminate]: 3.31999e-06 [specialize_transform]: 2.43e-05 [updatestate_useless_node_eliminater]: 2.23e-05 [accelerated_algorithm]: 7.27e-05 [meta_shard_fg_expand]: 6.01998e-06 [get_grad_eliminate_]: 2.261e-05 [merge_forward]: 1.292e-05 [cell_reuse_recompute_pass]: 9.70002e-07 [cell_reuse_handle_not_recompute_node_pass]: 4.173e-05 [j_node_and_user_rematch]: 3.746e-05 [meta_fg_expand]: 0.00317194 [replace_old_param]: 0.00010559 [inline_without_move]: 8.479e-05 [renormalize]: 0.0100415 [add_forward_monad_depend]: 2.487e-05 [auto_monad_grad]: 8.50999e-06 [auto_monad_eliminator]: 0.0561926 [cse]: 0.00039462 [replace_applicator]: 0.00015504 [Cycle 2]: 0.00562261, [27] [switch_simplify]: 0.00016042 [loop_unroll]: 7.739e-05 [a_1]: 0.0022573 [with_stream_mark]: 3.035e-05 [recompute_prepare]: 1.769e-05 [updatestate_depend_eliminate]: 9.10999e-06 [updatestate_assign_eliminate]: 7.38e-06 [updatestate_loads_eliminate]: 6.53e-06 [parameter_eliminate]: 2.32001e-06 [specialize_transform]: 1.502e-05 [updatestate_useless_node_eliminater]: 1.466e-05 [accelerated_algorithm]: 2.206e-05 [meta_shard_fg_expand]: 4.62e-06 [get_grad_eliminate_]: 1.447e-05 [merge_forward]: 8.45001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.877e-05 [j_node_and_user_rematch]: 2.339e-05 [meta_fg_expand]: 0.00024971 [replace_old_param]: 2.616e-05 [inline_without_move]: 1.598e-05 [renormalize]: 0.00208437 [add_forward_monad_depend]: 1.131e-05 [auto_monad_grad]: 3.04999e-06 [auto_monad_eliminator]: 2.988e-05 [cse]: 0.00025484 [replace_applicator]: 3.496e-05 [Cycle 3]: 0.00099311, [27] [switch_simplify]: 1.749e-05 [loop_unroll]: 1.561e-05 [a_1]: 0.00047543 [with_stream_mark]: 2.6e-05 [recompute_prepare]: 1.85e-05 [updatestate_depend_eliminate]: 8.82e-06 [updatestate_assign_eliminate]: 7.13e-06 [updatestate_loads_eliminate]: 6.48e-06 [parameter_eliminate]: 2.39001e-06 [specialize_transform]: 1.581e-05 [updatestate_useless_node_eliminater]: 1.477e-05 [accelerated_algorithm]: 2.218e-05 [meta_shard_fg_expand]: 3.7e-06 [get_grad_eliminate_]: 1.438e-05 [merge_forward]: 9.02e-06 [cell_reuse_recompute_pass]: 2.46e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.941e-05 [j_node_and_user_rematch]: 2.507e-05 [meta_fg_expand]: 4.92999e-06 [replace_old_param]: 2.42e-05 [inline_without_move]: 1.527e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.18998e-06 [auto_monad_grad]: 2.21e-06 [auto_monad_eliminator]: 1.895e-05 [cse]: 5.301e-05 [replace_applicator]: 1.883e-05 [py_interpret_to_execute_after_opt_a]: 2.578e-05 [rewriter_after_opt_a]: 0.00010882 [convert_after_rewriter]: 1.46e-05 [order_py_execute_after_rewriter]: 9.07001e-06 [mutable_eliminate]: 0.00094091 [jit_opt_b]: 0.00013615, [1] [Cycle 1]: 0.00012699, [2] [frontend_op_eliminate]: 4.907e-05 [inline_after_opt_a]: 6.267e-05 [cconv]: 3.471e-05 [loop_unroll]: 0.00053069 [jit_opt_after_cconv]: 0.00033477, [1] [Cycle 1]: 0.00032696, [11] [c_1]: 7.098e-05 [parameter_eliminate]: 4.84003e-06 [updatestate_depend_eliminate]: 1.419e-05 [updatestate_assign_eliminate]: 6.81001e-06 [updatestate_loads_eliminate]: 6.06e-06 [cse]: 7.255e-05 [call_graph_tuple_transform]: 4.692e-05 [tuple_list_get_item_eliminator]: 2.628e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.12999e-06 [switch_simplify]: 1.617e-05 [remove_dup_value]: 8.525e-05 [partial_unused_args_eliminate]: 2.37999e-06 [environ_conv]: 1.754e-05 [add_recomputation]: 8.814e-05 [cse_after_recomputation]: 5.874e-05, [1] [Cycle 1]: 4.897e-05, [1] [cse]: 3.88e-05 [auto_monad_reorder]: 3.404e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 9.67999e-06 [opt_after_jit_grad]: 0.00061081 [symbol_engine_optimizer]: 0.00015412, [1] [Cycle 1]: 0.00014631, [6] [build]: 1.573e-05 [elim_shapecalc]: 1.953e-05 [elim_not_effective]: 3.314e-05 [opt_reshape]: 1.828e-05 [fold_const_symbol]: 2.494e-05 [renormalize]: 1.27e-06 [validate]: 9.313e-05 [backend_pass]: 1.19e-06 [task_emit]: 0.120035 [execute]: 9.66e-06 Sums bootstrap : 0.000750s : 0.11% type_inference : 0.452741s : 68.94% event_method : 0.000778s : 0.12% auto_monad : 0.000161s : 0.02% graph_reusing : 0.000009s : 0.00% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000057s : 0.01% rewriter_before_opt_a : 0.000186s : 0.03% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000361s : 0.05% jit_opt_a.loop_unroll : 0.000166s : 0.03% jit_opt_a.a_1 : 0.004489s : 0.68% jit_opt_a.with_stream_mark : 0.000103s : 0.02% jit_opt_a.recompute_prepare : 0.000074s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000031s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000025s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000023s : 0.00% jit_opt_a.parameter_eliminate : 0.000008s : 0.00% jit_opt_a.specialize_transform : 0.000055s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000052s : 0.01% jit_opt_a.accelerated_algorithm : 0.000117s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000014s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000051s : 0.01% jit_opt_a.merge_forward : 0.000030s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000100s : 0.02% jit_opt_a.j_node_and_user_rematch : 0.000086s : 0.01% jit_opt_a.meta_fg_expand : 0.003427s : 0.52% jit_opt_a.replace_old_param : 0.000156s : 0.02% jit_opt_a.inline_without_move : 0.000116s : 0.02% jit_opt_a.renormalize : 0.012126s : 1.85% jit_opt_a.add_forward_monad_depend : 0.000039s : 0.01% jit_opt_a.auto_monad_grad : 0.000014s : 0.00% jit_opt_a.auto_monad_eliminator : 0.056241s : 8.56% jit_opt_a.cse : 0.000702s : 0.11% jit_opt_a.replace_applicator : 0.000209s : 0.03% py_interpret_to_execute_after_opt_a : 0.000026s : 0.00% rewriter_after_opt_a : 0.000109s : 0.02% convert_after_rewriter : 0.000015s : 0.00% order_py_execute_after_rewriter : 0.000009s : 0.00% mutable_eliminate : 0.000941s : 0.14% jit_opt_b.frontend_op_eliminate : 0.000049s : 0.01% jit_opt_b.inline_after_opt_a : 0.000063s : 0.01% cconv : 0.000035s : 0.01% loop_unroll : 0.000531s : 0.08% jit_opt_after_cconv.c_1 : 0.000071s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.cse : 0.000073s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000047s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000026s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000016s : 0.00% remove_dup_value : 0.000085s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000018s : 0.00% add_recomputation : 0.000088s : 0.01% cse_after_recomputation.cse : 0.000039s : 0.01% auto_monad_reorder : 0.000034s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000010s : 0.00% opt_after_jit_grad : 0.000611s : 0.09% symbol_engine_optimizer.build : 0.000016s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000033s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000018s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000093s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.120035s : 18.28% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001289 248 0.37% : 0.000005s : 6: substitution.elim_not_effective 0.29% : 0.000004s : 6: substitution.fold_const_symbol 0.95% : 0.000012s : 13: substitution.graph_param_transform 56.89% : 0.000733s : 16: substitution.inline 2.27% : 0.000029s : 2: substitution.inline_without_move 1.22% : 0.000016s : 23: substitution.j_node_and_user_rematch 3.85% : 0.000050s : 3: substitution.less_batch_normalization 2.15% : 0.000028s : 22: substitution.minmaximum_grad 1.89% : 0.000024s : 5: substitution.partial_eliminate 1.35% : 0.000017s : 23: substitution.remove_not_recompute_node 3.67% : 0.000047s : 12: substitution.replace_applicator 1.45% : 0.000019s : 21: substitution.replace_old_param 0.44% : 0.000006s : 1: substitution.set_cell_output_no_recompute 2.34% : 0.000030s : 2: substitution.switch_simplify 7.09% : 0.000091s : 22: substitution.tuple_list_convert_item_index_to_positive 3.52% : 0.000045s : 22: substitution.tuple_list_get_item_depend_reorder 10.25% : 0.000132s : 49: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.452606 2 99.05% : 0.448292s : 1: type_inference.infer 0.95% : 0.004314s : 1: type_inference.specialize ------[replace.] 0.000392 39 47.62% : 0.000187s : 16: replace.inline 10.14% : 0.000040s : 2: replace.switch_simplify 42.24% : 0.000166s : 21: replace.tuple_list_get_item_eliminator ------[match.] 0.000797 39 90.45% : 0.000721s : 16: match.inline 3.16% : 0.000025s : 2: match.switch_simplify 6.38% : 0.000051s : 21: match.tuple_list_get_item_eliminator ------[predicate.] 0.000763 5444 1.42% : 0.000011s : 87: predicate.accumulaten_eliminater 0.47% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 1.31% : 0.000010s : 87: predicate.addn_check_dump 1.46% : 0.000011s : 87: predicate.addn_zero_filter 2.12% : 0.000016s : 87: predicate.arithmetic_simplify 1.40% : 0.000011s : 87: predicate.cast_eliminate 0.23% : 0.000002s : 13: predicate.check_bprop_eliminate 1.31% : 0.000010s : 87: predicate.compare_switch_simplify 1.38% : 0.000011s : 87: predicate.depend_value_elim 1.35% : 0.000010s : 87: predicate.dict_get_item_const_eliminator 1.43% : 0.000011s : 87: predicate.dict_get_item_eliminator 1.43% : 0.000011s : 87: predicate.dict_set_item_eliminator 0.44% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 13: predicate.elim_not_effective 0.29% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 1.39% : 0.000011s : 87: predicate.environ_add_const_eliminate 1.34% : 0.000010s : 87: predicate.environ_get_add_eliminate 1.42% : 0.000011s : 87: predicate.environ_get_depend_swap 1.36% : 0.000010s : 87: predicate.environ_get_eliminate 1.34% : 0.000010s : 87: predicate.environ_get_set_eliminate 0.14% : 0.000001s : 13: predicate.fold_const_symbol 0.87% : 0.000007s : 47: predicate.get_grad_eliminate 0.15% : 0.000001s : 13: predicate.graph_param_transform 4.58% : 0.000035s : 150: predicate.inline 1.70% : 0.000013s : 76: predicate.inline_without_move 0.50% : 0.000004s : 47: predicate.j_node_and_user_rematch 1.11% : 0.000008s : 47: predicate.less_batch_normalization 1.87% : 0.000014s : 108: predicate.list_to_tuple_eliminator_ 2.00% : 0.000015s : 121: predicate.load_eliminater 0.63% : 0.000005s : 13: predicate.loop_unroll_after_grad 3.07% : 0.000023s : 168: predicate.loop_unroll_before_grad 1.73% : 0.000013s : 100: predicate.make_slice_get_slice_eliminator 1.32% : 0.000010s : 87: predicate.merge_addn 1.43% : 0.000011s : 87: predicate.minmaximum_grad 0.59% : 0.000005s : 13: predicate.mutable_eliminate 0.31% : 0.000002s : 13: predicate.opt_reshape 2.64% : 0.000020s : 121: predicate.partial_eliminate 1.48% : 0.000011s : 87: predicate.print_const_string_wrapper 1.87% : 0.000014s : 87: predicate.reduce_eliminate 1.79% : 0.000014s : 108: predicate.redundant_stop_gradient_eliminater 0.58% : 0.000004s : 47: predicate.remove_not_recompute_node 3.12% : 0.000024s : 215: predicate.replace_applicator 0.96% : 0.000007s : 76: predicate.replace_old_param 0.18% : 0.000001s : 13: predicate.reset_defer_inline 1.47% : 0.000011s : 87: predicate.reshape_eliminate 1.41% : 0.000011s : 87: predicate.row_tensor_add_zeros_like 0.30% : 0.000002s : 13: predicate.row_tensor_eliminate 1.41% : 0.000011s : 87: predicate.same_eliminate 0.61% : 0.000005s : 47: predicate.set_cell_output_no_recompute 0.46% : 0.000004s : 26: predicate.special_op_eliminate 0.86% : 0.000007s : 47: predicate.specialize_transform 1.66% : 0.000013s : 87: predicate.split_environ_get_set_with_tuple_value 1.43% : 0.000011s : 87: predicate.stack_unstack_eliminate 0.27% : 0.000002s : 13: predicate.switch_call_monad_eliminater 2.67% : 0.000020s : 124: predicate.switch_defer_inline 2.56% : 0.000020s : 124: predicate.switch_layer_defer_inline 6.11% : 0.000047s : 309: predicate.switch_simplify 1.34% : 0.000010s : 87: predicate.tile_eliminate 1.37% : 0.000010s : 87: predicate.transpose_eliminate 1.79% : 0.000014s : 87: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000013s : 87: predicate.tuple_list_get_item_depend_reorder 3.77% : 0.000029s : 134: predicate.tuple_list_get_item_eliminator 1.85% : 0.000014s : 87: predicate.tuple_list_set_item_eliminator 1.86% : 0.000014s : 108: predicate.tuple_to_list_eliminator_ 2.01% : 0.000015s : 121: predicate.updatestate_pure_node_eliminater 3.05% : 0.000023s : 168: predicate.updatestate_useless_node_eliminater 1.81% : 0.000014s : 87: predicate.value_based_eliminate 0.20% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.36% : 0.000003s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005463 51 75.66% : 0.004133s : 31: func_graph_cloner_run.FuncGraphClonerGraph 24.34% : 0.001330s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.730692 91 0.01% : 0.000093s : 1: add_recomputation 0.02% : 0.000169s : 1: auto_monad 0.01% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.11% : 0.000778s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000017s : 1: convert_after_rewriter 0.01% : 0.000061s : 1: cse_after_recomputation 0.00% : 0.000020s : 1: environ_conv 0.11% : 0.000788s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 18.32% : 0.133896s : 1: jit_opt_a 0.05% : 0.000338s : 1: jit_opt_after_cconv 0.02% : 0.000140s : 1: jit_opt_b 0.07% : 0.000542s : 1: loop_unroll 0.13% : 0.000955s : 1: mutable_eliminate 0.82% : 0.005959s : 39: opt.transform.jit_opt_a 0.02% : 0.000156s : 4: opt.transform.jit_opt_after_cconv 0.01% : 0.000103s : 4: opt.transform.jit_opt_b 0.00% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000031s : 1: opt.transform.mutable_eliminate 0.01% : 0.000059s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000091s : 4: opt.transform.symbol_engine_opt 0.09% : 0.000622s : 1: opt_after_jit_grad 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pre_auto_parallel 0.01% : 0.000059s : 1: py_interpret_to_execute 0.00% : 0.000029s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000091s : 1: remove_dup_value 1.13% : 0.008283s : 2: renormalize.infer 0.52% : 0.003814s : 2: renormalize.specialize 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000114s : 1: rewriter_after_opt_a 0.03% : 0.000189s : 1: rewriter_before_opt_a 0.02% : 0.000158s : 1: symbol_engine_optimizer 16.43% : 0.120052s : 1: task_emit 61.96% : 0.452764s : 1: type_inference 0.02% : 0.000141s : 1: validate TotalTime = 20.1139, [24] [bootstrap]: 0.00089202 [type_inference]: 0.048951 [event_method]: 2.489e-05 [auto_monad]: 0.00014081 [graph_reusing]: 6.14999e-06 [inline]: 3.28998e-06 [add_attr]: 0.0105319, [1] [add_attr_with_inline]: 0.0105141, [1] [Cycle 1]: 0.00014463, [2] [tag_attr]: 3.886e-05 [meta_addattr_fg_expand]: 1.404e-05 [parallel-infer-symbol]: 3.74002e-06 [pre_auto_parallel]: 6.081e-05 [insert-virtual-dataset]: 3.00998e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 1.56998e-06 [optimize]: 0.00956766, [53] [py_interpret_to_execute]: 7.52002e-06 [rewriter_before_opt_a]: 0.0002513 [opt_a]: 0.00558513, [2] [Cycle 1]: 0.00386325, [45] [expand_dump_flag]: 3.83001e-06 [switch_simplify]: 6.734e-05 [loop_unroll]: 3.331e-05 [a_1]: 0.00069127 [with_stream_mark]: 2.234e-05 [recompute_prepare]: 1.022e-05 [updatestate_depend_eliminate]: 1.272e-05 [updatestate_assign_eliminate]: 1.109e-05 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 8.229e-05 [accelerated_algorithm]: 8.11002e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.92002e-06 [shard_inline]: 6.96999e-06 [merge_send_recv]: 4.36e-05 [auto_parallel]: 9.19e-06 [parallel]: 8.53e-05 [flash_sp]: 3.458e-05 [merge_comm]: 5.99999e-06 [allreduce_fusion]: 1.198e-05 [matmul_add_comm_reduction]: 1.723e-05 [allreduce_slice_to_reducescatter]: 7.92998e-06 [virtual_shard_identity]: 1.31e-05 [virtual_dataset]: 6.17001e-06 [get_grad_eliminate_]: 6.04001e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 1.789e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.379e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 1.294e-05 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 3.94002e-06 [receive_attached]: 1.719e-05 [after_resolve]: 1.219e-05 [a_after_grad]: 9.69e-06 [renormalize]: 0.00194141 [add_forward_monad_depend]: 1.005e-05 [auto_monad_grad]: 4.79e-06 [auto_monad_eliminator]: 3.221e-05 [cse]: 4.958e-05 [a_3]: 5.372e-05 [Cycle 2]: 0.00170511, [45] [expand_dump_flag]: 2.51998e-06 [switch_simplify]: 8.82999e-06 [loop_unroll]: 5.87999e-06 [a_1]: 0.00012175 [with_stream_mark]: 1.62e-05 [recompute_prepare]: 6.01e-06 [updatestate_depend_eliminate]: 3.65e-06 [updatestate_assign_eliminate]: 3.10998e-06 [updatestate_loads_eliminate]: 3.51001e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 6.636e-05 [accelerated_algorithm]: 6.12001e-06 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 2.36998e-06 [shard_inline]: 6.25997e-06 [merge_send_recv]: 7.47002e-06 [auto_parallel]: 8.07e-06 [parallel]: 7.73999e-06 [flash_sp]: 3.6e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 2.89999e-06 [matmul_add_comm_reduction]: 7.06001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 6.51e-06 [virtual_dataset]: 5.69e-06 [get_grad_eliminate_]: 5.39e-06 [virtual_output]: 5.67999e-06 [merge_forward]: 3.51999e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 8.33999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.438e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 9.66e-06 [set_forward_comm_id_for_comm_node_pass]: 3.30998e-06 [meta_fg_expand]: 2.24001e-06 [flash_sp_send_recv_attached]: 1.20999e-06 [receive_attached]: 2.06998e-06 [after_resolve]: 1.011e-05 [a_after_grad]: 1.565e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.25999e-06 [auto_monad_grad]: 1.53002e-06 [auto_monad_eliminator]: 7.16001e-06 [cse]: 1.19e-05 [a_3]: 0.00108787 [py_interpret_to_execute_after_opt_a]: 1.013e-05 [slice_cell_reuse_recomputed_activation]: 2.81e-06 [rewriter_after_opt_a]: 4.106e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.22999e-06 [mutable_eliminate]: 0.00078565 [opt_b]: 0.00020649, [1] [Cycle 1]: 0.00019786, [7] [b_1]: 0.00011517 [b_2]: 7.2e-06 [updatestate_depend_eliminate]: 8.74e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [renormalize]: 8.80013e-07 [cse]: 2.57e-05 [optimize_parallel_all_gather_comm]: 2.946e-05 [overlap_param_gather]: 7.138e-05 [cconv]: 3.423e-05 [loop_unroll]: 0.00047633 [opt_after_cconv]: 0.00120207, [1] [Cycle 1]: 0.00119419, [7] [c_1]: 2.91e-05 [parameter_eliminate]: 5.28002e-06 [updatestate_depend_eliminate]: 0.0004655 [updatestate_assign_eliminate]: 8.32998e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 0.00052947 [renormalize]: 7.2e-07 [remove_dup_value]: 1.639e-05 [tuple_transform]: 9.414e-05, [1] [Cycle 1]: 8.617e-05, [4] [d_1]: 5.516e-05 [none_parameter_eliminate]: 4.08999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.88001e-06 [partial_unused_args_eliminate]: 2.36e-06 [add_recomputation]: 7.086e-05 [cse_after_recomputation]: 2.35e-05, [1] [Cycle 1]: 1.831e-05, [1] [cse]: 1.258e-05 [environ_conv]: 8.046e-05 [swap_dp_allreduce_reducescatter]: 2.478e-05 [bias_add_comm_swap]: 1.15e-05 [label_micro_interleaved_index]: 1.484e-05 [label_fine_grained_interleaved_index]: 2.49001e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.60999e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 9.56e-06 [full_micro_interleaved_order_control]: 1.047e-05 [reorder_send_recv_between_fp_bp]: 2.54001e-06 [comm_op_add_attrs]: 9.79984e-07 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 8.11002e-06 [overlap_opt_shard_in_pipeline]: 2.906e-05 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.518e-05 [grouped_pairwise_exchange_alltoall]: 1.32e-06 [offloading_packed_experts]: 4.01001e-06 [overlap_recompute_and_grad_model_parallel]: 1.307e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.32999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.27e-06 [overlap_recompute_comm]: 2.19999e-06 [overlap_grad_ring_attention]: 1.929e-05 [overlap_grad_flash_sp]: 4.492e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 9.21998e-06 [split_layernorm_comm]: 1.85001e-06 [handle_group_info]: 9.99979e-07 [symbol_engine_optimizer]: 8.139e-05, [1] [Cycle 1]: 7.614e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.266e-05 [elim_not_effective]: 1.336e-05 [opt_reshape]: 6.93e-06 [fold_const_symbol]: 1.044e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 2.536e-05 [get_jit_bprop_graph]: 2.79999e-06 [rewriter_after_jit_bprop_graph]: 4.97e-06 [opt_after_jit_grad]: 0.00067024 [validate]: 6.829e-05 [backend_pass]: 9.10019e-07 [task_emit]: 20.0424 [execute]: 1.1e-05 Sums bootstrap : 0.000892s : 0.00% type_inference : 0.048951s : 0.24% event_method : 0.000025s : 0.00% auto_monad : 0.000141s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000061s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000251s : 0.00% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000076s : 0.00% optimize.opt_a.loop_unroll : 0.000039s : 0.00% optimize.opt_a.a_1 : 0.000813s : 0.00% optimize.opt_a.with_stream_mark : 0.000039s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000149s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000051s : 0.00% optimize.opt_a.auto_parallel : 0.000017s : 0.00% optimize.opt_a.parallel : 0.000093s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.00% optimize.opt_a.a_after_grad : 0.000025s : 0.00% optimize.opt_a.renormalize : 0.001941s : 0.01% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.00% optimize.opt_a.cse : 0.000061s : 0.00% optimize.opt_a.a_3 : 0.001142s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000041s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000786s : 0.00% optimize.opt_b.b_1 : 0.000115s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000071s : 0.00% optimize.cconv : 0.000034s : 0.00% optimize.loop_unroll : 0.000476s : 0.00% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000466s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000529s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000016s : 0.00% optimize.tuple_transform.d_1 : 0.000055s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000004s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000071s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000080s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000029s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000025s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000670s : 0.00% validate : 0.000068s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 20.042394s : 99.70% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000232 29 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.78% : 0.000002s : 2: substitution.fold_const_symbol 2.83% : 0.000007s : 4: substitution.graph_param_transform 79.61% : 0.000185s : 5: substitution.inline 1.97% : 0.000005s : 4: substitution.j_node_and_user_rematch 5.66% : 0.000013s : 4: substitution.remove_not_recompute_node 1.85% : 0.000004s : 4: substitution.replace_old_param 6.57% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.048857 2 97.65% : 0.047710s : 1: type_inference.infer 2.35% : 0.001147s : 1: type_inference.specialize ------[replace.] 0.000071 9 67.08% : 0.000048s : 5: replace.inline 32.92% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 9 93.16% : 0.000181s : 5: match.inline 6.84% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000217 1345 0.81% : 0.000002s : 14: predicate.accumulaten_eliminater 0.92% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 0.87% : 0.000002s : 14: predicate.addn_zero_filter 0.83% : 0.000002s : 14: predicate.adjust_all_reduce_mul_add 1.91% : 0.000004s : 22: predicate.arithmetic_simplify 0.90% : 0.000002s : 14: predicate.cast_eliminate 0.80% : 0.000002s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.89% : 0.000002s : 14: predicate.dict_get_item_const_eliminator 0.93% : 0.000002s : 14: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 14: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 4: predicate.elim_not_effective 0.55% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000002s : 18: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 18: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 18: predicate.environ_get_depend_swap 1.57% : 0.000003s : 26: predicate.environ_get_eliminate 1.15% : 0.000003s : 18: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.25% : 0.000005s : 23: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.71% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.36% : 0.000001s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.79% : 0.000013s : 61: predicate.inline 0.76% : 0.000002s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.45% : 0.000005s : 40: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.68% : 0.000006s : 41: predicate.loop_unroll_before_grad 1.66% : 0.000004s : 22: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.13% : 0.000002s : 14: predicate.minmaximum_grad 1.37% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.54% : 0.000001s : 4: predicate.parallel_virtual_node 1.88% : 0.000004s : 23: predicate.partial_defer_inline 1.37% : 0.000003s : 22: predicate.partial_eliminate 1.03% : 0.000002s : 14: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.26% : 0.000003s : 14: predicate.reduce_eliminate 2.51% : 0.000005s : 40: predicate.redundant_stop_gradient_eliminater 0.60% : 0.000001s : 8: predicate.remove_not_recompute_node 1.48% : 0.000003s : 26: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.39% : 0.000001s : 4: predicate.reset_defer_inline 0.94% : 0.000002s : 14: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.98% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 1.05% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.85% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 23: predicate.switch_defer_inline 2.38% : 0.000005s : 31: predicate.switch_layer_defer_inline 5.43% : 0.000012s : 76: predicate.switch_simplify 0.87% : 0.000002s : 14: predicate.tile_eliminate 0.83% : 0.000002s : 14: predicate.transpose_eliminate 1.73% : 0.000004s : 22: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 22: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000003s : 22: predicate.tuple_list_get_item_depend_reorder 3.41% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.35% : 0.000003s : 22: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000005s : 30: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 2.87% : 0.000006s : 48: predicate.updatestate_useless_node_eliminater 0.27% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.53% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000959 13 53.98% : 0.000517s : 6: func_graph_cloner_run.FuncGraphClonerGraph 46.02% : 0.000441s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 20.137669 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.05% : 0.010538s : 1: add_attr 0.05% : 0.010519s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000148s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.00% : 0.000940s : 1: bootstrap 0.00% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000089s : 1: environ_conv 0.00% : 0.000031s : 1: event_method 0.00% : 0.000019s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.00% : 0.000485s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.00% : 0.000795s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.01% : 0.001764s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000095s : 28: opt.transform.opt_b 0.00% : 0.000059s : 2: opt.transform.opt_trans_graph 0.00% : 0.000039s : 4: opt.transform.symbol_engine_opt 0.03% : 0.005589s : 1: opt_a 0.01% : 0.001207s : 1: opt_after_cconv 0.00% : 0.000682s : 1: opt_after_jit_grad 0.00% : 0.000210s : 1: opt_b 0.05% : 0.009573s : 1: optimize 0.00% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000048s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000076s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000065s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000020s : 1: remove_dup_value 0.01% : 0.001433s : 1: renormalize.infer 0.00% : 0.000496s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000046s : 1: rewriter_after_opt_a 0.00% : 0.000259s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000084s : 1: symbol_engine_optimizer 99.53% : 20.042492s : 1: task_emit 0.00% : 0.000097s : 1: tuple_transform 0.24% : 0.048974s : 1: type_inference 0.00% : 0.000102s : 1: validate [WARNING] CORE(71532,ffffbf434f30,python3.9):2026-01-29-17:44:53.844.595 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph0 .. TotalTime = 0.088477, [24] [bootstrap]: 0.00080792 [type_inference]: 0.0558157 [event_method]: 0.00014945 [auto_monad]: 0.00016263 [graph_reusing]: 9.94999e-06 [inline]: 2.56e-06 [add_attr]: 0.00437181, [1] [add_attr_with_inline]: 0.00435987, [1] [Cycle 1]: 8.27e-05, [2] [tag_attr]: 4.082e-05 [meta_addattr_fg_expand]: 1.152e-05 [parallel-infer-symbol]: 4.03999e-06 [pre_auto_parallel]: 5.929e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.017412, [53] [py_interpret_to_execute]: 4.43001e-06 [rewriter_before_opt_a]: 0.00035661 [opt_a]: 0.0145488, [3] [Cycle 1]: 0.00992383, [45] [expand_dump_flag]: 4.63001e-06 [switch_simplify]: 0.00016077 [loop_unroll]: 7.044e-05 [a_1]: 0.00170891 [with_stream_mark]: 2.633e-05 [recompute_prepare]: 2.525e-05 [updatestate_depend_eliminate]: 9.15999e-06 [updatestate_assign_eliminate]: 7.64002e-06 [updatestate_loads_eliminate]: 7.08e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 0.00025667 [accelerated_algorithm]: 6.237e-05 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 4.18999e-06 [shard_inline]: 1.659e-05 [merge_send_recv]: 1.84e-05 [auto_parallel]: 1.31e-05 [parallel]: 3.996e-05 [flash_sp]: 1.103e-05 [merge_comm]: 1.023e-05 [allreduce_fusion]: 9.43002e-06 [matmul_add_comm_reduction]: 2.643e-05 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 1.844e-05 [virtual_dataset]: 1.579e-05 [get_grad_eliminate_]: 1.594e-05 [virtual_output]: 1.55e-05 [merge_forward]: 8.33999e-06 [cell_reuse_recompute_pass]: 1.77999e-06 [offload_activation]: 2.096e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.008e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 2.982e-05 [set_forward_comm_id_for_comm_node_pass]: 9.74999e-06 [meta_fg_expand]: 0.00176099 [flash_sp_send_recv_attached]: 7.11001e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 6.722e-05 [a_after_grad]: 8.768e-05 [renormalize]: 0.00430413 [add_forward_monad_depend]: 1.116e-05 [auto_monad_grad]: 7.36001e-06 [auto_monad_eliminator]: 5.93e-05 [cse]: 0.00027516 [a_3]: 0.00036352 [Cycle 2]: 0.00362758, [45] [expand_dump_flag]: 3.62002e-06 [switch_simplify]: 4.981e-05 [loop_unroll]: 4.6e-05 [a_1]: 0.00167377 [with_stream_mark]: 1.812e-05 [recompute_prepare]: 1.353e-05 [updatestate_depend_eliminate]: 6.00002e-06 [updatestate_assign_eliminate]: 4.68999e-06 [updatestate_loads_eliminate]: 4.58999e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.00013089 [accelerated_algorithm]: 1.79e-05 [shard]: 2.12999e-06 [meta_shard_fg_expand]: 3.16999e-06 [shard_inline]: 1.01e-05 [merge_send_recv]: 1.046e-05 [auto_parallel]: 1.176e-05 [parallel]: 8.28001e-06 [flash_sp]: 3.95e-06 [merge_comm]: 5.02e-06 [allreduce_fusion]: 4.67998e-06 [matmul_add_comm_reduction]: 1.141e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.185e-05 [virtual_dataset]: 9.66e-06 [get_grad_eliminate_]: 9.61e-06 [virtual_output]: 1.006e-05 [merge_forward]: 5.77001e-06 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 1.365e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.889e-05 [merge_recompute_call_nodes]: 2.04999e-06 [before_grad]: 1.624e-05 [set_forward_comm_id_for_comm_node_pass]: 7.06001e-06 [meta_fg_expand]: 7.747e-05 [flash_sp_send_recv_attached]: 1.98002e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.73e-05 [a_after_grad]: 1.626e-05 [renormalize]: 0.00091968 [add_forward_monad_depend]: 5.92001e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 1.66e-05 [cse]: 8.562e-05 [a_3]: 7.367e-05 [Cycle 3]: 0.00097883, [45] [expand_dump_flag]: 2.04999e-06 [switch_simplify]: 1.168e-05 [loop_unroll]: 9.82999e-06 [a_1]: 0.00023631 [with_stream_mark]: 1.152e-05 [recompute_prepare]: 9.73002e-06 [updatestate_depend_eliminate]: 4.60001e-06 [updatestate_assign_eliminate]: 3.83999e-06 [updatestate_loads_eliminate]: 4.49002e-06 [parameter_eliminate]: 1.67999e-06 [a_2]: 0.00012417 [accelerated_algorithm]: 1.689e-05 [shard]: 1.07e-06 [meta_shard_fg_expand]: 2.44001e-06 [shard_inline]: 9.64999e-06 [merge_send_recv]: 7.53999e-06 [auto_parallel]: 7.36999e-06 [parallel]: 5.24e-06 [flash_sp]: 8.50006e-07 [merge_comm]: 4.37998e-06 [allreduce_fusion]: 4.42e-06 [matmul_add_comm_reduction]: 8.53001e-06 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 1.077e-05 [virtual_dataset]: 1.058e-05 [get_grad_eliminate_]: 9.97999e-06 [virtual_output]: 9.22999e-06 [merge_forward]: 4.49998e-06 [cell_reuse_recompute_pass]: 1.83002e-06 [offload_activation]: 9.97001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.541e-05 [merge_recompute_call_nodes]: 1.12999e-06 [before_grad]: 1.539e-05 [set_forward_comm_id_for_comm_node_pass]: 4.89e-06 [meta_fg_expand]: 3.18e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.10999e-06 [after_resolve]: 1.405e-05 [a_after_grad]: 1.643e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.95001e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 1.072e-05 [cse]: 3.132e-05 [a_3]: 6.464e-05 [py_interpret_to_execute_after_opt_a]: 5.27001e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 2.881e-05 [convert_after_rewriter]: 1.40999e-06 [order_py_execute_after_rewriter]: 1.24e-06 [mutable_eliminate]: 0.00075357 [opt_b]: 0.00031921, [1] [Cycle 1]: 0.0003113, [7] [b_1]: 0.00020822 [b_2]: 1.212e-05 [updatestate_depend_eliminate]: 8.65001e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.88001e-06 [renormalize]: 8.80013e-07 [cse]: 3.979e-05 [optimize_parallel_all_gather_comm]: 2.9e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.79e-05 [loop_unroll]: 0.00047857 [opt_after_cconv]: 0.00014457, [1] [Cycle 1]: 0.00013829, [7] [c_1]: 5.081e-05 [parameter_eliminate]: 3.33e-06 [updatestate_depend_eliminate]: 7.75998e-06 [updatestate_assign_eliminate]: 4.38999e-06 [updatestate_loads_eliminate]: 3.61001e-06 [cse]: 3.493e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 2.557e-05 [tuple_transform]: 0.00010559, [1] [Cycle 1]: 0.0001006, [4] [d_1]: 6.963e-05 [none_parameter_eliminate]: 1.50001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 1.048e-05 [partial_unused_args_eliminate]: 2.19999e-06 [add_recomputation]: 6.724e-05 [cse_after_recomputation]: 3.448e-05, [1] [Cycle 1]: 2.95e-05, [1] [cse]: 2.364e-05 [environ_conv]: 1.075e-05 [swap_dp_allreduce_reducescatter]: 8.85999e-06 [bias_add_comm_swap]: 2.93e-06 [label_micro_interleaved_index]: 5.22e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.21002e-06 [ForceFp32Comm]: 1.02e-06 [remove_cast_before_assign_add]: 1.15001e-06 [full_micro_interleaved_order_control]: 2.49001e-06 [reorder_send_recv_between_fp_bp]: 2.91e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.08001e-06 [interleave_split_concat_branches]: 1.39e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 4.4e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.735e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 5.46e-06 [overlap_recompute_and_grad_model_parallel]: 5.82999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.06e-06 [overlap_grad_ring_attention]: 5.41002e-06 [overlap_grad_flash_sp]: 2.443e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.36e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 0.00010701, [1] [Cycle 1]: 0.00010248, [6] [build]: 8.70999e-06 [elim_shapecalc]: 1.765e-05 [elim_not_effective]: 1.952e-05 [opt_reshape]: 1.214e-05 [fold_const_symbol]: 1.518e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.34001e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 2.788e-05 [get_jit_bprop_graph]: 3.03998e-06 [rewriter_after_jit_bprop_graph]: 3.53e-06 [opt_after_jit_grad]: 0.00053729 [validate]: 5.853e-05 [backend_pass]: 9.20001e-07 [task_emit]: 0.00878638 [execute]: 8.71002e-06 Sums bootstrap : 0.000808s : 0.98% type_inference : 0.055816s : 67.50% event_method : 0.000149s : 0.18% auto_monad : 0.000163s : 0.20% graph_reusing : 0.000010s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000059s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000357s : 0.43% optimize.opt_a.expand_dump_flag : 0.000010s : 0.01% optimize.opt_a.switch_simplify : 0.000222s : 0.27% optimize.opt_a.loop_unroll : 0.000126s : 0.15% optimize.opt_a.a_1 : 0.003619s : 4.38% optimize.opt_a.with_stream_mark : 0.000056s : 0.07% optimize.opt_a.recompute_prepare : 0.000049s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000016s : 0.02% optimize.opt_a.parameter_eliminate : 0.000006s : 0.01% optimize.opt_a.a_2 : 0.000512s : 0.62% optimize.opt_a.accelerated_algorithm : 0.000097s : 0.12% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.01% optimize.opt_a.shard_inline : 0.000036s : 0.04% optimize.opt_a.merge_send_recv : 0.000036s : 0.04% optimize.opt_a.auto_parallel : 0.000032s : 0.04% optimize.opt_a.parallel : 0.000053s : 0.06% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000020s : 0.02% optimize.opt_a.allreduce_fusion : 0.000019s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000046s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000041s : 0.05% optimize.opt_a.virtual_dataset : 0.000036s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000036s : 0.04% optimize.opt_a.virtual_output : 0.000035s : 0.04% optimize.opt_a.merge_forward : 0.000019s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000045s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000074s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.01% optimize.opt_a.before_grad : 0.000061s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000022s : 0.03% optimize.opt_a.meta_fg_expand : 0.001842s : 2.23% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000099s : 0.12% optimize.opt_a.a_after_grad : 0.000120s : 0.15% optimize.opt_a.renormalize : 0.005224s : 6.32% optimize.opt_a.add_forward_monad_depend : 0.000019s : 0.02% optimize.opt_a.auto_monad_grad : 0.000011s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000087s : 0.10% optimize.opt_a.cse : 0.000392s : 0.47% optimize.opt_a.a_3 : 0.000502s : 0.61% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000754s : 0.91% optimize.opt_b.b_1 : 0.000208s : 0.25% optimize.opt_b.b_2 : 0.000012s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.03% optimize.loop_unroll : 0.000479s : 0.58% optimize.opt_after_cconv.c_1 : 0.000051s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000035s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000026s : 0.03% optimize.tuple_transform.d_1 : 0.000070s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.08% optimize.cse_after_recomputation.cse : 0.000024s : 0.03% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000028s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000537s : 0.65% validate : 0.000059s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.008786s : 10.63% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.001020 214 0.25% : 0.000003s : 4: substitution.elim_not_effective 0.85% : 0.000009s : 12: substitution.float_depend_g_call 0.41% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.21% : 0.000002s : 4: substitution.fold_const_symbol 0.76% : 0.000008s : 8: substitution.graph_param_transform 0.37% : 0.000004s : 2: substitution.incorporate_call 0.25% : 0.000003s : 2: substitution.incorporate_call_switch 55.00% : 0.000561s : 20: substitution.inline 1.84% : 0.000019s : 2: substitution.inline_without_move 1.25% : 0.000013s : 18: substitution.j_node_and_user_rematch 4.56% : 0.000047s : 3: substitution.less_batch_normalization 1.52% : 0.000016s : 11: substitution.minmaximum_grad 2.13% : 0.000022s : 12: substitution.partial_eliminate 2.16% : 0.000022s : 18: substitution.remove_not_recompute_node 2.45% : 0.000025s : 9: substitution.replace_applicator 0.96% : 0.000010s : 12: substitution.replace_old_param 0.25% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.82% : 0.000029s : 3: substitution.switch_simplify 10.23% : 0.000104s : 11: substitution.tuple_list_convert_item_index_to_positive 1.40% : 0.000014s : 11: substitution.tuple_list_get_item_const_eliminator 2.04% : 0.000021s : 11: substitution.tuple_list_get_item_depend_reorder 6.34% : 0.000065s : 26: substitution.tuple_list_get_item_eliminator 1.95% : 0.000020s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.055723 2 94.68% : 0.052757s : 1: type_inference.infer 5.32% : 0.002966s : 1: type_inference.specialize ------[replace.] 0.000290 35 58.29% : 0.000169s : 20: replace.inline 12.94% : 0.000038s : 3: replace.switch_simplify 28.78% : 0.000083s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000601 35 91.19% : 0.000548s : 20: match.inline 4.47% : 0.000027s : 3: match.switch_simplify 4.34% : 0.000026s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000798 5728 1.07% : 0.000009s : 69: predicate.accumulaten_eliminater 0.30% : 0.000002s : 8: predicate.ad_related_special_op_eliminate 0.51% : 0.000004s : 31: predicate.addn_check_dump 1.08% : 0.000009s : 69: predicate.addn_zero_filter 1.01% : 0.000008s : 69: predicate.adjust_all_reduce_mul_add 2.02% : 0.000016s : 100: predicate.arithmetic_simplify 1.08% : 0.000009s : 69: predicate.cast_eliminate 1.09% : 0.000009s : 65: predicate.check_bprop_eliminate 0.48% : 0.000004s : 31: predicate.compare_switch_simplify 0.09% : 0.000001s : 8: predicate.const_output_eliminate 0.48% : 0.000004s : 31: predicate.depend_value_elim 1.17% : 0.000009s : 69: predicate.dict_get_item_const_eliminator 1.34% : 0.000011s : 69: predicate.dict_get_item_eliminator 1.12% : 0.000009s : 69: predicate.dict_set_item_eliminator 0.36% : 0.000003s : 16: predicate.dumpgradient_eliminate 0.13% : 0.000001s : 8: predicate.elim_not_effective 0.18% : 0.000001s : 8: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000010s : 77: predicate.environ_add_const_eliminate 1.30% : 0.000010s : 77: predicate.environ_get_add_eliminate 1.20% : 0.000010s : 77: predicate.environ_get_depend_swap 1.69% : 0.000013s : 108: predicate.environ_get_eliminate 1.32% : 0.000011s : 77: predicate.environ_get_set_eliminate 1.74% : 0.000014s : 101: predicate.exchange_switch_depend_value 2.29% : 0.000018s : 101: predicate.float_depend_g_call 0.52% : 0.000004s : 31: predicate.float_environ_get_switch 0.65% : 0.000005s : 39: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 8: predicate.fold_const_symbol 0.57% : 0.000005s : 31: predicate.get_grad_eliminate 0.09% : 0.000001s : 8: predicate.graph_param_transform 0.53% : 0.000004s : 31: predicate.incorporate_call 0.45% : 0.000004s : 31: predicate.incorporate_call_switch 5.42% : 0.000043s : 248: predicate.inline 1.33% : 0.000011s : 54: predicate.inline_without_move 0.29% : 0.000002s : 31: predicate.j_node_and_user_rematch 0.76% : 0.000006s : 31: predicate.less_batch_normalization 1.56% : 0.000012s : 97: predicate.list_to_tuple_eliminator_ 2.70% : 0.000021s : 166: predicate.load_eliminater 0.39% : 0.000003s : 8: predicate.loop_unroll_after_grad 2.42% : 0.000019s : 140: predicate.loop_unroll_before_grad 1.41% : 0.000011s : 85: predicate.make_slice_get_slice_eliminator 0.59% : 0.000005s : 31: predicate.merge_addn 1.12% : 0.000009s : 65: predicate.micro_step_allgather_replace 1.06% : 0.000008s : 65: predicate.mini_step_allgather_replace 1.09% : 0.000009s : 69: predicate.minmaximum_grad 0.36% : 0.000003s : 8: predicate.mutable_eliminate 0.16% : 0.000001s : 8: predicate.opt_reshape 0.18% : 0.000001s : 8: predicate.parallel_virtual_node 2.38% : 0.000019s : 101: predicate.partial_defer_inline 1.61% : 0.000013s : 89: predicate.partial_eliminate 1.06% : 0.000008s : 69: predicate.print_const_string_wrapper 0.54% : 0.000004s : 31: predicate.reduce_all_const_elim 1.33% : 0.000011s : 69: predicate.reduce_eliminate 2.55% : 0.000020s : 166: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000003s : 31: predicate.remove_not_recompute_node 1.89% : 0.000015s : 146: predicate.replace_applicator 0.66% : 0.000005s : 54: predicate.replace_old_param 0.13% : 0.000001s : 8: predicate.reset_defer_inline 1.05% : 0.000008s : 69: predicate.reshape_eliminate 1.08% : 0.000009s : 65: predicate.row_tensor_add_zeros_like 0.20% : 0.000002s : 8: predicate.row_tensor_eliminate 1.32% : 0.000011s : 65: predicate.same_eliminate 0.43% : 0.000003s : 31: predicate.set_cell_output_no_recompute 0.62% : 0.000005s : 31: predicate.shard_identity_eliminate 0.34% : 0.000003s : 16: predicate.special_op_eliminate 0.56% : 0.000004s : 31: predicate.specialize_transform 1.26% : 0.000010s : 65: predicate.split_environ_get_set_with_tuple_value 1.14% : 0.000009s : 54: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.87% : 0.000015s : 101: predicate.switch_defer_inline 2.91% : 0.000023s : 166: predicate.switch_layer_defer_inline 5.19% : 0.000041s : 286: predicate.switch_simplify 1.07% : 0.000009s : 69: predicate.tile_eliminate 1.08% : 0.000009s : 69: predicate.transpose_eliminate 1.46% : 0.000012s : 85: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000012s : 85: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000011s : 85: predicate.tuple_list_get_item_depend_reorder 2.61% : 0.000021s : 128: predicate.tuple_list_get_item_eliminator 1.47% : 0.000012s : 85: predicate.tuple_list_get_set_item_eliminator 2.01% : 0.000016s : 116: predicate.tuple_list_set_item_eliminator 1.53% : 0.000012s : 97: predicate.tuple_to_list_eliminator_ 2.49% : 0.000020s : 166: predicate.updatestate_pure_node_eliminater 3.09% : 0.000025s : 197: predicate.updatestate_useless_node_eliminater 0.16% : 0.000001s : 8: predicate.value_based_eliminate 0.58% : 0.000005s : 31: predicate.virtual_dataset_eliminate 0.56% : 0.000004s : 31: predicate.virtual_output_eliminate 0.13% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.19% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002819 39 59.97% : 0.001690s : 15: func_graph_cloner_run.FuncGraphClonerGraph 40.03% : 0.001128s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.121346 237 0.00% : 0.000004s : 1: ForceFp32Comm 3.61% : 0.004377s : 1: add_attr 3.60% : 0.004364s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.14% : 0.000173s : 1: auto_monad 0.03% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.69% : 0.000834s : 1: bootstrap 0.03% : 0.000032s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.13% : 0.000161s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.40% : 0.000488s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.63% : 0.000764s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000021s : 1: opt.transform.mutable_eliminate 4.55% : 0.005523s : 117: opt.transform.opt_a 0.04% : 0.000049s : 1: opt.transform.opt_after_cconv 0.03% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000191s : 28: opt.transform.opt_b 0.06% : 0.000078s : 2: opt.transform.opt_trans_graph 0.05% : 0.000060s : 4: opt.transform.symbol_engine_opt 11.99% : 0.014553s : 1: opt_a 0.12% : 0.000148s : 1: opt_after_cconv 0.45% : 0.000549s : 1: opt_after_jit_grad 0.27% : 0.000323s : 1: opt_b 14.35% : 0.017418s : 1: optimize 0.03% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000064s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000030s : 1: remove_dup_value 2.57% : 0.003114s : 2: renormalize.infer 1.72% : 0.002091s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000032s : 1: rewriter_after_opt_a 0.30% : 0.000363s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000110s : 1: symbol_engine_optimizer 7.26% : 0.008806s : 1: task_emit 0.09% : 0.000108s : 1: tuple_transform 46.02% : 0.055838s : 1: type_inference 0.08% : 0.000095s : 1: validate .. TotalTime = 52.1856, [33] [bootstrap]: 0.00200129 [type_inference]: 0.13025 [event_method]: 1.569e-05 [auto_monad]: 0.0002451 [graph_reusing]: 5.14998e-06 [pre_auto_parallel]: 1.182e-05 [py_interpret_to_execute]: 2.99e-05 [rewriter_before_opt_a]: 7.489e-05 [expand_dump_flag]: 3.13998e-06 [jit_opt_a]: 0.022766, [2] [Cycle 1]: 0.00192007, [27] [switch_simplify]: 8.014e-05 [loop_unroll]: 1.647e-05 [a_1]: 0.0003762 [with_stream_mark]: 2.259e-05 [recompute_prepare]: 1.053e-05 [updatestate_depend_eliminate]: 1.262e-05 [updatestate_assign_eliminate]: 1.11e-05 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.71e-06 [specialize_transform]: 9.65002e-06 [updatestate_useless_node_eliminater]: 7.82998e-06 [accelerated_algorithm]: 8.15e-06 [meta_shard_fg_expand]: 1.065e-05 [get_grad_eliminate_]: 8.20999e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.731e-05 [j_node_and_user_rematch]: 1.176e-05 [meta_fg_expand]: 2.22999e-06 [replace_old_param]: 1.745e-05 [inline_without_move]: 7.87e-06 [renormalize]: 0.00094986 [add_forward_monad_depend]: 1.566e-05 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 2.219e-05 [cse]: 4.607e-05 [replace_applicator]: 1.435e-05 [Cycle 2]: 0.00046814, [27] [switch_simplify]: 8.27e-06 [loop_unroll]: 7.23e-06 [a_1]: 0.00018439 [with_stream_mark]: 1.541e-05 [recompute_prepare]: 7.75e-06 [updatestate_depend_eliminate]: 3.73999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.42e-06 [specialize_transform]: 7.72998e-06 [updatestate_useless_node_eliminater]: 7.5e-06 [accelerated_algorithm]: 7.83001e-06 [meta_shard_fg_expand]: 1.49e-06 [get_grad_eliminate_]: 7.52002e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 1.79e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.66e-05 [j_node_and_user_rematch]: 1.077e-05 [meta_fg_expand]: 2.19999e-06 [replace_old_param]: 1.476e-05 [inline_without_move]: 7.26001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.14e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 6.57002e-06 [cse]: 1.528e-05 [replace_applicator]: 8.04002e-06 [py_interpret_to_execute_after_opt_a]: 1.647e-05 [rewriter_after_opt_a]: 9.236e-05 [convert_after_rewriter]: 8.13999e-06 [order_py_execute_after_rewriter]: 5.57999e-06 [mutable_eliminate]: 0.00262048 [jit_opt_b]: 7.307e-05, [1] [Cycle 1]: 6.317e-05, [2] [frontend_op_eliminate]: 2.728e-05 [inline_after_opt_a]: 2.364e-05 [cconv]: 3.989e-05 [loop_unroll]: 0.00058227 [jit_opt_after_cconv]: 0.00021246, [1] [Cycle 1]: 0.00020418, [11] [c_1]: 3.649e-05 [parameter_eliminate]: 5.81e-06 [updatestate_depend_eliminate]: 9.24e-06 [updatestate_assign_eliminate]: 3.65e-06 [updatestate_loads_eliminate]: 3.28998e-06 [cse]: 4.292e-05 [call_graph_tuple_transform]: 3.078e-05 [tuple_list_get_item_eliminator]: 8.47e-06 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 7.39994e-07 [switch_simplify]: 8.67998e-06 [remove_dup_value]: 1.652e-05 [partial_unused_args_eliminate]: 2.16998e-06 [environ_conv]: 3.528e-05 [add_recomputation]: 7.329e-05 [cse_after_recomputation]: 3.251e-05, [1] [Cycle 1]: 2.429e-05, [1] [cse]: 1.718e-05 [auto_monad_reorder]: 2.757e-05 [get_jit_bprop_graph]: 2.64999e-06 [rewriter_after_jit_bprop_graph]: 4.89e-06 [opt_after_jit_grad]: 0.00054758 [symbol_engine_optimizer]: 0.00011649, [1] [Cycle 1]: 0.00010893, [6] [build]: 2.784e-05 [elim_shapecalc]: 1.226e-05 [elim_not_effective]: 1.592e-05 [opt_reshape]: 8.68001e-06 [fold_const_symbol]: 1.2e-05 [renormalize]: 1.12999e-06 [validate]: 8.689e-05 [backend_pass]: 1.05001e-06 [task_emit]: 52.0235 [execute]: 8.24998e-06 Sums bootstrap : 0.002001s : 0.00% type_inference : 0.130250s : 0.25% event_method : 0.000016s : 0.00% auto_monad : 0.000245s : 0.00% graph_reusing : 0.000005s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000030s : 0.00% rewriter_before_opt_a : 0.000075s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000088s : 0.00% jit_opt_a.loop_unroll : 0.000024s : 0.00% jit_opt_a.a_1 : 0.000561s : 0.00% jit_opt_a.with_stream_mark : 0.000038s : 0.00% jit_opt_a.recompute_prepare : 0.000018s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000017s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000015s : 0.00% jit_opt_a.accelerated_algorithm : 0.000016s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000012s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000016s : 0.00% jit_opt_a.merge_forward : 0.000008s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000023s : 0.00% jit_opt_a.meta_fg_expand : 0.000004s : 0.00% jit_opt_a.replace_old_param : 0.000032s : 0.00% jit_opt_a.inline_without_move : 0.000015s : 0.00% jit_opt_a.renormalize : 0.000950s : 0.00% jit_opt_a.add_forward_monad_depend : 0.000017s : 0.00% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000029s : 0.00% jit_opt_a.cse : 0.000061s : 0.00% jit_opt_a.replace_applicator : 0.000022s : 0.00% py_interpret_to_execute_after_opt_a : 0.000016s : 0.00% rewriter_after_opt_a : 0.000092s : 0.00% convert_after_rewriter : 0.000008s : 0.00% order_py_execute_after_rewriter : 0.000006s : 0.00% mutable_eliminate : 0.002620s : 0.01% jit_opt_b.frontend_op_eliminate : 0.000027s : 0.00% jit_opt_b.inline_after_opt_a : 0.000024s : 0.00% cconv : 0.000040s : 0.00% loop_unroll : 0.000582s : 0.00% jit_opt_after_cconv.c_1 : 0.000036s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000043s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000031s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000008s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000017s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000035s : 0.00% add_recomputation : 0.000073s : 0.00% cse_after_recomputation.cse : 0.000017s : 0.00% auto_monad_reorder : 0.000028s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000548s : 0.00% symbol_engine_optimizer.build : 0.000028s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000087s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 52.023511s : 99.73% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000144 30 1.53% : 0.000002s : 2: substitution.elim_not_effective 0.92% : 0.000001s : 2: substitution.fold_const_symbol 5.30% : 0.000008s : 7: substitution.graph_param_transform 74.46% : 0.000107s : 1: substitution.inline 2.59% : 0.000004s : 4: substitution.j_node_and_user_rematch 8.58% : 0.000012s : 4: substitution.remove_not_recompute_node 6.63% : 0.000010s : 10: substitution.replace_old_param ------[type_inference.] 0.130110 2 97.86% : 0.127321s : 1: type_inference.infer 2.14% : 0.002789s : 1: type_inference.specialize ------[replace.] 0.000019 1 100.00% : 0.000019s : 1: replace.inline ------[match.] 0.000106 1 100.00% : 0.000106s : 1: match.inline ------[predicate.] 0.000128 1037 1.14% : 0.000001s : 15: predicate.accumulaten_eliminater 1.52% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 1.10% : 0.000001s : 15: predicate.addn_check_dump 1.16% : 0.000001s : 15: predicate.addn_zero_filter 1.72% : 0.000002s : 15: predicate.arithmetic_simplify 1.15% : 0.000001s : 15: predicate.cast_eliminate 0.67% : 0.000001s : 7: predicate.check_bprop_eliminate 1.07% : 0.000001s : 15: predicate.compare_switch_simplify 1.20% : 0.000002s : 15: predicate.depend_value_elim 1.10% : 0.000001s : 15: predicate.dict_get_item_const_eliminator 1.32% : 0.000002s : 15: predicate.dict_get_item_eliminator 1.13% : 0.000001s : 15: predicate.dict_set_item_eliminator 0.95% : 0.000001s : 7: predicate.dumpgradient_eliminate 0.71% : 0.000001s : 7: predicate.elim_not_effective 0.98% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.06% : 0.000001s : 15: predicate.environ_get_add_eliminate 1.07% : 0.000001s : 15: predicate.environ_get_depend_swap 1.24% : 0.000002s : 15: predicate.environ_get_eliminate 1.12% : 0.000001s : 15: predicate.environ_get_set_eliminate 0.50% : 0.000001s : 7: predicate.fold_const_symbol 1.36% : 0.000002s : 14: predicate.get_grad_eliminate 0.55% : 0.000001s : 7: predicate.graph_param_transform 5.30% : 0.000007s : 30: predicate.inline 1.28% : 0.000002s : 14: predicate.inline_without_move 0.80% : 0.000001s : 14: predicate.j_node_and_user_rematch 1.56% : 0.000002s : 14: predicate.less_batch_normalization 1.21% : 0.000002s : 15: predicate.list_to_tuple_eliminator_ 2.25% : 0.000003s : 22: predicate.load_eliminater 1.83% : 0.000002s : 7: predicate.loop_unroll_after_grad 2.03% : 0.000003s : 22: predicate.loop_unroll_before_grad 2.27% : 0.000003s : 22: predicate.make_slice_get_slice_eliminator 1.06% : 0.000001s : 15: predicate.merge_addn 1.07% : 0.000001s : 15: predicate.minmaximum_grad 2.57% : 0.000003s : 7: predicate.mutable_eliminate 1.01% : 0.000001s : 7: predicate.opt_reshape 2.18% : 0.000003s : 22: predicate.partial_eliminate 1.08% : 0.000001s : 15: predicate.print_const_string_wrapper 1.42% : 0.000002s : 15: predicate.reduce_eliminate 1.14% : 0.000001s : 15: predicate.redundant_stop_gradient_eliminater 1.24% : 0.000002s : 14: predicate.remove_not_recompute_node 2.44% : 0.000003s : 29: predicate.replace_applicator 1.14% : 0.000001s : 14: predicate.replace_old_param 0.55% : 0.000001s : 7: predicate.reset_defer_inline 1.23% : 0.000002s : 15: predicate.reshape_eliminate 1.16% : 0.000001s : 15: predicate.row_tensor_add_zeros_like 1.15% : 0.000001s : 7: predicate.row_tensor_eliminate 1.13% : 0.000001s : 15: predicate.same_eliminate 0.94% : 0.000001s : 14: predicate.set_cell_output_no_recompute 1.54% : 0.000002s : 14: predicate.special_op_eliminate 1.58% : 0.000002s : 14: predicate.specialize_transform 1.31% : 0.000002s : 15: predicate.split_environ_get_set_with_tuple_value 1.10% : 0.000001s : 15: predicate.stack_unstack_eliminate 0.68% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.42% : 0.000002s : 16: predicate.switch_defer_inline 1.42% : 0.000002s : 16: predicate.switch_layer_defer_inline 5.39% : 0.000007s : 45: predicate.switch_simplify 1.10% : 0.000001s : 15: predicate.tile_eliminate 1.14% : 0.000001s : 15: predicate.transpose_eliminate 1.27% : 0.000002s : 15: predicate.tuple_list_convert_item_index_to_positive 1.17% : 0.000001s : 15: predicate.tuple_list_get_item_depend_reorder 3.65% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.50% : 0.000002s : 15: predicate.tuple_list_set_item_eliminator 1.50% : 0.000002s : 15: predicate.tuple_to_list_eliminator_ 1.72% : 0.000002s : 22: predicate.updatestate_pure_node_eliminater 3.51% : 0.000004s : 36: predicate.updatestate_useless_node_eliminater 1.49% : 0.000002s : 15: predicate.value_based_eliminate 0.53% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.85% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000467 6 51.45% : 0.000240s : 3: func_graph_cloner_run.FuncGraphClonerGraph 48.55% : 0.000227s : 3: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 52.185772 76 0.00% : 0.000077s : 1: add_recomputation 0.00% : 0.000251s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.00% : 0.002046s : 1: bootstrap 0.00% : 0.000043s : 1: cconv 0.00% : 0.000011s : 1: convert_after_rewriter 0.00% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000038s : 1: environ_conv 0.00% : 0.000022s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.04% : 0.022769s : 1: jit_opt_a 0.00% : 0.000215s : 1: jit_opt_after_cconv 0.00% : 0.000077s : 1: jit_opt_b 0.00% : 0.000592s : 1: loop_unroll 0.01% : 0.002637s : 1: mutable_eliminate 0.00% : 0.000849s : 26: opt.transform.jit_opt_a 0.00% : 0.000081s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000043s : 4: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000024s : 1: opt.transform.mutable_eliminate 0.00% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000045s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000558s : 1: opt_after_jit_grad 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000014s : 1: pre_auto_parallel 0.00% : 0.000033s : 1: py_interpret_to_execute 0.00% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000019s : 1: remove_dup_value 0.00% : 0.000624s : 1: renormalize.infer 0.00% : 0.000317s : 1: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000096s : 1: rewriter_after_opt_a 0.00% : 0.000078s : 1: rewriter_before_opt_a 0.00% : 0.000120s : 1: symbol_engine_optimizer 99.69% : 52.023525s : 1: task_emit 0.25% : 0.130274s : 1: type_inference 0.00% : 0.000105s : 1: validate ...dx: [[[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] ... [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]]] dw: [[[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] ... [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]]]dx: [[[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] ... [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]] [[[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] ... [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]] [[32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] ... [96.000000 192.000000 288.000000 ... 288.000000 192.000000 96.000000] [64.000000 128.000000 192.000000 ... 192.000000 128.000000 64.000000] [32.000000 64.000000 96.000000 ... 96.000000 64.000000 32.000000]]]] dw: [[[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] ... [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]] [[[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] ... [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]] [[9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000] [9024.000000 9024.000000 9024.000000]]]] group_cases_5 have all been run, results of sub cases are below: case: ('pynative',) {} pass. case: (1,) {} pass. case: (mindspore.bfloat16, 1, False) {} pass. case: ('KBK',) {} pass. case: (0,) {} pass. case: ('GE',) {} pass. case: (1,) {} pass. case: (0,) {} pass. ops group_cases_6 with 8 cases start to running, all cases are below: case: (, 1) case: (, 0) case: (, 'kbk', True) case: (, 'kbk', False) case: (, 'pynative', True) case: (, 'pynative', False) case: (, 'kbk', True) case: (, 'kbk', False) ops group_cases_6 total running memory: 3136M, memory threshold: 51200M [WARNING] ME(136592:281473890602800,ForkProcess-49):2026-01-29-17:45:45.279.783 [mindspore/context.py:1334] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. [WARNING] ME(136601:281473890602800,ForkProcess-50):2026-01-29-17:45:45.325.791 [mindspore/context.py:1334] For 'context.set_context', the parameter 'ascend_config' will be deprecated and removed in a future version. Please use the api mindspore.device_context.ascend.op_precision.precision_mode(), mindspore.device_context.ascend.op_precision.op_precision_mode(), mindspore.device_context.ascend.op_precision.matmul_allow_hf32(), mindspore.device_context.ascend.op_precision.conv_allow_hf32(), mindspore.device_context.ascend.op_tuning.op_compile() instead. TotalTime = 3.89889, [24] [bootstrap]: 0.00090177 [type_inference]: 0.268624 [event_method]: 0.00029785 [auto_monad]: 0.00038186 [graph_reusing]: 1.196e-05 [inline]: 2.66e-06 [add_attr]: 0.0081704, [1] [add_attr_with_inline]: 0.00815484, [1] [Cycle 1]: 0.00019841, [2] [tag_attr]: 7.304e-05 [meta_addattr_fg_expand]: 2.47e-05 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 0.00010014 [insert-virtual-dataset]: 2.70997e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.94999e-06 [optimize]: 0.0282869, [53] [py_interpret_to_execute]: 6.07999e-06 [rewriter_before_opt_a]: 0.00050598 [opt_a]: 0.0245416, [2] [Cycle 1]: 0.0231213, [45] [expand_dump_flag]: 4.80999e-06 [switch_simplify]: 0.00024563 [loop_unroll]: 8.613e-05 [a_1]: 0.00231559 [with_stream_mark]: 2.753e-05 [recompute_prepare]: 0.0160557 [updatestate_depend_eliminate]: 7.777e-05 [updatestate_assign_eliminate]: 1.13e-05 [updatestate_loads_eliminate]: 9.01998e-06 [parameter_eliminate]: 3.12002e-06 [a_2]: 0.00026678 [accelerated_algorithm]: 5.321e-05 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 7.69002e-06 [shard_inline]: 1.573e-05 [merge_send_recv]: 4.712e-05 [auto_parallel]: 1.85e-05 [parallel]: 9.646e-05 [flash_sp]: 3.654e-05 [merge_comm]: 9.43002e-06 [allreduce_fusion]: 1.626e-05 [matmul_add_comm_reduction]: 2.446e-05 [allreduce_slice_to_reducescatter]: 8.55999e-06 [virtual_shard_identity]: 1.757e-05 [virtual_dataset]: 1.561e-05 [get_grad_eliminate_]: 1.535e-05 [virtual_output]: 1.484e-05 [merge_forward]: 9.31e-06 [cell_reuse_recompute_pass]: 2.19999e-06 [offload_activation]: 2.696e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.845e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 2.662e-05 [set_forward_comm_id_for_comm_node_pass]: 1.733e-05 [meta_fg_expand]: 8.28999e-06 [flash_sp_send_recv_attached]: 4.45e-06 [receive_attached]: 1.838e-05 [after_resolve]: 2.333e-05 [a_after_grad]: 2.39e-05 [renormalize]: 0.00264661 [add_forward_monad_depend]: 5.87001e-06 [auto_monad_grad]: 2.00002e-06 [auto_monad_eliminator]: 4.213e-05 [cse]: 0.00028994 [a_3]: 0.00011507 [Cycle 2]: 0.00140617, [45] [expand_dump_flag]: 2.11998e-06 [switch_simplify]: 1.741e-05 [loop_unroll]: 1.471e-05 [a_1]: 0.00042919 [with_stream_mark]: 2.039e-05 [recompute_prepare]: 1.489e-05 [updatestate_depend_eliminate]: 9.48002e-06 [updatestate_assign_eliminate]: 8.12e-06 [updatestate_loads_eliminate]: 7.78001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00022078 [accelerated_algorithm]: 1.867e-05 [shard]: 1.35001e-06 [meta_shard_fg_expand]: 3.84002e-06 [shard_inline]: 1.503e-05 [merge_send_recv]: 1.082e-05 [auto_parallel]: 1.246e-05 [parallel]: 6.32001e-06 [flash_sp]: 3.5e-06 [merge_comm]: 8.94e-06 [allreduce_fusion]: 8.19002e-06 [matmul_add_comm_reduction]: 1.299e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.526e-05 [virtual_dataset]: 1.41e-05 [get_grad_eliminate_]: 1.429e-05 [virtual_output]: 1.377e-05 [merge_forward]: 7.83001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.624e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.651e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 2.354e-05 [set_forward_comm_id_for_comm_node_pass]: 8.75001e-06 [meta_fg_expand]: 6.30002e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.20001e-06 [after_resolve]: 1.939e-05 [a_after_grad]: 2.224e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.03002e-06 [auto_monad_grad]: 1.63002e-06 [auto_monad_eliminator]: 1.961e-05 [cse]: 4.551e-05 [a_3]: 9.535e-05 [py_interpret_to_execute_after_opt_a]: 6.06e-06 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 6.017e-05 [convert_after_rewriter]: 1.42e-06 [order_py_execute_after_rewriter]: 1.30001e-06 [mutable_eliminate]: 0.00073398 [opt_b]: 0.00051776, [1] [Cycle 1]: 0.00051084, [7] [b_1]: 0.00037379 [b_2]: 1.604e-05 [updatestate_depend_eliminate]: 1.076e-05 [updatestate_assign_eliminate]: 7.59002e-06 [updatestate_loads_eliminate]: 7.43e-06 [renormalize]: 6.19999e-07 [cse]: 5.756e-05 [optimize_parallel_all_gather_comm]: 4.108e-05 [overlap_param_gather]: 1.078e-05 [cconv]: 2.759e-05 [loop_unroll]: 0.00045131 [opt_after_cconv]: 0.00020599, [1] [Cycle 1]: 0.00020008, [7] [c_1]: 8.609e-05 [parameter_eliminate]: 2.73e-06 [updatestate_depend_eliminate]: 1.087e-05 [updatestate_assign_eliminate]: 7.95e-06 [updatestate_loads_eliminate]: 7.36001e-06 [cse]: 4.943e-05 [renormalize]: 6.40022e-07 [remove_dup_value]: 5.925e-05 [tuple_transform]: 0.00027234, [1] [Cycle 1]: 0.00026729, [4] [d_1]: 0.00022271 [none_parameter_eliminate]: 2.36e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 1.684e-05 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 0.00011667 [cse_after_recomputation]: 4.932e-05, [1] [Cycle 1]: 4.375e-05, [1] [cse]: 3.778e-05 [environ_conv]: 2.545e-05 [swap_dp_allreduce_reducescatter]: 2.937e-05 [bias_add_comm_swap]: 1.021e-05 [label_micro_interleaved_index]: 1.287e-05 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.59e-06 [ForceFp32Comm]: 7.29982e-07 [remove_cast_before_assign_add]: 8.87e-06 [full_micro_interleaved_order_control]: 9.99999e-06 [reorder_send_recv_between_fp_bp]: 2.53e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 8.62e-06 [overlap_opt_shard_in_pipeline]: 2.09e-05 [overlap_opt_shard_grad_in_pipeline]: 1.75001e-06 [control_data_broadcast_order]: 2.579e-05 [grouped_pairwise_exchange_alltoall]: 1.30001e-06 [offloading_packed_experts]: 7.6e-06 [overlap_recompute_and_grad_model_parallel]: 1.579e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.32999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.71e-06 [overlap_grad_ring_attention]: 2.431e-05 [overlap_grad_flash_sp]: 6.029e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 9.37001e-06 [split_layernorm_comm]: 1.50999e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 0.00014167, [1] [Cycle 1]: 0.00013711, [6] [build]: 2.265e-05 [elim_shapecalc]: 1.972e-05 [elim_not_effective]: 2.675e-05 [opt_reshape]: 1.55e-05 [fold_const_symbol]: 2.335e-05 [renormalize]: 2.3999e-07 [detach_backward]: 2.32001e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 4.143e-05 [get_jit_bprop_graph]: 2.43998e-06 [rewriter_after_jit_bprop_graph]: 3.55e-06 [opt_after_jit_grad]: 0.00050067 [validate]: 8.595e-05 [backend_pass]: 1.21002e-06 [task_emit]: 3.59085 [execute]: 1.149e-05 Sums bootstrap : 0.000902s : 0.02% type_inference : 0.268624s : 6.91% event_method : 0.000298s : 0.01% auto_monad : 0.000382s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000073s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000100s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000506s : 0.01% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000263s : 0.01% optimize.opt_a.loop_unroll : 0.000101s : 0.00% optimize.opt_a.a_1 : 0.002745s : 0.07% optimize.opt_a.with_stream_mark : 0.000048s : 0.00% optimize.opt_a.recompute_prepare : 0.016071s : 0.41% optimize.opt_a.updatestate_depend_eliminate : 0.000087s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000017s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000488s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000072s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.00% optimize.opt_a.shard_inline : 0.000031s : 0.00% optimize.opt_a.merge_send_recv : 0.000058s : 0.00% optimize.opt_a.auto_parallel : 0.000031s : 0.00% optimize.opt_a.parallel : 0.000103s : 0.00% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000018s : 0.00% optimize.opt_a.allreduce_fusion : 0.000024s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000037s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000033s : 0.00% optimize.opt_a.virtual_dataset : 0.000030s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000030s : 0.00% optimize.opt_a.virtual_output : 0.000029s : 0.00% optimize.opt_a.merge_forward : 0.000017s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000043s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000050s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.000015s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000043s : 0.00% optimize.opt_a.a_after_grad : 0.000046s : 0.00% optimize.opt_a.renormalize : 0.002647s : 0.07% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000062s : 0.00% optimize.opt_a.cse : 0.000335s : 0.01% optimize.opt_a.a_3 : 0.000210s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000060s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000734s : 0.02% optimize.opt_b.b_1 : 0.000374s : 0.01% optimize.opt_b.b_2 : 0.000016s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000058s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000041s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000028s : 0.00% optimize.loop_unroll : 0.000451s : 0.01% optimize.opt_after_cconv.c_1 : 0.000086s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000049s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000059s : 0.00% optimize.tuple_transform.d_1 : 0.000223s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000017s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000117s : 0.00% optimize.cse_after_recomputation.cse : 0.000038s : 0.00% optimize.environ_conv : 0.000025s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000029s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000021s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000026s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000016s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000024s : 0.00% optimize.overlap_grad_flash_sp : 0.000060s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000023s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000016s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000041s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000501s : 0.01% validate : 0.000086s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.590854s : 92.33% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000903 239 1.08% : 0.000010s : 2: substitution.depend_value_elim 0.43% : 0.000004s : 9: substitution.elim_not_effective 1.41% : 0.000013s : 9: substitution.float_tuple_getitem_switch 0.37% : 0.000003s : 9: substitution.fold_const_symbol 1.34% : 0.000012s : 12: substitution.graph_param_transform 55.60% : 0.000502s : 18: substitution.inline 1.00% : 0.000009s : 18: substitution.j_node_and_user_rematch 3.29% : 0.000030s : 3: substitution.less_batch_normalization 1.63% : 0.000015s : 10: substitution.minmaximum_grad 1.37% : 0.000012s : 18: substitution.remove_not_recompute_node 0.77% : 0.000007s : 6: substitution.replace_old_param 2.73% : 0.000025s : 5: substitution.switch_simplify 4.58% : 0.000041s : 16: substitution.tuple_list_convert_item_index_to_positive 3.48% : 0.000031s : 18: substitution.tuple_list_get_item_const_eliminator 4.87% : 0.000044s : 18: substitution.tuple_list_get_item_depend_reorder 7.81% : 0.000071s : 28: substitution.tuple_list_get_item_eliminator 3.42% : 0.000031s : 18: substitution.tuple_list_get_set_item_eliminator 1.96% : 0.000018s : 10: substitution.updatestate_pure_node_eliminater 2.87% : 0.000026s : 12: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.268488 2 98.53% : 0.264541s : 1: type_inference.infer 1.47% : 0.003946s : 1: type_inference.specialize ------[replace.] 0.000252 29 57.31% : 0.000144s : 18: replace.inline 24.39% : 0.000061s : 5: replace.switch_simplify 7.32% : 0.000018s : 2: replace.tuple_list_get_item_depend_reorder 10.98% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000538 29 91.26% : 0.000491s : 18: match.inline 4.01% : 0.000022s : 5: match.switch_simplify 3.32% : 0.000018s : 2: match.tuple_list_get_item_depend_reorder 1.41% : 0.000008s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.016657 4327 0.04% : 0.000007s : 50: predicate.accumulaten_eliminater 0.02% : 0.000003s : 12: predicate.ad_related_special_op_eliminate 0.02% : 0.000003s : 24: predicate.addn_check_dump 0.04% : 0.000007s : 50: predicate.addn_zero_filter 0.04% : 0.000006s : 50: predicate.adjust_all_reduce_mul_add 0.09% : 0.000016s : 74: predicate.arithmetic_simplify 0.04% : 0.000007s : 50: predicate.cast_eliminate 0.02% : 0.000003s : 24: predicate.check_bprop_eliminate 0.02% : 0.000003s : 24: predicate.compare_switch_simplify 0.01% : 0.000001s : 12: predicate.const_output_eliminate 0.02% : 0.000004s : 24: predicate.depend_value_elim 0.04% : 0.000007s : 50: predicate.dict_get_item_const_eliminator 0.05% : 0.000008s : 50: predicate.dict_get_item_eliminator 0.04% : 0.000007s : 50: predicate.dict_set_item_eliminator 0.02% : 0.000004s : 24: predicate.dumpgradient_eliminate 0.01% : 0.000001s : 12: predicate.elim_not_effective 0.01% : 0.000002s : 12: predicate.elim_shapecalc_of_broadcastargs 0.05% : 0.000008s : 62: predicate.environ_add_const_eliminate 0.05% : 0.000008s : 62: predicate.environ_get_add_eliminate 0.05% : 0.000008s : 62: predicate.environ_get_depend_swap 0.07% : 0.000012s : 86: predicate.environ_get_eliminate 0.05% : 0.000009s : 62: predicate.environ_get_set_eliminate 0.06% : 0.000010s : 74: predicate.exchange_switch_depend_value 0.08% : 0.000014s : 74: predicate.float_depend_g_call 0.02% : 0.000003s : 24: predicate.float_environ_get_switch 0.04% : 0.000006s : 36: predicate.float_tuple_getitem_switch 0.01% : 0.000001s : 12: predicate.fold_const_symbol 0.02% : 0.000004s : 24: predicate.get_grad_eliminate 0.01% : 0.000001s : 12: predicate.graph_param_transform 0.02% : 0.000003s : 24: predicate.incorporate_call 0.02% : 0.000003s : 24: predicate.incorporate_call_switch 0.22% : 0.000037s : 196: predicate.inline 0.03% : 0.000004s : 24: predicate.inline_without_move 0.01% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.03% : 0.000006s : 26: predicate.less_batch_normalization 0.07% : 0.000012s : 80: predicate.list_to_tuple_eliminator_ 0.10% : 0.000017s : 130: predicate.load_eliminater 0.02% : 0.000003s : 12: predicate.loop_unroll_after_grad 0.09% : 0.000015s : 110: predicate.loop_unroll_before_grad 0.07% : 0.000011s : 76: predicate.make_slice_get_slice_eliminator 0.02% : 0.000004s : 24: predicate.merge_addn 0.02% : 0.000003s : 24: predicate.micro_step_allgather_replace 0.02% : 0.000003s : 24: predicate.mini_step_allgather_replace 0.04% : 0.000007s : 50: predicate.minmaximum_grad 0.02% : 0.000004s : 12: predicate.mutable_eliminate 0.01% : 0.000002s : 12: predicate.opt_reshape 0.01% : 0.000002s : 12: predicate.parallel_virtual_node 0.08% : 0.000013s : 74: predicate.partial_defer_inline 0.06% : 0.000010s : 68: predicate.partial_eliminate 0.04% : 0.000007s : 50: predicate.print_const_string_wrapper 0.02% : 0.000003s : 24: predicate.reduce_all_const_elim 0.05% : 0.000009s : 50: predicate.reduce_eliminate 0.10% : 0.000017s : 130: predicate.redundant_stop_gradient_eliminater 0.01% : 0.000002s : 24: predicate.remove_not_recompute_node 0.05% : 0.000008s : 80: predicate.replace_applicator 0.01% : 0.000002s : 24: predicate.replace_old_param 0.01% : 0.000001s : 12: predicate.reset_defer_inline 0.04% : 0.000007s : 50: predicate.reshape_eliminate 0.02% : 0.000003s : 24: predicate.row_tensor_add_zeros_like 0.01% : 0.000002s : 12: predicate.row_tensor_eliminate 0.03% : 0.000005s : 24: predicate.same_eliminate 96.15% : 0.016016s : 27: predicate.set_cell_output_no_recompute 0.02% : 0.000004s : 24: predicate.shard_identity_eliminate 0.02% : 0.000004s : 24: predicate.special_op_eliminate 0.02% : 0.000004s : 24: predicate.specialize_transform 0.02% : 0.000004s : 24: predicate.split_environ_get_set_with_tuple_value 0.03% : 0.000004s : 24: predicate.stack_unstack_eliminate 0.01% : 0.000002s : 12: predicate.switch_call_monad_eliminater 0.07% : 0.000011s : 74: predicate.switch_defer_inline 0.08% : 0.000014s : 98: predicate.switch_layer_defer_inline 0.22% : 0.000037s : 230: predicate.switch_simplify 0.04% : 0.000007s : 50: predicate.tile_eliminate 0.04% : 0.000007s : 50: predicate.transpose_eliminate 0.07% : 0.000011s : 74: predicate.tuple_list_convert_item_index_to_positive 0.07% : 0.000012s : 76: predicate.tuple_list_get_item_const_eliminator 0.07% : 0.000011s : 76: predicate.tuple_list_get_item_depend_reorder 0.12% : 0.000020s : 104: predicate.tuple_list_get_item_eliminator 0.07% : 0.000011s : 76: predicate.tuple_list_get_set_item_eliminator 0.09% : 0.000015s : 100: predicate.tuple_list_set_item_eliminator 0.07% : 0.000012s : 80: predicate.tuple_to_list_eliminator_ 0.10% : 0.000017s : 130: predicate.updatestate_pure_node_eliminater 0.13% : 0.000021s : 154: predicate.updatestate_useless_node_eliminater 0.01% : 0.000002s : 12: predicate.value_based_eliminate 0.02% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.02% : 0.000004s : 24: predicate.virtual_output_eliminate 0.01% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.01% : 0.000002s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003911 41 70.93% : 0.002774s : 21: func_graph_cloner_run.FuncGraphClonerGraph 29.07% : 0.001137s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.958784 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.21% : 0.008176s : 1: add_attr 0.21% : 0.008159s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000121s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000395s : 1: auto_monad 0.00% : 0.000046s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.02% : 0.000950s : 1: bootstrap 0.00% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000029s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000052s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000029s : 1: environ_conv 0.01% : 0.000311s : 1: event_method 0.00% : 0.000035s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.01% : 0.000460s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000743s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000026s : 1: opt.transform.mutable_eliminate 0.51% : 0.020186s : 78: opt.transform.opt_a 0.00% : 0.000085s : 1: opt.transform.opt_after_cconv 0.00% : 0.000050s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000362s : 28: opt.transform.opt_b 0.01% : 0.000236s : 2: opt.transform.opt_trans_graph 0.00% : 0.000082s : 4: opt.transform.symbol_engine_opt 0.62% : 0.024545s : 1: opt_a 0.01% : 0.000209s : 1: opt_after_cconv 0.01% : 0.000509s : 1: opt_after_jit_grad 0.01% : 0.000522s : 1: opt_b 0.71% : 0.028292s : 1: optimize 0.00% : 0.000045s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000064s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000019s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000105s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000064s : 1: remove_dup_value 0.04% : 0.001454s : 1: renormalize.infer 0.03% : 0.001183s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000064s : 1: rewriter_after_opt_a 0.01% : 0.000513s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000033s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000145s : 1: symbol_engine_optimizer 90.71% : 3.591031s : 1: task_emit 0.01% : 0.000276s : 1: tuple_transform 6.79% : 0.268646s : 1: type_inference 0.00% : 0.000128s : 1: validate TotalTime = 4.79707, [24] [bootstrap]: 0.00086791 [type_inference]: 0.149318 [event_method]: 0.00031389 [auto_monad]: 0.00036163 [graph_reusing]: 1.197e-05 [inline]: 3.61001e-06 [add_attr]: 0.0100159, [1] [add_attr_with_inline]: 0.00999752, [1] [Cycle 1]: 0.00020267, [2] [tag_attr]: 8.736e-05 [meta_addattr_fg_expand]: 2.537e-05 [parallel-infer-symbol]: 3.88999e-06 [pre_auto_parallel]: 9.892e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 8.59989e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 2.01e-06 [optimize]: 0.01344, [53] [py_interpret_to_execute]: 5.61003e-06 [rewriter_before_opt_a]: 0.00050725 [opt_a]: 0.00938376, [2] [Cycle 1]: 0.00793714, [45] [expand_dump_flag]: 4.43001e-06 [switch_simplify]: 0.00025369 [loop_unroll]: 8.598e-05 [a_1]: 0.00232358 [with_stream_mark]: 2.048e-05 [recompute_prepare]: 2.053e-05 [updatestate_depend_eliminate]: 4.584e-05 [updatestate_assign_eliminate]: 8.47998e-06 [updatestate_loads_eliminate]: 7.73001e-06 [parameter_eliminate]: 1.68002e-06 [a_2]: 0.00024314 [accelerated_algorithm]: 4.826e-05 [shard]: 2.31e-06 [meta_shard_fg_expand]: 0.00063487 [shard_inline]: 3.237e-05 [merge_send_recv]: 5.869e-05 [auto_parallel]: 1.831e-05 [parallel]: 0.00010245 [flash_sp]: 3.676e-05 [merge_comm]: 1.015e-05 [allreduce_fusion]: 1.739e-05 [matmul_add_comm_reduction]: 2.496e-05 [allreduce_slice_to_reducescatter]: 8.45001e-06 [virtual_shard_identity]: 2.103e-05 [virtual_dataset]: 1.593e-05 [get_grad_eliminate_]: 1.561e-05 [virtual_output]: 1.565e-05 [merge_forward]: 8.2e-06 [cell_reuse_recompute_pass]: 2.61e-06 [offload_activation]: 2.791e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.347e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 2.618e-05 [set_forward_comm_id_for_comm_node_pass]: 1.668e-05 [meta_fg_expand]: 8.27e-06 [flash_sp_send_recv_attached]: 5.10001e-06 [receive_attached]: 1.856e-05 [after_resolve]: 2.302e-05 [a_after_grad]: 2.331e-05 [renormalize]: 0.00292249 [add_forward_monad_depend]: 6.48e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 4.192e-05 [cse]: 0.00026258 [a_3]: 0.00011344 [Cycle 2]: 0.0014343, [45] [expand_dump_flag]: 2.59001e-06 [switch_simplify]: 1.762e-05 [loop_unroll]: 1.543e-05 [a_1]: 0.00044151 [with_stream_mark]: 1.692e-05 [recompute_prepare]: 1.546e-05 [updatestate_depend_eliminate]: 9.42999e-06 [updatestate_assign_eliminate]: 7.74997e-06 [updatestate_loads_eliminate]: 7.3e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 0.00022442 [accelerated_algorithm]: 1.892e-05 [shard]: 2.01998e-06 [meta_shard_fg_expand]: 3.85e-06 [shard_inline]: 1.444e-05 [merge_send_recv]: 1.209e-05 [auto_parallel]: 1.425e-05 [parallel]: 7.56001e-06 [flash_sp]: 3.68e-06 [merge_comm]: 8.51002e-06 [allreduce_fusion]: 8.24998e-06 [matmul_add_comm_reduction]: 1.223e-05 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 1.53e-05 [virtual_dataset]: 1.468e-05 [get_grad_eliminate_]: 1.456e-05 [virtual_output]: 1.413e-05 [merge_forward]: 7.68999e-06 [cell_reuse_recompute_pass]: 2.34001e-06 [offload_activation]: 1.732e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.653e-05 [merge_recompute_call_nodes]: 1.07e-06 [before_grad]: 2.384e-05 [set_forward_comm_id_for_comm_node_pass]: 8.87999e-06 [meta_fg_expand]: 6.62002e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.54e-06 [after_resolve]: 1.984e-05 [a_after_grad]: 2.269e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 2.076e-05 [cse]: 4.717e-05 [a_3]: 9.731e-05 [py_interpret_to_execute_after_opt_a]: 7.18998e-06 [slice_cell_reuse_recomputed_activation]: 2.00002e-06 [rewriter_after_opt_a]: 6.285e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.25999e-06 [mutable_eliminate]: 0.00078203 [opt_b]: 0.00055895, [1] [Cycle 1]: 0.00055133, [7] [b_1]: 0.00039946 [b_2]: 1.707e-05 [updatestate_depend_eliminate]: 1.17e-05 [updatestate_assign_eliminate]: 8.17998e-06 [updatestate_loads_eliminate]: 7.82998e-06 [renormalize]: 9.20001e-07 [cse]: 6.757e-05 [optimize_parallel_all_gather_comm]: 4.301e-05 [overlap_param_gather]: 1.022e-05 [cconv]: 3.113e-05 [loop_unroll]: 0.0006441 [opt_after_cconv]: 0.00022928, [1] [Cycle 1]: 0.00022135, [7] [c_1]: 9.364e-05 [parameter_eliminate]: 4.48999e-06 [updatestate_depend_eliminate]: 1.207e-05 [updatestate_assign_eliminate]: 8.07998e-06 [updatestate_loads_eliminate]: 7.48e-06 [cse]: 5.79e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 6.983e-05 [tuple_transform]: 0.00019761, [1] [Cycle 1]: 0.00019162, [4] [d_1]: 0.00015282 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 1.584e-05 [partial_unused_args_eliminate]: 2.02001e-06 [add_recomputation]: 0.00011793 [cse_after_recomputation]: 5.206e-05, [1] [Cycle 1]: 4.62e-05, [1] [cse]: 4.014e-05 [environ_conv]: 2.976e-05 [swap_dp_allreduce_reducescatter]: 3.289e-05 [bias_add_comm_swap]: 1.139e-05 [label_micro_interleaved_index]: 1.337e-05 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.54001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 9.17001e-06 [full_micro_interleaved_order_control]: 1.007e-05 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.33002e-06 [interleave_parallel_branches]: 8.18999e-06 [overlap_opt_shard_in_pipeline]: 2.022e-05 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 2.956e-05 [grouped_pairwise_exchange_alltoall]: 1.43002e-06 [offloading_packed_experts]: 7.84002e-06 [overlap_recompute_and_grad_model_parallel]: 1.548e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.59e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 2.245e-05 [overlap_grad_flash_sp]: 6.122e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 9.82001e-06 [split_layernorm_comm]: 2.00002e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 0.00015565, [1] [Cycle 1]: 0.00014994, [6] [build]: 2.359e-05 [elim_shapecalc]: 2.448e-05 [elim_not_effective]: 2.789e-05 [opt_reshape]: 1.644e-05 [fold_const_symbol]: 2.499e-05 [renormalize]: 5.00004e-07 [detach_backward]: 2.01e-06 [pipeline_parallel_scheduler]: 1.40001e-06 [auto_monad_reorder]: 4.305e-05 [get_jit_bprop_graph]: 1.99999e-06 [rewriter_after_jit_bprop_graph]: 4.65001e-06 [opt_after_jit_grad]: 0.0006093 [validate]: 9.488e-05 [backend_pass]: 9.5999e-07 [task_emit]: 4.62095 [execute]: 1.088e-05 Sums bootstrap : 0.000868s : 0.02% type_inference : 0.149318s : 3.12% event_method : 0.000314s : 0.01% auto_monad : 0.000362s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000087s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000099s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000507s : 0.01% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000271s : 0.01% optimize.opt_a.loop_unroll : 0.000101s : 0.00% optimize.opt_a.a_1 : 0.002765s : 0.06% optimize.opt_a.with_stream_mark : 0.000037s : 0.00% optimize.opt_a.recompute_prepare : 0.000036s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000055s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000468s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000067s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000639s : 0.01% optimize.opt_a.shard_inline : 0.000047s : 0.00% optimize.opt_a.merge_send_recv : 0.000071s : 0.00% optimize.opt_a.auto_parallel : 0.000033s : 0.00% optimize.opt_a.parallel : 0.000110s : 0.00% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000019s : 0.00% optimize.opt_a.allreduce_fusion : 0.000026s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000037s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000036s : 0.00% optimize.opt_a.virtual_dataset : 0.000031s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000030s : 0.00% optimize.opt_a.virtual_output : 0.000030s : 0.00% optimize.opt_a.merge_forward : 0.000016s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000045s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000060s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000050s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.000015s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000043s : 0.00% optimize.opt_a.a_after_grad : 0.000046s : 0.00% optimize.opt_a.renormalize : 0.002923s : 0.06% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000063s : 0.00% optimize.opt_a.cse : 0.000310s : 0.01% optimize.opt_a.a_3 : 0.000211s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000063s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000782s : 0.02% optimize.opt_b.b_1 : 0.000399s : 0.01% optimize.opt_b.b_2 : 0.000017s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000068s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000043s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000031s : 0.00% optimize.loop_unroll : 0.000644s : 0.01% optimize.opt_after_cconv.c_1 : 0.000094s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000058s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000070s : 0.00% optimize.tuple_transform.d_1 : 0.000153s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000016s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000118s : 0.00% optimize.cse_after_recomputation.cse : 0.000040s : 0.00% optimize.environ_conv : 0.000030s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000033s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000020s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000030s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000061s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000024s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000028s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000016s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000043s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000609s : 0.01% validate : 0.000095s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.620947s : 96.57% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000920 239 0.89% : 0.000008s : 2: substitution.depend_value_elim 0.43% : 0.000004s : 9: substitution.elim_not_effective 1.41% : 0.000013s : 9: substitution.float_tuple_getitem_switch 0.40% : 0.000004s : 9: substitution.fold_const_symbol 1.27% : 0.000012s : 12: substitution.graph_param_transform 56.48% : 0.000520s : 18: substitution.inline 0.94% : 0.000009s : 18: substitution.j_node_and_user_rematch 3.01% : 0.000028s : 3: substitution.less_batch_normalization 1.52% : 0.000014s : 10: substitution.minmaximum_grad 1.39% : 0.000013s : 18: substitution.remove_not_recompute_node 0.64% : 0.000006s : 6: substitution.replace_old_param 2.59% : 0.000024s : 5: substitution.switch_simplify 4.84% : 0.000045s : 16: substitution.tuple_list_convert_item_index_to_positive 3.56% : 0.000033s : 18: substitution.tuple_list_get_item_const_eliminator 4.65% : 0.000043s : 18: substitution.tuple_list_get_item_depend_reorder 7.62% : 0.000070s : 28: substitution.tuple_list_get_item_eliminator 3.51% : 0.000032s : 18: substitution.tuple_list_get_set_item_eliminator 2.01% : 0.000018s : 10: substitution.updatestate_pure_node_eliminater 2.84% : 0.000026s : 12: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.149162 2 97.21% : 0.145001s : 1: type_inference.infer 2.79% : 0.004161s : 1: type_inference.specialize ------[replace.] 0.000267 29 55.94% : 0.000149s : 18: replace.inline 25.63% : 0.000068s : 5: replace.switch_simplify 7.56% : 0.000020s : 2: replace.tuple_list_get_item_depend_reorder 10.87% : 0.000029s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000553 29 92.04% : 0.000509s : 18: match.inline 3.74% : 0.000021s : 5: match.switch_simplify 2.86% : 0.000016s : 2: match.tuple_list_get_item_depend_reorder 1.35% : 0.000007s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000665 4327 1.06% : 0.000007s : 50: predicate.accumulaten_eliminater 0.59% : 0.000004s : 12: predicate.ad_related_special_op_eliminate 0.50% : 0.000003s : 24: predicate.addn_check_dump 1.06% : 0.000007s : 50: predicate.addn_zero_filter 1.04% : 0.000007s : 50: predicate.adjust_all_reduce_mul_add 2.23% : 0.000015s : 74: predicate.arithmetic_simplify 1.07% : 0.000007s : 50: predicate.cast_eliminate 0.53% : 0.000004s : 24: predicate.check_bprop_eliminate 0.51% : 0.000003s : 24: predicate.compare_switch_simplify 0.14% : 0.000001s : 12: predicate.const_output_eliminate 0.55% : 0.000004s : 24: predicate.depend_value_elim 1.08% : 0.000007s : 50: predicate.dict_get_item_const_eliminator 1.35% : 0.000009s : 50: predicate.dict_get_item_eliminator 1.02% : 0.000007s : 50: predicate.dict_set_item_eliminator 0.63% : 0.000004s : 24: predicate.dumpgradient_eliminate 0.16% : 0.000001s : 12: predicate.elim_not_effective 0.38% : 0.000003s : 12: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000009s : 62: predicate.environ_add_const_eliminate 1.26% : 0.000008s : 62: predicate.environ_get_add_eliminate 1.29% : 0.000009s : 62: predicate.environ_get_depend_swap 1.79% : 0.000012s : 86: predicate.environ_get_eliminate 1.24% : 0.000008s : 62: predicate.environ_get_set_eliminate 1.52% : 0.000010s : 74: predicate.exchange_switch_depend_value 2.18% : 0.000014s : 74: predicate.float_depend_g_call 0.52% : 0.000003s : 24: predicate.float_environ_get_switch 0.83% : 0.000006s : 36: predicate.float_tuple_getitem_switch 0.14% : 0.000001s : 12: predicate.fold_const_symbol 0.63% : 0.000004s : 24: predicate.get_grad_eliminate 0.16% : 0.000001s : 12: predicate.graph_param_transform 0.48% : 0.000003s : 24: predicate.incorporate_call 0.45% : 0.000003s : 24: predicate.incorporate_call_switch 5.78% : 0.000038s : 196: predicate.inline 0.64% : 0.000004s : 24: predicate.inline_without_move 0.26% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.93% : 0.000006s : 26: predicate.less_batch_normalization 1.72% : 0.000011s : 80: predicate.list_to_tuple_eliminator_ 2.61% : 0.000017s : 130: predicate.load_eliminater 0.68% : 0.000005s : 12: predicate.loop_unroll_after_grad 2.41% : 0.000016s : 110: predicate.loop_unroll_before_grad 1.73% : 0.000011s : 76: predicate.make_slice_get_slice_eliminator 0.55% : 0.000004s : 24: predicate.merge_addn 0.55% : 0.000004s : 24: predicate.micro_step_allgather_replace 0.58% : 0.000004s : 24: predicate.mini_step_allgather_replace 1.01% : 0.000007s : 50: predicate.minmaximum_grad 0.68% : 0.000005s : 12: predicate.mutable_eliminate 0.32% : 0.000002s : 12: predicate.opt_reshape 0.30% : 0.000002s : 12: predicate.parallel_virtual_node 2.13% : 0.000014s : 74: predicate.partial_defer_inline 1.51% : 0.000010s : 68: predicate.partial_eliminate 1.06% : 0.000007s : 50: predicate.print_const_string_wrapper 0.56% : 0.000004s : 24: predicate.reduce_all_const_elim 1.39% : 0.000009s : 50: predicate.reduce_eliminate 2.70% : 0.000018s : 130: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000002s : 24: predicate.remove_not_recompute_node 1.16% : 0.000008s : 80: predicate.replace_applicator 0.30% : 0.000002s : 24: predicate.replace_old_param 0.16% : 0.000001s : 12: predicate.reset_defer_inline 1.11% : 0.000007s : 50: predicate.reshape_eliminate 0.55% : 0.000004s : 24: predicate.row_tensor_add_zeros_like 0.33% : 0.000002s : 12: predicate.row_tensor_eliminate 0.65% : 0.000004s : 24: predicate.same_eliminate 0.34% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.58% : 0.000004s : 24: predicate.shard_identity_eliminate 0.63% : 0.000004s : 24: predicate.special_op_eliminate 0.61% : 0.000004s : 24: predicate.specialize_transform 0.59% : 0.000004s : 24: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000004s : 24: predicate.stack_unstack_eliminate 0.28% : 0.000002s : 12: predicate.switch_call_monad_eliminater 1.73% : 0.000012s : 74: predicate.switch_defer_inline 2.18% : 0.000014s : 98: predicate.switch_layer_defer_inline 5.31% : 0.000035s : 230: predicate.switch_simplify 1.09% : 0.000007s : 50: predicate.tile_eliminate 1.04% : 0.000007s : 50: predicate.transpose_eliminate 1.76% : 0.000012s : 74: predicate.tuple_list_convert_item_index_to_positive 1.83% : 0.000012s : 76: predicate.tuple_list_get_item_const_eliminator 1.68% : 0.000011s : 76: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000020s : 104: predicate.tuple_list_get_item_eliminator 1.77% : 0.000012s : 76: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000016s : 100: predicate.tuple_list_set_item_eliminator 1.87% : 0.000012s : 80: predicate.tuple_to_list_eliminator_ 2.67% : 0.000018s : 130: predicate.updatestate_pure_node_eliminater 3.29% : 0.000022s : 154: predicate.updatestate_useless_node_eliminater 0.35% : 0.000002s : 12: predicate.value_based_eliminate 0.66% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.59% : 0.000004s : 24: predicate.virtual_output_eliminate 0.23% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.32% : 0.000002s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003853 41 68.31% : 0.002632s : 21: func_graph_cloner_run.FuncGraphClonerGraph 31.69% : 0.001221s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.828015 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.21% : 0.010023s : 1: add_attr 0.21% : 0.010002s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000123s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000376s : 1: auto_monad 0.00% : 0.000048s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.02% : 0.000944s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000033s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000055s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000034s : 1: environ_conv 0.01% : 0.000328s : 1: event_method 0.00% : 0.000027s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.01% : 0.000657s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000797s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000035s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000032s : 1: opt.transform.mutable_eliminate 0.09% : 0.004189s : 78: opt.transform.opt_a 0.00% : 0.000092s : 1: opt.transform.opt_after_cconv 0.00% : 0.000054s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000384s : 28: opt.transform.opt_b 0.00% : 0.000166s : 2: opt.transform.opt_trans_graph 0.00% : 0.000089s : 4: opt.transform.symbol_engine_opt 0.19% : 0.009388s : 1: opt_a 0.00% : 0.000233s : 1: opt_after_cconv 0.01% : 0.000620s : 1: opt_after_jit_grad 0.01% : 0.000563s : 1: opt_b 0.28% : 0.013446s : 1: optimize 0.00% : 0.000047s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000065s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000027s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000019s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000104s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000075s : 1: remove_dup_value 0.03% : 0.001591s : 1: renormalize.infer 0.03% : 0.001321s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000067s : 1: rewriter_after_opt_a 0.01% : 0.000516s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000037s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000159s : 1: symbol_engine_optimizer 95.72% : 4.621201s : 1: task_emit 0.00% : 0.000201s : 1: tuple_transform 3.09% : 0.149344s : 1: type_inference 0.00% : 0.000137s : 1: validate TotalTime = 5.83247, [24] [bootstrap]: 0.00095252 [type_inference]: 0.292126 [event_method]: 0.00030934 [auto_monad]: 0.00038096 [graph_reusing]: 1.208e-05 [inline]: 2.32999e-06 [add_attr]: 0.0206502, [1] [add_attr_with_inline]: 0.020633, [1] [Cycle 1]: 0.00020526, [2] [tag_attr]: 8.198e-05 [meta_addattr_fg_expand]: 2.58e-05 [parallel-infer-symbol]: 3.67002e-06 [pre_auto_parallel]: 0.00010807 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 2.34999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0297538, [53] [py_interpret_to_execute]: 6.09999e-06 [rewriter_before_opt_a]: 0.00053268 [opt_a]: 0.0256302, [2] [Cycle 1]: 0.0240803, [45] [expand_dump_flag]: 8.23001e-06 [switch_simplify]: 0.00026412 [loop_unroll]: 9.664e-05 [a_1]: 0.00255646 [with_stream_mark]: 2.839e-05 [recompute_prepare]: 2.344e-05 [updatestate_depend_eliminate]: 4.671e-05 [updatestate_assign_eliminate]: 1.008e-05 [updatestate_loads_eliminate]: 8.38999e-06 [parameter_eliminate]: 2.07999e-06 [a_2]: 0.00024487 [accelerated_algorithm]: 5.057e-05 [shard]: 2.11e-06 [meta_shard_fg_expand]: 5.28002e-06 [shard_inline]: 1.624e-05 [merge_send_recv]: 5.235e-05 [auto_parallel]: 1.405e-05 [parallel]: 9.461e-05 [flash_sp]: 4.097e-05 [merge_comm]: 1.104e-05 [allreduce_fusion]: 1.731e-05 [matmul_add_comm_reduction]: 2.536e-05 [allreduce_slice_to_reducescatter]: 9.86e-06 [virtual_shard_identity]: 1.93e-05 [virtual_dataset]: 1.52e-05 [get_grad_eliminate_]: 1.452e-05 [virtual_output]: 1.417e-05 [merge_forward]: 8.45001e-06 [cell_reuse_recompute_pass]: 1.99999e-06 [offload_activation]: 2.732e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.05e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 2.684e-05 [set_forward_comm_id_for_comm_node_pass]: 1.88e-05 [meta_fg_expand]: 8.1e-06 [flash_sp_send_recv_attached]: 3.04001e-06 [receive_attached]: 1.998e-05 [after_resolve]: 2.339e-05 [a_after_grad]: 2.344e-05 [renormalize]: 0.0192934 [add_forward_monad_depend]: 1.475e-05 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 7.071e-05 [cse]: 0.00032815 [a_3]: 0.00013042 [Cycle 2]: 0.00153374, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 1.858e-05 [loop_unroll]: 1.537e-05 [a_1]: 0.00045514 [with_stream_mark]: 2.529e-05 [recompute_prepare]: 1.549e-05 [updatestate_depend_eliminate]: 1.08e-05 [updatestate_assign_eliminate]: 8.75999e-06 [updatestate_loads_eliminate]: 7.97e-06 [parameter_eliminate]: 2.58998e-06 [a_2]: 0.00022688 [accelerated_algorithm]: 1.97e-05 [shard]: 2.14e-06 [meta_shard_fg_expand]: 4.35e-06 [shard_inline]: 1.439e-05 [merge_send_recv]: 1.437e-05 [auto_parallel]: 1.54e-05 [parallel]: 1.085e-05 [flash_sp]: 4.2e-06 [merge_comm]: 8.64998e-06 [allreduce_fusion]: 8.82999e-06 [matmul_add_comm_reduction]: 1.58e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 1.598e-05 [virtual_dataset]: 1.412e-05 [get_grad_eliminate_]: 1.404e-05 [virtual_output]: 1.427e-05 [merge_forward]: 8.76002e-06 [cell_reuse_recompute_pass]: 3.64002e-06 [offload_activation]: 1.912e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.814e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 2.409e-05 [set_forward_comm_id_for_comm_node_pass]: 8.99e-06 [meta_fg_expand]: 6.91001e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 2.301e-05 [a_after_grad]: 2.288e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.17001e-06 [auto_monad_grad]: 1.79998e-06 [auto_monad_eliminator]: 2.216e-05 [cse]: 5.774e-05 [a_3]: 0.00010232 [py_interpret_to_execute_after_opt_a]: 1.334e-05 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 6.515e-05 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.52999e-06 [mutable_eliminate]: 0.00088574 [opt_b]: 0.00057331, [1] [Cycle 1]: 0.00056517, [7] [b_1]: 0.00040752 [b_2]: 1.747e-05 [updatestate_depend_eliminate]: 1.475e-05 [updatestate_assign_eliminate]: 8.11002e-06 [updatestate_loads_eliminate]: 7.81001e-06 [renormalize]: 6.80011e-07 [cse]: 6.842e-05 [optimize_parallel_all_gather_comm]: 5.225e-05 [overlap_param_gather]: 1.16e-05 [cconv]: 3.79e-05 [loop_unroll]: 0.00053048 [opt_after_cconv]: 0.00021829, [1] [Cycle 1]: 0.00021154, [7] [c_1]: 8.934e-05 [parameter_eliminate]: 2.91e-06 [updatestate_depend_eliminate]: 1.234e-05 [updatestate_assign_eliminate]: 7.41001e-06 [updatestate_loads_eliminate]: 7.6e-06 [cse]: 5.564e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 7.305e-05 [tuple_transform]: 0.00019086, [1] [Cycle 1]: 0.00018564, [4] [d_1]: 0.00014583 [none_parameter_eliminate]: 2.24999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 1.654e-05 [partial_unused_args_eliminate]: 1.86998e-06 [add_recomputation]: 0.00012279 [cse_after_recomputation]: 4.948e-05, [1] [Cycle 1]: 4.447e-05, [1] [cse]: 3.775e-05 [environ_conv]: 3.12e-05 [swap_dp_allreduce_reducescatter]: 3.237e-05 [bias_add_comm_swap]: 1.139e-05 [label_micro_interleaved_index]: 1.475e-05 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 1.89e-06 [micro_interleaved_order_control]: 2.13002e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 1.025e-05 [full_micro_interleaved_order_control]: 1.086e-05 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 9.19e-06 [overlap_opt_shard_in_pipeline]: 3.053e-05 [overlap_opt_shard_grad_in_pipeline]: 1.83997e-06 [control_data_broadcast_order]: 2.739e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 7.5e-06 [overlap_recompute_and_grad_model_parallel]: 1.675e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.53998e-06 [overlap_grad_ring_attention]: 2.339e-05 [overlap_grad_flash_sp]: 6.662e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 1.062e-05 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 0.00016433, [1] [Cycle 1]: 0.0001584, [6] [build]: 2.717e-05 [elim_shapecalc]: 2.391e-05 [elim_not_effective]: 3.098e-05 [opt_reshape]: 1.739e-05 [fold_const_symbol]: 2.526e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.24999e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 4.689e-05 [get_jit_bprop_graph]: 2.16998e-06 [rewriter_after_jit_bprop_graph]: 5.05999e-06 [opt_after_jit_grad]: 0.00057584 [validate]: 9.416e-05 [backend_pass]: 1.06002e-06 [task_emit]: 5.48675 [execute]: 9.66e-06 Sums bootstrap : 0.000953s : 0.02% type_inference : 0.292126s : 5.03% event_method : 0.000309s : 0.01% auto_monad : 0.000381s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000082s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000026s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000108s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000533s : 0.01% optimize.opt_a.expand_dump_flag : 0.000012s : 0.00% optimize.opt_a.switch_simplify : 0.000283s : 0.00% optimize.opt_a.loop_unroll : 0.000112s : 0.00% optimize.opt_a.a_1 : 0.003012s : 0.05% optimize.opt_a.with_stream_mark : 0.000054s : 0.00% optimize.opt_a.recompute_prepare : 0.000039s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000058s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000016s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000472s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000070s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000031s : 0.00% optimize.opt_a.merge_send_recv : 0.000067s : 0.00% optimize.opt_a.auto_parallel : 0.000029s : 0.00% optimize.opt_a.parallel : 0.000105s : 0.00% optimize.opt_a.flash_sp : 0.000045s : 0.00% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000026s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000041s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000011s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000035s : 0.00% optimize.opt_a.virtual_dataset : 0.000029s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000029s : 0.00% optimize.opt_a.virtual_output : 0.000028s : 0.00% optimize.opt_a.merge_forward : 0.000017s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000046s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000059s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000051s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000028s : 0.00% optimize.opt_a.meta_fg_expand : 0.000015s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000022s : 0.00% optimize.opt_a.after_resolve : 0.000046s : 0.00% optimize.opt_a.a_after_grad : 0.000046s : 0.00% optimize.opt_a.renormalize : 0.019293s : 0.33% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000093s : 0.00% optimize.opt_a.cse : 0.000386s : 0.01% optimize.opt_a.a_3 : 0.000233s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000065s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000886s : 0.02% optimize.opt_b.b_1 : 0.000408s : 0.01% optimize.opt_b.b_2 : 0.000017s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000068s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000052s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000038s : 0.00% optimize.loop_unroll : 0.000530s : 0.01% optimize.opt_after_cconv.c_1 : 0.000089s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000056s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000073s : 0.00% optimize.tuple_transform.d_1 : 0.000146s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000017s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000123s : 0.00% optimize.cse_after_recomputation.cse : 0.000038s : 0.00% optimize.environ_conv : 0.000031s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000032s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000031s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000027s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000017s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000067s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000027s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000031s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000047s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000576s : 0.01% validate : 0.000094s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 5.486754s : 94.43% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001009 242 1.03% : 0.000010s : 2: substitution.depend_value_elim 0.43% : 0.000004s : 9: substitution.elim_not_effective 1.27% : 0.000013s : 9: substitution.float_tuple_getitem_switch 0.34% : 0.000003s : 9: substitution.fold_const_symbol 1.10% : 0.000011s : 12: substitution.graph_param_transform 58.31% : 0.000588s : 21: substitution.inline 0.95% : 0.000010s : 18: substitution.j_node_and_user_rematch 2.93% : 0.000030s : 3: substitution.less_batch_normalization 1.57% : 0.000016s : 10: substitution.minmaximum_grad 1.30% : 0.000013s : 18: substitution.remove_not_recompute_node 0.86% : 0.000009s : 6: substitution.replace_old_param 2.57% : 0.000026s : 5: substitution.switch_simplify 4.44% : 0.000045s : 16: substitution.tuple_list_convert_item_index_to_positive 3.31% : 0.000033s : 18: substitution.tuple_list_get_item_const_eliminator 4.32% : 0.000044s : 18: substitution.tuple_list_get_item_depend_reorder 7.51% : 0.000076s : 28: substitution.tuple_list_get_item_eliminator 3.24% : 0.000033s : 18: substitution.tuple_list_get_set_item_eliminator 2.02% : 0.000020s : 10: substitution.updatestate_pure_node_eliminater 2.50% : 0.000025s : 12: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.291985 2 98.61% : 0.287936s : 1: type_inference.infer 1.39% : 0.004049s : 1: type_inference.specialize ------[replace.] 0.000283 32 57.98% : 0.000164s : 21: replace.inline 22.98% : 0.000065s : 5: replace.switch_simplify 7.60% : 0.000021s : 2: replace.tuple_list_get_item_depend_reorder 11.44% : 0.000032s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000623 32 92.52% : 0.000576s : 21: match.inline 3.71% : 0.000023s : 5: match.switch_simplify 2.66% : 0.000017s : 2: match.tuple_list_get_item_depend_reorder 1.12% : 0.000007s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000687 4512 1.19% : 0.000008s : 53: predicate.accumulaten_eliminater 0.50% : 0.000003s : 12: predicate.ad_related_special_op_eliminate 0.50% : 0.000003s : 24: predicate.addn_check_dump 1.13% : 0.000008s : 53: predicate.addn_zero_filter 0.99% : 0.000007s : 53: predicate.adjust_all_reduce_mul_add 2.18% : 0.000015s : 77: predicate.arithmetic_simplify 1.02% : 0.000007s : 53: predicate.cast_eliminate 0.51% : 0.000004s : 24: predicate.check_bprop_eliminate 0.47% : 0.000003s : 24: predicate.compare_switch_simplify 0.14% : 0.000001s : 12: predicate.const_output_eliminate 0.55% : 0.000004s : 24: predicate.depend_value_elim 1.17% : 0.000008s : 53: predicate.dict_get_item_const_eliminator 1.29% : 0.000009s : 53: predicate.dict_get_item_eliminator 1.11% : 0.000008s : 53: predicate.dict_set_item_eliminator 0.59% : 0.000004s : 24: predicate.dumpgradient_eliminate 0.16% : 0.000001s : 12: predicate.elim_not_effective 0.35% : 0.000002s : 12: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000009s : 65: predicate.environ_add_const_eliminate 1.22% : 0.000008s : 65: predicate.environ_get_add_eliminate 1.29% : 0.000009s : 65: predicate.environ_get_depend_swap 1.76% : 0.000012s : 89: predicate.environ_get_eliminate 1.28% : 0.000009s : 65: predicate.environ_get_set_eliminate 1.62% : 0.000011s : 80: predicate.exchange_switch_depend_value 2.35% : 0.000016s : 80: predicate.float_depend_g_call 0.48% : 0.000003s : 24: predicate.float_environ_get_switch 0.76% : 0.000005s : 36: predicate.float_tuple_getitem_switch 0.14% : 0.000001s : 12: predicate.fold_const_symbol 0.53% : 0.000004s : 24: predicate.get_grad_eliminate 0.18% : 0.000001s : 12: predicate.graph_param_transform 0.46% : 0.000003s : 24: predicate.incorporate_call 0.43% : 0.000003s : 24: predicate.incorporate_call_switch 5.72% : 0.000039s : 205: predicate.inline 0.63% : 0.000004s : 24: predicate.inline_without_move 0.26% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.76% : 0.000005s : 26: predicate.less_batch_normalization 1.79% : 0.000012s : 83: predicate.list_to_tuple_eliminator_ 2.66% : 0.000018s : 136: predicate.load_eliminater 0.72% : 0.000005s : 12: predicate.loop_unroll_after_grad 2.60% : 0.000018s : 123: predicate.loop_unroll_before_grad 1.69% : 0.000012s : 79: predicate.make_slice_get_slice_eliminator 0.51% : 0.000003s : 24: predicate.merge_addn 0.47% : 0.000003s : 24: predicate.micro_step_allgather_replace 0.48% : 0.000003s : 24: predicate.mini_step_allgather_replace 1.06% : 0.000007s : 53: predicate.minmaximum_grad 0.69% : 0.000005s : 12: predicate.mutable_eliminate 0.35% : 0.000002s : 12: predicate.opt_reshape 0.31% : 0.000002s : 12: predicate.parallel_virtual_node 2.17% : 0.000015s : 80: predicate.partial_defer_inline 1.54% : 0.000011s : 71: predicate.partial_eliminate 1.07% : 0.000007s : 53: predicate.print_const_string_wrapper 0.52% : 0.000004s : 24: predicate.reduce_all_const_elim 1.35% : 0.000009s : 53: predicate.reduce_eliminate 2.65% : 0.000018s : 136: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000002s : 24: predicate.remove_not_recompute_node 1.26% : 0.000009s : 83: predicate.replace_applicator 0.35% : 0.000002s : 24: predicate.replace_old_param 0.15% : 0.000001s : 12: predicate.reset_defer_inline 1.06% : 0.000007s : 53: predicate.reshape_eliminate 0.54% : 0.000004s : 24: predicate.row_tensor_add_zeros_like 0.32% : 0.000002s : 12: predicate.row_tensor_eliminate 0.70% : 0.000005s : 24: predicate.same_eliminate 0.35% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.59% : 0.000004s : 24: predicate.shard_identity_eliminate 0.61% : 0.000004s : 24: predicate.special_op_eliminate 0.57% : 0.000004s : 24: predicate.specialize_transform 0.71% : 0.000005s : 24: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000004s : 24: predicate.stack_unstack_eliminate 0.27% : 0.000002s : 12: predicate.switch_call_monad_eliminater 1.77% : 0.000012s : 80: predicate.switch_defer_inline 2.25% : 0.000015s : 104: predicate.switch_layer_defer_inline 5.63% : 0.000039s : 249: predicate.switch_simplify 1.08% : 0.000007s : 53: predicate.tile_eliminate 1.12% : 0.000008s : 53: predicate.transpose_eliminate 1.78% : 0.000012s : 77: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000012s : 79: predicate.tuple_list_get_item_const_eliminator 1.70% : 0.000012s : 79: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000022s : 107: predicate.tuple_list_get_item_eliminator 1.71% : 0.000012s : 79: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000016s : 103: predicate.tuple_list_set_item_eliminator 1.79% : 0.000012s : 83: predicate.tuple_to_list_eliminator_ 2.59% : 0.000018s : 136: predicate.updatestate_pure_node_eliminater 3.30% : 0.000023s : 160: predicate.updatestate_useless_node_eliminater 0.29% : 0.000002s : 12: predicate.value_based_eliminate 0.56% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.53% : 0.000004s : 24: predicate.virtual_output_eliminate 0.25% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.36% : 0.000002s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003783 43 67.69% : 0.002561s : 20: func_graph_cloner_run.FuncGraphClonerGraph 32.31% : 0.001222s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 5.907165 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.35% : 0.020657s : 1: add_attr 0.35% : 0.020638s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000128s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000394s : 1: auto_monad 0.00% : 0.000051s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.02% : 0.001000s : 1: bootstrap 0.00% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000031s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000053s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000035s : 1: environ_conv 0.01% : 0.000322s : 1: event_method 0.00% : 0.000036s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000540s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000896s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000034s : 1: opt.transform.mutable_eliminate 0.08% : 0.004461s : 78: opt.transform.opt_a 0.00% : 0.000088s : 1: opt.transform.opt_after_cconv 0.00% : 0.000053s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000393s : 28: opt.transform.opt_b 0.00% : 0.000160s : 2: opt.transform.opt_trans_graph 0.00% : 0.000093s : 4: opt.transform.symbol_engine_opt 0.43% : 0.025634s : 1: opt_a 0.00% : 0.000222s : 1: opt_after_cconv 0.01% : 0.000586s : 1: opt_after_jit_grad 0.01% : 0.000577s : 1: opt_b 0.50% : 0.029759s : 1: optimize 0.00% : 0.000057s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000071s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000027s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000034s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000113s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000079s : 1: remove_dup_value 0.03% : 0.001633s : 1: renormalize.infer 0.30% : 0.017642s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000070s : 1: rewriter_after_opt_a 0.01% : 0.000543s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000036s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000167s : 1: symbol_engine_optimizer 92.89% : 5.486963s : 1: task_emit 0.00% : 0.000194s : 1: tuple_transform 4.95% : 0.292148s : 1: type_inference 0.00% : 0.000142s : 1: validate TotalTime = 5.96286, [24] [bootstrap]: 0.00097732 [type_inference]: 0.153346 [event_method]: 0.00031561 [auto_monad]: 0.00039164 [graph_reusing]: 1.276e-05 [inline]: 3.61001e-06 [add_attr]: 0.00858036, [1] [add_attr_with_inline]: 0.00856082, [1] [Cycle 1]: 0.00019997, [2] [tag_attr]: 7.989e-05 [meta_addattr_fg_expand]: 2.632e-05 [parallel-infer-symbol]: 4.82e-06 [pre_auto_parallel]: 0.00010597 [insert-virtual-dataset]: 3.21001e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 2.32999e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0141252, [53] [py_interpret_to_execute]: 5.34e-06 [rewriter_before_opt_a]: 0.00051372 [opt_a]: 0.00982618, [2] [Cycle 1]: 0.00800091, [45] [expand_dump_flag]: 8e-06 [switch_simplify]: 0.00045158 [loop_unroll]: 0.0001045 [a_1]: 0.00264581 [with_stream_mark]: 3.528e-05 [recompute_prepare]: 2.744e-05 [updatestate_depend_eliminate]: 5.676e-05 [updatestate_assign_eliminate]: 9.46e-06 [updatestate_loads_eliminate]: 9.62001e-06 [parameter_eliminate]: 2.49999e-06 [a_2]: 0.00025683 [accelerated_algorithm]: 5.291e-05 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 5.78002e-06 [shard_inline]: 1.735e-05 [merge_send_recv]: 4.736e-05 [auto_parallel]: 1.671e-05 [parallel]: 9.566e-05 [flash_sp]: 6.985e-05 [merge_comm]: 1.304e-05 [allreduce_fusion]: 1.784e-05 [matmul_add_comm_reduction]: 2.463e-05 [allreduce_slice_to_reducescatter]: 8.10999e-06 [virtual_shard_identity]: 2.314e-05 [virtual_dataset]: 1.486e-05 [get_grad_eliminate_]: 1.46e-05 [virtual_output]: 1.456e-05 [merge_forward]: 9.14e-06 [cell_reuse_recompute_pass]: 2.38002e-06 [offload_activation]: 2.64e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.302e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 2.56e-05 [set_forward_comm_id_for_comm_node_pass]: 1.665e-05 [meta_fg_expand]: 9.45001e-06 [flash_sp_send_recv_attached]: 3.00002e-06 [receive_attached]: 1.786e-05 [after_resolve]: 2.242e-05 [a_after_grad]: 2.4e-05 [renormalize]: 0.00305853 [add_forward_monad_depend]: 9.76e-06 [auto_monad_grad]: 2.89999e-06 [auto_monad_eliminator]: 5.083e-05 [cse]: 0.00015911 [a_3]: 0.00012257 [Cycle 2]: 0.00181072, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 1.677e-05 [loop_unroll]: 3.88e-05 [a_1]: 0.00064328 [with_stream_mark]: 2.889e-05 [recompute_prepare]: 1.784e-05 [updatestate_depend_eliminate]: 1.034e-05 [updatestate_assign_eliminate]: 8.80001e-06 [updatestate_loads_eliminate]: 8.95999e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 0.00023586 [accelerated_algorithm]: 2.17e-05 [shard]: 2.41998e-06 [meta_shard_fg_expand]: 4.51002e-06 [shard_inline]: 1.547e-05 [merge_send_recv]: 1.537e-05 [auto_parallel]: 1.593e-05 [parallel]: 1.04e-05 [flash_sp]: 3.95e-06 [merge_comm]: 1.018e-05 [allreduce_fusion]: 8.55999e-06 [matmul_add_comm_reduction]: 1.725e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.883e-05 [virtual_dataset]: 1.478e-05 [get_grad_eliminate_]: 1.466e-05 [virtual_output]: 1.492e-05 [merge_forward]: 8.70001e-06 [cell_reuse_recompute_pass]: 3.37002e-06 [offload_activation]: 1.895e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.886e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 2.453e-05 [set_forward_comm_id_for_comm_node_pass]: 8.90999e-06 [meta_fg_expand]: 7.08998e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 2.502e-05 [a_after_grad]: 2.43e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.37999e-06 [auto_monad_grad]: 3.29001e-06 [auto_monad_eliminator]: 2.703e-05 [cse]: 7.085e-05 [a_3]: 0.00010305 [py_interpret_to_execute_after_opt_a]: 1.323e-05 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 6.666e-05 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.20001e-06 [mutable_eliminate]: 0.00091002 [opt_b]: 0.00057939, [1] [Cycle 1]: 0.00057051, [7] [b_1]: 0.00039913 [b_2]: 1.892e-05 [updatestate_depend_eliminate]: 1.608e-05 [updatestate_assign_eliminate]: 8.47998e-06 [updatestate_loads_eliminate]: 8.1e-06 [renormalize]: 8.2e-07 [cse]: 7.439e-05 [optimize_parallel_all_gather_comm]: 4.38e-05 [overlap_param_gather]: 1.108e-05 [cconv]: 3.907e-05 [loop_unroll]: 0.00061435 [opt_after_cconv]: 0.00024179, [1] [Cycle 1]: 0.00023344, [7] [c_1]: 9.568e-05 [parameter_eliminate]: 5.19e-06 [updatestate_depend_eliminate]: 1.315e-05 [updatestate_assign_eliminate]: 7.81001e-06 [updatestate_loads_eliminate]: 7.84997e-06 [cse]: 6.43e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 7.779e-05 [tuple_transform]: 0.00020377, [1] [Cycle 1]: 0.0001979, [4] [d_1]: 0.00015222 [none_parameter_eliminate]: 2.24999e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 1.714e-05 [partial_unused_args_eliminate]: 2.08998e-06 [add_recomputation]: 0.00014122 [cse_after_recomputation]: 5.592e-05, [1] [Cycle 1]: 5.006e-05, [1] [cse]: 4.221e-05 [environ_conv]: 3.15e-05 [swap_dp_allreduce_reducescatter]: 3.229e-05 [bias_add_comm_swap]: 1.905e-05 [label_micro_interleaved_index]: 1.448e-05 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.52001e-06 [slice_recompute_activation]: 2.32999e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 1.30001e-06 [remove_cast_before_assign_add]: 8.85999e-06 [full_micro_interleaved_order_control]: 1.014e-05 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.21997e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 8.47e-06 [overlap_opt_shard_in_pipeline]: 2.787e-05 [overlap_opt_shard_grad_in_pipeline]: 2.16e-06 [control_data_broadcast_order]: 3.147e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 8.51002e-06 [overlap_recompute_and_grad_model_parallel]: 1.573e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.65001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.09e-06 [overlap_grad_ring_attention]: 2.244e-05 [overlap_grad_flash_sp]: 6.487e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 1.013e-05 [split_layernorm_comm]: 1.69e-06 [handle_group_info]: 1.60999e-06 [symbol_engine_optimizer]: 0.00017146, [1] [Cycle 1]: 0.0001647, [6] [build]: 2.656e-05 [elim_shapecalc]: 2.845e-05 [elim_not_effective]: 2.897e-05 [opt_reshape]: 1.719e-05 [fold_const_symbol]: 2.484e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.09e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 4.824e-05 [get_jit_bprop_graph]: 2.20002e-06 [rewriter_after_jit_bprop_graph]: 7.81001e-06 [opt_after_jit_grad]: 0.00069565 [validate]: 0.00010132 [backend_pass]: 1.19e-06 [task_emit]: 5.78333 [execute]: 1.079e-05 Sums bootstrap : 0.000977s : 0.02% type_inference : 0.153346s : 2.58% event_method : 0.000316s : 0.01% auto_monad : 0.000392s : 0.01% graph_reusing : 0.000013s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000080s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000026s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000106s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000514s : 0.01% optimize.opt_a.expand_dump_flag : 0.000011s : 0.00% optimize.opt_a.switch_simplify : 0.000468s : 0.01% optimize.opt_a.loop_unroll : 0.000143s : 0.00% optimize.opt_a.a_1 : 0.003289s : 0.06% optimize.opt_a.with_stream_mark : 0.000064s : 0.00% optimize.opt_a.recompute_prepare : 0.000045s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000067s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000019s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000493s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000075s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000033s : 0.00% optimize.opt_a.merge_send_recv : 0.000063s : 0.00% optimize.opt_a.auto_parallel : 0.000033s : 0.00% optimize.opt_a.parallel : 0.000106s : 0.00% optimize.opt_a.flash_sp : 0.000074s : 0.00% optimize.opt_a.merge_comm : 0.000023s : 0.00% optimize.opt_a.allreduce_fusion : 0.000026s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000042s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000042s : 0.00% optimize.opt_a.virtual_dataset : 0.000030s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000029s : 0.00% optimize.opt_a.virtual_output : 0.000029s : 0.00% optimize.opt_a.merge_forward : 0.000018s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000045s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000062s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000050s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.000017s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000047s : 0.00% optimize.opt_a.a_after_grad : 0.000048s : 0.00% optimize.opt_a.renormalize : 0.003059s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.00% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000078s : 0.00% optimize.opt_a.cse : 0.000230s : 0.00% optimize.opt_a.a_3 : 0.000226s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000067s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000910s : 0.02% optimize.opt_b.b_1 : 0.000399s : 0.01% optimize.opt_b.b_2 : 0.000019s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000074s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000044s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000039s : 0.00% optimize.loop_unroll : 0.000614s : 0.01% optimize.opt_after_cconv.c_1 : 0.000096s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000064s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000078s : 0.00% optimize.tuple_transform.d_1 : 0.000152s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000017s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000141s : 0.00% optimize.cse_after_recomputation.cse : 0.000042s : 0.00% optimize.environ_conv : 0.000031s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000032s : 0.00% optimize.bias_add_comm_swap : 0.000019s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000031s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000016s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000065s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000027s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000028s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000029s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000017s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000048s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.00% opt_after_jit_grad : 0.000696s : 0.01% validate : 0.000101s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 5.783331s : 97.16% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.001088 242 0.95% : 0.000010s : 2: substitution.depend_value_elim 0.38% : 0.000004s : 9: substitution.elim_not_effective 1.41% : 0.000015s : 9: substitution.float_tuple_getitem_switch 0.32% : 0.000003s : 9: substitution.fold_const_symbol 1.14% : 0.000012s : 12: substitution.graph_param_transform 57.26% : 0.000623s : 21: substitution.inline 0.83% : 0.000009s : 18: substitution.j_node_and_user_rematch 2.85% : 0.000031s : 3: substitution.less_batch_normalization 1.52% : 0.000017s : 10: substitution.minmaximum_grad 1.31% : 0.000014s : 18: substitution.remove_not_recompute_node 0.90% : 0.000010s : 6: substitution.replace_old_param 3.79% : 0.000041s : 5: substitution.switch_simplify 4.42% : 0.000048s : 16: substitution.tuple_list_convert_item_index_to_positive 3.00% : 0.000033s : 18: substitution.tuple_list_get_item_const_eliminator 4.21% : 0.000046s : 18: substitution.tuple_list_get_item_depend_reorder 7.70% : 0.000084s : 28: substitution.tuple_list_get_item_eliminator 3.31% : 0.000036s : 18: substitution.tuple_list_get_set_item_eliminator 2.08% : 0.000023s : 10: substitution.updatestate_pure_node_eliminater 2.64% : 0.000029s : 12: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.153194 2 97.27% : 0.149010s : 1: type_inference.infer 2.73% : 0.004184s : 1: type_inference.specialize ------[replace.] 0.000331 32 53.76% : 0.000178s : 21: replace.inline 27.18% : 0.000090s : 5: replace.switch_simplify 7.50% : 0.000025s : 2: replace.tuple_list_get_item_depend_reorder 11.56% : 0.000038s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000674 32 90.71% : 0.000611s : 21: match.inline 5.60% : 0.000038s : 5: match.switch_simplify 2.51% : 0.000017s : 2: match.tuple_list_get_item_depend_reorder 1.18% : 0.000008s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000705 4512 1.05% : 0.000007s : 53: predicate.accumulaten_eliminater 0.66% : 0.000005s : 12: predicate.ad_related_special_op_eliminate 0.47% : 0.000003s : 24: predicate.addn_check_dump 1.07% : 0.000008s : 53: predicate.addn_zero_filter 0.95% : 0.000007s : 53: predicate.adjust_all_reduce_mul_add 2.39% : 0.000017s : 77: predicate.arithmetic_simplify 1.02% : 0.000007s : 53: predicate.cast_eliminate 0.52% : 0.000004s : 24: predicate.check_bprop_eliminate 0.47% : 0.000003s : 24: predicate.compare_switch_simplify 0.14% : 0.000001s : 12: predicate.const_output_eliminate 0.48% : 0.000003s : 24: predicate.depend_value_elim 1.10% : 0.000008s : 53: predicate.dict_get_item_const_eliminator 1.18% : 0.000008s : 53: predicate.dict_get_item_eliminator 1.04% : 0.000007s : 53: predicate.dict_set_item_eliminator 0.67% : 0.000005s : 24: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 12: predicate.elim_not_effective 0.37% : 0.000003s : 12: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000008s : 65: predicate.environ_add_const_eliminate 1.34% : 0.000009s : 65: predicate.environ_get_add_eliminate 1.24% : 0.000009s : 65: predicate.environ_get_depend_swap 1.72% : 0.000012s : 89: predicate.environ_get_eliminate 1.23% : 0.000009s : 65: predicate.environ_get_set_eliminate 1.56% : 0.000011s : 80: predicate.exchange_switch_depend_value 2.30% : 0.000016s : 80: predicate.float_depend_g_call 0.48% : 0.000003s : 24: predicate.float_environ_get_switch 0.81% : 0.000006s : 36: predicate.float_tuple_getitem_switch 0.13% : 0.000001s : 12: predicate.fold_const_symbol 0.53% : 0.000004s : 24: predicate.get_grad_eliminate 0.18% : 0.000001s : 12: predicate.graph_param_transform 0.47% : 0.000003s : 24: predicate.incorporate_call 0.42% : 0.000003s : 24: predicate.incorporate_call_switch 6.05% : 0.000043s : 205: predicate.inline 0.62% : 0.000004s : 24: predicate.inline_without_move 0.24% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.77% : 0.000005s : 26: predicate.less_batch_normalization 1.63% : 0.000011s : 83: predicate.list_to_tuple_eliminator_ 2.65% : 0.000019s : 136: predicate.load_eliminater 0.73% : 0.000005s : 12: predicate.loop_unroll_after_grad 2.67% : 0.000019s : 123: predicate.loop_unroll_before_grad 1.63% : 0.000011s : 79: predicate.make_slice_get_slice_eliminator 0.48% : 0.000003s : 24: predicate.merge_addn 0.44% : 0.000003s : 24: predicate.micro_step_allgather_replace 0.45% : 0.000003s : 24: predicate.mini_step_allgather_replace 1.02% : 0.000007s : 53: predicate.minmaximum_grad 0.81% : 0.000006s : 12: predicate.mutable_eliminate 0.34% : 0.000002s : 12: predicate.opt_reshape 0.27% : 0.000002s : 12: predicate.parallel_virtual_node 2.37% : 0.000017s : 80: predicate.partial_defer_inline 1.50% : 0.000011s : 71: predicate.partial_eliminate 1.03% : 0.000007s : 53: predicate.print_const_string_wrapper 0.49% : 0.000003s : 24: predicate.reduce_all_const_elim 1.34% : 0.000009s : 53: predicate.reduce_eliminate 2.58% : 0.000018s : 136: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000002s : 24: predicate.remove_not_recompute_node 1.17% : 0.000008s : 83: predicate.replace_applicator 0.34% : 0.000002s : 24: predicate.replace_old_param 0.15% : 0.000001s : 12: predicate.reset_defer_inline 1.07% : 0.000008s : 53: predicate.reshape_eliminate 0.56% : 0.000004s : 24: predicate.row_tensor_add_zeros_like 0.35% : 0.000002s : 12: predicate.row_tensor_eliminate 0.67% : 0.000005s : 24: predicate.same_eliminate 0.33% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.63% : 0.000004s : 24: predicate.shard_identity_eliminate 0.54% : 0.000004s : 24: predicate.special_op_eliminate 0.62% : 0.000004s : 24: predicate.specialize_transform 0.66% : 0.000005s : 24: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000005s : 24: predicate.stack_unstack_eliminate 0.25% : 0.000002s : 12: predicate.switch_call_monad_eliminater 1.90% : 0.000013s : 80: predicate.switch_defer_inline 2.34% : 0.000017s : 104: predicate.switch_layer_defer_inline 6.24% : 0.000044s : 249: predicate.switch_simplify 1.12% : 0.000008s : 53: predicate.tile_eliminate 1.03% : 0.000007s : 53: predicate.transpose_eliminate 1.72% : 0.000012s : 77: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000012s : 79: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000011s : 79: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000021s : 107: predicate.tuple_list_get_item_eliminator 1.72% : 0.000012s : 79: predicate.tuple_list_get_set_item_eliminator 2.33% : 0.000016s : 103: predicate.tuple_list_set_item_eliminator 1.78% : 0.000013s : 83: predicate.tuple_to_list_eliminator_ 2.56% : 0.000018s : 136: predicate.updatestate_pure_node_eliminater 3.17% : 0.000022s : 160: predicate.updatestate_useless_node_eliminater 0.28% : 0.000002s : 12: predicate.value_based_eliminate 0.53% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.50% : 0.000004s : 24: predicate.virtual_output_eliminate 0.27% : 0.000002s : 12: predicate.virtual_view_grad_eliminate 0.40% : 0.000003s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003802 43 67.99% : 0.002585s : 20: func_graph_cloner_run.FuncGraphClonerGraph 32.01% : 0.001217s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 5.994091 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.14% : 0.008586s : 1: add_attr 0.14% : 0.008565s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000148s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000405s : 1: auto_monad 0.00% : 0.000053s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000023s : 1: bias_add_comm_swap 0.02% : 0.001026s : 1: bootstrap 0.00% : 0.000043s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000035s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000035s : 1: environ_conv 0.01% : 0.000328s : 1: event_method 0.00% : 0.000063s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000018s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000625s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000923s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000032s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000036s : 1: opt.transform.mutable_eliminate 0.08% : 0.004989s : 78: opt.transform.opt_a 0.00% : 0.000094s : 1: opt.transform.opt_after_cconv 0.00% : 0.000057s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000386s : 28: opt.transform.opt_b 0.00% : 0.000167s : 2: opt.transform.opt_trans_graph 0.00% : 0.000095s : 4: opt.transform.symbol_engine_opt 0.16% : 0.009831s : 1: opt_a 0.00% : 0.000245s : 1: opt_after_cconv 0.01% : 0.000707s : 1: opt_after_jit_grad 0.01% : 0.000584s : 1: opt_b 0.24% : 0.014131s : 1: optimize 0.00% : 0.000048s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000070s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000027s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000019s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000111s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000083s : 1: remove_dup_value 0.03% : 0.001624s : 1: renormalize.infer 0.02% : 0.001420s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000072s : 1: rewriter_after_opt_a 0.01% : 0.000523s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000036s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000175s : 1: symbol_engine_optimizer 96.49% : 5.783538s : 1: task_emit 0.00% : 0.000207s : 1: tuple_transform 2.56% : 0.153369s : 1: type_inference 0.00% : 0.000155s : 1: validate TotalTime = 3.25066, [24] [bootstrap]: 0.00420105 [type_inference]: 0.447794 [event_method]: 2.31e-05 [auto_monad]: 0.00014235 [graph_reusing]: 5.51e-06 [inline]: 3.53e-06 [add_attr]: 0.0213798, [1] [add_attr_with_inline]: 0.0213611, [1] [Cycle 1]: 0.00013556, [2] [tag_attr]: 3.736e-05 [meta_addattr_fg_expand]: 1.47e-05 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 5.622e-05 [insert-virtual-dataset]: 2.98998e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.69999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.00911558, [53] [py_interpret_to_execute]: 6.72002e-06 [rewriter_before_opt_a]: 0.00022381 [opt_a]: 0.00601375, [2] [Cycle 1]: 0.00510335, [45] [expand_dump_flag]: 3.56999e-06 [switch_simplify]: 5.528e-05 [loop_unroll]: 2.957e-05 [a_1]: 0.00067283 [with_stream_mark]: 1.39e-05 [recompute_prepare]: 1.204e-05 [updatestate_depend_eliminate]: 1.473e-05 [updatestate_assign_eliminate]: 1.063e-05 [updatestate_loads_eliminate]: 2.71999e-06 [parameter_eliminate]: 1.11997e-06 [a_2]: 0.0001271 [accelerated_algorithm]: 1.094e-05 [shard]: 1.95001e-06 [meta_shard_fg_expand]: 1.69e-06 [shard_inline]: 9.69e-06 [merge_send_recv]: 3.973e-05 [auto_parallel]: 8.03999e-06 [parallel]: 8.987e-05 [flash_sp]: 3.323e-05 [merge_comm]: 5.24e-06 [allreduce_fusion]: 1.176e-05 [matmul_add_comm_reduction]: 1.733e-05 [allreduce_slice_to_reducescatter]: 8.92999e-06 [virtual_shard_identity]: 1.371e-05 [virtual_dataset]: 9.76998e-06 [get_grad_eliminate_]: 1.011e-05 [virtual_output]: 1.078e-05 [merge_forward]: 3.65998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.782e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.466e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.33e-05 [set_forward_comm_id_for_comm_node_pass]: 1.156e-05 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.65002e-06 [receive_attached]: 1.639e-05 [after_resolve]: 1.634e-05 [a_after_grad]: 1.53e-05 [renormalize]: 0.00331121 [add_forward_monad_depend]: 7.5e-06 [auto_monad_grad]: 2.16998e-06 [auto_monad_eliminator]: 3.14e-05 [cse]: 6.978e-05 [a_3]: 7.234e-05 [Cycle 2]: 0.00089781, [45] [expand_dump_flag]: 2.17001e-06 [switch_simplify]: 1.008e-05 [loop_unroll]: 9.60001e-06 [a_1]: 0.00023559 [with_stream_mark]: 1.759e-05 [recompute_prepare]: 9.71998e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 1.14998e-06 [a_2]: 0.00011528 [accelerated_algorithm]: 9.70002e-06 [shard]: 2.61999e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 8.99998e-06 [merge_send_recv]: 8.1e-06 [auto_parallel]: 9.73998e-06 [parallel]: 8.07e-06 [flash_sp]: 3.81001e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 7.51999e-06 [allreduce_slice_to_reducescatter]: 1.37e-06 [virtual_shard_identity]: 1.048e-05 [virtual_dataset]: 9.23002e-06 [get_grad_eliminate_]: 9.12999e-06 [virtual_output]: 8.92999e-06 [merge_forward]: 4.79e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.698e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.266e-05 [set_forward_comm_id_for_comm_node_pass]: 3.78999e-06 [meta_fg_expand]: 3.51001e-06 [flash_sp_send_recv_attached]: 1.52999e-06 [receive_attached]: 2.57001e-06 [after_resolve]: 1.488e-05 [a_after_grad]: 1.478e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.02001e-06 [auto_monad_grad]: 1.98002e-06 [auto_monad_eliminator]: 9.46998e-06 [cse]: 2.302e-05 [a_3]: 5.759e-05 [py_interpret_to_execute_after_opt_a]: 7.7e-06 [slice_cell_reuse_recomputed_activation]: 2.56998e-06 [rewriter_after_opt_a]: 3.682e-05 [convert_after_rewriter]: 1.89e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00078057 [opt_b]: 0.00030583, [1] [Cycle 1]: 0.00029891, [7] [b_1]: 0.00020121 [b_2]: 1.138e-05 [updatestate_depend_eliminate]: 7.02002e-06 [updatestate_assign_eliminate]: 3.63999e-06 [updatestate_loads_eliminate]: 3.28e-06 [renormalize]: 6.89994e-07 [cse]: 3.496e-05 [optimize_parallel_all_gather_comm]: 3.261e-05 [overlap_param_gather]: 1.049e-05 [cconv]: 3.189e-05 [loop_unroll]: 0.00054917 [opt_after_cconv]: 0.00014611, [1] [Cycle 1]: 0.00013924, [7] [c_1]: 5.015e-05 [parameter_eliminate]: 5.81e-06 [updatestate_depend_eliminate]: 7.76001e-06 [updatestate_assign_eliminate]: 3.17002e-06 [updatestate_loads_eliminate]: 2.89999e-06 [cse]: 3.25e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 4.45e-05 [tuple_transform]: 0.00010274, [1] [Cycle 1]: 9.745e-05, [4] [d_1]: 6.496e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 4.49974e-07 [switch_simplify]: 9.79e-06 [partial_unused_args_eliminate]: 2.54001e-06 [add_recomputation]: 0.00011694 [cse_after_recomputation]: 3.4e-05, [1] [Cycle 1]: 2.831e-05, [1] [cse]: 2.014e-05 [environ_conv]: 4.719e-05 [swap_dp_allreduce_reducescatter]: 2.723e-05 [bias_add_comm_swap]: 1.159e-05 [label_micro_interleaved_index]: 1.411e-05 [label_fine_grained_interleaved_index]: 2.87002e-06 [merge_cast_opt]: 1.20001e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 8.77e-06 [full_micro_interleaved_order_control]: 9.69e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.52999e-06 [interleave_parallel_branches]: 8.99e-06 [overlap_opt_shard_in_pipeline]: 2.714e-05 [overlap_opt_shard_grad_in_pipeline]: 2.63e-06 [control_data_broadcast_order]: 1.744e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.11001e-06 [overlap_recompute_and_grad_model_parallel]: 1.34e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.50001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.66e-06 [overlap_grad_ring_attention]: 1.922e-05 [overlap_grad_flash_sp]: 4.537e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 9.61e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.949e-05, [1] [Cycle 1]: 9.355e-05, [6] [build]: 2.88e-06 [elim_shapecalc]: 1.742e-05 [elim_not_effective]: 1.693e-05 [opt_reshape]: 9.67999e-06 [fold_const_symbol]: 1.29e-05 [renormalize]: 6.00005e-07 [detach_backward]: 2.49001e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 2.646e-05 [get_jit_bprop_graph]: 2.93e-06 [rewriter_after_jit_bprop_graph]: 5.09e-06 [opt_after_jit_grad]: 0.0006443 [validate]: 9.974e-05 [backend_pass]: 8.00006e-07 [task_emit]: 2.76685 [execute]: 8.75999e-06 Sums bootstrap : 0.004201s : 0.13% type_inference : 0.447794s : 13.87% event_method : 0.000023s : 0.00% auto_monad : 0.000142s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000037s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000056s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000224s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000065s : 0.00% optimize.opt_a.loop_unroll : 0.000039s : 0.00% optimize.opt_a.a_1 : 0.000908s : 0.03% optimize.opt_a.with_stream_mark : 0.000031s : 0.00% optimize.opt_a.recompute_prepare : 0.000022s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000242s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000021s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000019s : 0.00% optimize.opt_a.merge_send_recv : 0.000048s : 0.00% optimize.opt_a.auto_parallel : 0.000018s : 0.00% optimize.opt_a.parallel : 0.000098s : 0.00% optimize.opt_a.flash_sp : 0.000037s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.00% optimize.opt_a.virtual_dataset : 0.000019s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.00% optimize.opt_a.virtual_output : 0.000020s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000031s : 0.00% optimize.opt_a.a_after_grad : 0.000030s : 0.00% optimize.opt_a.renormalize : 0.003311s : 0.10% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.00% optimize.opt_a.cse : 0.000093s : 0.00% optimize.opt_a.a_3 : 0.000130s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000037s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000781s : 0.02% optimize.opt_b.b_1 : 0.000201s : 0.01% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000032s : 0.00% optimize.loop_unroll : 0.000549s : 0.02% optimize.opt_after_cconv.c_1 : 0.000050s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.00% optimize.tuple_transform.d_1 : 0.000065s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000117s : 0.00% optimize.cse_after_recomputation.cse : 0.000020s : 0.00% optimize.environ_conv : 0.000047s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000027s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000644s : 0.02% validate : 0.000100s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.766848s : 85.71% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000239 31 0.76% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000001s : 2: substitution.fold_const_symbol 3.06% : 0.000007s : 7: substitution.graph_param_transform 81.86% : 0.000196s : 3: substitution.inline 1.51% : 0.000004s : 4: substitution.j_node_and_user_rematch 5.45% : 0.000013s : 4: substitution.remove_not_recompute_node 2.26% : 0.000005s : 6: substitution.replace_old_param 4.49% : 0.000011s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.447682 2 99.42% : 0.445094s : 1: type_inference.infer 0.58% : 0.002589s : 1: type_inference.specialize ------[replace.] 0.000058 6 62.40% : 0.000036s : 3: replace.inline 37.60% : 0.000022s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 6 95.41% : 0.000193s : 3: match.inline 4.59% : 0.000009s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000253 2039 0.93% : 0.000002s : 19: predicate.accumulaten_eliminater 1.00% : 0.000003s : 7: predicate.ad_related_special_op_eliminate 0.66% : 0.000002s : 16: predicate.addn_check_dump 0.95% : 0.000002s : 19: predicate.addn_zero_filter 0.75% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 1.95% : 0.000005s : 35: predicate.arithmetic_simplify 0.91% : 0.000002s : 19: predicate.cast_eliminate 0.72% : 0.000002s : 16: predicate.check_bprop_eliminate 0.64% : 0.000002s : 16: predicate.compare_switch_simplify 0.29% : 0.000001s : 8: predicate.const_output_eliminate 0.81% : 0.000002s : 16: predicate.depend_value_elim 0.89% : 0.000002s : 19: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.83% : 0.000002s : 19: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 15: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 7: predicate.elim_not_effective 0.45% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.38% : 0.000003s : 27: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 27: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 27: predicate.environ_get_depend_swap 1.75% : 0.000004s : 43: predicate.environ_get_eliminate 1.04% : 0.000003s : 27: predicate.environ_get_set_eliminate 1.21% : 0.000003s : 25: predicate.exchange_switch_depend_value 1.68% : 0.000004s : 25: predicate.float_depend_g_call 0.68% : 0.000002s : 16: predicate.float_environ_get_switch 0.96% : 0.000002s : 24: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 7: predicate.fold_const_symbol 1.04% : 0.000003s : 16: predicate.get_grad_eliminate 0.25% : 0.000001s : 7: predicate.graph_param_transform 0.67% : 0.000002s : 16: predicate.incorporate_call 0.61% : 0.000002s : 16: predicate.incorporate_call_switch 5.62% : 0.000014s : 92: predicate.inline 0.85% : 0.000002s : 16: predicate.inline_without_move 0.47% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 16: predicate.less_batch_normalization 2.08% : 0.000005s : 37: predicate.list_to_tuple_eliminator_ 2.30% : 0.000006s : 57: predicate.load_eliminater 1.01% : 0.000003s : 8: predicate.loop_unroll_after_grad 1.89% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 35: predicate.make_slice_get_slice_eliminator 0.75% : 0.000002s : 16: predicate.merge_addn 0.70% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.75% : 0.000002s : 19: predicate.minmaximum_grad 1.06% : 0.000003s : 8: predicate.mutable_eliminate 0.40% : 0.000001s : 7: predicate.opt_reshape 0.38% : 0.000001s : 8: predicate.parallel_virtual_node 1.40% : 0.000004s : 25: predicate.partial_defer_inline 1.43% : 0.000004s : 30: predicate.partial_eliminate 1.01% : 0.000003s : 19: predicate.print_const_string_wrapper 0.92% : 0.000002s : 16: predicate.reduce_all_const_elim 1.18% : 0.000003s : 19: predicate.reduce_eliminate 2.53% : 0.000006s : 57: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000002s : 16: predicate.remove_not_recompute_node 1.67% : 0.000004s : 38: predicate.replace_applicator 0.58% : 0.000001s : 16: predicate.replace_old_param 0.54% : 0.000001s : 8: predicate.reset_defer_inline 0.95% : 0.000002s : 19: predicate.reshape_eliminate 0.90% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 8: predicate.row_tensor_eliminate 0.88% : 0.000002s : 16: predicate.same_eliminate 0.63% : 0.000002s : 16: predicate.set_cell_output_no_recompute 1.01% : 0.000003s : 16: predicate.shard_identity_eliminate 0.82% : 0.000002s : 15: predicate.special_op_eliminate 0.79% : 0.000002s : 16: predicate.specialize_transform 0.87% : 0.000002s : 16: predicate.split_environ_get_set_with_tuple_value 0.99% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.19% : 0.000003s : 25: predicate.switch_defer_inline 1.80% : 0.000005s : 41: predicate.switch_layer_defer_inline 4.24% : 0.000011s : 85: predicate.switch_simplify 0.81% : 0.000002s : 19: predicate.tile_eliminate 0.85% : 0.000002s : 19: predicate.transpose_eliminate 1.53% : 0.000004s : 34: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000004s : 34: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 34: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000008s : 53: predicate.tuple_list_get_item_eliminator 1.48% : 0.000004s : 34: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000006s : 50: predicate.tuple_list_set_item_eliminator 1.94% : 0.000005s : 37: predicate.tuple_to_list_eliminator_ 2.23% : 0.000006s : 57: predicate.updatestate_pure_node_eliminater 3.06% : 0.000008s : 73: predicate.updatestate_useless_node_eliminater 0.45% : 0.000001s : 8: predicate.value_based_eliminate 0.82% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.92% : 0.000002s : 16: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003205 25 74.78% : 0.002396s : 20: func_graph_cloner_run.FuncGraphClonerGraph 25.22% : 0.000808s : 5: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.286330 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.65% : 0.021385s : 1: add_attr 0.65% : 0.021365s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000126s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.00% : 0.000149s : 1: auto_monad 0.00% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.13% : 0.004248s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000007s : 1: detach_backward 0.00% : 0.000052s : 1: environ_conv 0.00% : 0.000030s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.02% : 0.000559s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000794s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.05% : 0.001566s : 78: opt.transform.opt_a 0.00% : 0.000049s : 1: opt.transform.opt_after_cconv 0.00% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000181s : 28: opt.transform.opt_b 0.00% : 0.000072s : 2: opt.transform.opt_trans_graph 0.00% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.18% : 0.006018s : 1: opt_a 0.00% : 0.000150s : 1: opt_after_cconv 0.02% : 0.000657s : 1: opt_after_jit_grad 0.01% : 0.000310s : 1: opt_b 0.28% : 0.009122s : 1: optimize 0.00% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000050s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000060s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000051s : 1: remove_dup_value 0.07% : 0.002350s : 1: renormalize.infer 0.03% : 0.000952s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000041s : 1: rewriter_after_opt_a 0.01% : 0.000230s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000102s : 1: symbol_engine_optimizer 84.19% : 2.766873s : 1: task_emit 0.00% : 0.000106s : 1: tuple_transform 13.63% : 0.447818s : 1: type_inference 0.00% : 0.000137s : 1: validate TotalTime = 0.602074, [24] [bootstrap]: 0.00127782 [type_inference]: 0.208968 [event_method]: 9.881e-05 [auto_monad]: 0.00054931 [graph_reusing]: 2.146e-05 [inline]: 4.29997e-06 [add_attr]: 0.0125392, [1] [add_attr_with_inline]: 0.0125208, [1] [Cycle 1]: 0.0001804, [2] [tag_attr]: 0.0001142 [meta_addattr_fg_expand]: 2.515e-05 [parallel-infer-symbol]: 3.83999e-06 [pre_auto_parallel]: 0.00015464 [insert-virtual-dataset]: 3.45e-06 [parallel-infer-symbol-second]: 7.99977e-07 [dataset_repeat_opt]: 2.64001e-06 [pipeline_split]: 2.11998e-06 [optimize]: 0.317244, [53] [py_interpret_to_execute]: 7.71999e-06 [rewriter_before_opt_a]: 0.0007217 [opt_a]: 0.312652, [4] [Cycle 1]: 0.291681, [45] [expand_dump_flag]: 1.034e-05 [switch_simplify]: 0.00031927 [loop_unroll]: 0.00013073 [a_1]: 0.00558187 [with_stream_mark]: 5.002e-05 [recompute_prepare]: 4.281e-05 [updatestate_depend_eliminate]: 4.996e-05 [updatestate_assign_eliminate]: 1.594e-05 [updatestate_loads_eliminate]: 1.446e-05 [parameter_eliminate]: 3.80998e-06 [a_2]: 0.00040717 [accelerated_algorithm]: 6.335e-05 [shard]: 2.41e-06 [meta_shard_fg_expand]: 1.354e-05 [shard_inline]: 2.618e-05 [merge_send_recv]: 2.877e-05 [auto_parallel]: 2.088e-05 [parallel]: 4.247e-05 [flash_sp]: 1.64e-05 [merge_comm]: 1.65e-05 [allreduce_fusion]: 1.541e-05 [matmul_add_comm_reduction]: 6.559e-05 [allreduce_slice_to_reducescatter]: 1.49e-06 [virtual_shard_identity]: 2.919e-05 [virtual_dataset]: 2.579e-05 [get_grad_eliminate_]: 2.539e-05 [virtual_output]: 2.54e-05 [merge_forward]: 1.556e-05 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 3.1e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.91e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 4.496e-05 [set_forward_comm_id_for_comm_node_pass]: 1.664e-05 [meta_fg_expand]: 0.0718966 [flash_sp_send_recv_attached]: 6.78e-06 [receive_attached]: 2.66999e-06 [after_resolve]: 0.00025545 [a_after_grad]: 0.00036836 [renormalize]: 0.206553 [add_forward_monad_depend]: 9.004e-05 [auto_monad_grad]: 6.313e-05 [auto_monad_eliminator]: 0.0003829 [cse]: 0.00077072 [a_3]: 0.00364554 [Cycle 2]: 0.0153565, [45] [expand_dump_flag]: 9.38002e-06 [switch_simplify]: 0.00021309 [loop_unroll]: 0.00020443 [a_1]: 0.00557782 [with_stream_mark]: 4.423e-05 [recompute_prepare]: 3.807e-05 [updatestate_depend_eliminate]: 1.702e-05 [updatestate_assign_eliminate]: 1.781e-05 [updatestate_loads_eliminate]: 1.64e-05 [parameter_eliminate]: 4.62e-06 [a_2]: 0.00069887 [accelerated_algorithm]: 4.133e-05 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 8.77999e-06 [shard_inline]: 1.974e-05 [merge_send_recv]: 1.989e-05 [auto_parallel]: 1.838e-05 [parallel]: 9.32999e-06 [flash_sp]: 5.07999e-06 [merge_comm]: 1.141e-05 [allreduce_fusion]: 1.127e-05 [matmul_add_comm_reduction]: 1.836e-05 [allreduce_slice_to_reducescatter]: 1.20999e-06 [virtual_shard_identity]: 2.021e-05 [virtual_dataset]: 1.969e-05 [get_grad_eliminate_]: 1.921e-05 [virtual_output]: 1.874e-05 [merge_forward]: 9.65002e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 2.315e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.581e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 3.395e-05 [set_forward_comm_id_for_comm_node_pass]: 1.181e-05 [meta_fg_expand]: 0.00090657 [flash_sp_send_recv_attached]: 2.70997e-06 [receive_attached]: 3.20998e-06 [after_resolve]: 3.873e-05 [a_after_grad]: 3.249e-05 [renormalize]: 0.00638979 [add_forward_monad_depend]: 9.46e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 4.535e-05 [cse]: 0.00022115 [a_3]: 0.00015924 [Cycle 3]: 0.00398705, [45] [expand_dump_flag]: 2.63998e-06 [switch_simplify]: 2.309e-05 [loop_unroll]: 1.929e-05 [a_1]: 0.0015468 [with_stream_mark]: 2.948e-05 [recompute_prepare]: 2.328e-05 [updatestate_depend_eliminate]: 4.818e-05 [updatestate_assign_eliminate]: 1.085e-05 [updatestate_loads_eliminate]: 9.44e-06 [parameter_eliminate]: 2.64001e-06 [a_2]: 0.00025449 [accelerated_algorithm]: 2.236e-05 [shard]: 2.69001e-06 [meta_shard_fg_expand]: 4.99e-06 [shard_inline]: 1.622e-05 [merge_send_recv]: 1.767e-05 [auto_parallel]: 1.73e-05 [parallel]: 1.077e-05 [flash_sp]: 1.96998e-06 [merge_comm]: 9.34998e-06 [allreduce_fusion]: 9.51998e-06 [matmul_add_comm_reduction]: 1.669e-05 [allreduce_slice_to_reducescatter]: 1.14e-06 [virtual_shard_identity]: 1.778e-05 [virtual_dataset]: 1.6e-05 [get_grad_eliminate_]: 1.554e-05 [virtual_output]: 1.619e-05 [merge_forward]: 9.32001e-06 [cell_reuse_recompute_pass]: 3.3e-06 [offload_activation]: 2.144e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.288e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 2.872e-05 [set_forward_comm_id_for_comm_node_pass]: 9.96e-06 [meta_fg_expand]: 6.84999e-06 [flash_sp_send_recv_attached]: 2.07999e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 2.485e-05 [a_after_grad]: 2.602e-05 [renormalize]: 0.00113839 [add_forward_monad_depend]: 6.31e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 2.721e-05 [cse]: 0.0001047 [a_3]: 0.00012052 [Cycle 4]: 0.00160146, [45] [expand_dump_flag]: 1.52001e-06 [switch_simplify]: 1.724e-05 [loop_unroll]: 1.731e-05 [a_1]: 0.00047093 [with_stream_mark]: 1.736e-05 [recompute_prepare]: 1.612e-05 [updatestate_depend_eliminate]: 9.71998e-06 [updatestate_assign_eliminate]: 9.17001e-06 [updatestate_loads_eliminate]: 9.09e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00024068 [accelerated_algorithm]: 2.004e-05 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 3.51999e-06 [shard_inline]: 1.587e-05 [merge_send_recv]: 1.236e-05 [auto_parallel]: 1.252e-05 [parallel]: 8.10999e-06 [flash_sp]: 1.30001e-06 [merge_comm]: 9.20999e-06 [allreduce_fusion]: 9.23002e-06 [matmul_add_comm_reduction]: 1.378e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.8e-05 [virtual_dataset]: 1.545e-05 [get_grad_eliminate_]: 1.518e-05 [virtual_output]: 1.594e-05 [merge_forward]: 9.62001e-06 [cell_reuse_recompute_pass]: 2.46e-06 [offload_activation]: 1.642e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.883e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 2.644e-05 [set_forward_comm_id_for_comm_node_pass]: 9.25999e-06 [meta_fg_expand]: 6.73e-06 [flash_sp_send_recv_attached]: 1.29998e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 2.117e-05 [a_after_grad]: 2.473e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 1.45001e-06 [auto_monad_eliminator]: 2.058e-05 [cse]: 5.243e-05 [a_3]: 0.00018579 [py_interpret_to_execute_after_opt_a]: 8.33001e-06 [slice_cell_reuse_recomputed_activation]: 2.47001e-06 [rewriter_after_opt_a]: 5.933e-05 [convert_after_rewriter]: 1.33002e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00116421 [opt_b]: 0.00059069, [1] [Cycle 1]: 0.00058263, [7] [b_1]: 0.0004297 [b_2]: 1.808e-05 [updatestate_depend_eliminate]: 1.42e-05 [updatestate_assign_eliminate]: 8.54e-06 [updatestate_loads_eliminate]: 9.36998e-06 [renormalize]: 5.79981e-07 [cse]: 6.316e-05 [optimize_parallel_all_gather_comm]: 3.127e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 3.356e-05 [loop_unroll]: 0.00051516 [opt_after_cconv]: 0.00022003, [1] [Cycle 1]: 0.00021384, [7] [c_1]: 9.318e-05 [parameter_eliminate]: 3.26999e-06 [updatestate_depend_eliminate]: 1.16e-05 [updatestate_assign_eliminate]: 8.27e-06 [updatestate_loads_eliminate]: 8.30999e-06 [cse]: 5.393e-05 [renormalize]: 2.20025e-07 [remove_dup_value]: 6.985e-05 [tuple_transform]: 0.00019673, [1] [Cycle 1]: 0.00019152, [4] [d_1]: 0.00015193 [none_parameter_eliminate]: 1.99e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 1.802e-05 [partial_unused_args_eliminate]: 2.29001e-06 [add_recomputation]: 0.0002537 [cse_after_recomputation]: 6.275e-05, [1] [Cycle 1]: 5.602e-05, [1] [cse]: 4.862e-05 [environ_conv]: 1.975e-05 [swap_dp_allreduce_reducescatter]: 1.451e-05 [bias_add_comm_swap]: 3.55e-06 [label_micro_interleaved_index]: 5.94e-06 [label_fine_grained_interleaved_index]: 2.98e-06 [merge_cast_opt]: 1.89999e-06 [slice_recompute_activation]: 2.48998e-06 [micro_interleaved_order_control]: 2.47001e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.37001e-06 [reorder_send_recv_between_fp_bp]: 3.09001e-06 [comm_op_add_attrs]: 1.34998e-06 [add_comm_op_reuse_tag]: 1.32e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.42e-06 [overlap_opt_shard_in_pipeline]: 1.82999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.03997e-06 [control_data_broadcast_order]: 4.032e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 1.025e-05 [overlap_recompute_and_grad_model_parallel]: 1.107e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.52001e-06 [overlap_grad_ring_attention]: 1.007e-05 [overlap_grad_flash_sp]: 5.005e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.75002e-06 [split_layernorm_comm]: 1.68002e-06 [handle_group_info]: 1.04998e-06 [symbol_engine_optimizer]: 0.00016977, [1] [Cycle 1]: 0.00016461, [6] [build]: 1.837e-05 [elim_shapecalc]: 2.338e-05 [elim_not_effective]: 3.632e-05 [opt_reshape]: 2.054e-05 [fold_const_symbol]: 3.291e-05 [renormalize]: 1.50001e-07 [detach_backward]: 2.29999e-06 [pipeline_parallel_scheduler]: 1.971e-05 [auto_monad_reorder]: 6.482e-05 [get_jit_bprop_graph]: 2.06e-06 [rewriter_after_jit_bprop_graph]: 5.69e-06 [opt_after_jit_grad]: 0.00059997 [validate]: 8.361e-05 [backend_pass]: 1.45999e-06 [task_emit]: 0.0600695 [execute]: 1.124e-05 Sums bootstrap : 0.001278s : 0.22% type_inference : 0.208968s : 35.56% event_method : 0.000099s : 0.02% auto_monad : 0.000549s : 0.09% graph_reusing : 0.000021s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000114s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000155s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000722s : 0.12% optimize.opt_a.expand_dump_flag : 0.000024s : 0.00% optimize.opt_a.switch_simplify : 0.000573s : 0.10% optimize.opt_a.loop_unroll : 0.000372s : 0.06% optimize.opt_a.a_1 : 0.013177s : 2.24% optimize.opt_a.with_stream_mark : 0.000141s : 0.02% optimize.opt_a.recompute_prepare : 0.000120s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000125s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000054s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000049s : 0.01% optimize.opt_a.parameter_eliminate : 0.000012s : 0.00% optimize.opt_a.a_2 : 0.001601s : 0.27% optimize.opt_a.accelerated_algorithm : 0.000147s : 0.03% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000031s : 0.01% optimize.opt_a.shard_inline : 0.000078s : 0.01% optimize.opt_a.merge_send_recv : 0.000079s : 0.01% optimize.opt_a.auto_parallel : 0.000069s : 0.01% optimize.opt_a.parallel : 0.000071s : 0.01% optimize.opt_a.flash_sp : 0.000025s : 0.00% optimize.opt_a.merge_comm : 0.000046s : 0.01% optimize.opt_a.allreduce_fusion : 0.000045s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000114s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000005s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000085s : 0.01% optimize.opt_a.virtual_dataset : 0.000077s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000075s : 0.01% optimize.opt_a.virtual_output : 0.000076s : 0.01% optimize.opt_a.merge_forward : 0.000044s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000009s : 0.00% optimize.opt_a.offload_activation : 0.000092s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000147s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000134s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000048s : 0.01% optimize.opt_a.meta_fg_expand : 0.072817s : 12.39% optimize.opt_a.flash_sp_send_recv_attached : 0.000013s : 0.00% optimize.opt_a.receive_attached : 0.000011s : 0.00% optimize.opt_a.after_resolve : 0.000340s : 0.06% optimize.opt_a.a_after_grad : 0.000452s : 0.08% optimize.opt_a.renormalize : 0.214082s : 36.43% optimize.opt_a.add_forward_monad_depend : 0.000107s : 0.02% optimize.opt_a.auto_monad_grad : 0.000069s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000476s : 0.08% optimize.opt_a.cse : 0.001149s : 0.20% optimize.opt_a.a_3 : 0.004111s : 0.70% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000059s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.001164s : 0.20% optimize.opt_b.b_1 : 0.000430s : 0.07% optimize.opt_b.b_2 : 0.000018s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000063s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.01% optimize.loop_unroll : 0.000515s : 0.09% optimize.opt_after_cconv.c_1 : 0.000093s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000054s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000070s : 0.01% optimize.tuple_transform.d_1 : 0.000152s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000018s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000254s : 0.04% optimize.cse_after_recomputation.cse : 0.000049s : 0.01% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000015s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000040s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000036s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000033s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000020s : 0.00% auto_monad_reorder : 0.000065s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000600s : 0.10% validate : 0.000084s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.060069s : 10.22% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.005782 750 0.29% : 0.000017s : 1: substitution.arithmetic_simplify 0.43% : 0.000025s : 8: substitution.depend_value_elim 0.11% : 0.000006s : 10: substitution.elim_not_effective 0.20% : 0.000012s : 14: substitution.float_depend_g_call 0.47% : 0.000027s : 23: substitution.float_tuple_getitem_switch 0.09% : 0.000005s : 10: substitution.fold_const_symbol 34.02% : 0.001967s : 9: substitution.getattr_setattr_resolve 0.20% : 0.000011s : 13: substitution.graph_param_transform 0.05% : 0.000003s : 2: substitution.incorporate_call 0.04% : 0.000002s : 2: substitution.incorporate_call_switch 41.96% : 0.002426s : 45: substitution.inline 1.14% : 0.000066s : 8: substitution.inline_without_move 0.42% : 0.000024s : 50: substitution.j_node_and_user_rematch 0.88% : 0.000051s : 5: substitution.less_batch_normalization 0.99% : 0.000057s : 36: substitution.minmaximum_grad 0.55% : 0.000032s : 14: substitution.partial_eliminate 0.57% : 0.000033s : 50: substitution.remove_not_recompute_node 3.67% : 0.000212s : 86: substitution.replace_applicator 0.52% : 0.000030s : 40: substitution.replace_old_param 0.14% : 0.000008s : 2: substitution.set_cell_output_no_recompute 0.38% : 0.000022s : 8: substitution.switch_simplify 3.35% : 0.000194s : 42: substitution.tuple_list_convert_item_index_to_positive 1.00% : 0.000058s : 44: substitution.tuple_list_get_item_const_eliminator 1.60% : 0.000093s : 44: substitution.tuple_list_get_item_depend_reorder 3.68% : 0.000213s : 83: substitution.tuple_list_get_item_eliminator 1.38% : 0.000080s : 44: substitution.tuple_list_get_set_item_eliminator 0.69% : 0.000040s : 24: substitution.updatestate_pure_node_eliminater 1.19% : 0.000069s : 33: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.207885 2 96.45% : 0.200512s : 1: type_inference.infer 3.55% : 0.007373s : 1: type_inference.specialize ------[replace.] 0.001698 99 1.21% : 0.000020s : 1: replace.arithmetic_simplify 8.31% : 0.000141s : 7: replace.getattr_setattr_resolve 35.00% : 0.000594s : 45: replace.inline 14.13% : 0.000240s : 15: replace.replace_applicator 6.02% : 0.000102s : 8: replace.switch_simplify 1.21% : 0.000021s : 2: replace.tuple_list_get_item_depend_reorder 31.95% : 0.000542s : 20: replace.tuple_list_get_item_eliminator 2.17% : 0.000037s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.004429 99 0.35% : 0.000016s : 1: match.arithmetic_simplify 41.98% : 0.001859s : 7: match.getattr_setattr_resolve 54.13% : 0.002397s : 45: match.inline 1.53% : 0.000068s : 15: match.replace_applicator 0.38% : 0.000017s : 8: match.switch_simplify 0.34% : 0.000015s : 2: match.tuple_list_get_item_depend_reorder 1.11% : 0.000049s : 20: match.tuple_list_get_item_eliminator 0.18% : 0.000008s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.002448 16344 0.92% : 0.000023s : 155: predicate.accumulaten_eliminater 0.17% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 0.46% : 0.000011s : 87: predicate.addn_check_dump 1.00% : 0.000025s : 155: predicate.addn_zero_filter 0.86% : 0.000021s : 155: predicate.adjust_all_reduce_mul_add 1.93% : 0.000047s : 237: predicate.arithmetic_simplify 0.89% : 0.000022s : 156: predicate.cast_eliminate 3.00% : 0.000073s : 554: predicate.check_bprop_eliminate 0.48% : 0.000012s : 87: predicate.compare_switch_simplify 0.04% : 0.000001s : 13: predicate.const_output_eliminate 0.48% : 0.000012s : 81: predicate.depend_value_elim 0.94% : 0.000023s : 156: predicate.dict_get_item_const_eliminator 1.00% : 0.000025s : 156: predicate.dict_get_item_eliminator 0.96% : 0.000024s : 156: predicate.dict_set_item_eliminator 0.21% : 0.000005s : 26: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 13: predicate.elim_not_effective 0.09% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 0.94% : 0.000023s : 169: predicate.environ_add_const_eliminate 0.92% : 0.000023s : 169: predicate.environ_get_add_eliminate 0.99% : 0.000024s : 169: predicate.environ_get_depend_swap 1.49% : 0.000036s : 250: predicate.environ_get_eliminate 0.93% : 0.000023s : 169: predicate.environ_get_set_eliminate 1.31% : 0.000032s : 223: predicate.exchange_switch_depend_value 1.76% : 0.000043s : 223: predicate.float_depend_g_call 0.48% : 0.000012s : 87: predicate.float_environ_get_switch 0.64% : 0.000016s : 100: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 13: predicate.fold_const_symbol 0.43% : 0.000011s : 65: predicate.get_grad_eliminate 0.61% : 0.000015s : 51: predicate.getattr_setattr_resolve 0.05% : 0.000001s : 13: predicate.graph_param_transform 0.45% : 0.000011s : 81: predicate.incorporate_call 0.42% : 0.000010s : 81: predicate.incorporate_call_switch 4.19% : 0.000103s : 556: predicate.inline 1.69% : 0.000041s : 222: predicate.inline_without_move 0.19% : 0.000005s : 65: predicate.j_node_and_user_rematch 0.58% : 0.000014s : 68: predicate.less_batch_normalization 1.24% : 0.000030s : 204: predicate.list_to_tuple_eliminator_ 2.00% : 0.000049s : 359: predicate.load_eliminater 0.17% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.36% : 0.000058s : 410: predicate.loop_unroll_before_grad 2.02% : 0.000049s : 184: predicate.make_slice_get_slice_eliminator 0.50% : 0.000012s : 87: predicate.merge_addn 2.73% : 0.000067s : 500: predicate.micro_step_allgather_replace 2.79% : 0.000068s : 500: predicate.mini_step_allgather_replace 0.92% : 0.000022s : 156: predicate.minmaximum_grad 0.19% : 0.000005s : 13: predicate.mutable_eliminate 0.08% : 0.000002s : 13: predicate.opt_reshape 0.10% : 0.000002s : 13: predicate.parallel_virtual_node 2.19% : 0.000054s : 223: predicate.partial_defer_inline 1.20% : 0.000029s : 191: predicate.partial_eliminate 0.95% : 0.000023s : 155: predicate.print_const_string_wrapper 0.50% : 0.000012s : 81: predicate.reduce_all_const_elim 1.32% : 0.000032s : 156: predicate.reduce_eliminate 1.99% : 0.000049s : 359: predicate.redundant_stop_gradient_eliminater 0.19% : 0.000005s : 65: predicate.remove_not_recompute_node 2.47% : 0.000061s : 708: predicate.replace_applicator 0.70% : 0.000017s : 222: predicate.replace_old_param 0.04% : 0.000001s : 13: predicate.reset_defer_inline 0.98% : 0.000024s : 156: predicate.reshape_eliminate 2.94% : 0.000072s : 500: predicate.row_tensor_add_zeros_like 0.10% : 0.000002s : 13: predicate.row_tensor_eliminate 3.28% : 0.000080s : 554: predicate.same_eliminate 0.26% : 0.000006s : 77: predicate.set_cell_output_no_recompute 0.44% : 0.000011s : 65: predicate.shard_identity_eliminate 0.21% : 0.000005s : 26: predicate.special_op_eliminate 0.57% : 0.000014s : 87: predicate.specialize_transform 2.89% : 0.000071s : 500: predicate.split_environ_get_set_with_tuple_value 1.49% : 0.000036s : 222: predicate.stack_unstack_eliminate 0.08% : 0.000002s : 13: predicate.switch_call_monad_eliminater 1.44% : 0.000035s : 223: predicate.switch_defer_inline 4.55% : 0.000111s : 777: predicate.switch_layer_defer_inline 4.57% : 0.000112s : 749: predicate.switch_simplify 1.04% : 0.000025s : 156: predicate.tile_eliminate 0.94% : 0.000023s : 156: predicate.transpose_eliminate 1.20% : 0.000029s : 182: predicate.tuple_list_convert_item_index_to_positive 1.22% : 0.000030s : 184: predicate.tuple_list_get_item_const_eliminator 1.18% : 0.000029s : 184: predicate.tuple_list_get_item_depend_reorder 2.33% : 0.000057s : 285: predicate.tuple_list_get_item_eliminator 1.26% : 0.000031s : 184: predicate.tuple_list_get_set_item_eliminator 1.83% : 0.000045s : 265: predicate.tuple_list_set_item_eliminator 1.26% : 0.000031s : 204: predicate.tuple_to_list_eliminator_ 2.00% : 0.000049s : 359: predicate.updatestate_pure_node_eliminater 2.52% : 0.000062s : 442: predicate.updatestate_useless_node_eliminater 0.10% : 0.000002s : 13: predicate.value_based_eliminate 0.42% : 0.000010s : 65: predicate.virtual_dataset_eliminate 0.43% : 0.000011s : 65: predicate.virtual_output_eliminate 0.07% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.09% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.016539 177 63.75% : 0.010544s : 88: func_graph_cloner_run.FuncGraphClonerGraph 1.47% : 0.000243s : 3: func_graph_cloner_run.FuncGraphClonerNode 34.78% : 0.005752s : 86: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.170206 307 0.00% : 0.000004s : 1: ForceFp32Comm 1.07% : 0.012547s : 1: add_attr 1.07% : 0.012526s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000261s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000562s : 1: auto_monad 0.01% : 0.000070s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.11% : 0.001331s : 1: bootstrap 0.00% : 0.000037s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000044s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000066s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.01% : 0.000109s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000027s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000008s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.04% : 0.000525s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.10% : 0.001178s : 1: mutable_eliminate 0.00% : 0.000014s : 1: offloading_packed_experts 0.00% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000033s : 1: opt.transform.mutable_eliminate 1.82% : 0.021285s : 181: opt.transform.opt_a 0.01% : 0.000091s : 1: opt.transform.opt_after_cconv 0.01% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000414s : 28: opt.transform.opt_b 0.19% : 0.002255s : 4: opt.transform.opt_resolve 0.01% : 0.000167s : 2: opt.transform.opt_trans_graph 0.01% : 0.000109s : 4: opt.transform.symbol_engine_opt 26.72% : 0.312657s : 1: opt_a 0.02% : 0.000223s : 1: opt_after_cconv 0.05% : 0.000612s : 1: opt_after_jit_grad 0.05% : 0.000594s : 1: opt_b 27.11% : 0.317251s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000025s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000161s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000074s : 1: remove_dup_value 14.16% : 0.165668s : 3: renormalize.infer 4.13% : 0.048371s : 3: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000064s : 1: rewriter_after_opt_a 0.06% : 0.000733s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000173s : 1: symbol_engine_optimizer 5.14% : 0.060093s : 1: task_emit 0.02% : 0.000200s : 1: tuple_transform 17.86% : 0.209010s : 1: type_inference 0.01% : 0.000141s : 1: validate TotalTime = 0.533665, [33] [bootstrap]: 0.00063042 [type_inference]: 0.385677 [event_method]: 0.00241751 [auto_monad]: 0.0001711 [graph_reusing]: 8.07e-06 [pre_auto_parallel]: 3.33e-06 [py_interpret_to_execute]: 5.7e-05 [rewriter_before_opt_a]: 0.00018773 [expand_dump_flag]: 3.49001e-06 [jit_opt_a]: 0.0283009, [3] [Cycle 1]: 0.0171462, [27] [switch_simplify]: 0.00010939 [loop_unroll]: 7.026e-05 [a_1]: 0.00173908 [with_stream_mark]: 4.125e-05 [recompute_prepare]: 3.424e-05 [updatestate_depend_eliminate]: 1.236e-05 [updatestate_assign_eliminate]: 9.89999e-06 [updatestate_loads_eliminate]: 9.45001e-06 [parameter_eliminate]: 4.05e-06 [specialize_transform]: 2.371e-05 [updatestate_useless_node_eliminater]: 2.09e-05 [accelerated_algorithm]: 8.317e-05 [meta_shard_fg_expand]: 5.70001e-06 [get_grad_eliminate_]: 2.343e-05 [merge_forward]: 1.232e-05 [cell_reuse_recompute_pass]: 1.10001e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.113e-05 [j_node_and_user_rematch]: 3.606e-05 [meta_fg_expand]: 0.00264994 [replace_old_param]: 9.748e-05 [inline_without_move]: 8.504e-05 [renormalize]: 0.0110802 [add_forward_monad_depend]: 2.815e-05 [auto_monad_grad]: 9.29998e-06 [auto_monad_eliminator]: 0.00010485 [cse]: 0.00038236 [replace_applicator]: 0.00013861 [Cycle 2]: 0.00558575, [27] [switch_simplify]: 0.00014775 [loop_unroll]: 8.102e-05 [a_1]: 0.00235721 [with_stream_mark]: 2.638e-05 [recompute_prepare]: 2.248e-05 [updatestate_depend_eliminate]: 1.043e-05 [updatestate_assign_eliminate]: 8.77e-06 [updatestate_loads_eliminate]: 8.1e-06 [parameter_eliminate]: 2.67001e-06 [specialize_transform]: 1.927e-05 [updatestate_useless_node_eliminater]: 1.947e-05 [accelerated_algorithm]: 2.687e-05 [meta_shard_fg_expand]: 5.32001e-06 [get_grad_eliminate_]: 1.926e-05 [merge_forward]: 9.66998e-06 [cell_reuse_recompute_pass]: 1.06997e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.378e-05 [j_node_and_user_rematch]: 2.967e-05 [meta_fg_expand]: 0.00021769 [replace_old_param]: 3e-05 [inline_without_move]: 2.014e-05 [renormalize]: 0.00197909 [add_forward_monad_depend]: 7.93001e-06 [auto_monad_grad]: 2.58e-06 [auto_monad_eliminator]: 3.086e-05 [cse]: 0.00023609 [replace_applicator]: 3.32e-05 [Cycle 3]: 0.00111089, [27] [switch_simplify]: 2.144e-05 [loop_unroll]: 1.92e-05 [a_1]: 0.00057012 [with_stream_mark]: 2.14e-05 [recompute_prepare]: 1.899e-05 [updatestate_depend_eliminate]: 1.014e-05 [updatestate_assign_eliminate]: 8.77999e-06 [updatestate_loads_eliminate]: 7.45e-06 [parameter_eliminate]: 1.51998e-06 [specialize_transform]: 1.879e-05 [updatestate_useless_node_eliminater]: 1.796e-05 [accelerated_algorithm]: 2.463e-05 [meta_shard_fg_expand]: 3.55e-06 [get_grad_eliminate_]: 1.812e-05 [merge_forward]: 8.74998e-06 [cell_reuse_recompute_pass]: 2.29999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.233e-05 [j_node_and_user_rematch]: 2.883e-05 [meta_fg_expand]: 6.53998e-06 [replace_old_param]: 2.389e-05 [inline_without_move]: 1.877e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.20002e-06 [auto_monad_grad]: 1.42e-06 [auto_monad_eliminator]: 1.825e-05 [cse]: 6.159e-05 [replace_applicator]: 1.983e-05 [py_interpret_to_execute_after_opt_a]: 2.396e-05 [rewriter_after_opt_a]: 0.00011394 [convert_after_rewriter]: 1.502e-05 [order_py_execute_after_rewriter]: 1.048e-05 [mutable_eliminate]: 0.00077638 [jit_opt_b]: 0.00016597, [1] [Cycle 1]: 0.00015793, [2] [frontend_op_eliminate]: 5.601e-05 [inline_after_opt_a]: 8.852e-05 [cconv]: 3.302e-05 [loop_unroll]: 0.00051591 [jit_opt_after_cconv]: 0.00038495, [1] [Cycle 1]: 0.00037765, [11] [c_1]: 0.00010044 [parameter_eliminate]: 3.33e-06 [updatestate_depend_eliminate]: 1.419e-05 [updatestate_assign_eliminate]: 8.84e-06 [updatestate_loads_eliminate]: 7.88001e-06 [cse]: 7.091e-05 [call_graph_tuple_transform]: 5.488e-05 [tuple_list_get_item_eliminator]: 3.282e-05 [none_parameter_eliminate]: 2.02001e-06 [renormalize]: 1.11997e-06 [switch_simplify]: 2.104e-05 [remove_dup_value]: 8.911e-05 [partial_unused_args_eliminate]: 2.69001e-06 [environ_conv]: 1.612e-05 [add_recomputation]: 9.793e-05 [cse_after_recomputation]: 5.683e-05, [1] [Cycle 1]: 5.025e-05, [1] [cse]: 4.266e-05 [auto_monad_reorder]: 3.414e-05 [get_jit_bprop_graph]: 2.17001e-06 [rewriter_after_jit_bprop_graph]: 6.33e-06 [opt_after_jit_grad]: 0.00053768 [symbol_engine_optimizer]: 0.00016363, [1] [Cycle 1]: 0.00015702, [6] [build]: 1.413e-05 [elim_shapecalc]: 2.251e-05 [elim_not_effective]: 3.721e-05 [opt_reshape]: 2.062e-05 [fold_const_symbol]: 3.24e-05 [renormalize]: 5.8001e-07 [validate]: 9.685e-05 [backend_pass]: 1.63002e-06 [task_emit]: 0.112745 [execute]: 9.64e-06 Sums bootstrap : 0.000630s : 0.12% type_inference : 0.385677s : 73.03% event_method : 0.002418s : 0.46% auto_monad : 0.000171s : 0.03% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000057s : 0.01% rewriter_before_opt_a : 0.000188s : 0.04% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000279s : 0.05% jit_opt_a.loop_unroll : 0.000170s : 0.03% jit_opt_a.a_1 : 0.004666s : 0.88% jit_opt_a.with_stream_mark : 0.000089s : 0.02% jit_opt_a.recompute_prepare : 0.000076s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000033s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000027s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000025s : 0.00% jit_opt_a.parameter_eliminate : 0.000008s : 0.00% jit_opt_a.specialize_transform : 0.000062s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000058s : 0.01% jit_opt_a.accelerated_algorithm : 0.000135s : 0.03% jit_opt_a.meta_shard_fg_expand : 0.000015s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000061s : 0.01% jit_opt_a.merge_forward : 0.000031s : 0.01% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000107s : 0.02% jit_opt_a.j_node_and_user_rematch : 0.000095s : 0.02% jit_opt_a.meta_fg_expand : 0.002874s : 0.54% jit_opt_a.replace_old_param : 0.000151s : 0.03% jit_opt_a.inline_without_move : 0.000124s : 0.02% jit_opt_a.renormalize : 0.013059s : 2.47% jit_opt_a.add_forward_monad_depend : 0.000038s : 0.01% jit_opt_a.auto_monad_grad : 0.000013s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000154s : 0.03% jit_opt_a.cse : 0.000680s : 0.13% jit_opt_a.replace_applicator : 0.000192s : 0.04% py_interpret_to_execute_after_opt_a : 0.000024s : 0.00% rewriter_after_opt_a : 0.000114s : 0.02% convert_after_rewriter : 0.000015s : 0.00% order_py_execute_after_rewriter : 0.000010s : 0.00% mutable_eliminate : 0.000776s : 0.15% jit_opt_b.frontend_op_eliminate : 0.000056s : 0.01% jit_opt_b.inline_after_opt_a : 0.000089s : 0.02% cconv : 0.000033s : 0.01% loop_unroll : 0.000516s : 0.10% jit_opt_after_cconv.c_1 : 0.000100s : 0.02% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% jit_opt_after_cconv.cse : 0.000071s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000055s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000033s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000021s : 0.00% remove_dup_value : 0.000089s : 0.02% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000016s : 0.00% add_recomputation : 0.000098s : 0.02% cse_after_recomputation.cse : 0.000043s : 0.01% auto_monad_reorder : 0.000034s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000538s : 0.10% symbol_engine_optimizer.build : 0.000014s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000037s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000021s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000032s : 0.01% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000097s : 0.02% backend_pass : 0.000002s : 0.00% task_emit : 0.112745s : 21.35% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001195 275 0.44% : 0.000005s : 9: substitution.elim_not_effective 0.40% : 0.000005s : 9: substitution.fold_const_symbol 1.16% : 0.000014s : 17: substitution.graph_param_transform 53.49% : 0.000639s : 16: substitution.inline 2.08% : 0.000025s : 2: substitution.inline_without_move 1.34% : 0.000016s : 29: substitution.j_node_and_user_rematch 5.02% : 0.000060s : 3: substitution.less_batch_normalization 2.79% : 0.000033s : 23: substitution.minmaximum_grad 2.40% : 0.000029s : 5: substitution.partial_eliminate 1.62% : 0.000019s : 29: substitution.remove_not_recompute_node 3.03% : 0.000036s : 12: substitution.replace_applicator 1.35% : 0.000016s : 21: substitution.replace_old_param 0.30% : 0.000004s : 1: substitution.set_cell_output_no_recompute 2.00% : 0.000024s : 2: substitution.switch_simplify 7.00% : 0.000084s : 23: substitution.tuple_list_convert_item_index_to_positive 3.59% : 0.000043s : 23: substitution.tuple_list_get_item_depend_reorder 12.00% : 0.000143s : 51: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.385504 2 98.53% : 0.379846s : 1: type_inference.infer 1.47% : 0.005658s : 1: type_inference.specialize ------[replace.] 0.000359 40 48.33% : 0.000174s : 16: replace.inline 8.35% : 0.000030s : 2: replace.switch_simplify 43.32% : 0.000156s : 22: replace.tuple_list_get_item_eliminator ------[match.] 0.000697 40 90.08% : 0.000628s : 16: match.inline 3.11% : 0.000022s : 2: match.switch_simplify 6.81% : 0.000047s : 22: match.tuple_list_get_item_eliminator ------[predicate.] 0.000884 6056 1.31% : 0.000012s : 96: predicate.accumulaten_eliminater 0.41% : 0.000004s : 16: predicate.ad_related_special_op_eliminate 1.22% : 0.000011s : 96: predicate.addn_check_dump 1.40% : 0.000012s : 96: predicate.addn_zero_filter 9.48% : 0.000084s : 96: predicate.arithmetic_simplify 1.30% : 0.000011s : 96: predicate.cast_eliminate 0.25% : 0.000002s : 17: predicate.check_bprop_eliminate 1.30% : 0.000011s : 96: predicate.compare_switch_simplify 1.33% : 0.000012s : 96: predicate.depend_value_elim 1.24% : 0.000011s : 96: predicate.dict_get_item_const_eliminator 1.34% : 0.000012s : 96: predicate.dict_get_item_eliminator 1.27% : 0.000011s : 96: predicate.dict_set_item_eliminator 0.28% : 0.000002s : 16: predicate.dumpgradient_eliminate 0.15% : 0.000001s : 16: predicate.elim_not_effective 0.29% : 0.000003s : 16: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000011s : 96: predicate.environ_add_const_eliminate 1.24% : 0.000011s : 96: predicate.environ_get_add_eliminate 1.25% : 0.000011s : 96: predicate.environ_get_depend_swap 1.28% : 0.000011s : 96: predicate.environ_get_eliminate 1.29% : 0.000011s : 96: predicate.environ_get_set_eliminate 0.14% : 0.000001s : 16: predicate.fold_const_symbol 0.86% : 0.000008s : 55: predicate.get_grad_eliminate 0.15% : 0.000001s : 17: predicate.graph_param_transform 3.98% : 0.000035s : 168: predicate.inline 1.54% : 0.000014s : 84: predicate.inline_without_move 0.48% : 0.000004s : 55: predicate.j_node_and_user_rematch 1.07% : 0.000010s : 55: predicate.less_batch_normalization 1.69% : 0.000015s : 118: predicate.list_to_tuple_eliminator_ 1.98% : 0.000018s : 135: predicate.load_eliminater 0.51% : 0.000005s : 17: predicate.loop_unroll_after_grad 2.83% : 0.000025s : 177: predicate.loop_unroll_before_grad 1.62% : 0.000014s : 113: predicate.make_slice_get_slice_eliminator 1.26% : 0.000011s : 96: predicate.merge_addn 1.34% : 0.000012s : 96: predicate.minmaximum_grad 0.48% : 0.000004s : 17: predicate.mutable_eliminate 0.27% : 0.000002s : 16: predicate.opt_reshape 2.45% : 0.000022s : 135: predicate.partial_eliminate 1.33% : 0.000012s : 96: predicate.print_const_string_wrapper 1.81% : 0.000016s : 96: predicate.reduce_eliminate 1.74% : 0.000015s : 118: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000004s : 55: predicate.remove_not_recompute_node 2.85% : 0.000025s : 238: predicate.replace_applicator 0.87% : 0.000008s : 84: predicate.replace_old_param 0.20% : 0.000002s : 17: predicate.reset_defer_inline 1.33% : 0.000012s : 96: predicate.reshape_eliminate 1.29% : 0.000011s : 96: predicate.row_tensor_add_zeros_like 0.29% : 0.000003s : 17: predicate.row_tensor_eliminate 1.39% : 0.000012s : 96: predicate.same_eliminate 0.57% : 0.000005s : 55: predicate.set_cell_output_no_recompute 0.52% : 0.000005s : 33: predicate.special_op_eliminate 0.88% : 0.000008s : 55: predicate.specialize_transform 1.64% : 0.000015s : 96: predicate.split_environ_get_set_with_tuple_value 1.39% : 0.000012s : 96: predicate.stack_unstack_eliminate 0.29% : 0.000003s : 17: predicate.switch_call_monad_eliminater 2.56% : 0.000023s : 134: predicate.switch_defer_inline 2.05% : 0.000018s : 134: predicate.switch_layer_defer_inline 5.43% : 0.000048s : 332: predicate.switch_simplify 1.31% : 0.000012s : 96: predicate.tile_eliminate 1.28% : 0.000011s : 96: predicate.transpose_eliminate 1.63% : 0.000014s : 96: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000014s : 96: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000030s : 152: predicate.tuple_list_get_item_eliminator 1.70% : 0.000015s : 96: predicate.tuple_list_set_item_eliminator 1.67% : 0.000015s : 118: predicate.tuple_to_list_eliminator_ 1.91% : 0.000017s : 135: predicate.updatestate_pure_node_eliminater 2.91% : 0.000026s : 190: predicate.updatestate_useless_node_eliminater 1.73% : 0.000015s : 96: predicate.value_based_eliminate 0.21% : 0.000002s : 16: predicate.virtual_view_grad_eliminate 0.28% : 0.000002s : 17: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005358 51 75.19% : 0.004028s : 31: func_graph_cloner_run.FuncGraphClonerGraph 24.81% : 0.001329s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.553332 91 0.02% : 0.000101s : 1: add_recomputation 0.03% : 0.000178s : 1: auto_monad 0.01% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.12% : 0.000653s : 1: bootstrap 0.01% : 0.000036s : 1: cconv 0.00% : 0.000017s : 1: convert_after_rewriter 0.01% : 0.000059s : 1: cse_after_recomputation 0.00% : 0.000019s : 1: environ_conv 0.44% : 0.002440s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 5.12% : 0.028304s : 1: jit_opt_a 0.07% : 0.000388s : 1: jit_opt_after_cconv 0.03% : 0.000169s : 1: jit_opt_b 0.10% : 0.000526s : 1: loop_unroll 0.14% : 0.000786s : 1: mutable_eliminate 1.11% : 0.006115s : 39: opt.transform.jit_opt_a 0.04% : 0.000205s : 4: opt.transform.jit_opt_after_cconv 0.02% : 0.000137s : 4: opt.transform.jit_opt_b 0.01% : 0.000032s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000031s : 1: opt.transform.mutable_eliminate 0.01% : 0.000064s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000109s : 4: opt.transform.symbol_engine_opt 0.10% : 0.000547s : 1: opt_after_jit_grad 0.00% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pre_auto_parallel 0.01% : 0.000060s : 1: py_interpret_to_execute 0.00% : 0.000027s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000093s : 1: remove_dup_value 1.57% : 0.008661s : 2: renormalize.infer 0.79% : 0.004376s : 2: renormalize.specialize 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000118s : 1: rewriter_after_opt_a 0.03% : 0.000191s : 1: rewriter_before_opt_a 0.03% : 0.000166s : 1: symbol_engine_optimizer 20.38% : 0.112771s : 1: task_emit 69.70% : 0.385699s : 1: type_inference 0.03% : 0.000144s : 1: validate TotalTime = 0.613466, [24] [bootstrap]: 0.00129204 [type_inference]: 0.330378 [event_method]: 7.808e-05 [auto_monad]: 0.00049823 [graph_reusing]: 1.969e-05 [inline]: 3.71001e-06 [add_attr]: 0.00693118, [1] [add_attr_with_inline]: 0.00688353, [1] [Cycle 1]: 0.00019037, [2] [tag_attr]: 0.00010994 [meta_addattr_fg_expand]: 2.532e-05 [parallel-infer-symbol]: 3.9e-06 [pre_auto_parallel]: 0.0001537 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 1.17e-06 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.94e-06 [optimize]: 0.233384, [53] [py_interpret_to_execute]: 8.07998e-06 [rewriter_before_opt_a]: 0.00072871 [opt_a]: 0.21277, [4] [Cycle 1]: 0.190082, [45] [expand_dump_flag]: 8.66002e-06 [switch_simplify]: 0.00032587 [loop_unroll]: 0.00013163 [a_1]: 0.00393206 [with_stream_mark]: 4.582e-05 [recompute_prepare]: 4.126e-05 [updatestate_depend_eliminate]: 4.998e-05 [updatestate_assign_eliminate]: 4.709e-05 [updatestate_loads_eliminate]: 1.551e-05 [parameter_eliminate]: 3.69002e-06 [a_2]: 0.00080831 [accelerated_algorithm]: 6.791e-05 [shard]: 2.73e-06 [meta_shard_fg_expand]: 1.66e-05 [shard_inline]: 2.773e-05 [merge_send_recv]: 3.025e-05 [auto_parallel]: 2.234e-05 [parallel]: 4.089e-05 [flash_sp]: 1.679e-05 [merge_comm]: 1.647e-05 [allreduce_fusion]: 4.363e-05 [matmul_add_comm_reduction]: 6.377e-05 [allreduce_slice_to_reducescatter]: 8.39995e-07 [virtual_shard_identity]: 3.09e-05 [virtual_dataset]: 2.548e-05 [get_grad_eliminate_]: 2.603e-05 [virtual_output]: 2.553e-05 [merge_forward]: 1.573e-05 [cell_reuse_recompute_pass]: 1.98002e-06 [offload_activation]: 2.976e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.057e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 4.48e-05 [set_forward_comm_id_for_comm_node_pass]: 1.643e-05 [meta_fg_expand]: 0.0449528 [flash_sp_send_recv_attached]: 6.26e-06 [receive_attached]: 3.01001e-06 [after_resolve]: 0.00026605 [a_after_grad]: 0.00036301 [renormalize]: 0.132797 [add_forward_monad_depend]: 0.0001285 [auto_monad_grad]: 5.723e-05 [auto_monad_eliminator]: 0.00034698 [cse]: 0.00077492 [a_3]: 0.00387257 [Cycle 2]: 0.0177083, [45] [expand_dump_flag]: 1.168e-05 [switch_simplify]: 0.00021251 [loop_unroll]: 0.00023026 [a_1]: 0.00493191 [with_stream_mark]: 5.595e-05 [recompute_prepare]: 4.109e-05 [updatestate_depend_eliminate]: 1.893e-05 [updatestate_assign_eliminate]: 1.731e-05 [updatestate_loads_eliminate]: 1.687e-05 [parameter_eliminate]: 6.37001e-06 [a_2]: 0.00071737 [accelerated_algorithm]: 4.785e-05 [shard]: 2.59999e-06 [meta_shard_fg_expand]: 9.24e-06 [shard_inline]: 2.134e-05 [merge_send_recv]: 1.877e-05 [auto_parallel]: 1.838e-05 [parallel]: 9.49e-06 [flash_sp]: 5.04e-06 [merge_comm]: 1.249e-05 [allreduce_fusion]: 1.105e-05 [matmul_add_comm_reduction]: 1.893e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 2.129e-05 [virtual_dataset]: 1.918e-05 [get_grad_eliminate_]: 1.871e-05 [virtual_output]: 1.917e-05 [merge_forward]: 1.114e-05 [cell_reuse_recompute_pass]: 1.71998e-06 [offload_activation]: 5.586e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.091e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 3.673e-05 [set_forward_comm_id_for_comm_node_pass]: 1.294e-05 [meta_fg_expand]: 0.00092394 [flash_sp_send_recv_attached]: 3.75e-06 [receive_attached]: 3.34001e-06 [after_resolve]: 5.301e-05 [a_after_grad]: 3.732e-05 [renormalize]: 0.00915043 [add_forward_monad_depend]: 1.371e-05 [auto_monad_grad]: 2.37001e-06 [auto_monad_eliminator]: 4.968e-05 [cse]: 0.00023511 [a_3]: 0.00016537 [Cycle 3]: 0.00334891, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 2.373e-05 [loop_unroll]: 1.959e-05 [a_1]: 0.00071249 [with_stream_mark]: 3.041e-05 [recompute_prepare]: 2.078e-05 [updatestate_depend_eliminate]: 5.042e-05 [updatestate_assign_eliminate]: 1.047e-05 [updatestate_loads_eliminate]: 9.85002e-06 [parameter_eliminate]: 2.42001e-06 [a_2]: 0.00031091 [accelerated_algorithm]: 2.6e-05 [shard]: 2.13998e-06 [meta_shard_fg_expand]: 5.30999e-06 [shard_inline]: 1.719e-05 [merge_send_recv]: 1.808e-05 [auto_parallel]: 1.571e-05 [parallel]: 1.058e-05 [flash_sp]: 1.79998e-06 [merge_comm]: 9.28002e-06 [allreduce_fusion]: 9.14e-06 [matmul_add_comm_reduction]: 1.651e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 1.761e-05 [virtual_dataset]: 1.644e-05 [get_grad_eliminate_]: 1.645e-05 [virtual_output]: 1.574e-05 [merge_forward]: 9.62001e-06 [cell_reuse_recompute_pass]: 3.49001e-06 [offload_activation]: 2.053e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.212e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 2.936e-05 [set_forward_comm_id_for_comm_node_pass]: 1.049e-05 [meta_fg_expand]: 8.1e-06 [flash_sp_send_recv_attached]: 1.79e-06 [receive_attached]: 2.98998e-06 [after_resolve]: 2.551e-05 [a_after_grad]: 2.594e-05 [renormalize]: 0.00124052 [add_forward_monad_depend]: 8.30999e-06 [auto_monad_grad]: 2.71999e-06 [auto_monad_eliminator]: 3.128e-05 [cse]: 0.00011254 [a_3]: 0.00012622 [Cycle 4]: 0.00160112, [45] [expand_dump_flag]: 2.24999e-06 [switch_simplify]: 1.783e-05 [loop_unroll]: 1.72e-05 [a_1]: 0.00048843 [with_stream_mark]: 1.988e-05 [recompute_prepare]: 1.739e-05 [updatestate_depend_eliminate]: 1.095e-05 [updatestate_assign_eliminate]: 8.72e-06 [updatestate_loads_eliminate]: 9.54e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 0.00024656 [accelerated_algorithm]: 2.264e-05 [shard]: 1.49e-06 [meta_shard_fg_expand]: 5.43002e-06 [shard_inline]: 1.668e-05 [merge_send_recv]: 1.358e-05 [auto_parallel]: 1.34e-05 [parallel]: 9.29e-06 [flash_sp]: 1.51002e-06 [merge_comm]: 9.57001e-06 [allreduce_fusion]: 9.05001e-06 [matmul_add_comm_reduction]: 1.724e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.868e-05 [virtual_dataset]: 1.554e-05 [get_grad_eliminate_]: 1.532e-05 [virtual_output]: 1.549e-05 [merge_forward]: 1.001e-05 [cell_reuse_recompute_pass]: 2.58e-06 [offload_activation]: 1.92e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.099e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 2.813e-05 [set_forward_comm_id_for_comm_node_pass]: 9.31998e-06 [meta_fg_expand]: 7.83001e-06 [flash_sp_send_recv_attached]: 1.81998e-06 [receive_attached]: 1.84e-06 [after_resolve]: 2.319e-05 [a_after_grad]: 2.512e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.98997e-06 [auto_monad_grad]: 1.96998e-06 [auto_monad_eliminator]: 2.276e-05 [cse]: 5.856e-05 [a_3]: 0.00010682 [py_interpret_to_execute_after_opt_a]: 8.55999e-06 [slice_cell_reuse_recomputed_activation]: 1.93997e-06 [rewriter_after_opt_a]: 5.436e-05 [convert_after_rewriter]: 2.143e-05 [order_py_execute_after_rewriter]: 1.60001e-06 [mutable_eliminate]: 0.00090702 [opt_b]: 0.0006951, [1] [Cycle 1]: 0.00068583, [7] [b_1]: 0.00046035 [b_2]: 1.947e-05 [updatestate_depend_eliminate]: 2.118e-05 [updatestate_assign_eliminate]: 9.64e-06 [updatestate_loads_eliminate]: 1.068e-05 [renormalize]: 1.54e-06 [cse]: 0.0001193 [optimize_parallel_all_gather_comm]: 8.431e-05 [overlap_param_gather]: 2.29999e-06 [cconv]: 4.304e-05 [loop_unroll]: 0.00055059 [opt_after_cconv]: 0.00023498, [1] [Cycle 1]: 0.00022892, [7] [c_1]: 9.873e-05 [parameter_eliminate]: 6.38e-06 [updatestate_depend_eliminate]: 1.281e-05 [updatestate_assign_eliminate]: 8.52e-06 [updatestate_loads_eliminate]: 8.29998e-06 [cse]: 5.783e-05 [renormalize]: 6.69999e-07 [remove_dup_value]: 8.31e-05 [tuple_transform]: 0.00020192, [1] [Cycle 1]: 0.00019698, [4] [d_1]: 0.00015654 [none_parameter_eliminate]: 1.96e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.944e-05 [partial_unused_args_eliminate]: 2.01003e-06 [add_recomputation]: 0.00015051 [cse_after_recomputation]: 5.828e-05, [1] [Cycle 1]: 5.157e-05, [1] [cse]: 4.478e-05 [environ_conv]: 2.015e-05 [swap_dp_allreduce_reducescatter]: 1.309e-05 [bias_add_comm_swap]: 3.86999e-06 [label_micro_interleaved_index]: 5.71e-06 [label_fine_grained_interleaved_index]: 2.98e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.70002e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 9.60019e-07 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.18001e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 7.13e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 4.051e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 1.039e-05 [overlap_recompute_and_grad_model_parallel]: 1.049e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.47999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.30002e-06 [overlap_grad_ring_attention]: 1.034e-05 [overlap_grad_flash_sp]: 4.785e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.96998e-06 [handle_group_info]: 1.32e-06 [symbol_engine_optimizer]: 0.00016791, [1] [Cycle 1]: 0.00016249, [6] [build]: 1.697e-05 [elim_shapecalc]: 2.378e-05 [elim_not_effective]: 3.742e-05 [opt_reshape]: 2.014e-05 [fold_const_symbol]: 3.134e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 6.302e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 6.13002e-06 [opt_after_jit_grad]: 0.00063754 [validate]: 0.00011458 [backend_pass]: 1.09e-06 [task_emit]: 0.0395512 [execute]: 9.74e-06 Sums bootstrap : 0.001292s : 0.22% type_inference : 0.330378s : 56.15% event_method : 0.000078s : 0.01% auto_monad : 0.000498s : 0.08% graph_reusing : 0.000020s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000110s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000154s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000729s : 0.12% optimize.opt_a.expand_dump_flag : 0.000025s : 0.00% optimize.opt_a.switch_simplify : 0.000580s : 0.10% optimize.opt_a.loop_unroll : 0.000399s : 0.07% optimize.opt_a.a_1 : 0.010065s : 1.71% optimize.opt_a.with_stream_mark : 0.000152s : 0.03% optimize.opt_a.recompute_prepare : 0.000121s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000130s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000084s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000052s : 0.01% optimize.opt_a.parameter_eliminate : 0.000014s : 0.00% optimize.opt_a.a_2 : 0.002083s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000164s : 0.03% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000037s : 0.01% optimize.opt_a.shard_inline : 0.000083s : 0.01% optimize.opt_a.merge_send_recv : 0.000081s : 0.01% optimize.opt_a.auto_parallel : 0.000070s : 0.01% optimize.opt_a.parallel : 0.000070s : 0.01% optimize.opt_a.flash_sp : 0.000025s : 0.00% optimize.opt_a.merge_comm : 0.000048s : 0.01% optimize.opt_a.allreduce_fusion : 0.000073s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000116s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000088s : 0.02% optimize.opt_a.virtual_dataset : 0.000077s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000077s : 0.01% optimize.opt_a.virtual_output : 0.000076s : 0.01% optimize.opt_a.merge_forward : 0.000047s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000010s : 0.00% optimize.opt_a.offload_activation : 0.000125s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000155s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000007s : 0.00% optimize.opt_a.before_grad : 0.000139s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000049s : 0.01% optimize.opt_a.meta_fg_expand : 0.045893s : 7.80% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.00% optimize.opt_a.receive_attached : 0.000011s : 0.00% optimize.opt_a.after_resolve : 0.000368s : 0.06% optimize.opt_a.a_after_grad : 0.000451s : 0.08% optimize.opt_a.renormalize : 0.143188s : 24.34% optimize.opt_a.add_forward_monad_depend : 0.000153s : 0.03% optimize.opt_a.auto_monad_grad : 0.000064s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000451s : 0.08% optimize.opt_a.cse : 0.001181s : 0.20% optimize.opt_a.a_3 : 0.004271s : 0.73% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000054s : 0.01% optimize.convert_after_rewriter : 0.000021s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000907s : 0.15% optimize.opt_b.b_1 : 0.000460s : 0.08% optimize.opt_b.b_2 : 0.000019s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_b.renormalize : 0.000002s : 0.00% optimize.opt_b.cse : 0.000119s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000084s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000043s : 0.01% optimize.loop_unroll : 0.000551s : 0.09% optimize.opt_after_cconv.c_1 : 0.000099s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000058s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000083s : 0.01% optimize.tuple_transform.d_1 : 0.000157s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000151s : 0.03% optimize.cse_after_recomputation.cse : 0.000045s : 0.01% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000007s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000041s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000048s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000037s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000020s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000031s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000063s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000638s : 0.11% validate : 0.000115s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.039551s : 6.72% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.005067 750 0.30% : 0.000015s : 1: substitution.arithmetic_simplify 0.51% : 0.000026s : 8: substitution.depend_value_elim 0.12% : 0.000006s : 10: substitution.elim_not_effective 0.21% : 0.000011s : 14: substitution.float_depend_g_call 0.60% : 0.000030s : 23: substitution.float_tuple_getitem_switch 0.09% : 0.000005s : 10: substitution.fold_const_symbol 30.95% : 0.001568s : 9: substitution.getattr_setattr_resolve 0.23% : 0.000011s : 13: substitution.graph_param_transform 0.06% : 0.000003s : 2: substitution.incorporate_call 0.04% : 0.000002s : 2: substitution.incorporate_call_switch 39.89% : 0.002021s : 45: substitution.inline 1.19% : 0.000060s : 8: substitution.inline_without_move 0.48% : 0.000024s : 50: substitution.j_node_and_user_rematch 1.07% : 0.000054s : 5: substitution.less_batch_normalization 1.56% : 0.000079s : 36: substitution.minmaximum_grad 0.59% : 0.000030s : 14: substitution.partial_eliminate 0.68% : 0.000035s : 50: substitution.remove_not_recompute_node 4.47% : 0.000227s : 86: substitution.replace_applicator 0.65% : 0.000033s : 40: substitution.replace_old_param 0.14% : 0.000007s : 2: substitution.set_cell_output_no_recompute 0.47% : 0.000024s : 8: substitution.switch_simplify 3.87% : 0.000196s : 42: substitution.tuple_list_convert_item_index_to_positive 1.12% : 0.000057s : 44: substitution.tuple_list_get_item_const_eliminator 1.84% : 0.000093s : 44: substitution.tuple_list_get_item_depend_reorder 4.26% : 0.000216s : 83: substitution.tuple_list_get_item_eliminator 1.55% : 0.000079s : 44: substitution.tuple_list_get_set_item_eliminator 1.39% : 0.000071s : 24: substitution.updatestate_pure_node_eliminater 1.65% : 0.000084s : 33: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.329075 2 92.74% : 0.305185s : 1: type_inference.infer 7.26% : 0.023889s : 1: type_inference.specialize ------[replace.] 0.001619 99 0.51% : 0.000008s : 1: replace.arithmetic_simplify 7.74% : 0.000125s : 7: replace.getattr_setattr_resolve 36.30% : 0.000588s : 45: replace.inline 14.51% : 0.000235s : 15: replace.replace_applicator 6.42% : 0.000104s : 8: replace.switch_simplify 1.25% : 0.000020s : 2: replace.tuple_list_get_item_depend_reorder 30.45% : 0.000493s : 20: replace.tuple_list_get_item_eliminator 2.83% : 0.000046s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.003648 99 0.38% : 0.000014s : 1: match.arithmetic_simplify 40.30% : 0.001470s : 7: match.getattr_setattr_resolve 54.60% : 0.001992s : 45: match.inline 2.09% : 0.000076s : 15: match.replace_applicator 0.49% : 0.000018s : 8: match.switch_simplify 0.43% : 0.000016s : 2: match.tuple_list_get_item_depend_reorder 1.34% : 0.000049s : 20: match.tuple_list_get_item_eliminator 0.36% : 0.000013s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.002396 16344 0.90% : 0.000022s : 155: predicate.accumulaten_eliminater 0.17% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 0.50% : 0.000012s : 87: predicate.addn_check_dump 0.89% : 0.000021s : 155: predicate.addn_zero_filter 0.83% : 0.000020s : 155: predicate.adjust_all_reduce_mul_add 1.90% : 0.000046s : 237: predicate.arithmetic_simplify 0.87% : 0.000021s : 156: predicate.cast_eliminate 3.07% : 0.000074s : 554: predicate.check_bprop_eliminate 0.49% : 0.000012s : 87: predicate.compare_switch_simplify 0.04% : 0.000001s : 13: predicate.const_output_eliminate 0.50% : 0.000012s : 81: predicate.depend_value_elim 0.94% : 0.000023s : 156: predicate.dict_get_item_const_eliminator 1.02% : 0.000025s : 156: predicate.dict_get_item_eliminator 0.88% : 0.000021s : 156: predicate.dict_set_item_eliminator 0.18% : 0.000004s : 26: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 13: predicate.elim_not_effective 0.11% : 0.000003s : 13: predicate.elim_shapecalc_of_broadcastargs 0.95% : 0.000023s : 169: predicate.environ_add_const_eliminate 0.92% : 0.000022s : 169: predicate.environ_get_add_eliminate 0.91% : 0.000022s : 169: predicate.environ_get_depend_swap 1.42% : 0.000034s : 250: predicate.environ_get_eliminate 0.97% : 0.000023s : 169: predicate.environ_get_set_eliminate 1.28% : 0.000031s : 223: predicate.exchange_switch_depend_value 1.90% : 0.000046s : 223: predicate.float_depend_g_call 0.58% : 0.000014s : 87: predicate.float_environ_get_switch 0.74% : 0.000018s : 100: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 13: predicate.fold_const_symbol 0.44% : 0.000011s : 65: predicate.get_grad_eliminate 0.56% : 0.000014s : 51: predicate.getattr_setattr_resolve 0.06% : 0.000001s : 13: predicate.graph_param_transform 0.47% : 0.000011s : 81: predicate.incorporate_call 0.43% : 0.000010s : 81: predicate.incorporate_call_switch 4.46% : 0.000107s : 556: predicate.inline 1.76% : 0.000042s : 222: predicate.inline_without_move 0.20% : 0.000005s : 65: predicate.j_node_and_user_rematch 0.60% : 0.000014s : 68: predicate.less_batch_normalization 1.22% : 0.000029s : 204: predicate.list_to_tuple_eliminator_ 2.01% : 0.000048s : 359: predicate.load_eliminater 0.19% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.48% : 0.000059s : 410: predicate.loop_unroll_before_grad 1.04% : 0.000025s : 184: predicate.make_slice_get_slice_eliminator 0.53% : 0.000013s : 87: predicate.merge_addn 2.75% : 0.000066s : 500: predicate.micro_step_allgather_replace 2.82% : 0.000068s : 500: predicate.mini_step_allgather_replace 0.89% : 0.000021s : 156: predicate.minmaximum_grad 0.40% : 0.000010s : 13: predicate.mutable_eliminate 0.09% : 0.000002s : 13: predicate.opt_reshape 0.10% : 0.000002s : 13: predicate.parallel_virtual_node 1.98% : 0.000047s : 223: predicate.partial_defer_inline 1.22% : 0.000029s : 191: predicate.partial_eliminate 0.95% : 0.000023s : 155: predicate.print_const_string_wrapper 0.49% : 0.000012s : 81: predicate.reduce_all_const_elim 1.22% : 0.000029s : 156: predicate.reduce_eliminate 2.09% : 0.000050s : 359: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000005s : 65: predicate.remove_not_recompute_node 2.65% : 0.000063s : 708: predicate.replace_applicator 0.78% : 0.000019s : 222: predicate.replace_old_param 0.06% : 0.000001s : 13: predicate.reset_defer_inline 0.94% : 0.000022s : 156: predicate.reshape_eliminate 3.12% : 0.000075s : 500: predicate.row_tensor_add_zeros_like 0.10% : 0.000002s : 13: predicate.row_tensor_eliminate 3.30% : 0.000079s : 554: predicate.same_eliminate 0.28% : 0.000007s : 77: predicate.set_cell_output_no_recompute 0.44% : 0.000011s : 65: predicate.shard_identity_eliminate 0.18% : 0.000004s : 26: predicate.special_op_eliminate 0.59% : 0.000014s : 87: predicate.specialize_transform 2.93% : 0.000070s : 500: predicate.split_environ_get_set_with_tuple_value 1.57% : 0.000038s : 222: predicate.stack_unstack_eliminate 0.09% : 0.000002s : 13: predicate.switch_call_monad_eliminater 1.42% : 0.000034s : 223: predicate.switch_defer_inline 4.44% : 0.000106s : 777: predicate.switch_layer_defer_inline 4.75% : 0.000114s : 749: predicate.switch_simplify 0.90% : 0.000022s : 156: predicate.tile_eliminate 0.88% : 0.000021s : 156: predicate.transpose_eliminate 1.23% : 0.000029s : 182: predicate.tuple_list_convert_item_index_to_positive 1.28% : 0.000031s : 184: predicate.tuple_list_get_item_const_eliminator 1.15% : 0.000027s : 184: predicate.tuple_list_get_item_depend_reorder 2.31% : 0.000055s : 285: predicate.tuple_list_get_item_eliminator 1.20% : 0.000029s : 184: predicate.tuple_list_get_set_item_eliminator 1.82% : 0.000044s : 265: predicate.tuple_list_set_item_eliminator 1.21% : 0.000029s : 204: predicate.tuple_to_list_eliminator_ 1.95% : 0.000047s : 359: predicate.updatestate_pure_node_eliminater 2.56% : 0.000061s : 442: predicate.updatestate_useless_node_eliminater 0.09% : 0.000002s : 13: predicate.value_based_eliminate 0.43% : 0.000010s : 65: predicate.virtual_dataset_eliminate 0.45% : 0.000011s : 65: predicate.virtual_output_eliminate 0.08% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.10% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.019331 177 60.50% : 0.011695s : 88: func_graph_cloner_run.FuncGraphClonerGraph 1.45% : 0.000280s : 3: func_graph_cloner_run.FuncGraphClonerNode 38.05% : 0.007356s : 86: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.018400 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.68% : 0.006939s : 1: add_attr 0.68% : 0.006890s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000156s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000511s : 1: auto_monad 0.01% : 0.000068s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.13% : 0.001354s : 1: bootstrap 0.00% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000044s : 1: control_data_broadcast_order 0.00% : 0.000030s : 1: convert_after_rewriter 0.01% : 0.000061s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.01% : 0.000086s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000025s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.06% : 0.000560s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.09% : 0.000918s : 1: mutable_eliminate 0.00% : 0.000014s : 1: offloading_packed_experts 0.00% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000053s : 1: opt.transform.mutable_eliminate 1.85% : 0.018882s : 181: opt.transform.opt_a 0.01% : 0.000097s : 1: opt.transform.opt_after_cconv 0.01% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000440s : 28: opt.transform.opt_b 0.18% : 0.001825s : 4: opt.transform.opt_resolve 0.02% : 0.000173s : 2: opt.transform.opt_trans_graph 0.01% : 0.000108s : 4: opt.transform.symbol_engine_opt 20.89% : 0.212775s : 1: opt_a 0.02% : 0.000238s : 1: opt_after_cconv 0.06% : 0.000647s : 1: opt_after_jit_grad 0.07% : 0.000699s : 1: opt_b 22.92% : 0.233390s : 1: optimize 0.01% : 0.000089s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.01% : 0.000052s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000161s : 1: pre_auto_parallel 0.00% : 0.000014s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000088s : 1: remove_dup_value 11.77% : 0.119856s : 3: renormalize.infer 2.29% : 0.023288s : 3: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 1.59% : 0.016153s : 1: rewriter_after_opt_a 0.07% : 0.000741s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000171s : 1: symbol_engine_optimizer 3.89% : 0.039575s : 1: task_emit 0.02% : 0.000205s : 1: tuple_transform 32.44% : 0.330402s : 1: type_inference 0.02% : 0.000171s : 1: validate TotalTime = 0.729111, [24] [bootstrap]: 0.00106336 [type_inference]: 0.351061 [event_method]: 8.923e-05 [auto_monad]: 0.00055933 [graph_reusing]: 2.09e-05 [inline]: 4.118e-05 [add_attr]: 0.00628938, [1] [add_attr_with_inline]: 0.0062702, [1] [Cycle 1]: 0.00017719, [2] [tag_attr]: 0.00010693 [meta_addattr_fg_expand]: 2.52e-05 [parallel-infer-symbol]: 3.77002e-06 [pre_auto_parallel]: 0.00017323 [insert-virtual-dataset]: 2.98e-06 [parallel-infer-symbol-second]: 1.15999e-06 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.321888, [53] [py_interpret_to_execute]: 7.03e-06 [rewriter_before_opt_a]: 0.0007128 [opt_a]: 0.317676, [4] [Cycle 1]: 0.282772, [45] [expand_dump_flag]: 1.401e-05 [switch_simplify]: 0.00032646 [loop_unroll]: 0.00014209 [a_1]: 0.00394081 [with_stream_mark]: 4.582e-05 [recompute_prepare]: 4.109e-05 [updatestate_depend_eliminate]: 4.666e-05 [updatestate_assign_eliminate]: 4.432e-05 [updatestate_loads_eliminate]: 1.49e-05 [parameter_eliminate]: 3.39001e-06 [a_2]: 0.00041002 [accelerated_algorithm]: 5.436e-05 [shard]: 2.11998e-06 [meta_shard_fg_expand]: 1.377e-05 [shard_inline]: 2.613e-05 [merge_send_recv]: 2.623e-05 [auto_parallel]: 2.129e-05 [parallel]: 3.424e-05 [flash_sp]: 1.636e-05 [merge_comm]: 1.684e-05 [allreduce_fusion]: 0.0161278 [matmul_add_comm_reduction]: 7.834e-05 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 5.721e-05 [virtual_dataset]: 2.753e-05 [get_grad_eliminate_]: 2.764e-05 [virtual_output]: 2.459e-05 [merge_forward]: 1.8e-05 [cell_reuse_recompute_pass]: 3.61001e-06 [offload_activation]: 3.398e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.937e-05 [merge_recompute_call_nodes]: 1.96e-06 [before_grad]: 4.594e-05 [set_forward_comm_id_for_comm_node_pass]: 2.011e-05 [meta_fg_expand]: 0.0710821 [flash_sp_send_recv_attached]: 9.37999e-06 [receive_attached]: 3.01001e-06 [after_resolve]: 0.00026666 [a_after_grad]: 0.00036832 [renormalize]: 0.183715 [add_forward_monad_depend]: 0.00013105 [auto_monad_grad]: 6.28e-05 [auto_monad_eliminator]: 0.00040174 [cse]: 0.00079571 [a_3]: 0.00366441 [Cycle 2]: 0.0301086, [45] [expand_dump_flag]: 1.027e-05 [switch_simplify]: 0.00021721 [loop_unroll]: 0.00020368 [a_1]: 0.00495263 [with_stream_mark]: 4.781e-05 [recompute_prepare]: 3.941e-05 [updatestate_depend_eliminate]: 1.706e-05 [updatestate_assign_eliminate]: 1.741e-05 [updatestate_loads_eliminate]: 1.701e-05 [parameter_eliminate]: 5.57001e-06 [a_2]: 0.00072294 [accelerated_algorithm]: 4.536e-05 [shard]: 2.29001e-06 [meta_shard_fg_expand]: 8.53001e-06 [shard_inline]: 2.139e-05 [merge_send_recv]: 2.039e-05 [auto_parallel]: 1.883e-05 [parallel]: 1.019e-05 [flash_sp]: 4.82998e-06 [merge_comm]: 1.233e-05 [allreduce_fusion]: 1.119e-05 [matmul_add_comm_reduction]: 1.98e-05 [allreduce_slice_to_reducescatter]: 1.34998e-06 [virtual_shard_identity]: 3.907e-05 [virtual_dataset]: 2.098e-05 [get_grad_eliminate_]: 1.923e-05 [virtual_output]: 2.02e-05 [merge_forward]: 1.275e-05 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 2.456e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.015e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 3.444e-05 [set_forward_comm_id_for_comm_node_pass]: 1.266e-05 [meta_fg_expand]: 0.00098719 [flash_sp_send_recv_attached]: 2.88e-06 [receive_attached]: 2.98e-06 [after_resolve]: 5.469e-05 [a_after_grad]: 3.354e-05 [renormalize]: 0.0215462 [add_forward_monad_depend]: 1.066e-05 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 4.685e-05 [cse]: 0.00022778 [a_3]: 0.00015893 [Cycle 3]: 0.00317048, [45] [expand_dump_flag]: 2.94001e-06 [switch_simplify]: 2.211e-05 [loop_unroll]: 1.976e-05 [a_1]: 0.00074554 [with_stream_mark]: 2.688e-05 [recompute_prepare]: 2.171e-05 [updatestate_depend_eliminate]: 4.715e-05 [updatestate_assign_eliminate]: 1.006e-05 [updatestate_loads_eliminate]: 9.06002e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 0.00025468 [accelerated_algorithm]: 2.369e-05 [shard]: 2.61e-06 [meta_shard_fg_expand]: 5.76e-06 [shard_inline]: 1.628e-05 [merge_send_recv]: 1.651e-05 [auto_parallel]: 1.586e-05 [parallel]: 1.077e-05 [flash_sp]: 1.96003e-06 [merge_comm]: 1.021e-05 [allreduce_fusion]: 9.42999e-06 [matmul_add_comm_reduction]: 1.738e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 2.025e-05 [virtual_dataset]: 1.761e-05 [get_grad_eliminate_]: 1.638e-05 [virtual_output]: 1.752e-05 [merge_forward]: 1.121e-05 [cell_reuse_recompute_pass]: 3.14999e-06 [offload_activation]: 2.15e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.202e-05 [merge_recompute_call_nodes]: 1.35999e-06 [before_grad]: 2.843e-05 [set_forward_comm_id_for_comm_node_pass]: 1.047e-05 [meta_fg_expand]: 7.06001e-06 [flash_sp_send_recv_attached]: 2.28002e-06 [receive_attached]: 2.66e-06 [after_resolve]: 2.602e-05 [a_after_grad]: 2.495e-05 [renormalize]: 0.00111203 [add_forward_monad_depend]: 6.28e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 2.724e-05 [cse]: 0.0001065 [a_3]: 0.00012333 [Cycle 4]: 0.00159386, [45] [expand_dump_flag]: 2.09999e-06 [switch_simplify]: 1.797e-05 [loop_unroll]: 1.738e-05 [a_1]: 0.00047676 [with_stream_mark]: 1.87e-05 [recompute_prepare]: 1.656e-05 [updatestate_depend_eliminate]: 1.063e-05 [updatestate_assign_eliminate]: 8.55001e-06 [updatestate_loads_eliminate]: 9.57001e-06 [parameter_eliminate]: 1.44e-06 [a_2]: 0.00024128 [accelerated_algorithm]: 2.041e-05 [shard]: 1.64e-06 [meta_shard_fg_expand]: 4.57e-06 [shard_inline]: 1.644e-05 [merge_send_recv]: 1.379e-05 [auto_parallel]: 1.386e-05 [parallel]: 8.67e-06 [flash_sp]: 1.37999e-06 [merge_comm]: 9.59e-06 [allreduce_fusion]: 8.99e-06 [matmul_add_comm_reduction]: 1.554e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.706e-05 [virtual_dataset]: 1.573e-05 [get_grad_eliminate_]: 1.557e-05 [virtual_output]: 1.608e-05 [merge_forward]: 9.91e-06 [cell_reuse_recompute_pass]: 2.44001e-06 [offload_activation]: 1.972e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.2e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 2.718e-05 [set_forward_comm_id_for_comm_node_pass]: 9.62999e-06 [meta_fg_expand]: 7.45998e-06 [flash_sp_send_recv_attached]: 1.50999e-06 [receive_attached]: 2.14e-06 [after_resolve]: 2.209e-05 [a_after_grad]: 3.992e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.87002e-06 [auto_monad_grad]: 1.88997e-06 [auto_monad_eliminator]: 2.404e-05 [cse]: 5.846e-05 [a_3]: 0.00010943 [py_interpret_to_execute_after_opt_a]: 8.2e-06 [slice_cell_reuse_recomputed_activation]: 2.32999e-06 [rewriter_after_opt_a]: 5.339e-05 [convert_after_rewriter]: 2.04e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00085351 [opt_b]: 0.00059453, [1] [Cycle 1]: 0.0005861, [7] [b_1]: 0.00042866 [b_2]: 1.972e-05 [updatestate_depend_eliminate]: 1.487e-05 [updatestate_assign_eliminate]: 8.70999e-06 [updatestate_loads_eliminate]: 8.45001e-06 [renormalize]: 3.50003e-07 [cse]: 6.543e-05 [optimize_parallel_all_gather_comm]: 6.853e-05 [overlap_param_gather]: 2.60002e-06 [cconv]: 3.405e-05 [loop_unroll]: 0.00053393 [opt_after_cconv]: 0.00023085, [1] [Cycle 1]: 0.00022374, [7] [c_1]: 9.553e-05 [parameter_eliminate]: 5.22999e-06 [updatestate_depend_eliminate]: 1.185e-05 [updatestate_assign_eliminate]: 8.28001e-06 [updatestate_loads_eliminate]: 8.34002e-06 [cse]: 5.876e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 7.597e-05 [tuple_transform]: 0.00020782, [1] [Cycle 1]: 0.00020223, [4] [d_1]: 0.00015809 [none_parameter_eliminate]: 1.98997e-06 [renormalize]: 3.39991e-07 [switch_simplify]: 1.97e-05 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 0.00015332 [cse_after_recomputation]: 5.926e-05, [1] [Cycle 1]: 5.3e-05, [1] [cse]: 4.648e-05 [environ_conv]: 1.75e-05 [swap_dp_allreduce_reducescatter]: 1.307e-05 [bias_add_comm_swap]: 2.76999e-06 [label_micro_interleaved_index]: 6.36e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.95002e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.28002e-06 [full_micro_interleaved_order_control]: 2.96001e-06 [reorder_send_recv_between_fp_bp]: 3.09999e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.45999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 2.842e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 8.69e-06 [overlap_recompute_and_grad_model_parallel]: 1.132e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.55999e-06 [overlap_recompute_comm]: 2.10002e-06 [overlap_grad_ring_attention]: 1.028e-05 [overlap_grad_flash_sp]: 4.835e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 2.34001e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 0.00016345, [1] [Cycle 1]: 0.00015854, [6] [build]: 1.569e-05 [elim_shapecalc]: 2.209e-05 [elim_not_effective]: 3.627e-05 [opt_reshape]: 1.951e-05 [fold_const_symbol]: 3.228e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.59999e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 5.502e-05 [get_jit_bprop_graph]: 2.53e-06 [rewriter_after_jit_bprop_graph]: 6.21e-06 [opt_after_jit_grad]: 0.0169918 [validate]: 0.00013133 [backend_pass]: 9.09989e-07 [task_emit]: 0.0303722 [execute]: 1.03e-05 Sums bootstrap : 0.001063s : 0.15% type_inference : 0.351061s : 48.70% event_method : 0.000089s : 0.01% auto_monad : 0.000559s : 0.08% graph_reusing : 0.000021s : 0.00% inline : 0.000041s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000107s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000173s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000713s : 0.10% optimize.opt_a.expand_dump_flag : 0.000029s : 0.00% optimize.opt_a.switch_simplify : 0.000584s : 0.08% optimize.opt_a.loop_unroll : 0.000383s : 0.05% optimize.opt_a.a_1 : 0.010116s : 1.40% optimize.opt_a.with_stream_mark : 0.000139s : 0.02% optimize.opt_a.recompute_prepare : 0.000119s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000122s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000080s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000051s : 0.01% optimize.opt_a.parameter_eliminate : 0.000013s : 0.00% optimize.opt_a.a_2 : 0.001629s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000144s : 0.02% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000033s : 0.00% optimize.opt_a.shard_inline : 0.000080s : 0.01% optimize.opt_a.merge_send_recv : 0.000077s : 0.01% optimize.opt_a.auto_parallel : 0.000070s : 0.01% optimize.opt_a.parallel : 0.000064s : 0.01% optimize.opt_a.flash_sp : 0.000025s : 0.00% optimize.opt_a.merge_comm : 0.000049s : 0.01% optimize.opt_a.allreduce_fusion : 0.016157s : 2.24% optimize.opt_a.matmul_add_comm_reduction : 0.000131s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000134s : 0.02% optimize.opt_a.virtual_dataset : 0.000082s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000079s : 0.01% optimize.opt_a.virtual_output : 0.000078s : 0.01% optimize.opt_a.merge_forward : 0.000052s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.offload_activation : 0.000100s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000164s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000136s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000053s : 0.01% optimize.opt_a.meta_fg_expand : 0.072084s : 10.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000016s : 0.00% optimize.opt_a.receive_attached : 0.000011s : 0.00% optimize.opt_a.after_resolve : 0.000369s : 0.05% optimize.opt_a.a_after_grad : 0.000467s : 0.06% optimize.opt_a.renormalize : 0.206374s : 28.63% optimize.opt_a.add_forward_monad_depend : 0.000151s : 0.02% optimize.opt_a.auto_monad_grad : 0.000070s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000500s : 0.07% optimize.opt_a.cse : 0.001188s : 0.16% optimize.opt_a.a_3 : 0.004056s : 0.56% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000854s : 0.12% optimize.opt_b.b_1 : 0.000429s : 0.06% optimize.opt_b.b_2 : 0.000020s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000065s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000069s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000034s : 0.00% optimize.loop_unroll : 0.000534s : 0.07% optimize.opt_after_cconv.c_1 : 0.000096s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000059s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000076s : 0.01% optimize.tuple_transform.d_1 : 0.000158s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000153s : 0.02% optimize.cse_after_recomputation.cse : 0.000046s : 0.01% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000028s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000048s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000036s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000020s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000032s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000055s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.016992s : 2.36% validate : 0.000131s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.030372s : 4.21% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.005169 753 0.27% : 0.000014s : 1: substitution.arithmetic_simplify 0.46% : 0.000024s : 8: substitution.depend_value_elim 0.10% : 0.000005s : 10: substitution.elim_not_effective 0.21% : 0.000011s : 14: substitution.float_depend_g_call 0.53% : 0.000027s : 23: substitution.float_tuple_getitem_switch 0.10% : 0.000005s : 10: substitution.fold_const_symbol 33.36% : 0.001724s : 9: substitution.getattr_setattr_resolve 0.22% : 0.000011s : 13: substitution.graph_param_transform 0.08% : 0.000004s : 2: substitution.incorporate_call 0.04% : 0.000002s : 2: substitution.incorporate_call_switch 38.98% : 0.002015s : 48: substitution.inline 1.18% : 0.000061s : 8: substitution.inline_without_move 0.49% : 0.000025s : 50: substitution.j_node_and_user_rematch 0.89% : 0.000046s : 5: substitution.less_batch_normalization 1.81% : 0.000094s : 36: substitution.minmaximum_grad 0.55% : 0.000028s : 14: substitution.partial_eliminate 0.69% : 0.000036s : 50: substitution.remove_not_recompute_node 4.29% : 0.000222s : 86: substitution.replace_applicator 0.64% : 0.000033s : 40: substitution.replace_old_param 0.13% : 0.000007s : 2: substitution.set_cell_output_no_recompute 0.43% : 0.000022s : 8: substitution.switch_simplify 3.60% : 0.000186s : 42: substitution.tuple_list_convert_item_index_to_positive 1.41% : 0.000073s : 44: substitution.tuple_list_get_item_const_eliminator 1.74% : 0.000090s : 44: substitution.tuple_list_get_item_depend_reorder 4.04% : 0.000209s : 83: substitution.tuple_list_get_item_eliminator 1.51% : 0.000078s : 44: substitution.tuple_list_get_set_item_eliminator 0.86% : 0.000044s : 24: substitution.updatestate_pure_node_eliminater 1.38% : 0.000071s : 33: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.349677 2 97.31% : 0.340255s : 1: type_inference.infer 2.69% : 0.009422s : 1: type_inference.specialize ------[replace.] 0.001597 102 0.41% : 0.000007s : 1: replace.arithmetic_simplify 8.64% : 0.000138s : 7: replace.getattr_setattr_resolve 36.72% : 0.000586s : 48: replace.inline 15.17% : 0.000242s : 15: replace.replace_applicator 5.79% : 0.000092s : 8: replace.switch_simplify 1.31% : 0.000021s : 2: replace.tuple_list_get_item_depend_reorder 29.27% : 0.000467s : 20: replace.tuple_list_get_item_eliminator 2.70% : 0.000043s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.003779 102 0.35% : 0.000013s : 1: match.arithmetic_simplify 42.82% : 0.001618s : 7: match.getattr_setattr_resolve 52.50% : 0.001984s : 48: match.inline 1.85% : 0.000070s : 15: match.replace_applicator 0.45% : 0.000017s : 8: match.switch_simplify 0.40% : 0.000015s : 2: match.tuple_list_get_item_depend_reorder 1.33% : 0.000050s : 20: match.tuple_list_get_item_eliminator 0.30% : 0.000011s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.002512 16437 0.95% : 0.000024s : 156: predicate.accumulaten_eliminater 0.40% : 0.000010s : 13: predicate.ad_related_special_op_eliminate 0.46% : 0.000012s : 87: predicate.addn_check_dump 0.86% : 0.000022s : 156: predicate.addn_zero_filter 0.84% : 0.000021s : 156: predicate.adjust_all_reduce_mul_add 1.92% : 0.000048s : 238: predicate.arithmetic_simplify 2.99% : 0.000075s : 157: predicate.cast_eliminate 3.69% : 0.000093s : 554: predicate.check_bprop_eliminate 0.47% : 0.000012s : 87: predicate.compare_switch_simplify 0.04% : 0.000001s : 13: predicate.const_output_eliminate 0.45% : 0.000011s : 81: predicate.depend_value_elim 0.90% : 0.000023s : 157: predicate.dict_get_item_const_eliminator 0.97% : 0.000024s : 157: predicate.dict_get_item_eliminator 0.86% : 0.000022s : 157: predicate.dict_set_item_eliminator 0.20% : 0.000005s : 26: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 13: predicate.elim_not_effective 0.09% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 0.90% : 0.000023s : 170: predicate.environ_add_const_eliminate 0.89% : 0.000022s : 170: predicate.environ_get_add_eliminate 0.92% : 0.000023s : 170: predicate.environ_get_depend_swap 1.36% : 0.000034s : 251: predicate.environ_get_eliminate 0.91% : 0.000023s : 170: predicate.environ_get_set_eliminate 1.31% : 0.000033s : 227: predicate.exchange_switch_depend_value 1.76% : 0.000044s : 227: predicate.float_depend_g_call 0.49% : 0.000012s : 87: predicate.float_environ_get_switch 0.60% : 0.000015s : 100: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 13: predicate.fold_const_symbol 0.42% : 0.000011s : 65: predicate.get_grad_eliminate 0.62% : 0.000016s : 51: predicate.getattr_setattr_resolve 0.04% : 0.000001s : 13: predicate.graph_param_transform 0.44% : 0.000011s : 81: predicate.incorporate_call 0.42% : 0.000011s : 81: predicate.incorporate_call_switch 4.07% : 0.000102s : 561: predicate.inline 1.78% : 0.000045s : 222: predicate.inline_without_move 0.19% : 0.000005s : 65: predicate.j_node_and_user_rematch 0.49% : 0.000012s : 68: predicate.less_batch_normalization 1.27% : 0.000032s : 205: predicate.list_to_tuple_eliminator_ 1.96% : 0.000049s : 361: predicate.load_eliminater 0.18% : 0.000005s : 13: predicate.loop_unroll_after_grad 2.43% : 0.000061s : 423: predicate.loop_unroll_before_grad 1.09% : 0.000027s : 185: predicate.make_slice_get_slice_eliminator 0.51% : 0.000013s : 87: predicate.merge_addn 2.62% : 0.000066s : 500: predicate.micro_step_allgather_replace 2.66% : 0.000067s : 500: predicate.mini_step_allgather_replace 0.88% : 0.000022s : 157: predicate.minmaximum_grad 0.28% : 0.000007s : 13: predicate.mutable_eliminate 0.09% : 0.000002s : 13: predicate.opt_reshape 0.10% : 0.000002s : 13: predicate.parallel_virtual_node 1.97% : 0.000049s : 227: predicate.partial_defer_inline 1.18% : 0.000030s : 192: predicate.partial_eliminate 0.88% : 0.000022s : 156: predicate.print_const_string_wrapper 0.45% : 0.000011s : 81: predicate.reduce_all_const_elim 1.10% : 0.000028s : 157: predicate.reduce_eliminate 1.94% : 0.000049s : 361: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000005s : 65: predicate.remove_not_recompute_node 2.45% : 0.000062s : 709: predicate.replace_applicator 1.51% : 0.000038s : 222: predicate.replace_old_param 0.04% : 0.000001s : 13: predicate.reset_defer_inline 0.88% : 0.000022s : 157: predicate.reshape_eliminate 2.88% : 0.000072s : 500: predicate.row_tensor_add_zeros_like 0.09% : 0.000002s : 13: predicate.row_tensor_eliminate 3.19% : 0.000080s : 554: predicate.same_eliminate 0.25% : 0.000006s : 77: predicate.set_cell_output_no_recompute 0.50% : 0.000012s : 65: predicate.shard_identity_eliminate 0.17% : 0.000004s : 26: predicate.special_op_eliminate 0.55% : 0.000014s : 87: predicate.specialize_transform 2.79% : 0.000070s : 500: predicate.split_environ_get_set_with_tuple_value 1.43% : 0.000036s : 222: predicate.stack_unstack_eliminate 0.08% : 0.000002s : 13: predicate.switch_call_monad_eliminater 1.39% : 0.000035s : 227: predicate.switch_defer_inline 4.33% : 0.000109s : 781: predicate.switch_layer_defer_inline 4.66% : 0.000117s : 766: predicate.switch_simplify 0.88% : 0.000022s : 157: predicate.tile_eliminate 0.88% : 0.000022s : 157: predicate.transpose_eliminate 1.24% : 0.000031s : 183: predicate.tuple_list_convert_item_index_to_positive 1.27% : 0.000032s : 185: predicate.tuple_list_get_item_const_eliminator 1.11% : 0.000028s : 185: predicate.tuple_list_get_item_depend_reorder 2.24% : 0.000056s : 286: predicate.tuple_list_get_item_eliminator 1.15% : 0.000029s : 185: predicate.tuple_list_get_set_item_eliminator 1.73% : 0.000043s : 266: predicate.tuple_list_set_item_eliminator 1.18% : 0.000030s : 205: predicate.tuple_to_list_eliminator_ 1.97% : 0.000050s : 361: predicate.updatestate_pure_node_eliminater 2.48% : 0.000062s : 444: predicate.updatestate_useless_node_eliminater 0.08% : 0.000002s : 13: predicate.value_based_eliminate 0.42% : 0.000011s : 65: predicate.virtual_dataset_eliminate 0.41% : 0.000010s : 65: predicate.virtual_output_eliminate 0.07% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.09% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.017679 179 60.90% : 0.010767s : 87: func_graph_cloner_run.FuncGraphClonerGraph 1.47% : 0.000260s : 3: func_graph_cloner_run.FuncGraphClonerNode 37.63% : 0.006653s : 89: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.284757 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.49% : 0.006296s : 1: add_attr 0.49% : 0.006277s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.01% : 0.000159s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.04% : 0.000574s : 1: auto_monad 0.00% : 0.000059s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.09% : 0.001112s : 1: bootstrap 0.00% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000032s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000062s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000021s : 1: environ_conv 0.01% : 0.000098s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000027s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000046s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.04% : 0.000544s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.07% : 0.000866s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000036s : 1: opt.transform.mutable_eliminate 1.43% : 0.018330s : 181: opt.transform.opt_a 0.01% : 0.000094s : 1: opt.transform.opt_after_cconv 0.01% : 0.000083s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000414s : 28: opt.transform.opt_b 0.16% : 0.002001s : 4: opt.transform.opt_resolve 0.01% : 0.000175s : 2: opt.transform.opt_trans_graph 0.01% : 0.000106s : 4: opt.transform.symbol_engine_opt 24.73% : 0.317680s : 1: opt_a 0.02% : 0.000234s : 1: opt_after_cconv 1.32% : 0.017011s : 1: opt_after_jit_grad 0.05% : 0.000598s : 1: opt_b 25.05% : 0.321894s : 1: optimize 0.01% : 0.000074s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000052s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000182s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000081s : 1: remove_dup_value 13.65% : 0.175385s : 3: renormalize.infer 2.41% : 0.030943s : 3: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000057s : 1: rewriter_after_opt_a 0.06% : 0.000723s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000166s : 1: symbol_engine_optimizer 2.37% : 0.030393s : 1: task_emit 0.02% : 0.000211s : 1: tuple_transform 27.33% : 0.351094s : 1: type_inference 0.02% : 0.000193s : 1: validate TotalTime = 0.653685, [24] [bootstrap]: 0.00079053 [type_inference]: 0.258927 [event_method]: 9.478e-05 [auto_monad]: 0.00054638 [graph_reusing]: 2.185e-05 [inline]: 4.03001e-06 [add_attr]: 0.00632926, [1] [add_attr_with_inline]: 0.00630953, [1] [Cycle 1]: 0.00023107, [2] [tag_attr]: 0.00011949 [meta_addattr_fg_expand]: 2.778e-05 [parallel-infer-symbol]: 4.30999e-06 [pre_auto_parallel]: 0.00019874 [insert-virtual-dataset]: 3.4e-06 [parallel-infer-symbol-second]: 1.30001e-06 [dataset_repeat_opt]: 2.83998e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.352104, [53] [py_interpret_to_execute]: 8.1e-06 [rewriter_before_opt_a]: 0.00074236 [opt_a]: 0.346565, [4] [Cycle 1]: 0.29486, [45] [expand_dump_flag]: 1.454e-05 [switch_simplify]: 0.00034816 [loop_unroll]: 0.00014635 [a_1]: 0.00495588 [with_stream_mark]: 5.187e-05 [recompute_prepare]: 4.723e-05 [updatestate_depend_eliminate]: 5.35e-05 [updatestate_assign_eliminate]: 1.778e-05 [updatestate_loads_eliminate]: 1.399e-05 [parameter_eliminate]: 4.23999e-06 [a_2]: 0.00042231 [accelerated_algorithm]: 6.146e-05 [shard]: 2.70997e-06 [meta_shard_fg_expand]: 1.321e-05 [shard_inline]: 2.868e-05 [merge_send_recv]: 3.232e-05 [auto_parallel]: 2.246e-05 [parallel]: 0.00011641 [flash_sp]: 1.736e-05 [merge_comm]: 1.902e-05 [allreduce_fusion]: 1.537e-05 [matmul_add_comm_reduction]: 7.042e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 3.589e-05 [virtual_dataset]: 2.557e-05 [get_grad_eliminate_]: 2.627e-05 [virtual_output]: 2.524e-05 [merge_forward]: 1.693e-05 [cell_reuse_recompute_pass]: 3.38999e-06 [offload_activation]: 3.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.395e-05 [merge_recompute_call_nodes]: 1.86998e-06 [before_grad]: 4.805e-05 [set_forward_comm_id_for_comm_node_pass]: 3.223e-05 [meta_fg_expand]: 0.117792 [flash_sp_send_recv_attached]: 1.075e-05 [receive_attached]: 2.87002e-06 [after_resolve]: 0.00025521 [a_after_grad]: 0.00036211 [renormalize]: 0.163542 [add_forward_monad_depend]: 9.986e-05 [auto_monad_grad]: 5.229e-05 [auto_monad_eliminator]: 0.00031896 [cse]: 0.0007427 [a_3]: 0.00430381 [Cycle 2]: 0.0469452, [45] [expand_dump_flag]: 3.21e-05 [switch_simplify]: 0.00026955 [loop_unroll]: 0.00022949 [a_1]: 0.0063074 [with_stream_mark]: 5.34e-05 [recompute_prepare]: 4.102e-05 [updatestate_depend_eliminate]: 1.866e-05 [updatestate_assign_eliminate]: 1.722e-05 [updatestate_loads_eliminate]: 1.751e-05 [parameter_eliminate]: 5.53002e-06 [a_2]: 0.00072291 [accelerated_algorithm]: 4.732e-05 [shard]: 2.69001e-06 [meta_shard_fg_expand]: 1.164e-05 [shard_inline]: 2.082e-05 [merge_send_recv]: 1.949e-05 [auto_parallel]: 1.911e-05 [parallel]: 0.0316089 [flash_sp]: 1.279e-05 [merge_comm]: 5.035e-05 [allreduce_fusion]: 1.214e-05 [matmul_add_comm_reduction]: 2.547e-05 [allreduce_slice_to_reducescatter]: 1.20001e-06 [virtual_shard_identity]: 5.628e-05 [virtual_dataset]: 2.408e-05 [get_grad_eliminate_]: 2.087e-05 [virtual_output]: 1.946e-05 [merge_forward]: 1.452e-05 [cell_reuse_recompute_pass]: 3.72998e-06 [offload_activation]: 2.588e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.17e-05 [merge_recompute_call_nodes]: 2.08002e-06 [before_grad]: 3.584e-05 [set_forward_comm_id_for_comm_node_pass]: 1.235e-05 [meta_fg_expand]: 0.00096437 [flash_sp_send_recv_attached]: 4.03001e-06 [receive_attached]: 3.76001e-06 [after_resolve]: 5.24e-05 [a_after_grad]: 3.348e-05 [renormalize]: 0.00514604 [add_forward_monad_depend]: 7.97e-06 [auto_monad_grad]: 3.23998e-06 [auto_monad_eliminator]: 4.628e-05 [cse]: 0.00024266 [a_3]: 0.00016365 [Cycle 3]: 0.00315569, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 2.365e-05 [loop_unroll]: 1.996e-05 [a_1]: 0.00070854 [with_stream_mark]: 2.999e-05 [recompute_prepare]: 2.174e-05 [updatestate_depend_eliminate]: 4.877e-05 [updatestate_assign_eliminate]: 1.084e-05 [updatestate_loads_eliminate]: 9.52999e-06 [parameter_eliminate]: 2.96999e-06 [a_2]: 0.00027355 [accelerated_algorithm]: 2.526e-05 [shard]: 2.33002e-06 [meta_shard_fg_expand]: 5.09e-06 [shard_inline]: 1.597e-05 [merge_send_recv]: 1.749e-05 [auto_parallel]: 1.639e-05 [parallel]: 1.115e-05 [flash_sp]: 2.32999e-06 [merge_comm]: 9.69e-06 [allreduce_fusion]: 9.29998e-06 [matmul_add_comm_reduction]: 1.73e-05 [allreduce_slice_to_reducescatter]: 1.04e-06 [virtual_shard_identity]: 1.935e-05 [virtual_dataset]: 1.623e-05 [get_grad_eliminate_]: 1.585e-05 [virtual_output]: 1.643e-05 [merge_forward]: 1.054e-05 [cell_reuse_recompute_pass]: 3.83999e-06 [offload_activation]: 1.976e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.162e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 2.786e-05 [set_forward_comm_id_for_comm_node_pass]: 9.52999e-06 [meta_fg_expand]: 7.36001e-06 [flash_sp_send_recv_attached]: 2.02999e-06 [receive_attached]: 2.50002e-06 [after_resolve]: 2.459e-05 [a_after_grad]: 2.568e-05 [renormalize]: 0.00109539 [add_forward_monad_depend]: 7.71999e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 2.893e-05 [cse]: 0.00011302 [a_3]: 0.00012606 [Cycle 4]: 0.00157063, [45] [expand_dump_flag]: 2.41e-06 [switch_simplify]: 1.839e-05 [loop_unroll]: 1.698e-05 [a_1]: 0.00048345 [with_stream_mark]: 1.825e-05 [recompute_prepare]: 1.604e-05 [updatestate_depend_eliminate]: 9.97999e-06 [updatestate_assign_eliminate]: 9.27999e-06 [updatestate_loads_eliminate]: 9.56e-06 [parameter_eliminate]: 1.57001e-06 [a_2]: 0.00024335 [accelerated_algorithm]: 2.212e-05 [shard]: 1.43002e-06 [meta_shard_fg_expand]: 4.13001e-06 [shard_inline]: 1.591e-05 [merge_send_recv]: 1.339e-05 [auto_parallel]: 1.346e-05 [parallel]: 9.62001e-06 [flash_sp]: 1.86e-06 [merge_comm]: 9.34998e-06 [allreduce_fusion]: 9.29998e-06 [matmul_add_comm_reduction]: 1.531e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 1.752e-05 [virtual_dataset]: 1.623e-05 [get_grad_eliminate_]: 1.584e-05 [virtual_output]: 1.546e-05 [merge_forward]: 8.67998e-06 [cell_reuse_recompute_pass]: 2.27999e-06 [offload_activation]: 1.911e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.16e-05 [merge_recompute_call_nodes]: 9.99979e-07 [before_grad]: 2.633e-05 [set_forward_comm_id_for_comm_node_pass]: 9.47999e-06 [meta_fg_expand]: 6.88998e-06 [flash_sp_send_recv_attached]: 2.50002e-06 [receive_attached]: 2.04e-06 [after_resolve]: 2.271e-05 [a_after_grad]: 2.451e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.64998e-06 [auto_monad_grad]: 1.50999e-06 [auto_monad_eliminator]: 2.185e-05 [cse]: 5.621e-05 [a_3]: 0.00010842 [py_interpret_to_execute_after_opt_a]: 8.96998e-06 [slice_cell_reuse_recomputed_activation]: 2.21998e-06 [rewriter_after_opt_a]: 5.682e-05 [convert_after_rewriter]: 2.99001e-06 [order_py_execute_after_rewriter]: 1.23002e-06 [mutable_eliminate]: 0.00203549 [opt_b]: 0.00062501, [1] [Cycle 1]: 0.00061584, [7] [b_1]: 0.00044034 [b_2]: 2.023e-05 [updatestate_depend_eliminate]: 1.539e-05 [updatestate_assign_eliminate]: 9.60001e-06 [updatestate_loads_eliminate]: 8.95001e-06 [renormalize]: 7.99977e-07 [cse]: 7.884e-05 [optimize_parallel_all_gather_comm]: 4.123e-05 [overlap_param_gather]: 2.29999e-06 [cconv]: 3.144e-05 [loop_unroll]: 0.00059314 [opt_after_cconv]: 0.00023266, [1] [Cycle 1]: 0.00022638, [7] [c_1]: 9.587e-05 [parameter_eliminate]: 4.75999e-06 [updatestate_depend_eliminate]: 1.256e-05 [updatestate_assign_eliminate]: 8.42998e-06 [updatestate_loads_eliminate]: 8.60999e-06 [cse]: 6.056e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 7.944e-05 [tuple_transform]: 0.00020579, [1] [Cycle 1]: 0.0002004, [4] [d_1]: 0.00015909 [none_parameter_eliminate]: 2.83e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.857e-05 [partial_unused_args_eliminate]: 1.77001e-06 [add_recomputation]: 0.00014951 [cse_after_recomputation]: 6.327e-05, [1] [Cycle 1]: 5.647e-05, [1] [cse]: 4.911e-05 [environ_conv]: 1.636e-05 [swap_dp_allreduce_reducescatter]: 1.367e-05 [bias_add_comm_swap]: 3.78999e-06 [label_micro_interleaved_index]: 6.96999e-06 [label_fine_grained_interleaved_index]: 3.46001e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.40001e-06 [ForceFp32Comm]: 1.12999e-06 [remove_cast_before_assign_add]: 1.43002e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.65001e-06 [add_comm_op_reuse_tag]: 1.27999e-06 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.25001e-06 [overlap_opt_shard_in_pipeline]: 2.918e-05 [overlap_opt_shard_grad_in_pipeline]: 2.01003e-06 [control_data_broadcast_order]: 3.097e-05 [grouped_pairwise_exchange_alltoall]: 1.73002e-06 [offloading_packed_experts]: 2.519e-05 [overlap_recompute_and_grad_model_parallel]: 9.83002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.60001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.77999e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 7.84002e-06 [overlap_grad_flash_sp]: 4.129e-05 [begin_end_overlap_inline]: 7.79983e-07 [split_matmul_comm_elemetwise]: 2.46998e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.25001e-06 [symbol_engine_optimizer]: 0.00015613, [1] [Cycle 1]: 0.00015081, [6] [build]: 1.582e-05 [elim_shapecalc]: 2.381e-05 [elim_not_effective]: 3.37e-05 [opt_reshape]: 1.776e-05 [fold_const_symbol]: 2.733e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.59999e-06 [pipeline_parallel_scheduler]: 1.80001e-06 [auto_monad_reorder]: 4.77e-05 [get_jit_bprop_graph]: 1.96998e-06 [rewriter_after_jit_bprop_graph]: 5.18002e-06 [opt_after_jit_grad]: 0.00061709 [validate]: 0.00014082 [backend_pass]: 1.30999e-06 [task_emit]: 0.0334789 [execute]: 8.83001e-06 Sums bootstrap : 0.000791s : 0.12% type_inference : 0.258927s : 40.14% event_method : 0.000095s : 0.01% auto_monad : 0.000546s : 0.08% graph_reusing : 0.000022s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000119s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000028s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000199s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000742s : 0.12% optimize.opt_a.expand_dump_flag : 0.000052s : 0.01% optimize.opt_a.switch_simplify : 0.000660s : 0.10% optimize.opt_a.loop_unroll : 0.000413s : 0.06% optimize.opt_a.a_1 : 0.012455s : 1.93% optimize.opt_a.with_stream_mark : 0.000154s : 0.02% optimize.opt_a.recompute_prepare : 0.000126s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000131s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000055s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000051s : 0.01% optimize.opt_a.parameter_eliminate : 0.000014s : 0.00% optimize.opt_a.a_2 : 0.001662s : 0.26% optimize.opt_a.accelerated_algorithm : 0.000156s : 0.02% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000034s : 0.01% optimize.opt_a.shard_inline : 0.000081s : 0.01% optimize.opt_a.merge_send_recv : 0.000083s : 0.01% optimize.opt_a.auto_parallel : 0.000071s : 0.01% optimize.opt_a.parallel : 0.031746s : 4.92% optimize.opt_a.flash_sp : 0.000034s : 0.01% optimize.opt_a.merge_comm : 0.000088s : 0.01% optimize.opt_a.allreduce_fusion : 0.000046s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000128s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000129s : 0.02% optimize.opt_a.virtual_dataset : 0.000082s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000079s : 0.01% optimize.opt_a.virtual_output : 0.000077s : 0.01% optimize.opt_a.merge_forward : 0.000051s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000013s : 0.00% optimize.opt_a.offload_activation : 0.000096s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000169s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000007s : 0.00% optimize.opt_a.before_grad : 0.000138s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000064s : 0.01% optimize.opt_a.meta_fg_expand : 0.118770s : 18.41% optimize.opt_a.flash_sp_send_recv_attached : 0.000019s : 0.00% optimize.opt_a.receive_attached : 0.000011s : 0.00% optimize.opt_a.after_resolve : 0.000355s : 0.06% optimize.opt_a.a_after_grad : 0.000446s : 0.07% optimize.opt_a.renormalize : 0.169783s : 26.32% optimize.opt_a.add_forward_monad_depend : 0.000117s : 0.02% optimize.opt_a.auto_monad_grad : 0.000060s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000416s : 0.06% optimize.opt_a.cse : 0.001155s : 0.18% optimize.opt_a.a_3 : 0.004702s : 0.73% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000057s : 0.01% optimize.convert_after_rewriter : 0.000003s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.002035s : 0.32% optimize.opt_b.b_1 : 0.000440s : 0.07% optimize.opt_b.b_2 : 0.000020s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000079s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000041s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.00% optimize.loop_unroll : 0.000593s : 0.09% optimize.opt_after_cconv.c_1 : 0.000096s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.cse : 0.000061s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000079s : 0.01% optimize.tuple_transform.d_1 : 0.000159s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000150s : 0.02% optimize.cse_after_recomputation.cse : 0.000049s : 0.01% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000029s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000031s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000025s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.00% optimize.overlap_grad_flash_sp : 0.000041s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000034s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000018s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000027s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000048s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000617s : 0.10% validate : 0.000141s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.033479s : 5.19% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.006975 753 0.23% : 0.000016s : 1: substitution.arithmetic_simplify 0.40% : 0.000028s : 8: substitution.depend_value_elim 0.08% : 0.000006s : 10: substitution.elim_not_effective 0.21% : 0.000014s : 14: substitution.float_depend_g_call 0.43% : 0.000030s : 23: substitution.float_tuple_getitem_switch 0.05% : 0.000004s : 10: substitution.fold_const_symbol 26.61% : 0.001856s : 9: substitution.getattr_setattr_resolve 0.17% : 0.000012s : 13: substitution.graph_param_transform 0.06% : 0.000004s : 2: substitution.incorporate_call 0.03% : 0.000002s : 2: substitution.incorporate_call_switch 50.09% : 0.003494s : 48: substitution.inline 0.89% : 0.000062s : 8: substitution.inline_without_move 0.38% : 0.000026s : 50: substitution.j_node_and_user_rematch 0.74% : 0.000051s : 5: substitution.less_batch_normalization 2.71% : 0.000189s : 36: substitution.minmaximum_grad 0.50% : 0.000035s : 14: substitution.partial_eliminate 0.53% : 0.000037s : 50: substitution.remove_not_recompute_node 3.31% : 0.000231s : 86: substitution.replace_applicator 0.51% : 0.000035s : 40: substitution.replace_old_param 0.11% : 0.000008s : 2: substitution.set_cell_output_no_recompute 0.36% : 0.000025s : 8: substitution.switch_simplify 2.84% : 0.000198s : 42: substitution.tuple_list_convert_item_index_to_positive 0.83% : 0.000058s : 44: substitution.tuple_list_get_item_const_eliminator 1.42% : 0.000099s : 44: substitution.tuple_list_get_item_depend_reorder 3.51% : 0.000245s : 83: substitution.tuple_list_get_item_eliminator 1.17% : 0.000081s : 44: substitution.tuple_list_get_set_item_eliminator 0.61% : 0.000043s : 24: substitution.updatestate_pure_node_eliminater 1.26% : 0.000088s : 33: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.244098 2 96.50% : 0.235549s : 1: type_inference.infer 3.50% : 0.008549s : 1: type_inference.specialize ------[replace.] 0.002117 102 0.34% : 0.000007s : 1: replace.arithmetic_simplify 6.48% : 0.000137s : 7: replace.getattr_setattr_resolve 38.86% : 0.000823s : 48: replace.inline 10.59% : 0.000224s : 15: replace.replace_applicator 5.18% : 0.000110s : 8: replace.switch_simplify 0.84% : 0.000018s : 2: replace.tuple_list_get_item_depend_reorder 35.51% : 0.000752s : 20: replace.tuple_list_get_item_eliminator 2.19% : 0.000046s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.005372 102 0.28% : 0.000015s : 1: match.arithmetic_simplify 31.85% : 0.001711s : 7: match.getattr_setattr_resolve 64.34% : 0.003456s : 48: match.inline 1.47% : 0.000079s : 15: match.replace_applicator 0.36% : 0.000019s : 8: match.switch_simplify 0.37% : 0.000020s : 2: match.tuple_list_get_item_depend_reorder 1.09% : 0.000059s : 20: match.tuple_list_get_item_eliminator 0.23% : 0.000012s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.002668 16437 0.84% : 0.000022s : 156: predicate.accumulaten_eliminater 0.16% : 0.000004s : 13: predicate.ad_related_special_op_eliminate 0.46% : 0.000012s : 87: predicate.addn_check_dump 0.82% : 0.000022s : 156: predicate.addn_zero_filter 0.77% : 0.000020s : 156: predicate.adjust_all_reduce_mul_add 1.85% : 0.000049s : 238: predicate.arithmetic_simplify 0.84% : 0.000022s : 157: predicate.cast_eliminate 2.77% : 0.000074s : 554: predicate.check_bprop_eliminate 0.44% : 0.000012s : 87: predicate.compare_switch_simplify 0.04% : 0.000001s : 13: predicate.const_output_eliminate 0.44% : 0.000012s : 81: predicate.depend_value_elim 0.86% : 0.000023s : 157: predicate.dict_get_item_const_eliminator 0.95% : 0.000025s : 157: predicate.dict_get_item_eliminator 0.93% : 0.000025s : 157: predicate.dict_set_item_eliminator 0.19% : 0.000005s : 26: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 13: predicate.elim_not_effective 0.09% : 0.000002s : 13: predicate.elim_shapecalc_of_broadcastargs 0.91% : 0.000024s : 170: predicate.environ_add_const_eliminate 0.87% : 0.000023s : 170: predicate.environ_get_add_eliminate 0.88% : 0.000024s : 170: predicate.environ_get_depend_swap 1.35% : 0.000036s : 251: predicate.environ_get_eliminate 0.94% : 0.000025s : 170: predicate.environ_get_set_eliminate 1.21% : 0.000032s : 227: predicate.exchange_switch_depend_value 1.74% : 0.000046s : 227: predicate.float_depend_g_call 0.45% : 0.000012s : 87: predicate.float_environ_get_switch 0.57% : 0.000015s : 100: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 13: predicate.fold_const_symbol 0.41% : 0.000011s : 65: predicate.get_grad_eliminate 0.58% : 0.000016s : 51: predicate.getattr_setattr_resolve 0.05% : 0.000001s : 13: predicate.graph_param_transform 0.42% : 0.000011s : 81: predicate.incorporate_call 0.39% : 0.000010s : 81: predicate.incorporate_call_switch 4.09% : 0.000109s : 561: predicate.inline 1.57% : 0.000042s : 222: predicate.inline_without_move 0.17% : 0.000004s : 65: predicate.j_node_and_user_rematch 0.51% : 0.000014s : 68: predicate.less_batch_normalization 1.27% : 0.000034s : 205: predicate.list_to_tuple_eliminator_ 1.95% : 0.000052s : 361: predicate.load_eliminater 0.16% : 0.000004s : 13: predicate.loop_unroll_after_grad 2.56% : 0.000068s : 423: predicate.loop_unroll_before_grad 1.05% : 0.000028s : 185: predicate.make_slice_get_slice_eliminator 0.48% : 0.000013s : 87: predicate.merge_addn 7.24% : 0.000193s : 500: predicate.micro_step_allgather_replace 2.53% : 0.000068s : 500: predicate.mini_step_allgather_replace 0.82% : 0.000022s : 157: predicate.minmaximum_grad 0.28% : 0.000008s : 13: predicate.mutable_eliminate 0.08% : 0.000002s : 13: predicate.opt_reshape 0.09% : 0.000002s : 13: predicate.parallel_virtual_node 2.22% : 0.000059s : 227: predicate.partial_defer_inline 1.10% : 0.000029s : 192: predicate.partial_eliminate 0.86% : 0.000023s : 156: predicate.print_const_string_wrapper 0.44% : 0.000012s : 81: predicate.reduce_all_const_elim 1.07% : 0.000029s : 157: predicate.reduce_eliminate 1.88% : 0.000050s : 361: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000006s : 65: predicate.remove_not_recompute_node 2.32% : 0.000062s : 709: predicate.replace_applicator 0.68% : 0.000018s : 222: predicate.replace_old_param 0.04% : 0.000001s : 13: predicate.reset_defer_inline 0.92% : 0.000025s : 157: predicate.reshape_eliminate 2.74% : 0.000073s : 500: predicate.row_tensor_add_zeros_like 0.08% : 0.000002s : 13: predicate.row_tensor_eliminate 3.03% : 0.000081s : 554: predicate.same_eliminate 0.24% : 0.000006s : 77: predicate.set_cell_output_no_recompute 0.47% : 0.000013s : 65: predicate.shard_identity_eliminate 0.16% : 0.000004s : 26: predicate.special_op_eliminate 0.53% : 0.000014s : 87: predicate.specialize_transform 4.66% : 0.000124s : 500: predicate.split_environ_get_set_with_tuple_value 1.34% : 0.000036s : 222: predicate.stack_unstack_eliminate 0.08% : 0.000002s : 13: predicate.switch_call_monad_eliminater 1.34% : 0.000036s : 227: predicate.switch_defer_inline 4.14% : 0.000110s : 781: predicate.switch_layer_defer_inline 4.81% : 0.000128s : 766: predicate.switch_simplify 0.95% : 0.000025s : 157: predicate.tile_eliminate 0.81% : 0.000022s : 157: predicate.transpose_eliminate 1.11% : 0.000030s : 183: predicate.tuple_list_convert_item_index_to_positive 1.13% : 0.000030s : 185: predicate.tuple_list_get_item_const_eliminator 1.10% : 0.000029s : 185: predicate.tuple_list_get_item_depend_reorder 2.21% : 0.000059s : 286: predicate.tuple_list_get_item_eliminator 1.16% : 0.000031s : 185: predicate.tuple_list_get_set_item_eliminator 1.67% : 0.000044s : 266: predicate.tuple_list_set_item_eliminator 1.20% : 0.000032s : 205: predicate.tuple_to_list_eliminator_ 1.81% : 0.000048s : 361: predicate.updatestate_pure_node_eliminater 2.32% : 0.000062s : 444: predicate.updatestate_useless_node_eliminater 0.07% : 0.000002s : 13: predicate.value_based_eliminate 0.39% : 0.000010s : 65: predicate.virtual_dataset_eliminate 0.38% : 0.000010s : 65: predicate.virtual_output_eliminate 0.06% : 0.000002s : 13: predicate.virtual_view_grad_eliminate 0.09% : 0.000002s : 13: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.043988 179 85.79% : 0.037739s : 87: func_graph_cloner_run.FuncGraphClonerGraph 1.10% : 0.000482s : 3: func_graph_cloner_run.FuncGraphClonerNode 13.11% : 0.005768s : 89: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.206188 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.53% : 0.006336s : 1: add_attr 0.52% : 0.006316s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000155s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000563s : 1: auto_monad 0.00% : 0.000053s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: backend_pass 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.07% : 0.000839s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000035s : 1: control_data_broadcast_order 0.00% : 0.000006s : 1: convert_after_rewriter 0.01% : 0.000066s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.01% : 0.000105s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000027s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.05% : 0.000602s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.17% : 0.002052s : 1: mutable_eliminate 0.00% : 0.000029s : 1: offloading_packed_experts 0.00% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000043s : 1: opt.transform.mutable_eliminate 1.77% : 0.021399s : 181: opt.transform.opt_a 0.01% : 0.000094s : 1: opt.transform.opt_after_cconv 0.00% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000424s : 28: opt.transform.opt_b 0.18% : 0.002140s : 4: opt.transform.opt_resolve 0.01% : 0.000175s : 2: opt.transform.opt_trans_graph 0.01% : 0.000098s : 4: opt.transform.symbol_engine_opt 28.73% : 0.346570s : 1: opt_a 0.02% : 0.000236s : 1: opt_after_cconv 0.05% : 0.000629s : 1: opt_after_jit_grad 0.05% : 0.000630s : 1: opt_b 29.19% : 0.352112s : 1: optimize 0.00% : 0.000045s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000045s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000208s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000084s : 1: remove_dup_value 12.22% : 0.147369s : 3: renormalize.infer 1.85% : 0.022370s : 3: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000061s : 1: rewriter_after_opt_a 0.06% : 0.000757s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000159s : 1: symbol_engine_optimizer 2.78% : 0.033500s : 1: task_emit 0.02% : 0.000209s : 1: tuple_transform 21.47% : 0.258972s : 1: type_inference 0.02% : 0.000203s : 1: validate group_cases_6 have all been run, results of sub cases are below: case: (1,) {} pass. case: ('pynative', False) {} pass. case: ('pynative', True) {} pass. case: ('kbk', False) {} pass. case: (0,) {} pass. case: ('kbk', True) {} pass. case: ('kbk', False) {} pass. case: ('kbk', True) {} pass. ops group_cases_7 with 8 cases start to running, all cases are below: case: (, 'pynative', True) case: (, 'pynative', False) case: (, 0) case: (, 1) case: (, 0, mindspore.float16) case: (, 0, mindspore.bfloat16) case: (, 1, mindspore.float16) case: (, 1, mindspore.bfloat16) ops group_cases_7 total running memory: 1176M, memory threshold: 51200M TotalTime = 3.13924, [24] [bootstrap]: 0.00195503 [type_inference]: 0.083128 [event_method]: 5.038e-05 [auto_monad]: 0.00014271 [graph_reusing]: 7.31001e-06 [inline]: 3.68e-06 [add_attr]: 0.00842059, [1] [add_attr_with_inline]: 0.00840205, [1] [Cycle 1]: 0.00018133, [2] [tag_attr]: 6.658e-05 [meta_addattr_fg_expand]: 1.976e-05 [parallel-infer-symbol]: 4.27e-06 [pre_auto_parallel]: 7.728e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00927943, [53] [py_interpret_to_execute]: 5.34e-06 [rewriter_before_opt_a]: 0.00054258 [opt_a]: 0.00576863, [2] [Cycle 1]: 0.00478084, [45] [expand_dump_flag]: 4.22e-06 [switch_simplify]: 0.00010086 [loop_unroll]: 5.431e-05 [a_1]: 0.00106299 [with_stream_mark]: 2.066e-05 [recompute_prepare]: 1.187e-05 [updatestate_depend_eliminate]: 1.632e-05 [updatestate_assign_eliminate]: 1.317e-05 [updatestate_loads_eliminate]: 5.41002e-06 [parameter_eliminate]: 2.14999e-06 [a_2]: 0.00015373 [accelerated_algorithm]: 1.07e-05 [shard]: 0.0003032 [meta_shard_fg_expand]: 2.25e-05 [shard_inline]: 3.764e-05 [merge_send_recv]: 6.783e-05 [auto_parallel]: 3.085e-05 [parallel]: 0.00015132 [flash_sp]: 5.136e-05 [merge_comm]: 7.23999e-06 [allreduce_fusion]: 1.43e-05 [matmul_add_comm_reduction]: 2.009e-05 [allreduce_slice_to_reducescatter]: 8.52998e-06 [virtual_shard_identity]: 1.512e-05 [virtual_dataset]: 1.085e-05 [get_grad_eliminate_]: 1.006e-05 [virtual_output]: 1.03e-05 [merge_forward]: 6.97002e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 2.241e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.033e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.499e-05 [set_forward_comm_id_for_comm_node_pass]: 1.418e-05 [meta_fg_expand]: 6.35002e-06 [flash_sp_send_recv_attached]: 2.73e-06 [receive_attached]: 1.755e-05 [after_resolve]: 1.784e-05 [a_after_grad]: 1.463e-05 [renormalize]: 0.0018657 [add_forward_monad_depend]: 7.58999e-06 [auto_monad_grad]: 2.75002e-06 [auto_monad_eliminator]: 3.568e-05 [cse]: 9.905e-05 [a_3]: 7.526e-05 [Cycle 2]: 0.00097665, [45] [expand_dump_flag]: 1.87999e-06 [switch_simplify]: 1.184e-05 [loop_unroll]: 9.86e-06 [a_1]: 0.00021522 [with_stream_mark]: 1.624e-05 [recompute_prepare]: 9.74e-06 [updatestate_depend_eliminate]: 6.38003e-06 [updatestate_assign_eliminate]: 5.37999e-06 [updatestate_loads_eliminate]: 5.24e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00013589 [accelerated_algorithm]: 9.30001e-06 [shard]: 1.37e-06 [meta_shard_fg_expand]: 2.14999e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 8.33999e-06 [auto_parallel]: 9.26998e-06 [parallel]: 5.77001e-06 [flash_sp]: 3.88001e-06 [merge_comm]: 6.04001e-06 [allreduce_fusion]: 5.84999e-06 [matmul_add_comm_reduction]: 9.09e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 1.019e-05 [virtual_dataset]: 9.56e-06 [get_grad_eliminate_]: 8.92e-06 [virtual_output]: 9.06998e-06 [merge_forward]: 6.73e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.087e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.548e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 1.431e-05 [set_forward_comm_id_for_comm_node_pass]: 5.77001e-06 [meta_fg_expand]: 3.61001e-06 [flash_sp_send_recv_attached]: 1.10001e-06 [receive_attached]: 4.426e-05 [after_resolve]: 1.603e-05 [a_after_grad]: 1.313e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.96998e-06 [auto_monad_grad]: 1.34e-06 [auto_monad_eliminator]: 1.183e-05 [cse]: 3.808e-05 [a_3]: 6.24e-05 [py_interpret_to_execute_after_opt_a]: 6.53e-06 [slice_cell_reuse_recomputed_activation]: 2.06998e-06 [rewriter_after_opt_a]: 5.08e-05 [convert_after_rewriter]: 1.40001e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00076324 [opt_b]: 0.00035891, [1] [Cycle 1]: 0.00035146, [7] [b_1]: 0.00023639 [b_2]: 1.21e-05 [updatestate_depend_eliminate]: 9.40001e-06 [updatestate_assign_eliminate]: 5.30999e-06 [updatestate_loads_eliminate]: 5.34e-06 [renormalize]: 3.80009e-07 [cse]: 4.557e-05 [optimize_parallel_all_gather_comm]: 3.217e-05 [overlap_param_gather]: 1.058e-05 [cconv]: 2.834e-05 [loop_unroll]: 0.00046267 [opt_after_cconv]: 0.00014193, [1] [Cycle 1]: 0.00013606, [7] [c_1]: 4.178e-05 [parameter_eliminate]: 3.25e-06 [updatestate_depend_eliminate]: 7.95e-06 [updatestate_assign_eliminate]: 5.15001e-06 [updatestate_loads_eliminate]: 4.97999e-06 [cse]: 3.946e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 0.00021953 [tuple_transform]: 0.00012333, [1] [Cycle 1]: 0.00011744, [4] [d_1]: 8.222e-05 [none_parameter_eliminate]: 2.46e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.059e-05 [partial_unused_args_eliminate]: 1.91003e-06 [add_recomputation]: 7.512e-05 [cse_after_recomputation]: 3.955e-05, [1] [Cycle 1]: 3.399e-05, [1] [cse]: 2.803e-05 [environ_conv]: 2.968e-05 [swap_dp_allreduce_reducescatter]: 2.709e-05 [bias_add_comm_swap]: 1.098e-05 [label_micro_interleaved_index]: 1.377e-05 [label_fine_grained_interleaved_index]: 2.82002e-06 [merge_cast_opt]: 1.66002e-06 [slice_recompute_activation]: 2.06e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 8.27e-06 [full_micro_interleaved_order_control]: 9.77999e-06 [reorder_send_recv_between_fp_bp]: 2.56998e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.28002e-06 [interleave_parallel_branches]: 8.08001e-06 [overlap_opt_shard_in_pipeline]: 2.526e-05 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.857e-05 [grouped_pairwise_exchange_alltoall]: 1.61998e-06 [offloading_packed_experts]: 5.45001e-06 [overlap_recompute_and_grad_model_parallel]: 1.347e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.71e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.31e-06 [overlap_grad_ring_attention]: 2.052e-05 [overlap_grad_flash_sp]: 4.663e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 1.004e-05 [split_layernorm_comm]: 2.00002e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 0.00012032, [1] [Cycle 1]: 0.00011481, [6] [build]: 2.89e-05 [elim_shapecalc]: 1.494e-05 [elim_not_effective]: 1.668e-05 [opt_reshape]: 1.032e-05 [fold_const_symbol]: 1.427e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.75997e-06 [pipeline_parallel_scheduler]: 1.42999e-06 [auto_monad_reorder]: 3.037e-05 [get_jit_bprop_graph]: 1.71002e-06 [rewriter_after_jit_bprop_graph]: 4.02002e-06 [opt_after_jit_grad]: 0.00054079 [validate]: 8.432e-05 [backend_pass]: 8.60018e-07 [task_emit]: 3.03517 [execute]: 1.153e-05 Sums bootstrap : 0.001955s : 0.06% type_inference : 0.083128s : 2.66% event_method : 0.000050s : 0.00% auto_monad : 0.000143s : 0.00% graph_reusing : 0.000007s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000067s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000077s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000543s : 0.02% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000113s : 0.00% optimize.opt_a.loop_unroll : 0.000064s : 0.00% optimize.opt_a.a_1 : 0.001278s : 0.04% optimize.opt_a.with_stream_mark : 0.000037s : 0.00% optimize.opt_a.recompute_prepare : 0.000022s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000290s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.00% optimize.opt_a.shard : 0.000305s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000025s : 0.00% optimize.opt_a.shard_inline : 0.000047s : 0.00% optimize.opt_a.merge_send_recv : 0.000076s : 0.00% optimize.opt_a.auto_parallel : 0.000040s : 0.00% optimize.opt_a.parallel : 0.000157s : 0.01% optimize.opt_a.flash_sp : 0.000055s : 0.00% optimize.opt_a.merge_comm : 0.000013s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.00% optimize.opt_a.virtual_dataset : 0.000020s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.00% optimize.opt_a.virtual_output : 0.000019s : 0.00% optimize.opt_a.merge_forward : 0.000014s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000033s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000029s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000010s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000062s : 0.00% optimize.opt_a.after_resolve : 0.000034s : 0.00% optimize.opt_a.a_after_grad : 0.000028s : 0.00% optimize.opt_a.renormalize : 0.001866s : 0.06% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000048s : 0.00% optimize.opt_a.cse : 0.000137s : 0.00% optimize.opt_a.a_3 : 0.000138s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000051s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000763s : 0.02% optimize.opt_b.b_1 : 0.000236s : 0.01% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000046s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000028s : 0.00% optimize.loop_unroll : 0.000463s : 0.01% optimize.opt_after_cconv.c_1 : 0.000042s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000039s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000220s : 0.01% optimize.tuple_transform.d_1 : 0.000082s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000075s : 0.00% optimize.cse_after_recomputation.cse : 0.000028s : 0.00% optimize.environ_conv : 0.000030s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000025s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000047s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000029s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000541s : 0.02% validate : 0.000084s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.035166s : 96.98% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000431 69 0.47% : 0.000002s : 3: substitution.elim_not_effective 3.73% : 0.000016s : 3: substitution.float_tuple_getitem_switch 0.37% : 0.000002s : 3: substitution.fold_const_symbol 1.66% : 0.000007s : 6: substitution.graph_param_transform 65.02% : 0.000280s : 6: substitution.inline 1.31% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.40% : 0.000006s : 2: substitution.minmaximum_grad 1.52% : 0.000007s : 6: substitution.remove_not_recompute_node 1.44% : 0.000006s : 6: substitution.replace_old_param 3.89% : 0.000017s : 4: substitution.tuple_list_convert_item_index_to_positive 3.78% : 0.000016s : 4: substitution.tuple_list_get_item_const_eliminator 2.94% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 9.69% : 0.000042s : 12: substitution.tuple_list_get_item_eliminator 2.77% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.083004 2 96.81% : 0.080357s : 1: type_inference.infer 3.19% : 0.002647s : 1: type_inference.specialize ------[replace.] 0.000152 12 61.10% : 0.000093s : 6: replace.inline 38.90% : 0.000059s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000287 12 96.54% : 0.000277s : 6: match.inline 3.46% : 0.000010s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000288 1946 1.01% : 0.000003s : 20: predicate.accumulaten_eliminater 0.74% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.56% : 0.000002s : 12: predicate.addn_check_dump 0.98% : 0.000003s : 20: predicate.addn_zero_filter 0.83% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 32: predicate.arithmetic_simplify 0.90% : 0.000003s : 20: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.54% : 0.000002s : 12: predicate.compare_switch_simplify 0.18% : 0.000001s : 6: predicate.const_output_eliminate 0.54% : 0.000002s : 12: predicate.depend_value_elim 0.94% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.10% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.94% : 0.000003s : 20: predicate.dict_set_item_eliminator 0.81% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 6: predicate.elim_not_effective 0.36% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 26: predicate.environ_get_depend_swap 1.67% : 0.000005s : 38: predicate.environ_get_eliminate 1.09% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.44% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.16% : 0.000006s : 32: predicate.float_depend_g_call 0.53% : 0.000002s : 12: predicate.float_environ_get_switch 0.92% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.71% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000001s : 6: predicate.graph_param_transform 0.55% : 0.000002s : 12: predicate.incorporate_call 0.50% : 0.000001s : 12: predicate.incorporate_call_switch 5.96% : 0.000017s : 88: predicate.inline 0.84% : 0.000002s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 12: predicate.less_batch_normalization 1.83% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.54% : 0.000007s : 58: predicate.load_eliminater 0.75% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.61% : 0.000008s : 54: predicate.loop_unroll_before_grad 1.62% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 12: predicate.merge_addn 0.58% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 20: predicate.minmaximum_grad 0.89% : 0.000003s : 6: predicate.mutable_eliminate 0.36% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 2.00% : 0.000006s : 32: predicate.partial_defer_inline 1.53% : 0.000004s : 32: predicate.partial_eliminate 0.94% : 0.000003s : 20: predicate.print_const_string_wrapper 0.63% : 0.000002s : 12: predicate.reduce_all_const_elim 1.32% : 0.000004s : 20: predicate.reduce_eliminate 2.62% : 0.000008s : 58: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 12: predicate.remove_not_recompute_node 1.36% : 0.000004s : 38: predicate.replace_applicator 0.51% : 0.000001s : 12: predicate.replace_old_param 0.21% : 0.000001s : 6: predicate.reset_defer_inline 1.01% : 0.000003s : 20: predicate.reshape_eliminate 0.59% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.70% : 0.000002s : 12: predicate.same_eliminate 0.42% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.65% : 0.000002s : 12: predicate.specialize_transform 0.81% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 32: predicate.switch_defer_inline 2.15% : 0.000006s : 44: predicate.switch_layer_defer_inline 5.45% : 0.000016s : 104: predicate.switch_simplify 0.91% : 0.000003s : 20: predicate.tile_eliminate 0.92% : 0.000003s : 20: predicate.transpose_eliminate 1.71% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.63% : 0.000010s : 50: predicate.tuple_list_get_item_eliminator 1.47% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.82% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.37% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.01% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 6: predicate.value_based_eliminate 0.80% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.79% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001944 17 56.46% : 0.001097s : 9: func_graph_cloner_run.FuncGraphClonerGraph 43.54% : 0.000847s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.161223 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.27% : 0.008427s : 1: add_attr 0.27% : 0.008407s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000080s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000150s : 1: auto_monad 0.00% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.06% : 0.002002s : 1: bootstrap 0.00% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000043s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000034s : 1: environ_conv 0.00% : 0.000059s : 1: event_method 0.00% : 0.000028s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.01% : 0.000472s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000774s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000022s : 1: opt.transform.mutable_eliminate 0.07% : 0.002086s : 78: opt.transform.opt_a 0.00% : 0.000040s : 1: opt.transform.opt_after_cconv 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000220s : 28: opt.transform.opt_b 0.00% : 0.000090s : 2: opt.transform.opt_trans_graph 0.00% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.18% : 0.005772s : 1: opt_a 0.00% : 0.000145s : 1: opt_after_cconv 0.02% : 0.000552s : 1: opt_after_jit_grad 0.01% : 0.000362s : 1: opt_b 0.29% : 0.009285s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000051s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000029s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000082s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.01% : 0.000229s : 1: remove_dup_value 0.03% : 0.000980s : 1: renormalize.infer 0.03% : 0.000875s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000055s : 1: rewriter_after_opt_a 0.02% : 0.000550s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000123s : 1: symbol_engine_optimizer 96.01% : 3.035207s : 1: task_emit 0.00% : 0.000127s : 1: tuple_transform 2.63% : 0.083157s : 1: type_inference 0.00% : 0.000117s : 1: validate TotalTime = 3.38661, [24] [bootstrap]: 0.00121502 [type_inference]: 0.0812468 [event_method]: 5.62e-05 [auto_monad]: 0.00017004 [graph_reusing]: 7.78999e-06 [inline]: 3.98999e-06 [add_attr]: 0.00910673, [1] [add_attr_with_inline]: 0.00908641, [1] [Cycle 1]: 0.0001955, [2] [tag_attr]: 7.058e-05 [meta_addattr_fg_expand]: 1.926e-05 [parallel-infer-symbol]: 3.73999e-06 [pre_auto_parallel]: 8.135e-05 [insert-virtual-dataset]: 2.79999e-06 [parallel-infer-symbol-second]: 1.10001e-06 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.93997e-06 [optimize]: 0.00915375, [53] [py_interpret_to_execute]: 6.36998e-06 [rewriter_before_opt_a]: 0.0005589 [opt_a]: 0.00548661, [2] [Cycle 1]: 0.00451201, [45] [expand_dump_flag]: 3.83999e-06 [switch_simplify]: 0.00010006 [loop_unroll]: 5.326e-05 [a_1]: 0.00105517 [with_stream_mark]: 2.622e-05 [recompute_prepare]: 1.339e-05 [updatestate_depend_eliminate]: 1.648e-05 [updatestate_assign_eliminate]: 1.36e-05 [updatestate_loads_eliminate]: 5.64998e-06 [parameter_eliminate]: 2.51998e-06 [a_2]: 0.00016315 [accelerated_algorithm]: 1.04e-05 [shard]: 2.11e-06 [meta_shard_fg_expand]: 3.97998e-06 [shard_inline]: 9.15001e-06 [merge_send_recv]: 4.585e-05 [auto_parallel]: 1.171e-05 [parallel]: 8.486e-05 [flash_sp]: 3.508e-05 [merge_comm]: 6.74999e-06 [allreduce_fusion]: 1.378e-05 [matmul_add_comm_reduction]: 1.994e-05 [allreduce_slice_to_reducescatter]: 8.57e-06 [virtual_shard_identity]: 1.279e-05 [virtual_dataset]: 9.62001e-06 [get_grad_eliminate_]: 9.17001e-06 [virtual_output]: 9.62999e-06 [merge_forward]: 6.32001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 2.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.62e-05 [merge_recompute_call_nodes]: 1.98997e-06 [before_grad]: 1.564e-05 [set_forward_comm_id_for_comm_node_pass]: 1.484e-05 [meta_fg_expand]: 5.75001e-06 [flash_sp_send_recv_attached]: 2.69001e-06 [receive_attached]: 1.757e-05 [after_resolve]: 1.72e-05 [a_after_grad]: 1.424e-05 [renormalize]: 0.0020846 [add_forward_monad_depend]: 1.02e-05 [auto_monad_grad]: 3.01999e-06 [auto_monad_eliminator]: 4.363e-05 [cse]: 0.00010799 [a_3]: 8.41e-05 [Cycle 2]: 0.00096012, [45] [expand_dump_flag]: 2.32001e-06 [switch_simplify]: 1.261e-05 [loop_unroll]: 9.82001e-06 [a_1]: 0.00022241 [with_stream_mark]: 2.23e-05 [recompute_prepare]: 9.83002e-06 [updatestate_depend_eliminate]: 6.87002e-06 [updatestate_assign_eliminate]: 5.89e-06 [updatestate_loads_eliminate]: 5.07e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 0.00013437 [accelerated_algorithm]: 9.10001e-06 [shard]: 2.36998e-06 [meta_shard_fg_expand]: 2.77002e-06 [shard_inline]: 9.31002e-06 [merge_send_recv]: 9.59e-06 [auto_parallel]: 1.161e-05 [parallel]: 8.00999e-06 [flash_sp]: 4.50001e-06 [merge_comm]: 6.07001e-06 [allreduce_fusion]: 6.07999e-06 [matmul_add_comm_reduction]: 1.1e-05 [allreduce_slice_to_reducescatter]: 5.79981e-07 [virtual_shard_identity]: 1.013e-05 [virtual_dataset]: 9.31e-06 [get_grad_eliminate_]: 9.00001e-06 [virtual_output]: 8.76997e-06 [merge_forward]: 7.16001e-06 [cell_reuse_recompute_pass]: 3.08e-06 [offload_activation]: 1.26e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.508e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.406e-05 [set_forward_comm_id_for_comm_node_pass]: 6.22001e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 1.35999e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.58e-05 [a_after_grad]: 1.348e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 1.20001e-06 [auto_monad_eliminator]: 1.176e-05 [cse]: 3.811e-05 [a_3]: 6.088e-05 [py_interpret_to_execute_after_opt_a]: 8.69998e-06 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 5.411e-05 [convert_after_rewriter]: 1.67001e-06 [order_py_execute_after_rewriter]: 1.23002e-06 [mutable_eliminate]: 0.00085448 [opt_b]: 0.00036869, [1] [Cycle 1]: 0.00035868, [7] [b_1]: 0.00023753 [b_2]: 1.233e-05 [updatestate_depend_eliminate]: 1.07e-05 [updatestate_assign_eliminate]: 5.59e-06 [updatestate_loads_eliminate]: 4.95001e-06 [renormalize]: 8.09989e-07 [cse]: 4.882e-05 [optimize_parallel_all_gather_comm]: 3.586e-05 [overlap_param_gather]: 1.089e-05 [cconv]: 3.411e-05 [loop_unroll]: 0.0005121 [opt_after_cconv]: 0.00015146, [1] [Cycle 1]: 0.00014478, [7] [c_1]: 4.248e-05 [parameter_eliminate]: 4.33001e-06 [updatestate_depend_eliminate]: 8.98002e-06 [updatestate_assign_eliminate]: 5.63002e-06 [updatestate_loads_eliminate]: 5.46e-06 [cse]: 4.178e-05 [renormalize]: 5.50004e-07 [remove_dup_value]: 9.261e-05 [tuple_transform]: 0.00011516, [1] [Cycle 1]: 0.00011008, [4] [d_1]: 7.723e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.065e-05 [partial_unused_args_eliminate]: 2.29001e-06 [add_recomputation]: 8.408e-05 [cse_after_recomputation]: 7.321e-05, [1] [Cycle 1]: 6.785e-05, [1] [cse]: 6.171e-05 [environ_conv]: 3.186e-05 [swap_dp_allreduce_reducescatter]: 2.942e-05 [bias_add_comm_swap]: 1.222e-05 [label_micro_interleaved_index]: 1.503e-05 [label_fine_grained_interleaved_index]: 3.21999e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.36998e-06 [micro_interleaved_order_control]: 2.21998e-06 [assign_add_opt]: 1.30001e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 8.88002e-06 [full_micro_interleaved_order_control]: 1.004e-05 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.32999e-06 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 8.55999e-06 [overlap_opt_shard_in_pipeline]: 3.053e-05 [overlap_opt_shard_grad_in_pipeline]: 1.82999e-06 [control_data_broadcast_order]: 1.999e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 5.91e-06 [overlap_recompute_and_grad_model_parallel]: 1.567e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.62001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 2.086e-05 [overlap_grad_flash_sp]: 5.056e-05 [begin_end_overlap_inline]: 7.09988e-07 [split_matmul_comm_elemetwise]: 1.024e-05 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.26002e-06 [symbol_engine_optimizer]: 0.00013901, [1] [Cycle 1]: 0.00013293, [6] [build]: 3.928e-05 [elim_shapecalc]: 1.76e-05 [elim_not_effective]: 1.848e-05 [opt_reshape]: 1.036e-05 [fold_const_symbol]: 1.408e-05 [renormalize]: 1.60013e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 3.041e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 5.65001e-06 [opt_after_jit_grad]: 0.00057905 [validate]: 8.746e-05 [backend_pass]: 1.19e-06 [task_emit]: 3.28425 [execute]: 1.154e-05 Sums bootstrap : 0.001215s : 0.04% type_inference : 0.081247s : 2.41% event_method : 0.000056s : 0.00% auto_monad : 0.000170s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000071s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000019s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000081s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000559s : 0.02% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000113s : 0.00% optimize.opt_a.loop_unroll : 0.000063s : 0.00% optimize.opt_a.a_1 : 0.001278s : 0.04% optimize.opt_a.with_stream_mark : 0.000049s : 0.00% optimize.opt_a.recompute_prepare : 0.000023s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000298s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000020s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.00% optimize.opt_a.merge_send_recv : 0.000055s : 0.00% optimize.opt_a.auto_parallel : 0.000023s : 0.00% optimize.opt_a.parallel : 0.000093s : 0.00% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000013s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000031s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.00% optimize.opt_a.virtual_dataset : 0.000019s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000018s : 0.00% optimize.opt_a.virtual_output : 0.000018s : 0.00% optimize.opt_a.merge_forward : 0.000013s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000034s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000021s : 0.00% optimize.opt_a.meta_fg_expand : 0.000010s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.00% optimize.opt_a.a_after_grad : 0.000028s : 0.00% optimize.opt_a.renormalize : 0.002085s : 0.06% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000055s : 0.00% optimize.opt_a.cse : 0.000146s : 0.00% optimize.opt_a.a_3 : 0.000145s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000054s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000854s : 0.03% optimize.opt_b.b_1 : 0.000238s : 0.01% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000049s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000036s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000034s : 0.00% optimize.loop_unroll : 0.000512s : 0.02% optimize.opt_after_cconv.c_1 : 0.000042s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000042s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000093s : 0.00% optimize.tuple_transform.d_1 : 0.000077s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000084s : 0.00% optimize.cse_after_recomputation.cse : 0.000062s : 0.00% optimize.environ_conv : 0.000032s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000029s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000031s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000016s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000039s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000579s : 0.02% validate : 0.000087s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.284248s : 97.28% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000439 69 0.50% : 0.000002s : 3: substitution.elim_not_effective 4.37% : 0.000019s : 3: substitution.float_tuple_getitem_switch 0.49% : 0.000002s : 3: substitution.fold_const_symbol 1.75% : 0.000008s : 6: substitution.graph_param_transform 63.28% : 0.000278s : 6: substitution.inline 1.32% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.76% : 0.000008s : 2: substitution.minmaximum_grad 1.39% : 0.000006s : 6: substitution.remove_not_recompute_node 1.58% : 0.000007s : 6: substitution.replace_old_param 4.31% : 0.000019s : 4: substitution.tuple_list_convert_item_index_to_positive 3.60% : 0.000016s : 4: substitution.tuple_list_get_item_const_eliminator 2.85% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 10.11% : 0.000044s : 12: substitution.tuple_list_get_item_eliminator 2.68% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.081082 2 96.37% : 0.078142s : 1: type_inference.infer 3.63% : 0.002940s : 1: type_inference.specialize ------[replace.] 0.000152 12 61.84% : 0.000094s : 6: replace.inline 38.16% : 0.000058s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000283 12 96.93% : 0.000274s : 6: match.inline 3.07% : 0.000009s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000294 1946 0.89% : 0.000003s : 20: predicate.accumulaten_eliminater 1.10% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.72% : 0.000002s : 12: predicate.addn_check_dump 0.89% : 0.000003s : 20: predicate.addn_zero_filter 0.77% : 0.000002s : 20: predicate.adjust_all_reduce_mul_add 2.02% : 0.000006s : 32: predicate.arithmetic_simplify 0.91% : 0.000003s : 20: predicate.cast_eliminate 0.58% : 0.000002s : 12: predicate.check_bprop_eliminate 0.50% : 0.000001s : 12: predicate.compare_switch_simplify 0.17% : 0.000000s : 6: predicate.const_output_eliminate 0.51% : 0.000001s : 12: predicate.depend_value_elim 0.92% : 0.000003s : 20: predicate.dict_get_item_const_eliminator 1.06% : 0.000003s : 20: predicate.dict_get_item_eliminator 0.88% : 0.000003s : 20: predicate.dict_set_item_eliminator 1.01% : 0.000003s : 12: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 6: predicate.elim_not_effective 0.37% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 26: predicate.environ_get_depend_swap 1.65% : 0.000005s : 38: predicate.environ_get_eliminate 1.07% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.45% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.21% : 0.000007s : 32: predicate.float_depend_g_call 0.54% : 0.000002s : 12: predicate.float_environ_get_switch 1.12% : 0.000003s : 18: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 6: predicate.fold_const_symbol 0.65% : 0.000002s : 12: predicate.get_grad_eliminate 0.20% : 0.000001s : 6: predicate.graph_param_transform 0.55% : 0.000002s : 12: predicate.incorporate_call 0.48% : 0.000001s : 12: predicate.incorporate_call_switch 5.60% : 0.000016s : 88: predicate.inline 0.71% : 0.000002s : 12: predicate.inline_without_move 0.31% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 12: predicate.less_batch_normalization 1.77% : 0.000005s : 38: predicate.list_to_tuple_eliminator_ 2.39% : 0.000007s : 58: predicate.load_eliminater 1.01% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.66% : 0.000008s : 54: predicate.loop_unroll_before_grad 1.64% : 0.000005s : 32: predicate.make_slice_get_slice_eliminator 0.55% : 0.000002s : 12: predicate.merge_addn 0.58% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.59% : 0.000002s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 20: predicate.minmaximum_grad 1.43% : 0.000004s : 6: predicate.mutable_eliminate 0.44% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 6: predicate.parallel_virtual_node 2.04% : 0.000006s : 32: predicate.partial_defer_inline 1.58% : 0.000005s : 32: predicate.partial_eliminate 0.94% : 0.000003s : 20: predicate.print_const_string_wrapper 0.60% : 0.000002s : 12: predicate.reduce_all_const_elim 1.16% : 0.000003s : 20: predicate.reduce_eliminate 2.51% : 0.000007s : 58: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 12: predicate.remove_not_recompute_node 1.46% : 0.000004s : 38: predicate.replace_applicator 0.43% : 0.000001s : 12: predicate.replace_old_param 0.25% : 0.000001s : 6: predicate.reset_defer_inline 1.03% : 0.000003s : 20: predicate.reshape_eliminate 0.58% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.84% : 0.000002s : 12: predicate.same_eliminate 0.49% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 12: predicate.shard_identity_eliminate 0.68% : 0.000002s : 12: predicate.special_op_eliminate 0.73% : 0.000002s : 12: predicate.specialize_transform 0.84% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.46% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.51% : 0.000004s : 32: predicate.switch_defer_inline 2.20% : 0.000006s : 44: predicate.switch_layer_defer_inline 5.36% : 0.000016s : 104: predicate.switch_simplify 0.84% : 0.000002s : 20: predicate.tile_eliminate 0.87% : 0.000003s : 20: predicate.transpose_eliminate 1.61% : 0.000005s : 32: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000005s : 32: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 3.58% : 0.000011s : 50: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.81% : 0.000005s : 38: predicate.tuple_to_list_eliminator_ 2.34% : 0.000007s : 58: predicate.updatestate_pure_node_eliminater 3.03% : 0.000009s : 70: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 6: predicate.value_based_eliminate 0.64% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.78% : 0.000002s : 12: predicate.virtual_output_eliminate 0.24% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002108 17 56.63% : 0.001194s : 9: func_graph_cloner_run.FuncGraphClonerGraph 43.37% : 0.000914s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.409170 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.27% : 0.009114s : 1: add_attr 0.27% : 0.009091s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000089s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000179s : 1: auto_monad 0.00% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.04% : 0.001268s : 1: bootstrap 0.00% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000076s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000037s : 1: environ_conv 0.00% : 0.000066s : 1: event_method 0.00% : 0.000024s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.02% : 0.000522s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000868s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000026s : 1: opt.transform.mutable_eliminate 0.06% : 0.002057s : 78: opt.transform.opt_a 0.00% : 0.000041s : 1: opt.transform.opt_after_cconv 0.00% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000221s : 28: opt.transform.opt_b 0.00% : 0.000086s : 2: opt.transform.opt_trans_graph 0.00% : 0.000056s : 4: opt.transform.symbol_engine_opt 0.16% : 0.005490s : 1: opt_a 0.00% : 0.000155s : 1: opt_after_cconv 0.02% : 0.000592s : 1: opt_after_jit_grad 0.01% : 0.000373s : 1: opt_b 0.27% : 0.009159s : 1: optimize 0.00% : 0.000040s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000035s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000019s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000086s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000098s : 1: remove_dup_value 0.03% : 0.001001s : 1: renormalize.infer 0.03% : 0.001071s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000058s : 1: rewriter_after_opt_a 0.02% : 0.000569s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000034s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000142s : 1: symbol_engine_optimizer 96.34% : 3.284358s : 1: task_emit 0.00% : 0.000118s : 1: tuple_transform 2.38% : 0.081283s : 1: type_inference 0.00% : 0.000128s : 1: validate TotalTime = 2.88107, [24] [bootstrap]: 0.00107631 [type_inference]: 0.042015 [event_method]: 2.088e-05 [auto_monad]: 0.00013704 [graph_reusing]: 5.88002e-06 [inline]: 3.06001e-06 [add_attr]: 0.008623, [1] [add_attr_with_inline]: 0.00842278, [1] [Cycle 1]: 0.00013547, [2] [tag_attr]: 3.279e-05 [meta_addattr_fg_expand]: 1.293e-05 [parallel-infer-symbol]: 4.78001e-06 [pre_auto_parallel]: 5.55e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 1.00001e-06 [dataset_repeat_opt]: 2.16998e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0052529, [53] [py_interpret_to_execute]: 5.76003e-06 [rewriter_before_opt_a]: 0.0002072 [opt_a]: 0.00286525, [2] [Cycle 1]: 0.00231643, [45] [expand_dump_flag]: 3.63999e-06 [switch_simplify]: 7.047e-05 [loop_unroll]: 2.868e-05 [a_1]: 0.00055372 [with_stream_mark]: 1.64e-05 [recompute_prepare]: 7.43e-06 [updatestate_depend_eliminate]: 1.186e-05 [updatestate_assign_eliminate]: 1.005e-05 [updatestate_loads_eliminate]: 2.93003e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 6.762e-05 [accelerated_algorithm]: 6.14001e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 8.42998e-06 [merge_send_recv]: 4.014e-05 [auto_parallel]: 7.14001e-06 [parallel]: 8.221e-05 [flash_sp]: 3.056e-05 [merge_comm]: 3.8e-06 [allreduce_fusion]: 1.011e-05 [matmul_add_comm_reduction]: 1.629e-05 [allreduce_slice_to_reducescatter]: 8.05999e-06 [virtual_shard_identity]: 8.23999e-06 [virtual_dataset]: 5.61998e-06 [get_grad_eliminate_]: 5.18002e-06 [virtual_output]: 5.61998e-06 [merge_forward]: 3.87998e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 1.583e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.026e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 8.47e-06 [set_forward_comm_id_for_comm_node_pass]: 1.022e-05 [meta_fg_expand]: 2.87002e-06 [flash_sp_send_recv_attached]: 2.71e-06 [receive_attached]: 1.699e-05 [after_resolve]: 8.99998e-06 [a_after_grad]: 8.60999e-06 [renormalize]: 0.00083804 [add_forward_monad_depend]: 6.32001e-06 [auto_monad_grad]: 2.45002e-06 [auto_monad_eliminator]: 2.345e-05 [cse]: 4.509e-05 [a_3]: 4.009e-05 [Cycle 2]: 0.00053748, [45] [expand_dump_flag]: 1.64e-06 [switch_simplify]: 6.22001e-06 [loop_unroll]: 4.80001e-06 [a_1]: 9.625e-05 [with_stream_mark]: 1.111e-05 [recompute_prepare]: 5.19998e-06 [updatestate_depend_eliminate]: 2.52001e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.39999e-06 [parameter_eliminate]: 1.11002e-06 [a_2]: 5.559e-05 [accelerated_algorithm]: 5.15001e-06 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 4.95001e-06 [merge_send_recv]: 4.46002e-06 [auto_parallel]: 5.32999e-06 [parallel]: 4.24997e-06 [flash_sp]: 3.07002e-06 [merge_comm]: 3.03e-06 [allreduce_fusion]: 2.59999e-06 [matmul_add_comm_reduction]: 4.71002e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 1.106e-05 [virtual_dataset]: 4.89003e-06 [get_grad_eliminate_]: 5.05001e-06 [virtual_output]: 5.11997e-06 [merge_forward]: 2.81999e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 6.32001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.118e-05 [merge_recompute_call_nodes]: 8.30012e-07 [before_grad]: 7.75e-06 [set_forward_comm_id_for_comm_node_pass]: 2.69001e-06 [meta_fg_expand]: 1.81e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.35999e-06 [after_resolve]: 7.68001e-06 [a_after_grad]: 6.79001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 9.10019e-07 [auto_monad_grad]: 8.90024e-07 [auto_monad_eliminator]: 5.44e-06 [cse]: 1.146e-05 [a_3]: 2.777e-05 [py_interpret_to_execute_after_opt_a]: 4.37e-06 [slice_cell_reuse_recomputed_activation]: 1.97999e-06 [rewriter_after_opt_a]: 2.599e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00064801 [opt_b]: 0.00017003, [1] [Cycle 1]: 0.00016297, [7] [b_1]: 9.662e-05 [b_2]: 6.54999e-06 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 2.20002e-06 [updatestate_loads_eliminate]: 2.19999e-06 [renormalize]: 7.79983e-07 [cse]: 1.691e-05 [optimize_parallel_all_gather_comm]: 2.653e-05 [overlap_param_gather]: 1.071e-05 [cconv]: 2.615e-05 [loop_unroll]: 0.00044332 [opt_after_cconv]: 8.853e-05, [1] [Cycle 1]: 8.261e-05, [7] [c_1]: 2.366e-05 [parameter_eliminate]: 2.88e-06 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 2.26e-06 [updatestate_loads_eliminate]: 2.12001e-06 [cse]: 1.522e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.308e-05 [tuple_transform]: 6.776e-05, [1] [Cycle 1]: 6.258e-05, [4] [d_1]: 3.719e-05 [none_parameter_eliminate]: 1.52001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 5.69e-06 [partial_unused_args_eliminate]: 1.73002e-06 [add_recomputation]: 5.908e-05 [cse_after_recomputation]: 2.001e-05, [1] [Cycle 1]: 1.515e-05, [1] [cse]: 9.89999e-06 [environ_conv]: 4.549e-05 [swap_dp_allreduce_reducescatter]: 2.32e-05 [bias_add_comm_swap]: 9.54999e-06 [label_micro_interleaved_index]: 1.305e-05 [label_fine_grained_interleaved_index]: 2.43e-06 [merge_cast_opt]: 1.59998e-06 [slice_recompute_activation]: 2.33002e-06 [micro_interleaved_order_control]: 2.09e-06 [assign_add_opt]: 1.46998e-06 [ForceFp32Comm]: 8.90024e-07 [remove_cast_before_assign_add]: 8.04997e-06 [full_micro_interleaved_order_control]: 1.01e-05 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.27999e-06 [add_comm_op_reuse_tag]: 1.28002e-06 [interleave_split_concat_branches]: 1.32e-06 [interleave_parallel_branches]: 7.68999e-06 [overlap_opt_shard_in_pipeline]: 2.453e-05 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.18e-05 [grouped_pairwise_exchange_alltoall]: 1.42999e-06 [offloading_packed_experts]: 3.68999e-06 [overlap_recompute_and_grad_model_parallel]: 1.189e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 1.786e-05 [overlap_grad_flash_sp]: 4.016e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 9.81998e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 1.19e-06 [symbol_engine_optimizer]: 7.122e-05, [1] [Cycle 1]: 6.664e-05, [6] [build]: 3.4e-06 [elim_shapecalc]: 9.64e-06 [elim_not_effective]: 1.245e-05 [opt_reshape]: 5.55001e-06 [fold_const_symbol]: 8.35999e-06 [renormalize]: 1.70025e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 2.205e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 3.26001e-06 [opt_after_jit_grad]: 0.00049178 [validate]: 5.549e-05 [backend_pass]: 9.20001e-07 [task_emit]: 2.82266 [execute]: 1.286e-05 Sums bootstrap : 0.001076s : 0.04% type_inference : 0.042015s : 1.46% event_method : 0.000021s : 0.00% auto_monad : 0.000137s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000056s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000207s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000077s : 0.00% optimize.opt_a.loop_unroll : 0.000033s : 0.00% optimize.opt_a.a_1 : 0.000650s : 0.02% optimize.opt_a.with_stream_mark : 0.000028s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000123s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000045s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000086s : 0.00% optimize.opt_a.flash_sp : 0.000034s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000016s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.00% optimize.opt_a.a_after_grad : 0.000015s : 0.00% optimize.opt_a.renormalize : 0.000838s : 0.03% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.00% optimize.opt_a.cse : 0.000057s : 0.00% optimize.opt_a.a_3 : 0.000068s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000648s : 0.02% optimize.opt_b.b_1 : 0.000097s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000443s : 0.02% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000015s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000037s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000059s : 0.00% optimize.cse_after_recomputation.cse : 0.000010s : 0.00% optimize.environ_conv : 0.000045s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000025s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000018s : 0.00% optimize.overlap_grad_flash_sp : 0.000040s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000008s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000492s : 0.02% validate : 0.000055s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.822659s : 98.31% execute : 0.000013s : 0.00% Time group info: ------[substitution.] 0.000195 24 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000001s : 2: substitution.fold_const_symbol 2.67% : 0.000005s : 3: substitution.graph_param_transform 74.80% : 0.000146s : 5: substitution.inline 1.63% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.47% : 0.000013s : 4: substitution.remove_not_recompute_node 1.85% : 0.000004s : 2: substitution.replace_old_param 10.87% : 0.000021s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.041931 2 97.03% : 0.040685s : 1: type_inference.infer 2.97% : 0.001246s : 1: type_inference.specialize ------[replace.] 0.000058 7 76.74% : 0.000045s : 5: replace.inline 23.26% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 7 87.67% : 0.000143s : 5: match.inline 12.33% : 0.000020s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000169 1031 1.08% : 0.000002s : 11: predicate.accumulaten_eliminater 0.96% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 11: predicate.addn_zero_filter 0.81% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.35% : 0.000004s : 17: predicate.arithmetic_simplify 0.97% : 0.000002s : 11: predicate.cast_eliminate 0.60% : 0.000001s : 6: predicate.check_bprop_eliminate 0.54% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.60% : 0.000001s : 6: predicate.depend_value_elim 0.90% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_depend_swap 1.58% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.44% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.32% : 0.000004s : 18: predicate.float_depend_g_call 0.50% : 0.000001s : 6: predicate.float_environ_get_switch 0.76% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.30% : 0.000001s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.95% : 0.000010s : 47: predicate.inline 0.70% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.01% : 0.000002s : 6: predicate.less_batch_normalization 1.70% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.36% : 0.000004s : 30: predicate.load_eliminater 1.01% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.01% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.83% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 6: predicate.merge_addn 0.59% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.86% : 0.000001s : 11: predicate.minmaximum_grad 1.11% : 0.000002s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 1.82% : 0.000003s : 18: predicate.partial_defer_inline 1.38% : 0.000002s : 16: predicate.partial_eliminate 0.86% : 0.000001s : 11: predicate.print_const_string_wrapper 0.69% : 0.000001s : 6: predicate.reduce_all_const_elim 1.28% : 0.000002s : 11: predicate.reduce_eliminate 2.70% : 0.000005s : 30: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 6: predicate.remove_not_recompute_node 1.16% : 0.000002s : 19: predicate.replace_applicator 0.53% : 0.000001s : 6: predicate.replace_old_param 0.23% : 0.000000s : 3: predicate.reset_defer_inline 0.99% : 0.000002s : 11: predicate.reshape_eliminate 0.59% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 3: predicate.row_tensor_eliminate 0.79% : 0.000001s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.85% : 0.000001s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.63% : 0.000001s : 6: predicate.specialize_transform 0.95% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.97% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 18: predicate.switch_defer_inline 2.09% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.64% : 0.000010s : 61: predicate.switch_simplify 0.89% : 0.000002s : 11: predicate.tile_eliminate 0.93% : 0.000002s : 11: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.41% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.81% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.48% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.89% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 3: predicate.value_based_eliminate 0.60% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001012 12 34.06% : 0.000345s : 5: func_graph_cloner_run.FuncGraphClonerGraph 65.94% : 0.000667s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.896583 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.30% : 0.008635s : 1: add_attr 0.29% : 0.008427s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000063s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000143s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000012s : 1: bias_add_comm_swap 0.04% : 0.001139s : 1: bootstrap 0.00% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000050s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000052s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000010s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000451s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000658s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.04% : 0.001037s : 78: opt.transform.opt_a 0.00% : 0.000022s : 1: opt.transform.opt_after_cconv 0.00% : 0.000020s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000078s : 28: opt.transform.opt_b 0.00% : 0.000041s : 2: opt.transform.opt_trans_graph 0.00% : 0.000033s : 4: opt.transform.symbol_engine_opt 0.10% : 0.002868s : 1: opt_a 0.00% : 0.000092s : 1: opt_after_cconv 0.02% : 0.000502s : 1: opt_after_jit_grad 0.01% : 0.000173s : 1: opt_b 0.18% : 0.005258s : 1: optimize 0.00% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000044s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000021s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000028s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000060s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000016s : 1: remove_dup_value 0.01% : 0.000368s : 1: renormalize.infer 0.02% : 0.000460s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000029s : 1: rewriter_after_opt_a 0.01% : 0.000213s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000074s : 1: symbol_engine_optimizer 97.45% : 2.822822s : 1: task_emit 0.00% : 0.000071s : 1: tuple_transform 1.45% : 0.042047s : 1: type_inference 0.00% : 0.000086s : 1: validate TotalTime = 0.193028, [24] [bootstrap]: 0.00068098 [type_inference]: 0.0504549 [event_method]: 5.835e-05 [auto_monad]: 0.00014103 [graph_reusing]: 8.3e-06 [inline]: 3.60998e-06 [add_attr]: 0.00461257, [1] [add_attr_with_inline]: 0.0046027, [1] [Cycle 1]: 0.00011593, [2] [tag_attr]: 6.155e-05 [meta_addattr_fg_expand]: 1.652e-05 [parallel-infer-symbol]: 4.18999e-06 [pre_auto_parallel]: 7.137e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.110332, [53] [py_interpret_to_execute]: 5.24998e-06 [rewriter_before_opt_a]: 0.0002604 [opt_a]: 0.10607, [3] [Cycle 1]: 0.0942229, [45] [expand_dump_flag]: 4.44998e-06 [switch_simplify]: 9.239e-05 [loop_unroll]: 7.669e-05 [a_1]: 0.00173608 [with_stream_mark]: 3.787e-05 [recompute_prepare]: 3.619e-05 [updatestate_depend_eliminate]: 2.313e-05 [updatestate_assign_eliminate]: 1.445e-05 [updatestate_loads_eliminate]: 1.401e-05 [parameter_eliminate]: 3.21001e-06 [a_2]: 0.00039718 [accelerated_algorithm]: 8.538e-05 [shard]: 1.63002e-06 [meta_shard_fg_expand]: 1.044e-05 [shard_inline]: 2.545e-05 [merge_send_recv]: 2.476e-05 [auto_parallel]: 1.9e-05 [parallel]: 3.906e-05 [flash_sp]: 1.508e-05 [merge_comm]: 1.617e-05 [allreduce_fusion]: 1.495e-05 [matmul_add_comm_reduction]: 3.807e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 2.613e-05 [virtual_dataset]: 2.542e-05 [get_grad_eliminate_]: 2.355e-05 [virtual_output]: 2.305e-05 [merge_forward]: 1.615e-05 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 2.38e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.023e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 3.714e-05 [set_forward_comm_id_for_comm_node_pass]: 1.566e-05 [meta_fg_expand]: 0.028021 [flash_sp_send_recv_attached]: 1.189e-05 [receive_attached]: 2.84001e-06 [after_resolve]: 0.00019029 [a_after_grad]: 0.00022017 [renormalize]: 0.0594875 [add_forward_monad_depend]: 2.795e-05 [auto_monad_grad]: 2.318e-05 [auto_monad_eliminator]: 0.00020626 [cse]: 0.00051308 [a_3]: 0.00214948 [Cycle 2]: 0.00923023, [45] [expand_dump_flag]: 4.35e-06 [switch_simplify]: 0.00012733 [loop_unroll]: 0.00011746 [a_1]: 0.0032218 [with_stream_mark]: 3.802e-05 [recompute_prepare]: 2.48e-05 [updatestate_depend_eliminate]: 1.483e-05 [updatestate_assign_eliminate]: 1.293e-05 [updatestate_loads_eliminate]: 1.3e-05 [parameter_eliminate]: 2.63e-06 [a_2]: 0.00033791 [accelerated_algorithm]: 2.769e-05 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 1.11e-05 [shard_inline]: 2.075e-05 [merge_send_recv]: 1.917e-05 [auto_parallel]: 1.995e-05 [parallel]: 9.51e-06 [flash_sp]: 4.32e-06 [merge_comm]: 1.383e-05 [allreduce_fusion]: 1.353e-05 [matmul_add_comm_reduction]: 1.997e-05 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 2.219e-05 [virtual_dataset]: 1.998e-05 [get_grad_eliminate_]: 2.061e-05 [virtual_output]: 1.971e-05 [merge_forward]: 1.366e-05 [cell_reuse_recompute_pass]: 2.78e-06 [offload_activation]: 2.324e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.713e-05 [merge_recompute_call_nodes]: 1.35001e-06 [before_grad]: 3.196e-05 [set_forward_comm_id_for_comm_node_pass]: 1.392e-05 [meta_fg_expand]: 0.00021575 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 3.08e-06 [after_resolve]: 3.66e-05 [a_after_grad]: 3.468e-05 [renormalize]: 0.00397193 [add_forward_monad_depend]: 9.49999e-06 [auto_monad_grad]: 2.99999e-06 [auto_monad_eliminator]: 4.033e-05 [cse]: 0.0001692 [a_3]: 0.0001684 [Cycle 3]: 0.00259464, [45] [expand_dump_flag]: 2.81e-06 [switch_simplify]: 2.358e-05 [loop_unroll]: 2.076e-05 [a_1]: 0.00096471 [with_stream_mark]: 2.967e-05 [recompute_prepare]: 2.464e-05 [updatestate_depend_eliminate]: 1.479e-05 [updatestate_assign_eliminate]: 1.256e-05 [updatestate_loads_eliminate]: 1.289e-05 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00040234 [accelerated_algorithm]: 3.009e-05 [shard]: 2.04e-06 [meta_shard_fg_expand]: 7.68999e-06 [shard_inline]: 2.339e-05 [merge_send_recv]: 2.07e-05 [auto_parallel]: 2.109e-05 [parallel]: 8.97e-06 [flash_sp]: 1.77999e-06 [merge_comm]: 1.408e-05 [allreduce_fusion]: 1.389e-05 [matmul_add_comm_reduction]: 2.028e-05 [allreduce_slice_to_reducescatter]: 1.05999e-06 [virtual_shard_identity]: 2.476e-05 [virtual_dataset]: 2.267e-05 [get_grad_eliminate_]: 2.168e-05 [virtual_output]: 2.275e-05 [merge_forward]: 1.405e-05 [cell_reuse_recompute_pass]: 3.31999e-06 [offload_activation]: 2.261e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.621e-05 [merge_recompute_call_nodes]: 1.94e-06 [before_grad]: 3.621e-05 [set_forward_comm_id_for_comm_node_pass]: 1.54e-05 [meta_fg_expand]: 1.014e-05 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 2.867e-05 [a_after_grad]: 3.287e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 3.44001e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 2.762e-05 [cse]: 0.00019866 [a_3]: 0.00015232 [py_interpret_to_execute_after_opt_a]: 8.87e-06 [slice_cell_reuse_recomputed_activation]: 2.59001e-06 [rewriter_after_opt_a]: 8.314e-05 [convert_after_rewriter]: 1.17e-06 [order_py_execute_after_rewriter]: 1.20001e-06 [mutable_eliminate]: 0.00084606 [opt_b]: 0.00080789, [1] [Cycle 1]: 0.00079927, [7] [b_1]: 0.00059507 [b_2]: 2.381e-05 [updatestate_depend_eliminate]: 1.918e-05 [updatestate_assign_eliminate]: 1.233e-05 [updatestate_loads_eliminate]: 1.163e-05 [renormalize]: 8.79983e-07 [cse]: 9.486e-05 [optimize_parallel_all_gather_comm]: 3.691e-05 [overlap_param_gather]: 2.46998e-06 [cconv]: 3.683e-05 [loop_unroll]: 0.00053646 [opt_after_cconv]: 0.0003409, [1] [Cycle 1]: 0.00033405, [7] [c_1]: 9.832e-05 [parameter_eliminate]: 3.68e-06 [updatestate_depend_eliminate]: 1.843e-05 [updatestate_assign_eliminate]: 1.204e-05 [updatestate_loads_eliminate]: 1.217e-05 [cse]: 8.452e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 0.00018624 [tuple_transform]: 0.00029742, [1] [Cycle 1]: 0.0002916, [4] [d_1]: 0.0002399 [none_parameter_eliminate]: 2.58e-06 [renormalize]: 3.39991e-07 [switch_simplify]: 2.613e-05 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 0.00012271 [cse_after_recomputation]: 6.926e-05, [1] [Cycle 1]: 6.345e-05, [1] [cse]: 5.68e-05 [environ_conv]: 1.743e-05 [swap_dp_allreduce_reducescatter]: 1.613e-05 [bias_add_comm_swap]: 3.37002e-06 [label_micro_interleaved_index]: 6.78998e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.36998e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.57001e-06 [assign_add_opt]: 1.49e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.59e-06 [full_micro_interleaved_order_control]: 2.74999e-06 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.15999e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.34998e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 6.32001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.20002e-06 [control_data_broadcast_order]: 3.819e-05 [grouped_pairwise_exchange_alltoall]: 1.40001e-06 [offloading_packed_experts]: 1.112e-05 [overlap_recompute_and_grad_model_parallel]: 9.94001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.21997e-06 [overlap_recompute_comm]: 2.58e-06 [overlap_grad_ring_attention]: 8.97999e-06 [overlap_grad_flash_sp]: 4.387e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.66e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.40001e-06 [symbol_engine_optimizer]: 0.00017527, [1] [Cycle 1]: 0.00016948, [6] [build]: 1.33e-05 [elim_shapecalc]: 2.848e-05 [elim_not_effective]: 3.736e-05 [opt_reshape]: 2.235e-05 [fold_const_symbol]: 3.442e-05 [renormalize]: 4.30009e-07 [detach_backward]: 2.66e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 4.413e-05 [get_jit_bprop_graph]: 2.43e-06 [rewriter_after_jit_bprop_graph]: 4.89e-06 [opt_after_jit_grad]: 0.00063681 [validate]: 0.00012884 [backend_pass]: 1.07e-06 [task_emit]: 0.0255369 [execute]: 8.89e-06 Sums bootstrap : 0.000681s : 0.36% type_inference : 0.050455s : 27.01% event_method : 0.000058s : 0.03% auto_monad : 0.000141s : 0.08% graph_reusing : 0.000008s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000062s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000071s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000260s : 0.14% optimize.opt_a.expand_dump_flag : 0.000012s : 0.01% optimize.opt_a.switch_simplify : 0.000243s : 0.13% optimize.opt_a.loop_unroll : 0.000215s : 0.12% optimize.opt_a.a_1 : 0.005923s : 3.17% optimize.opt_a.with_stream_mark : 0.000106s : 0.06% optimize.opt_a.recompute_prepare : 0.000086s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000053s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000040s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000040s : 0.02% optimize.opt_a.parameter_eliminate : 0.000009s : 0.00% optimize.opt_a.a_2 : 0.001137s : 0.61% optimize.opt_a.accelerated_algorithm : 0.000143s : 0.08% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000029s : 0.02% optimize.opt_a.shard_inline : 0.000070s : 0.04% optimize.opt_a.merge_send_recv : 0.000065s : 0.03% optimize.opt_a.auto_parallel : 0.000060s : 0.03% optimize.opt_a.parallel : 0.000058s : 0.03% optimize.opt_a.flash_sp : 0.000021s : 0.01% optimize.opt_a.merge_comm : 0.000044s : 0.02% optimize.opt_a.allreduce_fusion : 0.000042s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000078s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000073s : 0.04% optimize.opt_a.virtual_dataset : 0.000068s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000066s : 0.04% optimize.opt_a.virtual_output : 0.000066s : 0.04% optimize.opt_a.merge_forward : 0.000044s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000070s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000114s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000105s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000045s : 0.02% optimize.opt_a.meta_fg_expand : 0.028247s : 15.12% optimize.opt_a.flash_sp_send_recv_attached : 0.000016s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000256s : 0.14% optimize.opt_a.a_after_grad : 0.000288s : 0.15% optimize.opt_a.renormalize : 0.063460s : 33.98% optimize.opt_a.add_forward_monad_depend : 0.000041s : 0.02% optimize.opt_a.auto_monad_grad : 0.000028s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000274s : 0.15% optimize.opt_a.cse : 0.000881s : 0.47% optimize.opt_a.a_3 : 0.002470s : 1.32% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000083s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000846s : 0.45% optimize.opt_b.b_1 : 0.000595s : 0.32% optimize.opt_b.b_2 : 0.000024s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000019s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000095s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000037s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000037s : 0.02% optimize.loop_unroll : 0.000536s : 0.29% optimize.opt_after_cconv.c_1 : 0.000098s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000018s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.cse : 0.000085s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000186s : 0.10% optimize.tuple_transform.d_1 : 0.000240s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000026s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000123s : 0.07% optimize.cse_after_recomputation.cse : 0.000057s : 0.03% optimize.environ_conv : 0.000017s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000016s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000006s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000038s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000011s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000044s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000028s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000037s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000034s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000044s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000637s : 0.34% validate : 0.000129s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.025537s : 13.67% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.003002 586 0.16% : 0.000005s : 11: substitution.elim_not_effective 0.20% : 0.000006s : 6: substitution.float_depend_g_call 0.85% : 0.000025s : 26: substitution.float_tuple_getitem_switch 0.15% : 0.000005s : 11: substitution.fold_const_symbol 26.97% : 0.000810s : 5: substitution.getattr_setattr_resolve 0.48% : 0.000014s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.07% : 0.000002s : 2: substitution.incorporate_call_switch 29.88% : 0.000897s : 17: substitution.inline 1.10% : 0.000033s : 3: substitution.inline_without_move 0.60% : 0.000018s : 34: substitution.j_node_and_user_rematch 2.11% : 0.000063s : 3: substitution.less_batch_normalization 1.34% : 0.000040s : 34: substitution.minmaximum_grad 0.84% : 0.000025s : 6: substitution.partial_eliminate 0.74% : 0.000022s : 34: substitution.remove_not_recompute_node 4.14% : 0.000124s : 38: substitution.replace_applicator 1.03% : 0.000031s : 53: substitution.replace_old_param 0.11% : 0.000003s : 1: substitution.set_cell_output_no_recompute 3.83% : 0.000115s : 48: substitution.tuple_list_convert_item_index_to_positive 1.91% : 0.000057s : 49: substitution.tuple_list_get_item_const_eliminator 2.48% : 0.000074s : 48: substitution.tuple_list_get_item_depend_reorder 6.12% : 0.000184s : 91: substitution.tuple_list_get_item_eliminator 14.81% : 0.000445s : 49: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.050309 2 90.72% : 0.045642s : 1: type_inference.infer 9.28% : 0.004667s : 1: type_inference.specialize ------[replace.] 0.000768 52 9.70% : 0.000075s : 4: replace.getattr_setattr_resolve 31.55% : 0.000242s : 17: replace.inline 19.22% : 0.000148s : 7: replace.replace_applicator 30.11% : 0.000231s : 23: replace.tuple_list_get_item_eliminator 9.41% : 0.000072s : 1: replace.tuple_list_get_set_item_eliminator ------[match.] 0.001730 52 43.31% : 0.000749s : 4: match.getattr_setattr_resolve 51.16% : 0.000885s : 17: match.inline 2.64% : 0.000046s : 7: match.replace_applicator 2.72% : 0.000047s : 23: match.tuple_list_get_item_eliminator 0.16% : 0.000003s : 1: match.tuple_list_get_set_item_eliminator ------[predicate.] 0.001436 10058 0.86% : 0.000012s : 94: predicate.accumulaten_eliminater 0.34% : 0.000005s : 15: predicate.ad_related_special_op_eliminate 0.45% : 0.000007s : 50: predicate.addn_check_dump 0.89% : 0.000013s : 94: predicate.addn_zero_filter 0.80% : 0.000012s : 94: predicate.adjust_all_reduce_mul_add 1.82% : 0.000026s : 144: predicate.arithmetic_simplify 0.89% : 0.000013s : 94: predicate.cast_eliminate 2.86% : 0.000041s : 291: predicate.check_bprop_eliminate 0.46% : 0.000007s : 50: predicate.compare_switch_simplify 0.08% : 0.000001s : 15: predicate.const_output_eliminate 0.48% : 0.000007s : 50: predicate.depend_value_elim 0.91% : 0.000013s : 94: predicate.dict_get_item_const_eliminator 1.06% : 0.000015s : 94: predicate.dict_get_item_eliminator 0.87% : 0.000012s : 94: predicate.dict_set_item_eliminator 0.39% : 0.000006s : 30: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 15: predicate.elim_not_effective 0.20% : 0.000003s : 15: predicate.elim_shapecalc_of_broadcastargs 1.02% : 0.000015s : 109: predicate.environ_add_const_eliminate 0.99% : 0.000014s : 109: predicate.environ_get_add_eliminate 0.95% : 0.000014s : 109: predicate.environ_get_depend_swap 1.54% : 0.000022s : 159: predicate.environ_get_eliminate 0.98% : 0.000014s : 109: predicate.environ_get_set_eliminate 1.21% : 0.000017s : 135: predicate.exchange_switch_depend_value 1.80% : 0.000026s : 135: predicate.float_depend_g_call 0.47% : 0.000007s : 50: predicate.float_environ_get_switch 0.73% : 0.000010s : 65: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 15: predicate.fold_const_symbol 0.57% : 0.000008s : 50: predicate.get_grad_eliminate 0.54% : 0.000008s : 31: predicate.getattr_setattr_resolve 0.11% : 0.000002s : 15: predicate.graph_param_transform 0.48% : 0.000007s : 50: predicate.incorporate_call 0.44% : 0.000006s : 50: predicate.incorporate_call_switch 4.53% : 0.000065s : 359: predicate.inline 1.71% : 0.000025s : 143: predicate.inline_without_move 0.26% : 0.000004s : 50: predicate.j_node_and_user_rematch 0.68% : 0.000010s : 50: predicate.less_batch_normalization 1.49% : 0.000021s : 148: predicate.list_to_tuple_eliminator_ 2.15% : 0.000031s : 242: predicate.load_eliminater 0.31% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.93% : 0.000028s : 205: predicate.loop_unroll_before_grad 1.33% : 0.000019s : 125: predicate.make_slice_get_slice_eliminator 0.50% : 0.000007s : 50: predicate.merge_addn 2.51% : 0.000036s : 266: predicate.micro_step_allgather_replace 2.51% : 0.000036s : 266: predicate.mini_step_allgather_replace 0.91% : 0.000013s : 94: predicate.minmaximum_grad 0.37% : 0.000005s : 15: predicate.mutable_eliminate 0.16% : 0.000002s : 15: predicate.opt_reshape 0.17% : 0.000002s : 15: predicate.parallel_virtual_node 1.70% : 0.000024s : 135: predicate.partial_defer_inline 1.34% : 0.000019s : 133: predicate.partial_eliminate 0.84% : 0.000012s : 94: predicate.print_const_string_wrapper 0.55% : 0.000008s : 50: predicate.reduce_all_const_elim 1.10% : 0.000016s : 94: predicate.reduce_eliminate 2.16% : 0.000031s : 242: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000004s : 50: predicate.remove_not_recompute_node 2.48% : 0.000036s : 398: predicate.replace_applicator 0.78% : 0.000011s : 143: predicate.replace_old_param 0.09% : 0.000001s : 15: predicate.reset_defer_inline 0.88% : 0.000013s : 94: predicate.reshape_eliminate 2.71% : 0.000039s : 266: predicate.row_tensor_add_zeros_like 0.16% : 0.000002s : 15: predicate.row_tensor_eliminate 3.06% : 0.000044s : 291: predicate.same_eliminate 0.31% : 0.000005s : 50: predicate.set_cell_output_no_recompute 0.57% : 0.000008s : 50: predicate.shard_identity_eliminate 0.35% : 0.000005s : 30: predicate.special_op_eliminate 0.53% : 0.000008s : 50: predicate.specialize_transform 2.65% : 0.000038s : 266: predicate.split_environ_get_set_with_tuple_value 1.62% : 0.000023s : 143: predicate.stack_unstack_eliminate 0.17% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.36% : 0.000020s : 135: predicate.switch_defer_inline 3.97% : 0.000057s : 426: predicate.switch_layer_defer_inline 4.25% : 0.000061s : 405: predicate.switch_simplify 0.84% : 0.000012s : 94: predicate.tile_eliminate 0.88% : 0.000013s : 94: predicate.transpose_eliminate 1.38% : 0.000020s : 124: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000020s : 125: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000018s : 124: predicate.tuple_list_get_item_depend_reorder 2.58% : 0.000037s : 198: predicate.tuple_list_get_item_eliminator 1.39% : 0.000020s : 125: predicate.tuple_list_get_set_item_eliminator 1.86% : 0.000027s : 175: predicate.tuple_list_set_item_eliminator 1.38% : 0.000020s : 148: predicate.tuple_to_list_eliminator_ 2.03% : 0.000029s : 242: predicate.updatestate_pure_node_eliminater 2.60% : 0.000037s : 292: predicate.updatestate_useless_node_eliminater 0.16% : 0.000002s : 15: predicate.value_based_eliminate 0.58% : 0.000008s : 50: predicate.virtual_dataset_eliminate 0.52% : 0.000008s : 50: predicate.virtual_output_eliminate 0.15% : 0.000002s : 15: predicate.virtual_view_grad_eliminate 0.19% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.009074 91 61.66% : 0.005595s : 60: func_graph_cloner_run.FuncGraphClonerGraph 38.34% : 0.003479s : 31: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.384591 247 0.00% : 0.000004s : 1: ForceFp32Comm 1.20% : 0.004618s : 1: add_attr 1.20% : 0.004606s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000128s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000148s : 1: auto_monad 0.01% : 0.000048s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000728s : 1: bootstrap 0.01% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000042s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000072s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000021s : 1: environ_conv 0.02% : 0.000067s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.14% : 0.000546s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.22% : 0.000857s : 1: mutable_eliminate 0.00% : 0.000014s : 1: offloading_packed_experts 0.01% : 0.000033s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000037s : 1: opt.transform.mutable_eliminate 2.89% : 0.011124s : 125: opt.transform.opt_a 0.03% : 0.000097s : 1: opt.transform.opt_after_cconv 0.02% : 0.000075s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000584s : 28: opt.transform.opt_b 0.25% : 0.000962s : 2: opt.transform.opt_resolve 0.07% : 0.000262s : 2: opt.transform.opt_trans_graph 0.03% : 0.000118s : 4: opt.transform.symbol_engine_opt 27.58% : 0.106073s : 1: opt_a 0.09% : 0.000344s : 1: opt_after_cconv 0.17% : 0.000648s : 1: opt_after_jit_grad 0.21% : 0.000812s : 1: opt_b 28.69% : 0.110338s : 1: optimize 0.01% : 0.000040s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000047s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000009s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000076s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000192s : 1: remove_dup_value 14.43% : 0.055511s : 2: renormalize.infer 2.06% : 0.007916s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000087s : 1: rewriter_after_opt_a 0.07% : 0.000265s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000019s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000178s : 1: symbol_engine_optimizer 6.65% : 0.025558s : 1: task_emit 0.08% : 0.000300s : 1: tuple_transform 13.13% : 0.050481s : 1: type_inference 0.05% : 0.000184s : 1: validate TotalTime = 0.188098, [24] [bootstrap]: 0.00064177 [type_inference]: 0.0408585 [event_method]: 5.636e-05 [auto_monad]: 0.00013791 [graph_reusing]: 8.32e-06 [inline]: 3.59002e-06 [add_attr]: 0.00449187, [1] [add_attr_with_inline]: 0.00448061, [1] [Cycle 1]: 0.00014332, [2] [tag_attr]: 5.999e-05 [meta_addattr_fg_expand]: 1.734e-05 [parallel-infer-symbol]: 3.90998e-06 [pre_auto_parallel]: 0.00011485 [insert-virtual-dataset]: 2.95002e-06 [parallel-infer-symbol-second]: 9.79984e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.115363, [53] [py_interpret_to_execute]: 5.79e-06 [rewriter_before_opt_a]: 0.00025809 [opt_a]: 0.110612, [3] [Cycle 1]: 0.0982007, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 9.328e-05 [loop_unroll]: 7.376e-05 [a_1]: 0.00173058 [with_stream_mark]: 3.458e-05 [recompute_prepare]: 3.225e-05 [updatestate_depend_eliminate]: 2.361e-05 [updatestate_assign_eliminate]: 1.461e-05 [updatestate_loads_eliminate]: 1.496e-05 [parameter_eliminate]: 3.31999e-06 [a_2]: 0.00037899 [accelerated_algorithm]: 8.007e-05 [shard]: 1.75001e-06 [meta_shard_fg_expand]: 9.31e-06 [shard_inline]: 2.455e-05 [merge_send_recv]: 2.496e-05 [auto_parallel]: 2.075e-05 [parallel]: 8.893e-05 [flash_sp]: 1.801e-05 [merge_comm]: 1.734e-05 [allreduce_fusion]: 1.634e-05 [matmul_add_comm_reduction]: 3.835e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 2.953e-05 [virtual_dataset]: 2.56e-05 [get_grad_eliminate_]: 2.368e-05 [virtual_output]: 2.309e-05 [merge_forward]: 1.6e-05 [cell_reuse_recompute_pass]: 1.92001e-06 [offload_activation]: 2.424e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.368e-05 [merge_recompute_call_nodes]: 1.78997e-06 [before_grad]: 3.75e-05 [set_forward_comm_id_for_comm_node_pass]: 1.661e-05 [meta_fg_expand]: 0.0301421 [flash_sp_send_recv_attached]: 1.088e-05 [receive_attached]: 2.88e-06 [after_resolve]: 0.00018443 [a_after_grad]: 0.00021319 [renormalize]: 0.0611898 [add_forward_monad_depend]: 2.752e-05 [auto_monad_grad]: 2.414e-05 [auto_monad_eliminator]: 0.00024079 [cse]: 0.00059453 [a_3]: 0.00214326 [Cycle 2]: 0.00972152, [45] [expand_dump_flag]: 4.44998e-06 [switch_simplify]: 0.00012631 [loop_unroll]: 0.00011887 [a_1]: 0.00352492 [with_stream_mark]: 3.919e-05 [recompute_prepare]: 2.852e-05 [updatestate_depend_eliminate]: 1.571e-05 [updatestate_assign_eliminate]: 1.321e-05 [updatestate_loads_eliminate]: 1.356e-05 [parameter_eliminate]: 2.86999e-06 [a_2]: 0.00033901 [accelerated_algorithm]: 2.727e-05 [shard]: 2.21e-06 [meta_shard_fg_expand]: 1.156e-05 [shard_inline]: 2.116e-05 [merge_send_recv]: 2.11e-05 [auto_parallel]: 2.078e-05 [parallel]: 9.32001e-06 [flash_sp]: 4.96997e-06 [merge_comm]: 1.346e-05 [allreduce_fusion]: 1.302e-05 [matmul_add_comm_reduction]: 2.055e-05 [allreduce_slice_to_reducescatter]: 1.59e-06 [virtual_shard_identity]: 2.259e-05 [virtual_dataset]: 1.957e-05 [get_grad_eliminate_]: 2.085e-05 [virtual_output]: 1.955e-05 [merge_forward]: 1.321e-05 [cell_reuse_recompute_pass]: 1.87001e-06 [offload_activation]: 2.25e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.661e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 3.205e-05 [set_forward_comm_id_for_comm_node_pass]: 1.403e-05 [meta_fg_expand]: 0.00020478 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 3.24001e-06 [after_resolve]: 3.416e-05 [a_after_grad]: 3.422e-05 [renormalize]: 0.00407334 [add_forward_monad_depend]: 1.185e-05 [auto_monad_grad]: 2.87002e-06 [auto_monad_eliminator]: 4.213e-05 [cse]: 0.00021114 [a_3]: 0.00017502 [Cycle 3]: 0.00266527, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 2.471e-05 [loop_unroll]: 2.12e-05 [a_1]: 0.00108695 [with_stream_mark]: 3.169e-05 [recompute_prepare]: 2.492e-05 [updatestate_depend_eliminate]: 1.623e-05 [updatestate_assign_eliminate]: 1.274e-05 [updatestate_loads_eliminate]: 1.27e-05 [parameter_eliminate]: 2.02001e-06 [a_2]: 0.0003367 [accelerated_algorithm]: 2.709e-05 [shard]: 2.71999e-06 [meta_shard_fg_expand]: 6.24999e-06 [shard_inline]: 2.11e-05 [merge_send_recv]: 2.243e-05 [auto_parallel]: 2.111e-05 [parallel]: 9.89999e-06 [flash_sp]: 2.07999e-06 [merge_comm]: 1.3e-05 [allreduce_fusion]: 1.338e-05 [matmul_add_comm_reduction]: 2.128e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 2.225e-05 [virtual_dataset]: 2.087e-05 [get_grad_eliminate_]: 1.958e-05 [virtual_output]: 2.152e-05 [merge_forward]: 1.508e-05 [cell_reuse_recompute_pass]: 3.38e-06 [offload_activation]: 2.251e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.839e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 3.221e-05 [set_forward_comm_id_for_comm_node_pass]: 1.449e-05 [meta_fg_expand]: 1.025e-05 [flash_sp_send_recv_attached]: 2.29001e-06 [receive_attached]: 2.26e-06 [after_resolve]: 3.043e-05 [a_after_grad]: 3.166e-05 [renormalize]: 2.30008e-07 [add_forward_monad_depend]: 3.36001e-06 [auto_monad_grad]: 2.40002e-06 [auto_monad_eliminator]: 2.975e-05 [cse]: 0.00019592 [a_3]: 0.00015366 [py_interpret_to_execute_after_opt_a]: 1.048e-05 [slice_cell_reuse_recomputed_activation]: 2.49001e-06 [rewriter_after_opt_a]: 0.00011482 [convert_after_rewriter]: 1.43002e-06 [order_py_execute_after_rewriter]: 1.29e-06 [mutable_eliminate]: 0.00083565 [opt_b]: 0.00108945, [1] [Cycle 1]: 0.00091084, [7] [b_1]: 0.00067437 [b_2]: 2.56e-05 [updatestate_depend_eliminate]: 2.054e-05 [updatestate_assign_eliminate]: 1.338e-05 [updatestate_loads_eliminate]: 1.364e-05 [renormalize]: 9.80013e-07 [cse]: 0.00010624 [optimize_parallel_all_gather_comm]: 4.417e-05 [overlap_param_gather]: 2.37001e-06 [cconv]: 3.443e-05 [loop_unroll]: 0.00064197 [opt_after_cconv]: 0.00028889, [1] [Cycle 1]: 0.00027998, [7] [c_1]: 9.632e-05 [parameter_eliminate]: 4.33999e-06 [updatestate_depend_eliminate]: 1.817e-05 [updatestate_assign_eliminate]: 1.206e-05 [updatestate_loads_eliminate]: 1.187e-05 [cse]: 9.469e-05 [renormalize]: 7.30011e-07 [remove_dup_value]: 0.00019053 [tuple_transform]: 0.00030053, [1] [Cycle 1]: 0.0002938, [4] [d_1]: 0.00024339 [none_parameter_eliminate]: 2.64001e-06 [renormalize]: 8.89995e-07 [switch_simplify]: 2.316e-05 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 0.00017886 [cse_after_recomputation]: 7.824e-05, [1] [Cycle 1]: 7.203e-05, [1] [cse]: 6.403e-05 [environ_conv]: 1.7e-05 [swap_dp_allreduce_reducescatter]: 1.731e-05 [bias_add_comm_swap]: 4.4e-06 [label_micro_interleaved_index]: 8.15e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 2.29999e-06 [micro_interleaved_order_control]: 2.99001e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 1.07e-06 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.72001e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.20999e-06 [add_comm_op_reuse_tag]: 1.10999e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.25999e-06 [overlap_opt_shard_in_pipeline]: 2.752e-05 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 3.995e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 1.027e-05 [overlap_recompute_and_grad_model_parallel]: 1.063e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 1.011e-05 [overlap_grad_flash_sp]: 4.681e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.36998e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 0.00018219, [1] [Cycle 1]: 0.00017534, [6] [build]: 1.62e-05 [elim_shapecalc]: 3.103e-05 [elim_not_effective]: 3.814e-05 [opt_reshape]: 2.162e-05 [fold_const_symbol]: 3.193e-05 [renormalize]: 3.89991e-07 [detach_backward]: 2.17999e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 4.702e-05 [get_jit_bprop_graph]: 2.43e-06 [rewriter_after_jit_bprop_graph]: 5.77001e-06 [opt_after_jit_grad]: 0.00085256 [validate]: 0.00012562 [backend_pass]: 1.21002e-06 [task_emit]: 0.0250624 [execute]: 9.14e-06 Sums bootstrap : 0.000642s : 0.35% type_inference : 0.040859s : 22.49% event_method : 0.000056s : 0.03% auto_monad : 0.000138s : 0.08% graph_reusing : 0.000008s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000060s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000115s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000258s : 0.14% optimize.opt_a.expand_dump_flag : 0.000011s : 0.01% optimize.opt_a.switch_simplify : 0.000244s : 0.13% optimize.opt_a.loop_unroll : 0.000214s : 0.12% optimize.opt_a.a_1 : 0.006342s : 3.49% optimize.opt_a.with_stream_mark : 0.000105s : 0.06% optimize.opt_a.recompute_prepare : 0.000086s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000056s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000041s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000041s : 0.02% optimize.opt_a.parameter_eliminate : 0.000008s : 0.00% optimize.opt_a.a_2 : 0.001055s : 0.58% optimize.opt_a.accelerated_algorithm : 0.000134s : 0.07% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000027s : 0.01% optimize.opt_a.shard_inline : 0.000067s : 0.04% optimize.opt_a.merge_send_recv : 0.000068s : 0.04% optimize.opt_a.auto_parallel : 0.000063s : 0.03% optimize.opt_a.parallel : 0.000108s : 0.06% optimize.opt_a.flash_sp : 0.000025s : 0.01% optimize.opt_a.merge_comm : 0.000044s : 0.02% optimize.opt_a.allreduce_fusion : 0.000043s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000080s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000074s : 0.04% optimize.opt_a.virtual_dataset : 0.000066s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000064s : 0.04% optimize.opt_a.virtual_output : 0.000064s : 0.04% optimize.opt_a.merge_forward : 0.000044s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000069s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000119s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000102s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000045s : 0.02% optimize.opt_a.meta_fg_expand : 0.030357s : 16.71% optimize.opt_a.flash_sp_send_recv_attached : 0.000016s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000249s : 0.14% optimize.opt_a.a_after_grad : 0.000279s : 0.15% optimize.opt_a.renormalize : 0.065263s : 35.92% optimize.opt_a.add_forward_monad_depend : 0.000043s : 0.02% optimize.opt_a.auto_monad_grad : 0.000029s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000313s : 0.17% optimize.opt_a.cse : 0.001002s : 0.55% optimize.opt_a.a_3 : 0.002472s : 1.36% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000115s : 0.06% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000836s : 0.46% optimize.opt_b.b_1 : 0.000674s : 0.37% optimize.opt_b.b_2 : 0.000026s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000021s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000014s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000106s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000044s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.02% optimize.loop_unroll : 0.000642s : 0.35% optimize.opt_after_cconv.c_1 : 0.000096s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000018s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.cse : 0.000095s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000191s : 0.10% optimize.tuple_transform.d_1 : 0.000243s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000001s : 0.00% optimize.tuple_transform.switch_simplify : 0.000023s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000179s : 0.10% optimize.cse_after_recomputation.cse : 0.000064s : 0.04% optimize.environ_conv : 0.000017s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000017s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000040s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.01% optimize.overlap_grad_flash_sp : 0.000047s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000031s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000038s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000032s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000047s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000853s : 0.47% validate : 0.000126s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.025062s : 13.79% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.003104 586 0.17% : 0.000005s : 11: substitution.elim_not_effective 0.20% : 0.000006s : 6: substitution.float_depend_g_call 0.85% : 0.000026s : 26: substitution.float_tuple_getitem_switch 0.13% : 0.000004s : 11: substitution.fold_const_symbol 23.01% : 0.000714s : 5: substitution.getattr_setattr_resolve 0.47% : 0.000015s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.06% : 0.000002s : 2: substitution.incorporate_call_switch 31.65% : 0.000983s : 17: substitution.inline 0.94% : 0.000029s : 3: substitution.inline_without_move 0.55% : 0.000017s : 34: substitution.j_node_and_user_rematch 1.87% : 0.000058s : 3: substitution.less_batch_normalization 1.31% : 0.000041s : 34: substitution.minmaximum_grad 0.73% : 0.000023s : 6: substitution.partial_eliminate 0.77% : 0.000024s : 34: substitution.remove_not_recompute_node 3.99% : 0.000124s : 38: substitution.replace_applicator 1.01% : 0.000031s : 53: substitution.replace_old_param 0.09% : 0.000003s : 1: substitution.set_cell_output_no_recompute 3.96% : 0.000123s : 48: substitution.tuple_list_convert_item_index_to_positive 1.95% : 0.000061s : 49: substitution.tuple_list_get_item_const_eliminator 2.49% : 0.000077s : 48: substitution.tuple_list_get_item_depend_reorder 6.27% : 0.000195s : 91: substitution.tuple_list_get_item_eliminator 17.43% : 0.000541s : 49: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.040729 2 91.97% : 0.037459s : 1: type_inference.infer 8.03% : 0.003269s : 1: type_inference.specialize ------[replace.] 0.000831 52 8.28% : 0.000069s : 4: replace.getattr_setattr_resolve 37.63% : 0.000313s : 17: replace.inline 13.07% : 0.000109s : 7: replace.replace_applicator 31.14% : 0.000259s : 23: replace.tuple_list_get_item_eliminator 9.87% : 0.000082s : 1: replace.tuple_list_get_set_item_eliminator ------[match.] 0.001726 52 38.08% : 0.000657s : 4: match.getattr_setattr_resolve 56.26% : 0.000971s : 17: match.inline 2.48% : 0.000043s : 7: match.replace_applicator 2.96% : 0.000051s : 23: match.tuple_list_get_item_eliminator 0.20% : 0.000004s : 1: match.tuple_list_get_set_item_eliminator ------[predicate.] 0.001491 10058 0.84% : 0.000012s : 94: predicate.accumulaten_eliminater 0.34% : 0.000005s : 15: predicate.ad_related_special_op_eliminate 0.42% : 0.000006s : 50: predicate.addn_check_dump 0.84% : 0.000013s : 94: predicate.addn_zero_filter 0.76% : 0.000011s : 94: predicate.adjust_all_reduce_mul_add 1.90% : 0.000028s : 144: predicate.arithmetic_simplify 0.94% : 0.000014s : 94: predicate.cast_eliminate 2.61% : 0.000039s : 291: predicate.check_bprop_eliminate 0.42% : 0.000006s : 50: predicate.compare_switch_simplify 0.10% : 0.000002s : 15: predicate.const_output_eliminate 0.44% : 0.000007s : 50: predicate.depend_value_elim 0.88% : 0.000013s : 94: predicate.dict_get_item_const_eliminator 1.10% : 0.000016s : 94: predicate.dict_get_item_eliminator 0.80% : 0.000012s : 94: predicate.dict_set_item_eliminator 0.38% : 0.000006s : 30: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 15: predicate.elim_not_effective 0.18% : 0.000003s : 15: predicate.elim_shapecalc_of_broadcastargs 0.94% : 0.000014s : 109: predicate.environ_add_const_eliminate 0.93% : 0.000014s : 109: predicate.environ_get_add_eliminate 0.92% : 0.000014s : 109: predicate.environ_get_depend_swap 1.43% : 0.000021s : 159: predicate.environ_get_eliminate 4.54% : 0.000068s : 109: predicate.environ_get_set_eliminate 1.16% : 0.000017s : 135: predicate.exchange_switch_depend_value 1.80% : 0.000027s : 135: predicate.float_depend_g_call 0.44% : 0.000007s : 50: predicate.float_environ_get_switch 0.64% : 0.000010s : 65: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 15: predicate.fold_const_symbol 0.53% : 0.000008s : 50: predicate.get_grad_eliminate 0.50% : 0.000008s : 31: predicate.getattr_setattr_resolve 0.12% : 0.000002s : 15: predicate.graph_param_transform 0.44% : 0.000007s : 50: predicate.incorporate_call 0.42% : 0.000006s : 50: predicate.incorporate_call_switch 4.43% : 0.000066s : 359: predicate.inline 1.67% : 0.000025s : 143: predicate.inline_without_move 0.24% : 0.000004s : 50: predicate.j_node_and_user_rematch 0.57% : 0.000009s : 50: predicate.less_batch_normalization 1.36% : 0.000020s : 148: predicate.list_to_tuple_eliminator_ 2.08% : 0.000031s : 242: predicate.load_eliminater 0.38% : 0.000006s : 15: predicate.loop_unroll_after_grad 1.89% : 0.000028s : 205: predicate.loop_unroll_before_grad 1.18% : 0.000018s : 125: predicate.make_slice_get_slice_eliminator 0.47% : 0.000007s : 50: predicate.merge_addn 2.41% : 0.000036s : 266: predicate.micro_step_allgather_replace 2.44% : 0.000036s : 266: predicate.mini_step_allgather_replace 0.83% : 0.000012s : 94: predicate.minmaximum_grad 0.36% : 0.000005s : 15: predicate.mutable_eliminate 0.15% : 0.000002s : 15: predicate.opt_reshape 0.18% : 0.000003s : 15: predicate.parallel_virtual_node 1.76% : 0.000026s : 135: predicate.partial_defer_inline 1.30% : 0.000019s : 133: predicate.partial_eliminate 0.82% : 0.000012s : 94: predicate.print_const_string_wrapper 0.48% : 0.000007s : 50: predicate.reduce_all_const_elim 1.09% : 0.000016s : 94: predicate.reduce_eliminate 2.05% : 0.000031s : 242: predicate.redundant_stop_gradient_eliminater 0.27% : 0.000004s : 50: predicate.remove_not_recompute_node 2.39% : 0.000036s : 398: predicate.replace_applicator 0.78% : 0.000012s : 143: predicate.replace_old_param 0.09% : 0.000001s : 15: predicate.reset_defer_inline 0.91% : 0.000014s : 94: predicate.reshape_eliminate 2.48% : 0.000037s : 266: predicate.row_tensor_add_zeros_like 0.18% : 0.000003s : 15: predicate.row_tensor_eliminate 2.91% : 0.000043s : 291: predicate.same_eliminate 0.29% : 0.000004s : 50: predicate.set_cell_output_no_recompute 0.55% : 0.000008s : 50: predicate.shard_identity_eliminate 0.35% : 0.000005s : 30: predicate.special_op_eliminate 0.50% : 0.000007s : 50: predicate.specialize_transform 2.64% : 0.000039s : 266: predicate.split_environ_get_set_with_tuple_value 1.54% : 0.000023s : 143: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.32% : 0.000020s : 135: predicate.switch_defer_inline 3.82% : 0.000057s : 426: predicate.switch_layer_defer_inline 3.98% : 0.000059s : 405: predicate.switch_simplify 0.82% : 0.000012s : 94: predicate.tile_eliminate 0.87% : 0.000013s : 94: predicate.transpose_eliminate 1.29% : 0.000019s : 124: predicate.tuple_list_convert_item_index_to_positive 1.31% : 0.000020s : 125: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000019s : 124: predicate.tuple_list_get_item_depend_reorder 2.57% : 0.000038s : 198: predicate.tuple_list_get_item_eliminator 1.28% : 0.000019s : 125: predicate.tuple_list_get_set_item_eliminator 1.87% : 0.000028s : 175: predicate.tuple_list_set_item_eliminator 1.35% : 0.000020s : 148: predicate.tuple_to_list_eliminator_ 1.98% : 0.000030s : 242: predicate.updatestate_pure_node_eliminater 2.49% : 0.000037s : 292: predicate.updatestate_useless_node_eliminater 0.17% : 0.000003s : 15: predicate.value_based_eliminate 0.51% : 0.000008s : 50: predicate.virtual_dataset_eliminate 0.52% : 0.000008s : 50: predicate.virtual_output_eliminate 0.14% : 0.000002s : 15: predicate.virtual_view_grad_eliminate 0.30% : 0.000005s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.008020 91 71.21% : 0.005711s : 60: func_graph_cloner_run.FuncGraphClonerGraph 28.79% : 0.002309s : 31: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.386644 247 0.00% : 0.000004s : 1: ForceFp32Comm 1.16% : 0.004497s : 1: add_attr 1.16% : 0.004485s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000187s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000144s : 1: auto_monad 0.01% : 0.000052s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.17% : 0.000670s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000044s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000081s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000021s : 1: environ_conv 0.02% : 0.000064s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.17% : 0.000652s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.22% : 0.000848s : 1: mutable_eliminate 0.00% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000036s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000039s : 1: opt.transform.mutable_eliminate 2.96% : 0.011434s : 125: opt.transform.opt_a 0.02% : 0.000095s : 1: opt.transform.opt_after_cconv 0.02% : 0.000074s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000660s : 28: opt.transform.opt_b 0.22% : 0.000846s : 2: opt.transform.opt_resolve 0.07% : 0.000263s : 2: opt.transform.opt_trans_graph 0.03% : 0.000118s : 4: opt.transform.symbol_engine_opt 28.61% : 0.110616s : 1: opt_a 0.08% : 0.000293s : 1: opt_after_cconv 0.22% : 0.000867s : 1: opt_after_jit_grad 0.28% : 0.001094s : 1: opt_b 29.84% : 0.115369s : 1: optimize 0.01% : 0.000049s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000050s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000120s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000196s : 1: remove_dup_value 14.84% : 0.057368s : 2: renormalize.infer 2.03% : 0.007864s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000121s : 1: rewriter_after_opt_a 0.07% : 0.000263s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000021s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000185s : 1: symbol_engine_optimizer 6.50% : 0.025113s : 1: task_emit 0.08% : 0.000304s : 1: tuple_transform 10.57% : 0.040879s : 1: type_inference 0.05% : 0.000186s : 1: validate TotalTime = 0.0293444, [24] [bootstrap]: 0.00066272 [type_inference]: 0.010979 [event_method]: 1.831e-05 [auto_monad]: 7.514e-05 [graph_reusing]: 6.67002e-06 [inline]: 3.36999e-06 [add_attr]: 0.0042282, [1] [add_attr_with_inline]: 0.00421709, [1] [Cycle 1]: 0.00010621, [2] [tag_attr]: 2.282e-05 [meta_addattr_fg_expand]: 5.91e-06 [parallel-infer-symbol]: 4.53001e-06 [pre_auto_parallel]: 3.649e-05 [insert-virtual-dataset]: 2.98998e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.96003e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.0051182, [53] [py_interpret_to_execute]: 5.39e-06 [rewriter_before_opt_a]: 0.00018567 [opt_a]: 0.00275249, [2] [Cycle 1]: 0.00215935, [45] [expand_dump_flag]: 4.47e-06 [switch_simplify]: 4.283e-05 [loop_unroll]: 2.887e-05 [a_1]: 0.00055987 [with_stream_mark]: 1.58e-05 [recompute_prepare]: 7.14001e-06 [updatestate_depend_eliminate]: 3.9e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.62001e-06 [parameter_eliminate]: 1.81998e-06 [a_2]: 6.687e-05 [accelerated_algorithm]: 6.07001e-06 [shard]: 2.04e-06 [meta_shard_fg_expand]: 2.11998e-06 [shard_inline]: 5.36998e-06 [merge_send_recv]: 9.64e-06 [auto_parallel]: 6.23e-06 [parallel]: 8.379e-05 [flash_sp]: 9.96e-06 [merge_comm]: 3.93001e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 9.51e-06 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 8.91002e-06 [virtual_dataset]: 6.93e-06 [get_grad_eliminate_]: 5.62001e-06 [virtual_output]: 5.94999e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [offload_activation]: 1.065e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.38e-05 [merge_recompute_call_nodes]: 1.56002e-06 [before_grad]: 9.35001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.9e-06 [meta_fg_expand]: 3.01001e-06 [flash_sp_send_recv_attached]: 2.96001e-06 [receive_attached]: 2.78e-06 [after_resolve]: 9.87999e-06 [a_after_grad]: 9.17001e-06 [renormalize]: 0.00081861 [add_forward_monad_depend]: 5.04998e-06 [auto_monad_grad]: 2.63e-06 [auto_monad_eliminator]: 1.522e-05 [cse]: 3.424e-05 [a_3]: 4.236e-05 [Cycle 2]: 0.00058145, [45] [expand_dump_flag]: 1.66002e-06 [switch_simplify]: 6.63e-06 [loop_unroll]: 5.00999e-06 [a_1]: 0.0001023 [with_stream_mark]: 1.286e-05 [recompute_prepare]: 5.17e-06 [updatestate_depend_eliminate]: 2.41998e-06 [updatestate_assign_eliminate]: 2.01e-06 [updatestate_loads_eliminate]: 2.34001e-06 [parameter_eliminate]: 1.243e-05 [a_2]: 5.872e-05 [accelerated_algorithm]: 5.94999e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 1.63002e-06 [shard_inline]: 5.09998e-06 [merge_send_recv]: 6.17001e-06 [auto_parallel]: 6.32001e-06 [parallel]: 6.42001e-06 [flash_sp]: 4.08001e-06 [merge_comm]: 2.67001e-06 [allreduce_fusion]: 2.73e-06 [matmul_add_comm_reduction]: 5.99e-06 [allreduce_slice_to_reducescatter]: 1.05001e-06 [virtual_shard_identity]: 6.44001e-06 [virtual_dataset]: 5.25001e-06 [get_grad_eliminate_]: 5.20999e-06 [virtual_output]: 4.89e-06 [merge_forward]: 2.42001e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 6.38e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.251e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 8.30999e-06 [set_forward_comm_id_for_comm_node_pass]: 2.87002e-06 [meta_fg_expand]: 1.88002e-06 [flash_sp_send_recv_attached]: 1.24e-06 [receive_attached]: 1.94999e-06 [after_resolve]: 9.46e-06 [a_after_grad]: 7.63999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.44998e-06 [auto_monad_grad]: 6.80011e-07 [auto_monad_eliminator]: 5.74999e-06 [cse]: 1.439e-05 [a_3]: 2.874e-05 [py_interpret_to_execute_after_opt_a]: 5.46e-06 [slice_cell_reuse_recomputed_activation]: 2.39001e-06 [rewriter_after_opt_a]: 1.95e-05 [convert_after_rewriter]: 1.76998e-06 [order_py_execute_after_rewriter]: 1.47001e-06 [mutable_eliminate]: 0.00072355 [opt_b]: 0.00018354, [1] [Cycle 1]: 0.00017594, [7] [b_1]: 0.00010066 [b_2]: 6.64999e-06 [updatestate_depend_eliminate]: 5.61e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.21998e-06 [renormalize]: 4.69998e-07 [cse]: 1.91e-05 [optimize_parallel_all_gather_comm]: 1.741e-05 [overlap_param_gather]: 2.88e-06 [cconv]: 3.185e-05 [loop_unroll]: 0.00046753 [opt_after_cconv]: 9.631e-05, [1] [Cycle 1]: 9.009e-05, [7] [c_1]: 2.338e-05 [parameter_eliminate]: 3.75998e-06 [updatestate_depend_eliminate]: 4.83001e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.09999e-06 [cse]: 1.772e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.39e-05 [tuple_transform]: 6.876e-05, [1] [Cycle 1]: 6.309e-05, [4] [d_1]: 3.637e-05 [none_parameter_eliminate]: 1.43002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 5.87999e-06 [partial_unused_args_eliminate]: 1.83002e-06 [add_recomputation]: 8.732e-05 [cse_after_recomputation]: 2.175e-05, [1] [Cycle 1]: 1.706e-05, [1] [cse]: 1.099e-05 [environ_conv]: 4.53999e-06 [swap_dp_allreduce_reducescatter]: 5.54e-06 [bias_add_comm_swap]: 3.16001e-06 [label_micro_interleaved_index]: 5.09e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.40001e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.68e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.02998e-06 [remove_cast_before_assign_add]: 1.21997e-06 [full_micro_interleaved_order_control]: 2.44001e-06 [reorder_send_recv_between_fp_bp]: 2.84001e-06 [comm_op_add_attrs]: 1.24998e-06 [add_comm_op_reuse_tag]: 1.32e-06 [interleave_split_concat_branches]: 1.30999e-06 [interleave_parallel_branches]: 1.36002e-06 [overlap_opt_shard_in_pipeline]: 2.262e-05 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.333e-05 [grouped_pairwise_exchange_alltoall]: 1.84998e-06 [offloading_packed_experts]: 4.13999e-06 [overlap_recompute_and_grad_model_parallel]: 4.83001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.34999e-06 [overlap_grad_ring_attention]: 4.52e-06 [overlap_grad_flash_sp]: 2.178e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.55999e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 7.649e-05, [1] [Cycle 1]: 7.239e-05, [6] [build]: 4e-06 [elim_shapecalc]: 9.52999e-06 [elim_not_effective]: 1.208e-05 [opt_reshape]: 6.09001e-06 [fold_const_symbol]: 8.63001e-06 [renormalize]: 1.8999e-07 [detach_backward]: 1.97001e-06 [pipeline_parallel_scheduler]: 1.51002e-06 [auto_monad_reorder]: 1.693e-05 [get_jit_bprop_graph]: 1.74998e-06 [rewriter_after_jit_bprop_graph]: 3.48999e-06 [opt_after_jit_grad]: 0.000494 [validate]: 4.065e-05 [backend_pass]: 8.59989e-07 [task_emit]: 0.00738896 [execute]: 1.079e-05 Sums bootstrap : 0.000663s : 2.76% type_inference : 0.010979s : 45.64% event_method : 0.000018s : 0.08% auto_monad : 0.000075s : 0.31% graph_reusing : 0.000007s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.09% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000005s : 0.02% pre_auto_parallel : 0.000036s : 0.15% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000005s : 0.02% optimize.rewriter_before_opt_a : 0.000186s : 0.77% optimize.opt_a.expand_dump_flag : 0.000006s : 0.03% optimize.opt_a.switch_simplify : 0.000049s : 0.21% optimize.opt_a.loop_unroll : 0.000034s : 0.14% optimize.opt_a.a_1 : 0.000662s : 2.75% optimize.opt_a.with_stream_mark : 0.000029s : 0.12% optimize.opt_a.recompute_prepare : 0.000012s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.02% optimize.opt_a.parameter_eliminate : 0.000014s : 0.06% optimize.opt_a.a_2 : 0.000126s : 0.52% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.05% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.02% optimize.opt_a.shard_inline : 0.000010s : 0.04% optimize.opt_a.merge_send_recv : 0.000016s : 0.07% optimize.opt_a.auto_parallel : 0.000013s : 0.05% optimize.opt_a.parallel : 0.000090s : 0.38% optimize.opt_a.flash_sp : 0.000014s : 0.06% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000006s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000012s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.05% optimize.opt_a.virtual_output : 0.000011s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.07% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.11% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.02% optimize.opt_a.after_resolve : 0.000019s : 0.08% optimize.opt_a.a_after_grad : 0.000017s : 0.07% optimize.opt_a.renormalize : 0.000819s : 3.40% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.09% optimize.opt_a.cse : 0.000049s : 0.20% optimize.opt_a.a_3 : 0.000071s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000020s : 0.08% optimize.convert_after_rewriter : 0.000002s : 0.01% optimize.order_py_execute_after_rewriter : 0.000001s : 0.01% optimize.mutable_eliminate : 0.000724s : 3.01% optimize.opt_b.b_1 : 0.000101s : 0.42% optimize.opt_b.b_2 : 0.000007s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.07% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000032s : 0.13% optimize.loop_unroll : 0.000468s : 1.94% optimize.opt_after_cconv.c_1 : 0.000023s : 0.10% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.02% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.06% optimize.tuple_transform.d_1 : 0.000036s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000087s : 0.36% optimize.cse_after_recomputation.cse : 0.000011s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000023s : 0.09% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.02% optimize.overlap_grad_flash_sp : 0.000022s : 0.09% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.07% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000494s : 2.05% validate : 0.000041s : 0.17% backend_pass : 0.000001s : 0.00% task_emit : 0.007389s : 30.72% execute : 0.000011s : 0.04% Time group info: ------[substitution.] 0.000179 24 1.13% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000001s : 2: substitution.fold_const_symbol 3.16% : 0.000006s : 3: substitution.graph_param_transform 80.82% : 0.000145s : 5: substitution.inline 2.00% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.08% : 0.000006s : 4: substitution.remove_not_recompute_node 2.34% : 0.000004s : 2: substitution.replace_old_param 6.76% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.010906 2 90.85% : 0.009908s : 1: type_inference.infer 9.15% : 0.000998s : 1: type_inference.specialize ------[replace.] 0.000062 7 76.84% : 0.000048s : 5: replace.inline 23.16% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 7 92.81% : 0.000141s : 5: match.inline 7.19% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000174 1031 0.92% : 0.000002s : 11: predicate.accumulaten_eliminater 0.93% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 0.98% : 0.000002s : 11: predicate.addn_zero_filter 0.93% : 0.000002s : 11: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 17: predicate.arithmetic_simplify 0.89% : 0.000002s : 11: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.56% : 0.000001s : 6: predicate.depend_value_elim 0.92% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.83% : 0.000001s : 11: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 3: predicate.elim_not_effective 0.56% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.01% : 0.000002s : 14: predicate.environ_get_depend_swap 1.57% : 0.000003s : 20: predicate.environ_get_eliminate 1.01% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.42% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.32% : 0.000004s : 18: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.71% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.52% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 5.69% : 0.000010s : 47: predicate.inline 0.97% : 0.000002s : 6: predicate.inline_without_move 0.31% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.10% : 0.000002s : 6: predicate.less_batch_normalization 2.01% : 0.000004s : 19: predicate.list_to_tuple_eliminator_ 2.26% : 0.000004s : 30: predicate.load_eliminater 1.00% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.73% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 6: predicate.merge_addn 0.53% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.79% : 0.000001s : 11: predicate.minmaximum_grad 1.39% : 0.000002s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.36% : 0.000001s : 3: predicate.parallel_virtual_node 1.99% : 0.000003s : 18: predicate.partial_defer_inline 1.36% : 0.000002s : 16: predicate.partial_eliminate 0.93% : 0.000002s : 11: predicate.print_const_string_wrapper 0.51% : 0.000001s : 6: predicate.reduce_all_const_elim 1.30% : 0.000002s : 11: predicate.reduce_eliminate 2.33% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 19: predicate.replace_applicator 0.80% : 0.000001s : 6: predicate.replace_old_param 0.25% : 0.000000s : 3: predicate.reset_defer_inline 1.36% : 0.000002s : 11: predicate.reshape_eliminate 0.61% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 3: predicate.row_tensor_eliminate 0.88% : 0.000002s : 6: predicate.same_eliminate 0.40% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.06% : 0.000002s : 6: predicate.shard_identity_eliminate 0.73% : 0.000001s : 6: predicate.special_op_eliminate 0.65% : 0.000001s : 6: predicate.specialize_transform 1.15% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.28% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 18: predicate.switch_defer_inline 2.01% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.40% : 0.000009s : 61: predicate.switch_simplify 0.90% : 0.000002s : 11: predicate.tile_eliminate 0.85% : 0.000001s : 11: predicate.transpose_eliminate 1.58% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.34% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 17: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.89% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.29% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 3: predicate.value_based_eliminate 0.66% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000688 12 44.93% : 0.000309s : 5: func_graph_cloner_run.FuncGraphClonerGraph 55.07% : 0.000379s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040665 196 0.01% : 0.000004s : 1: ForceFp32Comm 10.41% : 0.004234s : 1: add_attr 10.38% : 0.004221s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.23% : 0.000092s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000081s : 1: auto_monad 0.05% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.71% : 0.000697s : 1: bootstrap 0.09% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000007s : 1: environ_conv 0.06% : 0.000024s : 1: event_method 0.04% : 0.000018s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000007s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000007s : 1: label_fine_grained_interleaved_index 0.02% : 0.000008s : 1: label_micro_interleaved_index 1.17% : 0.000478s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.81% : 0.000735s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 2.52% : 0.001023s : 78: opt.transform.opt_a 0.05% : 0.000022s : 1: opt.transform.opt_after_cconv 0.05% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000080s : 28: opt.transform.opt_b 0.10% : 0.000040s : 2: opt.transform.opt_trans_graph 0.08% : 0.000033s : 4: opt.transform.symbol_engine_opt 6.78% : 0.002756s : 1: opt_a 0.25% : 0.000100s : 1: opt_after_cconv 1.24% : 0.000504s : 1: opt_after_jit_grad 0.46% : 0.000187s : 1: opt_b 12.60% : 0.005123s : 1: optimize 0.05% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.06% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.02% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.06% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.02% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.10% : 0.000041s : 1: pre_auto_parallel 0.02% : 0.000009s : 1: py_interpret_to_execute 0.02% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000017s : 1: remove_dup_value 0.90% : 0.000368s : 1: renormalize.infer 1.09% : 0.000443s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000023s : 1: rewriter_after_opt_a 0.47% : 0.000191s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000079s : 1: symbol_engine_optimizer 18.23% : 0.007413s : 1: task_emit 0.18% : 0.000071s : 1: tuple_transform 27.05% : 0.011001s : 1: type_inference 0.20% : 0.000080s : 1: validate TotalTime = 0.0963101, [24] [bootstrap]: 0.00079683 [type_inference]: 0.0573104 [event_method]: 9.088e-05 [auto_monad]: 0.00018855 [graph_reusing]: 1.003e-05 [inline]: 3.14999e-06 [add_attr]: 0.00511056, [1] [add_attr_with_inline]: 0.00508777, [1] [Cycle 1]: 0.00010437, [2] [tag_attr]: 5.186e-05 [meta_addattr_fg_expand]: 1.102e-05 [parallel-infer-symbol]: 4.99e-06 [pre_auto_parallel]: 6.514e-05 [insert-virtual-dataset]: 5.91e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 2.14999e-06 [optimize]: 0.0171922, [53] [py_interpret_to_execute]: 5.63002e-06 [rewriter_before_opt_a]: 0.0003319 [opt_a]: 0.014524, [3] [Cycle 1]: 0.0107607, [45] [expand_dump_flag]: 7.5e-06 [switch_simplify]: 0.0001871 [loop_unroll]: 7.327e-05 [a_1]: 0.00177997 [with_stream_mark]: 3.252e-05 [recompute_prepare]: 2.578e-05 [updatestate_depend_eliminate]: 8.45001e-06 [updatestate_assign_eliminate]: 6.56e-06 [updatestate_loads_eliminate]: 6.23e-06 [parameter_eliminate]: 2.90002e-06 [a_2]: 0.00021537 [accelerated_algorithm]: 1.552e-05 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 4.63999e-06 [shard_inline]: 1.395e-05 [merge_send_recv]: 1.913e-05 [auto_parallel]: 1.318e-05 [parallel]: 3.63e-05 [flash_sp]: 1.298e-05 [merge_comm]: 9.02e-06 [allreduce_fusion]: 7.55998e-06 [matmul_add_comm_reduction]: 3.039e-05 [allreduce_slice_to_reducescatter]: 1.69998e-06 [virtual_shard_identity]: 1.803e-05 [virtual_dataset]: 1.583e-05 [get_grad_eliminate_]: 1.456e-05 [virtual_output]: 1.533e-05 [merge_forward]: 8.55001e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 1.774e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.8e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 3.098e-05 [set_forward_comm_id_for_comm_node_pass]: 8.53001e-06 [meta_fg_expand]: 0.00241912 [flash_sp_send_recv_attached]: 4.57e-06 [receive_attached]: 2.74001e-06 [after_resolve]: 6.141e-05 [a_after_grad]: 9.314e-05 [renormalize]: 0.00456959 [add_forward_monad_depend]: 1.377e-05 [auto_monad_grad]: 7.46001e-06 [auto_monad_eliminator]: 5.927e-05 [cse]: 0.00021753 [a_3]: 0.00031066 [Cycle 2]: 0.00309601, [45] [expand_dump_flag]: 3.50998e-06 [switch_simplify]: 4.003e-05 [loop_unroll]: 4.225e-05 [a_1]: 0.00133914 [with_stream_mark]: 2.48e-05 [recompute_prepare]: 9.87001e-06 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.49001e-06 [parameter_eliminate]: 2.09e-06 [a_2]: 0.00012513 [accelerated_algorithm]: 7.72998e-06 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 3.06999e-06 [shard_inline]: 6.92002e-06 [merge_send_recv]: 9.19998e-06 [auto_parallel]: 1.055e-05 [parallel]: 1.988e-05 [flash_sp]: 4.1e-06 [merge_comm]: 3.99002e-06 [allreduce_fusion]: 3.85998e-06 [matmul_add_comm_reduction]: 1.098e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 9.40001e-06 [virtual_dataset]: 6.77002e-06 [get_grad_eliminate_]: 6.22001e-06 [virtual_output]: 1.142e-05 [merge_forward]: 4.46002e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 1.057e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.066e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.23e-05 [set_forward_comm_id_for_comm_node_pass]: 4.03001e-06 [meta_fg_expand]: 8.024e-05 [flash_sp_send_recv_attached]: 2.31e-06 [receive_attached]: 3.26001e-06 [after_resolve]: 1.344e-05 [a_after_grad]: 1.086e-05 [renormalize]: 0.00084954 [add_forward_monad_depend]: 6.19999e-06 [auto_monad_grad]: 1.89999e-06 [auto_monad_eliminator]: 1.475e-05 [cse]: 3.08e-05 [a_3]: 4.818e-05 [Cycle 3]: 0.00064792, [45] [expand_dump_flag]: 1.87001e-06 [switch_simplify]: 8.54e-06 [loop_unroll]: 6.34999e-06 [a_1]: 0.00012663 [with_stream_mark]: 1.059e-05 [recompute_prepare]: 6.18002e-06 [updatestate_depend_eliminate]: 3.38e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.66999e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 7.544e-05 [accelerated_algorithm]: 6.66e-06 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 6.47001e-06 [merge_send_recv]: 5.46e-06 [auto_parallel]: 6.62002e-06 [parallel]: 5.20001e-06 [flash_sp]: 8.29983e-07 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 5.85002e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 8.08001e-06 [virtual_dataset]: 6.74001e-06 [get_grad_eliminate_]: 6.01e-06 [virtual_output]: 6.52001e-06 [merge_forward]: 4.18001e-06 [cell_reuse_recompute_pass]: 1.56002e-06 [offload_activation]: 7.66001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.541e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.081e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 1.04e-06 [receive_attached]: 1.59e-06 [after_resolve]: 9.20999e-06 [a_after_grad]: 9.87999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 7.64002e-06 [cse]: 1.904e-05 [a_3]: 3.727e-05 [py_interpret_to_execute_after_opt_a]: 5.81e-06 [slice_cell_reuse_recomputed_activation]: 2.36e-06 [rewriter_after_opt_a]: 2.553e-05 [convert_after_rewriter]: 1.79e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.0007729 [opt_b]: 0.00022162, [1] [Cycle 1]: 0.00021327, [7] [b_1]: 0.00012986 [b_2]: 8.22e-06 [updatestate_depend_eliminate]: 7.70998e-06 [updatestate_assign_eliminate]: 3.27002e-06 [updatestate_loads_eliminate]: 2.76999e-06 [renormalize]: 6.19999e-07 [cse]: 2.539e-05 [optimize_parallel_all_gather_comm]: 1.925e-05 [overlap_param_gather]: 2.41e-06 [cconv]: 3.017e-05 [loop_unroll]: 0.00046672 [opt_after_cconv]: 0.00010674, [1] [Cycle 1]: 0.00010084, [7] [c_1]: 3.118e-05 [parameter_eliminate]: 3.81999e-06 [updatestate_depend_eliminate]: 5.47999e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 2.73e-06 [cse]: 2.211e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.742e-05 [tuple_transform]: 7.816e-05, [1] [Cycle 1]: 7.329e-05, [4] [d_1]: 4.641e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 1.20024e-07 [switch_simplify]: 7.33999e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 0.00010882 [cse_after_recomputation]: 3.73e-05, [1] [Cycle 1]: 3.237e-05, [1] [cse]: 2.651e-05 [environ_conv]: 6.73998e-06 [swap_dp_allreduce_reducescatter]: 6.37001e-06 [bias_add_comm_swap]: 3.5e-06 [label_micro_interleaved_index]: 5.56e-06 [label_fine_grained_interleaved_index]: 3.12002e-06 [merge_cast_opt]: 1.56002e-06 [slice_recompute_activation]: 2.07001e-06 [micro_interleaved_order_control]: 2.89999e-06 [assign_add_opt]: 1.37e-06 [ForceFp32Comm]: 8.50006e-07 [remove_cast_before_assign_add]: 1.36002e-06 [full_micro_interleaved_order_control]: 2.73e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.39998e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.38002e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.471e-05 [grouped_pairwise_exchange_alltoall]: 1.94e-06 [offloading_packed_experts]: 5.31002e-06 [overlap_recompute_and_grad_model_parallel]: 5.41998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57001e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 5.12999e-06 [overlap_grad_flash_sp]: 2.474e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.46e-06 [split_layernorm_comm]: 1.89999e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 8.279e-05, [1] [Cycle 1]: 7.85e-05, [6] [build]: 4.11001e-06 [elim_shapecalc]: 1.182e-05 [elim_not_effective]: 1.476e-05 [opt_reshape]: 8.1e-06 [fold_const_symbol]: 1.183e-05 [renormalize]: 2.69996e-07 [detach_backward]: 2.06003e-06 [pipeline_parallel_scheduler]: 1.39e-06 [auto_monad_reorder]: 2.135e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 3.83001e-06 [opt_after_jit_grad]: 0.00048396 [validate]: 4.43e-05 [backend_pass]: 9.39996e-07 [task_emit]: 0.0146642 [execute]: 1.127e-05 Sums bootstrap : 0.000797s : 0.89% type_inference : 0.057310s : 63.83% event_method : 0.000091s : 0.10% auto_monad : 0.000189s : 0.21% graph_reusing : 0.000010s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000052s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000011s : 0.01% parallel-infer-symbol : 0.000005s : 0.01% pre_auto_parallel : 0.000065s : 0.07% insert-virtual-dataset : 0.000006s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000332s : 0.37% optimize.opt_a.expand_dump_flag : 0.000013s : 0.01% optimize.opt_a.switch_simplify : 0.000236s : 0.26% optimize.opt_a.loop_unroll : 0.000122s : 0.14% optimize.opt_a.a_1 : 0.003246s : 3.61% optimize.opt_a.with_stream_mark : 0.000068s : 0.08% optimize.opt_a.recompute_prepare : 0.000042s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000017s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_a.parameter_eliminate : 0.000006s : 0.01% optimize.opt_a.a_2 : 0.000416s : 0.46% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.01% optimize.opt_a.shard_inline : 0.000027s : 0.03% optimize.opt_a.merge_send_recv : 0.000034s : 0.04% optimize.opt_a.auto_parallel : 0.000030s : 0.03% optimize.opt_a.parallel : 0.000061s : 0.07% optimize.opt_a.flash_sp : 0.000018s : 0.02% optimize.opt_a.merge_comm : 0.000017s : 0.02% optimize.opt_a.allreduce_fusion : 0.000015s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000036s : 0.04% optimize.opt_a.virtual_dataset : 0.000029s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000027s : 0.03% optimize.opt_a.virtual_output : 0.000033s : 0.04% optimize.opt_a.merge_forward : 0.000017s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000036s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000064s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000054s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.02% optimize.opt_a.meta_fg_expand : 0.002502s : 2.79% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.01% optimize.opt_a.after_resolve : 0.000084s : 0.09% optimize.opt_a.a_after_grad : 0.000114s : 0.13% optimize.opt_a.renormalize : 0.005419s : 6.04% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.02% optimize.opt_a.auto_monad_grad : 0.000010s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000082s : 0.09% optimize.opt_a.cse : 0.000267s : 0.30% optimize.opt_a.a_3 : 0.000396s : 0.44% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.03% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000773s : 0.86% optimize.opt_b.b_1 : 0.000130s : 0.14% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.03% optimize.loop_unroll : 0.000467s : 0.52% optimize.opt_after_cconv.c_1 : 0.000031s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000046s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000109s : 0.12% optimize.cse_after_recomputation.cse : 0.000027s : 0.03% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000484s : 0.54% validate : 0.000044s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.014664s : 16.33% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.001005 160 4.10% : 0.000041s : 1: substitution.arithmetic_simplify 0.31% : 0.000003s : 3: substitution.elim_not_effective 0.90% : 0.000009s : 11: substitution.float_depend_g_call 0.44% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.17% : 0.000002s : 3: substitution.fold_const_symbol 0.72% : 0.000007s : 4: substitution.graph_param_transform 0.26% : 0.000003s : 2: substitution.incorporate_call 0.19% : 0.000002s : 2: substitution.incorporate_call_switch 62.99% : 0.000633s : 20: substitution.inline 2.04% : 0.000020s : 2: substitution.inline_without_move 1.53% : 0.000015s : 14: substitution.j_node_and_user_rematch 1.24% : 0.000012s : 7: substitution.minmaximum_grad 2.30% : 0.000023s : 11: substitution.partial_eliminate 1.81% : 0.000018s : 14: substitution.remove_not_recompute_node 3.91% : 0.000039s : 9: substitution.replace_applicator 0.91% : 0.000009s : 7: substitution.replace_old_param 0.37% : 0.000004s : 1: substitution.set_cell_output_no_recompute 3.04% : 0.000031s : 3: substitution.switch_simplify 2.49% : 0.000025s : 7: substitution.tuple_list_convert_item_index_to_positive 1.33% : 0.000013s : 7: substitution.tuple_list_get_item_const_eliminator 1.61% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 5.66% : 0.000057s : 16: substitution.tuple_list_get_item_eliminator 1.66% : 0.000017s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.057170 2 93.53% : 0.053471s : 1: type_inference.infer 6.47% : 0.003699s : 1: type_inference.specialize ------[replace.] 0.000271 31 2.59% : 0.000007s : 1: replace.arithmetic_simplify 57.54% : 0.000156s : 20: replace.inline 17.39% : 0.000047s : 3: replace.switch_simplify 22.47% : 0.000061s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000715 31 5.63% : 0.000040s : 1: match.arithmetic_simplify 86.98% : 0.000622s : 20: match.inline 4.05% : 0.000029s : 3: match.switch_simplify 3.34% : 0.000024s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000656 4039 1.07% : 0.000007s : 50: predicate.accumulaten_eliminater 0.28% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.40% : 0.000003s : 19: predicate.addn_check_dump 1.06% : 0.000007s : 50: predicate.addn_zero_filter 0.99% : 0.000006s : 50: predicate.adjust_all_reduce_mul_add 2.15% : 0.000014s : 70: predicate.arithmetic_simplify 1.16% : 0.000008s : 51: predicate.cast_eliminate 1.01% : 0.000007s : 47: predicate.check_bprop_eliminate 0.41% : 0.000003s : 19: predicate.compare_switch_simplify 0.05% : 0.000000s : 4: predicate.const_output_eliminate 0.42% : 0.000003s : 19: predicate.depend_value_elim 1.14% : 0.000007s : 51: predicate.dict_get_item_const_eliminator 1.22% : 0.000008s : 51: predicate.dict_get_item_eliminator 1.08% : 0.000007s : 51: predicate.dict_set_item_eliminator 0.29% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.07% : 0.000000s : 4: predicate.elim_not_effective 0.14% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.12% : 0.000007s : 55: predicate.environ_add_const_eliminate 1.09% : 0.000007s : 55: predicate.environ_get_add_eliminate 1.07% : 0.000007s : 55: predicate.environ_get_depend_swap 8.15% : 0.000053s : 74: predicate.environ_get_eliminate 1.07% : 0.000007s : 55: predicate.environ_get_set_eliminate 1.67% : 0.000011s : 78: predicate.exchange_switch_depend_value 2.45% : 0.000016s : 78: predicate.float_depend_g_call 0.42% : 0.000003s : 19: predicate.float_environ_get_switch 0.52% : 0.000003s : 23: predicate.float_tuple_getitem_switch 0.06% : 0.000000s : 4: predicate.fold_const_symbol 0.46% : 0.000003s : 19: predicate.get_grad_eliminate 0.09% : 0.000001s : 4: predicate.graph_param_transform 0.41% : 0.000003s : 19: predicate.incorporate_call 0.36% : 0.000002s : 19: predicate.incorporate_call_switch 5.03% : 0.000033s : 174: predicate.inline 1.24% : 0.000008s : 40: predicate.inline_without_move 0.22% : 0.000001s : 19: predicate.j_node_and_user_rematch 0.63% : 0.000004s : 19: predicate.less_batch_normalization 1.52% : 0.000010s : 66: predicate.list_to_tuple_eliminator_ 2.35% : 0.000015s : 116: predicate.load_eliminater 0.39% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.51% : 0.000016s : 115: predicate.loop_unroll_before_grad 1.33% : 0.000009s : 59: predicate.make_slice_get_slice_eliminator 0.45% : 0.000003s : 19: predicate.merge_addn 0.99% : 0.000006s : 47: predicate.micro_step_allgather_replace 0.97% : 0.000006s : 47: predicate.mini_step_allgather_replace 1.02% : 0.000007s : 51: predicate.minmaximum_grad 0.46% : 0.000003s : 4: predicate.mutable_eliminate 0.20% : 0.000001s : 4: predicate.opt_reshape 0.12% : 0.000001s : 4: predicate.parallel_virtual_node 2.35% : 0.000015s : 78: predicate.partial_defer_inline 1.47% : 0.000010s : 62: predicate.partial_eliminate 1.07% : 0.000007s : 50: predicate.print_const_string_wrapper 0.44% : 0.000003s : 19: predicate.reduce_all_const_elim 1.31% : 0.000009s : 51: predicate.reduce_eliminate 2.39% : 0.000016s : 116: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000002s : 19: predicate.remove_not_recompute_node 1.60% : 0.000010s : 105: predicate.replace_applicator 0.63% : 0.000004s : 40: predicate.replace_old_param 0.11% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000007s : 51: predicate.reshape_eliminate 1.01% : 0.000007s : 47: predicate.row_tensor_add_zeros_like 0.13% : 0.000001s : 4: predicate.row_tensor_eliminate 1.17% : 0.000008s : 47: predicate.same_eliminate 0.28% : 0.000002s : 19: predicate.set_cell_output_no_recompute 0.56% : 0.000004s : 19: predicate.shard_identity_eliminate 0.25% : 0.000002s : 8: predicate.special_op_eliminate 0.48% : 0.000003s : 19: predicate.specialize_transform 1.15% : 0.000008s : 47: predicate.split_environ_get_set_with_tuple_value 1.17% : 0.000008s : 40: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.82% : 0.000012s : 78: predicate.switch_defer_inline 2.77% : 0.000018s : 125: predicate.switch_layer_defer_inline 5.22% : 0.000034s : 222: predicate.switch_simplify 1.10% : 0.000007s : 51: predicate.tile_eliminate 1.03% : 0.000007s : 51: predicate.transpose_eliminate 1.32% : 0.000009s : 59: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000009s : 59: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000009s : 59: predicate.tuple_list_get_item_depend_reorder 2.51% : 0.000016s : 85: predicate.tuple_list_get_item_eliminator 1.37% : 0.000009s : 59: predicate.tuple_list_get_set_item_eliminator 1.91% : 0.000013s : 78: predicate.tuple_list_set_item_eliminator 1.43% : 0.000009s : 66: predicate.tuple_to_list_eliminator_ 2.29% : 0.000015s : 116: predicate.updatestate_pure_node_eliminater 2.74% : 0.000018s : 135: predicate.updatestate_useless_node_eliminater 0.12% : 0.000001s : 4: predicate.value_based_eliminate 0.48% : 0.000003s : 19: predicate.virtual_dataset_eliminate 0.50% : 0.000003s : 19: predicate.virtual_output_eliminate 0.08% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.13% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003790 38 59.11% : 0.002240s : 14: func_graph_cloner_run.FuncGraphClonerGraph 40.89% : 0.001550s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.129008 237 0.00% : 0.000003s : 1: ForceFp32Comm 3.97% : 0.005116s : 1: add_attr 3.95% : 0.005092s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000113s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.15% : 0.000199s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.65% : 0.000837s : 1: bootstrap 0.03% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.08% : 0.000102s : 1: event_method 0.02% : 0.000021s : 1: execute 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.01% : 0.000009s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.37% : 0.000476s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.61% : 0.000783s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 3.73% : 0.004809s : 117: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000110s : 28: opt.transform.opt_b 0.04% : 0.000052s : 2: opt.transform.opt_trans_graph 0.03% : 0.000043s : 4: opt.transform.symbol_engine_opt 11.26% : 0.014528s : 1: opt_a 0.09% : 0.000110s : 1: opt_after_cconv 0.38% : 0.000495s : 1: opt_after_jit_grad 0.17% : 0.000225s : 1: opt_b 13.33% : 0.017198s : 1: optimize 0.02% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.05% : 0.000070s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 2.22% : 0.002867s : 2: renormalize.infer 1.96% : 0.002532s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000029s : 1: rewriter_after_opt_a 0.26% : 0.000339s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000085s : 1: symbol_engine_optimizer 11.39% : 0.014691s : 1: task_emit 0.06% : 0.000081s : 1: tuple_transform 44.45% : 0.057349s : 1: type_inference 0.07% : 0.000084s : 1: validate random_generator: generate a numpy.ndarray(shape=(2, 3, 4, 5), dtype=, seed=1967515154) by numpy.random.randn, will be used as x random_generator: generate a numpy.ndarray(shape=(2, 3, 4, 5), dtype=, seed=1967515154) by numpy.random.randn, will be used as x group_cases_7 have all been run, results of sub cases are below: case: (1, mindspore.float16) {} pass. case: (1,) {} pass. case: (1, mindspore.bfloat16) {} pass. case: ('pynative', False) {} pass. case: ('pynative', True) {} pass. case: (0, mindspore.bfloat16) {} pass. case: (0, mindspore.float16) {} pass. case: (0,) {} pass. ops group_cases_8 with 8 cases start to running, all cases are below: case: (, 1) case: (, 0) case: (, 1) case: (, 'pynative') case: (, 'KBK') case: (, 'GRAPH') case: (, 'KBK') case: (, 'PYBOOST') ops group_cases_8 total running memory: 248M, memory threshold: 51200M [WARNING] ME(163781:281473890602800,ForkProcess-71):2026-01-29-17:46:03.336.940 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(163781:281473890602800,ForkProcess-71):2026-01-29-17:46:03.338.642 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. [WARNING] ME(163853:281473890602800,ForkProcess-72):2026-01-29-17:46:03.364.668 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(163853:281473890602800,ForkProcess-72):2026-01-29-17:46:03.366.746 [mindspore/context.py:1334] For 'context.set_context', the parameter 'save_graphs' will be deprecated and removed in a future version. Please use the env MS_DEV_SAVE_GRAPHS instead. TotalTime = 2.87851, [24] [bootstrap]: 0.00111976 [type_inference]: 0.0668211 [event_method]: 6.17e-05 [auto_monad]: 0.00012388 [graph_reusing]: 6.14001e-06 [inline]: 2.66999e-06 [add_attr]: 0.00784401, [1] [add_attr_with_inline]: 0.00783034, [1] [Cycle 1]: 0.00012526, [2] [tag_attr]: 3.904e-05 [meta_addattr_fg_expand]: 1.437e-05 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 5.098e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.88002e-06 [optimize]: 0.00639357, [53] [py_interpret_to_execute]: 4.20999e-06 [rewriter_before_opt_a]: 0.00037098 [opt_a]: 0.00371925, [2] [Cycle 1]: 0.00288992, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 7.807e-05 [loop_unroll]: 3.835e-05 [a_1]: 0.00069799 [with_stream_mark]: 1.525e-05 [recompute_prepare]: 1.023e-05 [updatestate_depend_eliminate]: 1.325e-05 [updatestate_assign_eliminate]: 1.049e-05 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 0.00011997 [accelerated_algorithm]: 9.39e-06 [shard]: 1.60999e-06 [meta_shard_fg_expand]: 1.96998e-06 [shard_inline]: 9.09e-06 [merge_send_recv]: 4.039e-05 [auto_parallel]: 6.14999e-06 [parallel]: 8.612e-05 [flash_sp]: 2.984e-05 [merge_comm]: 4.52998e-06 [allreduce_fusion]: 1.043e-05 [matmul_add_comm_reduction]: 1.608e-05 [allreduce_slice_to_reducescatter]: 7.56001e-06 [virtual_shard_identity]: 1.101e-05 [virtual_dataset]: 9.03002e-06 [get_grad_eliminate_]: 8.60999e-06 [virtual_output]: 8.67e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.639e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.188e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.226e-05 [set_forward_comm_id_for_comm_node_pass]: 1.096e-05 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 2.88998e-06 [receive_attached]: 1.711e-05 [after_resolve]: 1.221e-05 [a_after_grad]: 1.447e-05 [renormalize]: 0.001113 [add_forward_monad_depend]: 5.87001e-06 [auto_monad_grad]: 2.26e-06 [auto_monad_eliminator]: 2.55e-05 [cse]: 6.88e-05 [a_3]: 6.486e-05 [Cycle 2]: 0.00081865, [45] [expand_dump_flag]: 1.05001e-06 [switch_simplify]: 1.009e-05 [loop_unroll]: 8.57e-06 [a_1]: 0.00022508 [with_stream_mark]: 1.127e-05 [recompute_prepare]: 8.82e-06 [updatestate_depend_eliminate]: 3.34001e-06 [updatestate_assign_eliminate]: 2.79999e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00010776 [accelerated_algorithm]: 8.49998e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 8.59e-06 [merge_send_recv]: 4.80999e-06 [auto_parallel]: 5.51998e-06 [parallel]: 4.70999e-06 [flash_sp]: 3.41999e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 3.26999e-06 [matmul_add_comm_reduction]: 5.10001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 9.36e-06 [virtual_dataset]: 8.32e-06 [get_grad_eliminate_]: 8.21002e-06 [virtual_output]: 8.33999e-06 [merge_forward]: 3.38e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 6.49999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.436e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.161e-05 [set_forward_comm_id_for_comm_node_pass]: 3.43e-06 [meta_fg_expand]: 2.01998e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.12e-06 [after_resolve]: 1.128e-05 [a_after_grad]: 1.332e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 6.71999e-06 [cse]: 1.863e-05 [a_3]: 6.849e-05 [py_interpret_to_execute_after_opt_a]: 5.219e-05 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 3.004e-05 [convert_after_rewriter]: 1.40001e-06 [order_py_execute_after_rewriter]: 1.46002e-06 [mutable_eliminate]: 0.00056056 [opt_b]: 0.00026668, [1] [Cycle 1]: 0.00026067, [7] [b_1]: 0.00018326 [b_2]: 1.001e-05 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.58003e-06 [renormalize]: 5.50004e-07 [cse]: 2.378e-05 [optimize_parallel_all_gather_comm]: 2.595e-05 [overlap_param_gather]: 1.082e-05 [cconv]: 2.289e-05 [loop_unroll]: 0.00043659 [opt_after_cconv]: 0.00012169, [1] [Cycle 1]: 0.00011583, [7] [c_1]: 4.619e-05 [parameter_eliminate]: 2.59999e-06 [updatestate_depend_eliminate]: 5.42001e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 2.391e-05 [renormalize]: 2.59985e-07 [remove_dup_value]: 3.498e-05 [tuple_transform]: 9.266e-05, [1] [Cycle 1]: 8.813e-05, [4] [d_1]: 5.935e-05 [none_parameter_eliminate]: 1.58002e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 8.89e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 6.101e-05 [cse_after_recomputation]: 2.568e-05, [1] [Cycle 1]: 2.133e-05, [1] [cse]: 1.565e-05 [environ_conv]: 2.116e-05 [swap_dp_allreduce_reducescatter]: 2.237e-05 [bias_add_comm_swap]: 9.77999e-06 [label_micro_interleaved_index]: 1.298e-05 [label_fine_grained_interleaved_index]: 2.75002e-06 [merge_cast_opt]: 1.65001e-06 [slice_recompute_activation]: 1.84e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 8.75999e-06 [full_micro_interleaved_order_control]: 9.49e-06 [reorder_send_recv_between_fp_bp]: 2.83003e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.14003e-06 [interleave_split_concat_branches]: 1.16997e-06 [interleave_parallel_branches]: 7.9e-06 [overlap_opt_shard_in_pipeline]: 2.176e-05 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.344e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 3.97002e-06 [overlap_recompute_and_grad_model_parallel]: 1.188e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.14998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.37999e-06 [overlap_grad_ring_attention]: 1.825e-05 [overlap_grad_flash_sp]: 3.899e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 9.27999e-06 [split_layernorm_comm]: 1.69998e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 8.271e-05, [1] [Cycle 1]: 7.834e-05, [6] [build]: 2.22001e-06 [elim_shapecalc]: 1.297e-05 [elim_not_effective]: 1.453e-05 [opt_reshape]: 9.15999e-06 [fold_const_symbol]: 1.159e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.82001e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 2.303e-05 [get_jit_bprop_graph]: 1.02998e-06 [rewriter_after_jit_bprop_graph]: 3.56999e-06 [opt_after_jit_grad]: 0.00049104 [validate]: 6.92e-05 [backend_pass]: 9.49978e-07 [task_emit]: 2.79519 [execute]: 1.037e-05 Sums bootstrap : 0.001120s : 0.04% type_inference : 0.066821s : 2.33% event_method : 0.000062s : 0.00% auto_monad : 0.000124s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000051s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000371s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000088s : 0.00% optimize.opt_a.loop_unroll : 0.000047s : 0.00% optimize.opt_a.a_1 : 0.000923s : 0.03% optimize.opt_a.with_stream_mark : 0.000027s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000228s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.00% optimize.opt_a.merge_send_recv : 0.000045s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000091s : 0.00% optimize.opt_a.flash_sp : 0.000033s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.00% optimize.opt_a.a_after_grad : 0.000028s : 0.00% optimize.opt_a.renormalize : 0.001113s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.00% optimize.opt_a.cse : 0.000087s : 0.00% optimize.opt_a.a_3 : 0.000133s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000052s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000561s : 0.02% optimize.opt_b.b_1 : 0.000183s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000437s : 0.02% optimize.opt_after_cconv.c_1 : 0.000046s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000035s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000021s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000022s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000009s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000022s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000018s : 0.00% optimize.overlap_grad_flash_sp : 0.000039s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000491s : 0.02% validate : 0.000069s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.795189s : 97.41% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000202 28 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 3.52% : 0.000007s : 7: substitution.graph_param_transform 73.29% : 0.000148s : 5: substitution.inline 1.59% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.96% : 0.000012s : 4: substitution.remove_not_recompute_node 1.64% : 0.000003s : 2: substitution.replace_old_param 12.50% : 0.000025s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.066730 2 97.71% : 0.065201s : 1: type_inference.infer 2.29% : 0.001529s : 1: type_inference.specialize ------[replace.] 0.000069 7 72.62% : 0.000050s : 5: replace.inline 27.38% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000169 7 85.76% : 0.000145s : 5: match.inline 14.24% : 0.000024s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 2153 0.93% : 0.000002s : 21: predicate.accumulaten_eliminater 0.83% : 0.000002s : 7: predicate.ad_related_special_op_eliminate 0.61% : 0.000002s : 16: predicate.addn_check_dump 0.90% : 0.000002s : 21: predicate.addn_zero_filter 0.81% : 0.000002s : 21: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 37: predicate.arithmetic_simplify 0.89% : 0.000002s : 21: predicate.cast_eliminate 0.79% : 0.000002s : 16: predicate.check_bprop_eliminate 0.62% : 0.000002s : 16: predicate.compare_switch_simplify 0.32% : 0.000001s : 8: predicate.const_output_eliminate 0.65% : 0.000002s : 16: predicate.depend_value_elim 0.94% : 0.000002s : 21: predicate.dict_get_item_const_eliminator 1.07% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 21: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 15: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 7: predicate.elim_not_effective 0.54% : 0.000001s : 7: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 29: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 29: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 29: predicate.environ_get_depend_swap 1.87% : 0.000005s : 45: predicate.environ_get_eliminate 1.13% : 0.000003s : 29: predicate.environ_get_set_eliminate 1.24% : 0.000003s : 28: predicate.exchange_switch_depend_value 1.96% : 0.000005s : 28: predicate.float_depend_g_call 0.62% : 0.000002s : 16: predicate.float_environ_get_switch 0.94% : 0.000002s : 24: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 7: predicate.fold_const_symbol 0.73% : 0.000002s : 16: predicate.get_grad_eliminate 0.26% : 0.000001s : 7: predicate.graph_param_transform 0.67% : 0.000002s : 16: predicate.incorporate_call 0.59% : 0.000001s : 16: predicate.incorporate_call_switch 5.62% : 0.000014s : 97: predicate.inline 0.79% : 0.000002s : 16: predicate.inline_without_move 0.47% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.81% : 0.000002s : 16: predicate.less_batch_normalization 1.66% : 0.000004s : 38: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 60: predicate.load_eliminater 0.77% : 0.000002s : 8: predicate.loop_unroll_after_grad 2.41% : 0.000006s : 49: predicate.loop_unroll_before_grad 1.70% : 0.000004s : 37: predicate.make_slice_get_slice_eliminator 0.69% : 0.000002s : 16: predicate.merge_addn 0.67% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.67% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 21: predicate.minmaximum_grad 0.95% : 0.000002s : 8: predicate.mutable_eliminate 0.35% : 0.000001s : 7: predicate.opt_reshape 0.47% : 0.000001s : 8: predicate.parallel_virtual_node 1.55% : 0.000004s : 28: predicate.partial_defer_inline 1.47% : 0.000004s : 31: predicate.partial_eliminate 0.87% : 0.000002s : 21: predicate.print_const_string_wrapper 0.69% : 0.000002s : 16: predicate.reduce_all_const_elim 1.25% : 0.000003s : 21: predicate.reduce_eliminate 2.40% : 0.000006s : 60: predicate.redundant_stop_gradient_eliminater 0.54% : 0.000001s : 16: predicate.remove_not_recompute_node 1.64% : 0.000004s : 39: predicate.replace_applicator 0.53% : 0.000001s : 16: predicate.replace_old_param 0.37% : 0.000001s : 8: predicate.reset_defer_inline 0.87% : 0.000002s : 21: predicate.reshape_eliminate 0.71% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 8: predicate.row_tensor_eliminate 0.84% : 0.000002s : 16: predicate.same_eliminate 0.63% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 16: predicate.shard_identity_eliminate 0.79% : 0.000002s : 15: predicate.special_op_eliminate 0.75% : 0.000002s : 16: predicate.specialize_transform 0.79% : 0.000002s : 16: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 16: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.33% : 0.000003s : 28: predicate.switch_defer_inline 2.02% : 0.000005s : 44: predicate.switch_layer_defer_inline 5.13% : 0.000013s : 100: predicate.switch_simplify 0.89% : 0.000002s : 21: predicate.tile_eliminate 0.91% : 0.000002s : 21: predicate.transpose_eliminate 1.54% : 0.000004s : 36: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000004s : 36: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 36: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000007s : 54: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 36: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000006s : 52: predicate.tuple_list_set_item_eliminator 1.66% : 0.000004s : 38: predicate.tuple_to_list_eliminator_ 2.35% : 0.000006s : 60: predicate.updatestate_pure_node_eliminater 3.16% : 0.000008s : 76: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 8: predicate.value_based_eliminate 0.73% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 16: predicate.virtual_output_eliminate 0.32% : 0.000001s : 7: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000966 14 48.56% : 0.000469s : 7: func_graph_cloner_run.FuncGraphClonerGraph 51.44% : 0.000497s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.895681 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.27% : 0.007849s : 1: add_attr 0.27% : 0.007834s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000131s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.04% : 0.001167s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000069s : 1: event_method 0.00% : 0.000020s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000445s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000570s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.05% : 0.001560s : 78: opt.transform.opt_a 0.00% : 0.000045s : 1: opt.transform.opt_after_cconv 0.00% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000167s : 28: opt.transform.opt_b 0.00% : 0.000066s : 2: opt.transform.opt_trans_graph 0.00% : 0.000044s : 4: opt.transform.symbol_engine_opt 0.13% : 0.003722s : 1: opt_a 0.00% : 0.000125s : 1: opt_after_cconv 0.02% : 0.000501s : 1: opt_after_jit_grad 0.01% : 0.000270s : 1: opt_b 0.22% : 0.006398s : 1: optimize 0.00% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000043s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000021s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000025s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000055s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000056s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000040s : 1: remove_dup_value 0.02% : 0.000619s : 1: renormalize.infer 0.02% : 0.000486s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000034s : 1: rewriter_after_opt_a 0.01% : 0.000378s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000025s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000085s : 1: symbol_engine_optimizer 96.53% : 2.795231s : 1: task_emit 0.00% : 0.000096s : 1: tuple_transform 2.31% : 0.066838s : 1: type_inference 0.00% : 0.000098s : 1: validate TotalTime = 2.92695, [24] [bootstrap]: 0.00092869 [type_inference]: 0.0237483 [event_method]: 1.97e-05 [auto_monad]: 0.00013318 [graph_reusing]: 4.35999e-06 [inline]: 2.39999e-06 [add_attr]: 0.00777716, [1] [add_attr_with_inline]: 0.00776137, [1] [Cycle 1]: 0.00013672, [2] [tag_attr]: 3.488e-05 [meta_addattr_fg_expand]: 1.793e-05 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 5.534e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 6.60017e-07 [dataset_repeat_opt]: 1.60001e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.00536192, [53] [py_interpret_to_execute]: 4.06001e-06 [rewriter_before_opt_a]: 0.00022019 [opt_a]: 0.00291797, [2] [Cycle 1]: 0.00230798, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 7.752e-05 [loop_unroll]: 3.339e-05 [a_1]: 0.00061899 [with_stream_mark]: 1.114e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 1.544e-05 [updatestate_assign_eliminate]: 1.467e-05 [updatestate_loads_eliminate]: 2.12001e-06 [parameter_eliminate]: 1.14003e-06 [a_2]: 7.348e-05 [accelerated_algorithm]: 6.19999e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 1.46998e-06 [shard_inline]: 5.86e-06 [merge_send_recv]: 5.551e-05 [auto_parallel]: 6.51e-06 [parallel]: 9.99e-05 [flash_sp]: 4.281e-05 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 1.37e-05 [matmul_add_comm_reduction]: 1.865e-05 [allreduce_slice_to_reducescatter]: 1.109e-05 [virtual_shard_identity]: 1.305e-05 [virtual_dataset]: 6.36998e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 5.94999e-06 [merge_forward]: 3.35e-06 [cell_reuse_recompute_pass]: 9.39996e-07 [offload_activation]: 2.044e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.275e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 9.34998e-06 [set_forward_comm_id_for_comm_node_pass]: 1.488e-05 [meta_fg_expand]: 2.48e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.584e-05 [after_resolve]: 1.064e-05 [a_after_grad]: 8.60001e-06 [renormalize]: 0.00067135 [add_forward_monad_depend]: 4.56002e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 2.584e-05 [cse]: 4.673e-05 [a_3]: 4.085e-05 [Cycle 2]: 0.00059987, [45] [expand_dump_flag]: 1.62999e-06 [switch_simplify]: 7.67998e-06 [loop_unroll]: 5.94e-06 [a_1]: 0.00012344 [with_stream_mark]: 1.06e-05 [recompute_prepare]: 5.81e-06 [updatestate_depend_eliminate]: 2.95998e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.19001e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 6.432e-05 [accelerated_algorithm]: 5.65001e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.20001e-06 [shard_inline]: 5.55001e-06 [merge_send_recv]: 4.72e-06 [auto_parallel]: 5.81003e-06 [parallel]: 3.9e-06 [flash_sp]: 3.25e-06 [merge_comm]: 3.09999e-06 [allreduce_fusion]: 2.86e-06 [matmul_add_comm_reduction]: 5.59e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 6.51999e-06 [virtual_dataset]: 5.69999e-06 [get_grad_eliminate_]: 5.40999e-06 [virtual_output]: 5.54e-06 [merge_forward]: 3.09001e-06 [cell_reuse_recompute_pass]: 1.54e-06 [offload_activation]: 6.26e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.164e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 8.32e-06 [set_forward_comm_id_for_comm_node_pass]: 3.31001e-06 [meta_fg_expand]: 1.81e-06 [flash_sp_send_recv_attached]: 1.00999e-06 [receive_attached]: 1.14e-06 [after_resolve]: 1.017e-05 [a_after_grad]: 8.30999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.06002e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 6.58e-06 [cse]: 1.203e-05 [a_3]: 3.251e-05 [py_interpret_to_execute_after_opt_a]: 3.75998e-06 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 3.001e-05 [convert_after_rewriter]: 1.35001e-06 [order_py_execute_after_rewriter]: 1.41998e-06 [mutable_eliminate]: 0.00062996 [opt_b]: 0.00018414, [1] [Cycle 1]: 0.00017817, [7] [b_1]: 0.00011086 [b_2]: 7.23e-06 [updatestate_depend_eliminate]: 4.15999e-06 [updatestate_assign_eliminate]: 2.26998e-06 [updatestate_loads_eliminate]: 2.26e-06 [renormalize]: 8.2e-07 [cse]: 1.595e-05 [optimize_parallel_all_gather_comm]: 3.101e-05 [overlap_param_gather]: 1.6e-05 [cconv]: 2.402e-05 [loop_unroll]: 0.00043275 [opt_after_cconv]: 9.262e-05, [1] [Cycle 1]: 8.711e-05, [7] [c_1]: 2.762e-05 [parameter_eliminate]: 2.19999e-06 [updatestate_depend_eliminate]: 4.81002e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.14999e-06 [cse]: 1.581e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.315e-05 [tuple_transform]: 7.015e-05, [1] [Cycle 1]: 6.575e-05, [4] [d_1]: 4.07e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 6.36998e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 6.099e-05 [cse_after_recomputation]: 1.997e-05, [1] [Cycle 1]: 1.56e-05, [1] [cse]: 1.051e-05 [environ_conv]: 2.195e-05 [swap_dp_allreduce_reducescatter]: 3.05e-05 [bias_add_comm_swap]: 1.408e-05 [label_micro_interleaved_index]: 1.608e-05 [label_fine_grained_interleaved_index]: 2.59001e-06 [merge_cast_opt]: 1.23002e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.61e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 1.12e-06 [remove_cast_before_assign_add]: 1.428e-05 [full_micro_interleaved_order_control]: 1.416e-05 [reorder_send_recv_between_fp_bp]: 2.43e-06 [comm_op_add_attrs]: 1.06997e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.06002e-06 [interleave_parallel_branches]: 1.224e-05 [overlap_opt_shard_in_pipeline]: 1.964e-05 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.293e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 4.08001e-06 [overlap_recompute_and_grad_model_parallel]: 1.64e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.28998e-06 [overlap_grad_ring_attention]: 2.782e-05 [overlap_grad_flash_sp]: 5.594e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 1.351e-05 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 7.15e-05, [1] [Cycle 1]: 6.752e-05, [6] [build]: 2.63003e-06 [elim_shapecalc]: 9.72999e-06 [elim_not_effective]: 1.192e-05 [opt_reshape]: 6.26e-06 [fold_const_symbol]: 9.07001e-06 [renormalize]: 1.8999e-07 [detach_backward]: 2.02001e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.164e-05 [get_jit_bprop_graph]: 1.77999e-06 [rewriter_after_jit_bprop_graph]: 3.9e-06 [opt_after_jit_grad]: 0.0004772 [validate]: 6.092e-05 [backend_pass]: 1.00999e-06 [task_emit]: 2.88763 [execute]: 1.23e-05 Sums bootstrap : 0.000929s : 0.03% type_inference : 0.023748s : 0.81% event_method : 0.000020s : 0.00% auto_monad : 0.000133s : 0.00% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000018s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000055s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000220s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000085s : 0.00% optimize.opt_a.loop_unroll : 0.000039s : 0.00% optimize.opt_a.a_1 : 0.000742s : 0.03% optimize.opt_a.with_stream_mark : 0.000022s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000138s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000060s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000104s : 0.00% optimize.opt_a.flash_sp : 0.000046s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000027s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.00% optimize.opt_a.a_after_grad : 0.000017s : 0.00% optimize.opt_a.renormalize : 0.000671s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.00% optimize.opt_a.cse : 0.000059s : 0.00% optimize.opt_a.a_3 : 0.000073s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000630s : 0.02% optimize.opt_b.b_1 : 0.000111s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000016s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000016s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000433s : 0.01% optimize.opt_after_cconv.c_1 : 0.000028s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000041s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000022s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000030s : 0.00% optimize.bias_add_comm_swap : 0.000014s : 0.00% optimize.label_micro_interleaved_index : 0.000016s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000014s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000012s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000020s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000016s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000028s : 0.00% optimize.overlap_grad_flash_sp : 0.000056s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000014s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000477s : 0.02% validate : 0.000061s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.887630s : 98.97% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000185 29 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 3.30% : 0.000006s : 4: substitution.graph_param_transform 73.44% : 0.000136s : 5: substitution.inline 1.69% : 0.000003s : 4: substitution.j_node_and_user_rematch 8.44% : 0.000016s : 4: substitution.remove_not_recompute_node 2.56% : 0.000005s : 4: substitution.replace_old_param 8.76% : 0.000016s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.023670 2 95.84% : 0.022686s : 1: type_inference.infer 4.16% : 0.000984s : 1: type_inference.specialize ------[replace.] 0.000061 9 63.65% : 0.000039s : 5: replace.inline 36.35% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 9 90.25% : 0.000133s : 5: match.inline 9.75% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000190 1345 0.92% : 0.000002s : 14: predicate.accumulaten_eliminater 0.78% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 14: predicate.addn_zero_filter 0.87% : 0.000002s : 14: predicate.adjust_all_reduce_mul_add 2.07% : 0.000004s : 22: predicate.arithmetic_simplify 0.96% : 0.000002s : 14: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 14: predicate.dict_get_item_const_eliminator 1.26% : 0.000002s : 14: predicate.dict_get_item_eliminator 1.24% : 0.000002s : 14: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000003s : 18: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 18: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 18: predicate.environ_get_depend_swap 1.68% : 0.000003s : 26: predicate.environ_get_eliminate 1.14% : 0.000002s : 18: predicate.environ_get_set_eliminate 1.58% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.23% : 0.000004s : 23: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.30% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 5.63% : 0.000011s : 61: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.58% : 0.000005s : 40: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.78% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 22: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 14: predicate.minmaximum_grad 0.85% : 0.000002s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 2.00% : 0.000004s : 23: predicate.partial_defer_inline 1.66% : 0.000003s : 22: predicate.partial_eliminate 0.94% : 0.000002s : 14: predicate.print_const_string_wrapper 0.56% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000002s : 14: predicate.reduce_eliminate 2.67% : 0.000005s : 40: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000003s : 26: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 14: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000001s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.67% : 0.000003s : 23: predicate.switch_defer_inline 2.20% : 0.000004s : 31: predicate.switch_layer_defer_inline 5.62% : 0.000011s : 76: predicate.switch_simplify 0.97% : 0.000002s : 14: predicate.tile_eliminate 0.97% : 0.000002s : 14: predicate.transpose_eliminate 1.59% : 0.000003s : 22: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 22: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000003s : 22: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 22: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000004s : 30: predicate.tuple_list_set_item_eliminator 1.82% : 0.000003s : 26: predicate.tuple_to_list_eliminator_ 2.53% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 3.23% : 0.000006s : 48: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.69% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000595 12 49.84% : 0.000297s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.16% : 0.000298s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.941794 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.26% : 0.007782s : 1: add_attr 0.26% : 0.007766s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000140s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.03% : 0.000995s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000026s : 1: environ_conv 0.00% : 0.000026s : 1: event_method 0.00% : 0.000028s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000015s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000019s : 1: label_micro_interleaved_index 0.01% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000639s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.04% : 0.001181s : 78: opt.transform.opt_a 0.00% : 0.000026s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000092s : 28: opt.transform.opt_b 0.00% : 0.000045s : 2: opt.transform.opt_trans_graph 0.00% : 0.000034s : 4: opt.transform.symbol_engine_opt 0.10% : 0.002921s : 1: opt_a 0.00% : 0.000096s : 1: opt_after_cconv 0.02% : 0.000489s : 1: opt_after_jit_grad 0.01% : 0.000188s : 1: opt_b 0.18% : 0.005367s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000060s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000031s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000019s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000060s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000017s : 1: remove_cast_before_assign_add 0.00% : 0.000016s : 1: remove_dup_value 0.01% : 0.000320s : 1: renormalize.infer 0.01% : 0.000344s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000034s : 1: rewriter_after_opt_a 0.01% : 0.000227s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000016s : 1: split_matmul_comm_elemetwise 0.00% : 0.000034s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000074s : 1: symbol_engine_optimizer 98.16% : 2.887792s : 1: task_emit 0.00% : 0.000073s : 1: tuple_transform 0.81% : 0.023773s : 1: type_inference 0.00% : 0.000090s : 1: validate Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' TotalTime = 0.117439, [24] [bootstrap]: 0.00079124 [type_inference]: 0.06754 [event_method]: 0.00025091 [auto_monad]: 0.00015162 [graph_reusing]: 8.90001e-06 [inline]: 2.04e-06 [add_attr]: 0.00401444, [1] [add_attr_with_inline]: 0.00400608, [1] [Cycle 1]: 0.00011203, [2] [tag_attr]: 4.363e-05 [meta_addattr_fg_expand]: 1.284e-05 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 6.025e-05 [insert-virtual-dataset]: 3.16001e-06 [parallel-infer-symbol-second]: 1.05001e-06 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0320766, [53] [py_interpret_to_execute]: 5.12e-06 [rewriter_before_opt_a]: 0.00044575 [opt_a]: 0.0293299, [3] [Cycle 1]: 0.0246433, [45] [expand_dump_flag]: 4.34002e-06 [switch_simplify]: 0.00016475 [loop_unroll]: 7.682e-05 [a_1]: 0.00160876 [with_stream_mark]: 2.254e-05 [recompute_prepare]: 2.506e-05 [updatestate_depend_eliminate]: 9.04e-06 [updatestate_assign_eliminate]: 7.71999e-06 [updatestate_loads_eliminate]: 7.21999e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 0.00031085 [accelerated_algorithm]: 1.917e-05 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 4.99e-06 [shard_inline]: 1.809e-05 [merge_send_recv]: 1.832e-05 [auto_parallel]: 1.225e-05 [parallel]: 8.587e-05 [flash_sp]: 1.186e-05 [merge_comm]: 1.053e-05 [allreduce_fusion]: 9.32999e-06 [matmul_add_comm_reduction]: 2.69e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 2.135e-05 [virtual_dataset]: 1.875e-05 [get_grad_eliminate_]: 1.833e-05 [virtual_output]: 1.8e-05 [merge_forward]: 8.74e-06 [cell_reuse_recompute_pass]: 1.26002e-06 [offload_activation]: 1.888e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.986e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 3.011e-05 [set_forward_comm_id_for_comm_node_pass]: 9.52001e-06 [meta_fg_expand]: 0.00212488 [flash_sp_send_recv_attached]: 3.99002e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 8.587e-05 [a_after_grad]: 0.00011041 [renormalize]: 0.0182761 [add_forward_monad_depend]: 1.315e-05 [auto_monad_grad]: 7.73999e-06 [auto_monad_eliminator]: 6.714e-05 [cse]: 0.00052163 [a_3]: 0.00046363 [Cycle 2]: 0.00392234, [45] [expand_dump_flag]: 2.50002e-06 [switch_simplify]: 5.951e-05 [loop_unroll]: 5.668e-05 [a_1]: 0.00151712 [with_stream_mark]: 1.558e-05 [recompute_prepare]: 1.078e-05 [updatestate_depend_eliminate]: 3.44001e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 0.00010147 [accelerated_algorithm]: 9.74999e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 9.75002e-06 [auto_parallel]: 9.82001e-06 [parallel]: 9.59e-06 [flash_sp]: 4.2e-06 [merge_comm]: 3.42002e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.05999e-06 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 1.088e-05 [virtual_dataset]: 8.62e-06 [get_grad_eliminate_]: 8.14002e-06 [virtual_output]: 8.34998e-06 [merge_forward]: 3.54002e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 9.96e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.413e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.221e-05 [set_forward_comm_id_for_comm_node_pass]: 4.06001e-06 [meta_fg_expand]: 0.0001506 [flash_sp_send_recv_attached]: 1.95001e-06 [receive_attached]: 2.71999e-06 [after_resolve]: 1.085e-05 [a_after_grad]: 1.252e-05 [renormalize]: 0.00146067 [add_forward_monad_depend]: 4.28001e-06 [auto_monad_grad]: 1.40999e-06 [auto_monad_eliminator]: 1.222e-05 [cse]: 2.756e-05 [a_3]: 5.913e-05 [Cycle 3]: 0.00074746, [45] [expand_dump_flag]: 1.42999e-06 [switch_simplify]: 9.65002e-06 [loop_unroll]: 8.2e-06 [a_1]: 0.00018341 [with_stream_mark]: 8.42e-06 [recompute_prepare]: 8e-06 [updatestate_depend_eliminate]: 3.51001e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 9.746e-05 [accelerated_algorithm]: 8.43001e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.52001e-06 [shard_inline]: 8e-06 [merge_send_recv]: 5.44e-06 [auto_parallel]: 6.78998e-06 [parallel]: 4.37e-06 [flash_sp]: 1.16002e-06 [merge_comm]: 3.39001e-06 [allreduce_fusion]: 3.14999e-06 [matmul_add_comm_reduction]: 5.77001e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 8.81002e-06 [virtual_dataset]: 7.66001e-06 [get_grad_eliminate_]: 7.50998e-06 [virtual_output]: 7.69997e-06 [merge_forward]: 3.26999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 7.00998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.416e-05 [merge_recompute_call_nodes]: 8.39995e-07 [before_grad]: 1.098e-05 [set_forward_comm_id_for_comm_node_pass]: 3.34001e-06 [meta_fg_expand]: 2.39999e-06 [flash_sp_send_recv_attached]: 1.27e-06 [receive_attached]: 1.25999e-06 [after_resolve]: 7.69002e-06 [a_after_grad]: 1.227e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.05001e-06 [auto_monad_grad]: 7.29982e-07 [auto_monad_eliminator]: 6.49001e-06 [cse]: 2.052e-05 [a_3]: 4.908e-05 [py_interpret_to_execute_after_opt_a]: 4.96002e-06 [slice_cell_reuse_recomputed_activation]: 2.43002e-06 [rewriter_after_opt_a]: 1.917e-05 [convert_after_rewriter]: 1.23002e-06 [order_py_execute_after_rewriter]: 9.80013e-07 [mutable_eliminate]: 0.00063771 [opt_b]: 0.00027308, [1] [Cycle 1]: 0.00026563, [7] [b_1]: 0.00018031 [b_2]: 1.033e-05 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 2.70997e-06 [updatestate_loads_eliminate]: 2.54001e-06 [renormalize]: 5.50004e-07 [cse]: 2.695e-05 [optimize_parallel_all_gather_comm]: 1.575e-05 [overlap_param_gather]: 2.61e-06 [cconv]: 1.997e-05 [loop_unroll]: 0.00045811 [opt_after_cconv]: 0.00014129, [1] [Cycle 1]: 0.00011621, [7] [c_1]: 4.165e-05 [parameter_eliminate]: 2.59001e-06 [updatestate_depend_eliminate]: 5.51002e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.62001e-06 [cse]: 2.743e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 4.396e-05 [tuple_transform]: 9.934e-05, [1] [Cycle 1]: 9.391e-05, [4] [d_1]: 5.604e-05 [none_parameter_eliminate]: 1.30001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.269e-05 [partial_unused_args_eliminate]: 1.27e-06 [add_recomputation]: 7.774e-05 [cse_after_recomputation]: 3.268e-05, [1] [Cycle 1]: 2.775e-05, [1] [cse]: 2.065e-05 [environ_conv]: 8.25999e-06 [swap_dp_allreduce_reducescatter]: 7.24001e-06 [bias_add_comm_swap]: 2.63003e-06 [label_micro_interleaved_index]: 4.74998e-06 [label_fine_grained_interleaved_index]: 2.63003e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 1.12e-06 [remove_cast_before_assign_add]: 1.34998e-06 [full_micro_interleaved_order_control]: 2.74999e-06 [reorder_send_recv_between_fp_bp]: 2.79001e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.72999e-06 [overlap_opt_shard_in_pipeline]: 2.21e-05 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.452e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 3.77002e-06 [overlap_recompute_and_grad_model_parallel]: 5.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47001e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.85999e-06 [overlap_grad_flash_sp]: 2.048e-05 [begin_end_overlap_inline]: 4.60015e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.734e-05, [1] [Cycle 1]: 8.048e-05, [6] [build]: 2.52001e-06 [elim_shapecalc]: 1.322e-05 [elim_not_effective]: 1.548e-05 [opt_reshape]: 8.69998e-06 [fold_const_symbol]: 1.189e-05 [renormalize]: 1.99972e-07 [detach_backward]: 2.37001e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 1.591e-05 [get_jit_bprop_graph]: 1.49e-06 [rewriter_after_jit_bprop_graph]: 3.58999e-06 [opt_after_jit_grad]: 0.00051274 [validate]: 4.585e-05 [backend_pass]: 6.80011e-07 [task_emit]: 0.0116651 [execute]: 8.50001e-06 Sums bootstrap : 0.000791s : 0.71% type_inference : 0.067540s : 60.31% event_method : 0.000251s : 0.22% auto_monad : 0.000152s : 0.14% graph_reusing : 0.000009s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000044s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000060s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000446s : 0.40% optimize.opt_a.expand_dump_flag : 0.000008s : 0.01% optimize.opt_a.switch_simplify : 0.000234s : 0.21% optimize.opt_a.loop_unroll : 0.000142s : 0.13% optimize.opt_a.a_1 : 0.003309s : 2.95% optimize.opt_a.with_stream_mark : 0.000047s : 0.04% optimize.opt_a.recompute_prepare : 0.000044s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000510s : 0.46% optimize.opt_a.accelerated_algorithm : 0.000037s : 0.03% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.01% optimize.opt_a.shard_inline : 0.000034s : 0.03% optimize.opt_a.merge_send_recv : 0.000034s : 0.03% optimize.opt_a.auto_parallel : 0.000029s : 0.03% optimize.opt_a.parallel : 0.000100s : 0.09% optimize.opt_a.flash_sp : 0.000017s : 0.02% optimize.opt_a.merge_comm : 0.000017s : 0.02% optimize.opt_a.allreduce_fusion : 0.000016s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000042s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000041s : 0.04% optimize.opt_a.virtual_dataset : 0.000035s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000034s : 0.03% optimize.opt_a.virtual_output : 0.000034s : 0.03% optimize.opt_a.merge_forward : 0.000016s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000036s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000058s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000053s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.02% optimize.opt_a.meta_fg_expand : 0.002278s : 2.03% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000104s : 0.09% optimize.opt_a.a_after_grad : 0.000135s : 0.12% optimize.opt_a.renormalize : 0.019737s : 17.62% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.02% optimize.opt_a.auto_monad_grad : 0.000010s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000086s : 0.08% optimize.opt_a.cse : 0.000570s : 0.51% optimize.opt_a.a_3 : 0.000572s : 0.51% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000638s : 0.57% optimize.opt_b.b_1 : 0.000180s : 0.16% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000020s : 0.02% optimize.loop_unroll : 0.000458s : 0.41% optimize.opt_after_cconv.c_1 : 0.000042s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.04% optimize.tuple_transform.d_1 : 0.000056s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000078s : 0.07% optimize.cse_after_recomputation.cse : 0.000021s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000022s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000513s : 0.46% validate : 0.000046s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.011665s : 10.42% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000794 162 0.23% : 0.000002s : 2: substitution.elim_not_effective 1.12% : 0.000009s : 11: substitution.float_depend_g_call 0.52% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.20% : 0.000002s : 2: substitution.fold_const_symbol 0.81% : 0.000006s : 5: substitution.graph_param_transform 0.42% : 0.000003s : 2: substitution.incorporate_call 0.24% : 0.000002s : 2: substitution.incorporate_call_switch 66.11% : 0.000525s : 20: substitution.inline 2.57% : 0.000020s : 2: substitution.inline_without_move 1.19% : 0.000009s : 12: substitution.j_node_and_user_rematch 1.44% : 0.000011s : 7: substitution.minmaximum_grad 2.54% : 0.000020s : 11: substitution.partial_eliminate 1.46% : 0.000012s : 12: substitution.remove_not_recompute_node 3.21% : 0.000025s : 9: substitution.replace_applicator 1.07% : 0.000008s : 15: substitution.replace_old_param 0.35% : 0.000003s : 1: substitution.set_cell_output_no_recompute 3.10% : 0.000025s : 3: substitution.switch_simplify 2.77% : 0.000022s : 7: substitution.tuple_list_convert_item_index_to_positive 1.28% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.80% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.75% : 0.000046s : 16: substitution.tuple_list_get_item_eliminator 1.81% : 0.000014s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.067440 2 93.92% : 0.063339s : 1: type_inference.infer 6.08% : 0.004101s : 1: type_inference.specialize ------[replace.] 0.000298 30 61.69% : 0.000184s : 20: replace.inline 12.36% : 0.000037s : 3: replace.switch_simplify 25.95% : 0.000077s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000558 30 92.17% : 0.000514s : 20: match.inline 4.07% : 0.000023s : 3: match.switch_simplify 3.77% : 0.000021s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000744 5505 1.14% : 0.000008s : 67: predicate.accumulaten_eliminater 0.21% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.48% : 0.000004s : 29: predicate.addn_check_dump 1.06% : 0.000008s : 67: predicate.addn_zero_filter 1.05% : 0.000008s : 67: predicate.adjust_all_reduce_mul_add 2.23% : 0.000017s : 96: predicate.arithmetic_simplify 1.06% : 0.000008s : 67: predicate.cast_eliminate 1.29% : 0.000010s : 74: predicate.check_bprop_eliminate 0.52% : 0.000004s : 29: predicate.compare_switch_simplify 0.08% : 0.000001s : 6: predicate.const_output_eliminate 0.47% : 0.000004s : 29: predicate.depend_value_elim 1.17% : 0.000009s : 67: predicate.dict_get_item_const_eliminator 1.39% : 0.000010s : 67: predicate.dict_get_item_eliminator 1.03% : 0.000008s : 67: predicate.dict_set_item_eliminator 0.32% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 5: predicate.elim_not_effective 0.10% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000009s : 73: predicate.environ_add_const_eliminate 1.09% : 0.000008s : 73: predicate.environ_get_add_eliminate 1.06% : 0.000008s : 73: predicate.environ_get_depend_swap 1.59% : 0.000012s : 102: predicate.environ_get_eliminate 1.10% : 0.000008s : 73: predicate.environ_get_set_eliminate 1.55% : 0.000012s : 94: predicate.exchange_switch_depend_value 2.08% : 0.000015s : 94: predicate.float_depend_g_call 0.48% : 0.000004s : 29: predicate.float_environ_get_switch 0.57% : 0.000004s : 35: predicate.float_tuple_getitem_switch 0.06% : 0.000000s : 5: predicate.fold_const_symbol 0.53% : 0.000004s : 29: predicate.get_grad_eliminate 0.09% : 0.000001s : 5: predicate.graph_param_transform 4.40% : 0.000033s : 29: predicate.incorporate_call 0.43% : 0.000003s : 29: predicate.incorporate_call_switch 4.84% : 0.000036s : 231: predicate.inline 1.39% : 0.000010s : 65: predicate.inline_without_move 0.29% : 0.000002s : 29: predicate.j_node_and_user_rematch 0.68% : 0.000005s : 29: predicate.less_batch_normalization 1.43% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 2.33% : 0.000017s : 153: predicate.load_eliminater 0.25% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.68% : 0.000020s : 153: predicate.loop_unroll_before_grad 1.27% : 0.000009s : 79: predicate.make_slice_get_slice_eliminator 0.52% : 0.000004s : 29: predicate.merge_addn 1.17% : 0.000009s : 74: predicate.micro_step_allgather_replace 1.18% : 0.000009s : 74: predicate.mini_step_allgather_replace 1.00% : 0.000007s : 67: predicate.minmaximum_grad 0.25% : 0.000002s : 6: predicate.mutable_eliminate 0.10% : 0.000001s : 5: predicate.opt_reshape 0.16% : 0.000001s : 6: predicate.parallel_virtual_node 2.02% : 0.000015s : 94: predicate.partial_defer_inline 1.43% : 0.000011s : 80: predicate.partial_eliminate 1.09% : 0.000008s : 67: predicate.print_const_string_wrapper 0.51% : 0.000004s : 29: predicate.reduce_all_const_elim 1.41% : 0.000010s : 67: predicate.reduce_eliminate 2.31% : 0.000017s : 153: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000002s : 29: predicate.remove_not_recompute_node 2.08% : 0.000015s : 148: predicate.replace_applicator 0.72% : 0.000005s : 65: predicate.replace_old_param 0.10% : 0.000001s : 6: predicate.reset_defer_inline 1.07% : 0.000008s : 67: predicate.reshape_eliminate 1.27% : 0.000009s : 74: predicate.row_tensor_add_zeros_like 0.14% : 0.000001s : 6: predicate.row_tensor_eliminate 1.53% : 0.000011s : 74: predicate.same_eliminate 0.39% : 0.000003s : 29: predicate.set_cell_output_no_recompute 0.62% : 0.000005s : 29: predicate.shard_identity_eliminate 0.27% : 0.000002s : 11: predicate.special_op_eliminate 0.50% : 0.000004s : 29: predicate.specialize_transform 1.30% : 0.000010s : 74: predicate.split_environ_get_set_with_tuple_value 1.36% : 0.000010s : 65: predicate.stack_unstack_eliminate 0.10% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.72% : 0.000013s : 94: predicate.switch_defer_inline 2.87% : 0.000021s : 168: predicate.switch_layer_defer_inline 5.24% : 0.000039s : 287: predicate.switch_simplify 1.08% : 0.000008s : 67: predicate.tile_eliminate 1.08% : 0.000008s : 67: predicate.transpose_eliminate 1.36% : 0.000010s : 78: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000010s : 78: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000011s : 78: predicate.tuple_list_get_item_depend_reorder 2.37% : 0.000018s : 114: predicate.tuple_list_get_item_eliminator 1.55% : 0.000012s : 78: predicate.tuple_list_get_set_item_eliminator 1.99% : 0.000015s : 107: predicate.tuple_list_set_item_eliminator 1.44% : 0.000011s : 85: predicate.tuple_to_list_eliminator_ 2.31% : 0.000017s : 153: predicate.updatestate_pure_node_eliminater 2.81% : 0.000021s : 182: predicate.updatestate_useless_node_eliminater 0.10% : 0.000001s : 6: predicate.value_based_eliminate 0.53% : 0.000004s : 29: predicate.virtual_dataset_eliminate 0.51% : 0.000004s : 29: predicate.virtual_output_eliminate 0.08% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.17% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004610 52 63.80% : 0.002941s : 28: func_graph_cloner_run.FuncGraphClonerGraph 36.20% : 0.001668s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.178751 237 0.00% : 0.000004s : 1: ForceFp32Comm 2.25% : 0.004019s : 1: add_attr 2.24% : 0.004010s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000083s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.09% : 0.000163s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.47% : 0.000846s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000036s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.15% : 0.000263s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000008s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.26% : 0.000467s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000008s : 1: micro_interleaved_order_control 0.36% : 0.000647s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 2.93% : 0.005233s : 117: opt.transform.opt_a 0.02% : 0.000041s : 1: opt.transform.opt_after_cconv 0.02% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000161s : 28: opt.transform.opt_b 0.04% : 0.000065s : 2: opt.transform.opt_trans_graph 0.02% : 0.000044s : 4: opt.transform.symbol_engine_opt 16.41% : 0.029334s : 1: opt_a 0.08% : 0.000146s : 1: opt_after_cconv 0.29% : 0.000523s : 1: opt_after_jit_grad 0.15% : 0.000277s : 1: opt_b 17.95% : 0.032084s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.04% : 0.000065s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000049s : 1: remove_dup_value 8.86% : 0.015843s : 2: renormalize.infer 2.17% : 0.003871s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000023s : 1: rewriter_after_opt_a 0.25% : 0.000452s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000090s : 1: symbol_engine_optimizer 6.54% : 0.011687s : 1: task_emit 0.06% : 0.000103s : 1: tuple_transform 37.80% : 0.067560s : 1: type_inference 0.04% : 0.000077s : 1: validate TotalTime = 7.12784, [24] [bootstrap]: 0.00083125 [type_inference]: 0.0230998 [event_method]: 2.249e-05 [auto_monad]: 0.00013896 [graph_reusing]: 6.19999e-06 [inline]: 2.17999e-06 [add_attr]: 0.00780754, [1] [add_attr_with_inline]: 0.00779194, [1] [Cycle 1]: 0.00012978, [2] [tag_attr]: 3.263e-05 [meta_addattr_fg_expand]: 1.378e-05 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 5.221e-05 [insert-virtual-dataset]: 2.48e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.93e-06 [pipeline_split]: 1.77001e-06 [optimize]: 0.0053918, [53] [py_interpret_to_execute]: 4.18001e-06 [rewriter_before_opt_a]: 0.00023348 [opt_a]: 0.00297493, [2] [Cycle 1]: 0.00236433, [45] [expand_dump_flag]: 3.4e-06 [switch_simplify]: 7.386e-05 [loop_unroll]: 3.313e-05 [a_1]: 0.00063759 [with_stream_mark]: 1.65e-05 [recompute_prepare]: 7.6e-06 [updatestate_depend_eliminate]: 1.273e-05 [updatestate_assign_eliminate]: 1.008e-05 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.92999e-06 [a_2]: 7.533e-05 [accelerated_algorithm]: 6.23e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 5.79999e-06 [merge_send_recv]: 4.206e-05 [auto_parallel]: 6.39999e-06 [parallel]: 8.251e-05 [flash_sp]: 3.124e-05 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 1.093e-05 [matmul_add_comm_reduction]: 1.726e-05 [allreduce_slice_to_reducescatter]: 7.70998e-06 [virtual_shard_identity]: 7.97e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 5.95002e-06 [virtual_output]: 5.87001e-06 [merge_forward]: 3.88999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 1.769e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.998e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 9.67999e-06 [set_forward_comm_id_for_comm_node_pass]: 1.091e-05 [meta_fg_expand]: 2.76999e-06 [flash_sp_send_recv_attached]: 2.58998e-06 [receive_attached]: 1.712e-05 [after_resolve]: 1.047e-05 [a_after_grad]: 1.08e-05 [renormalize]: 0.00077974 [add_forward_monad_depend]: 5.55001e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 2.432e-05 [cse]: 4.66e-05 [a_3]: 4.266e-05 [Cycle 2]: 0.00059957, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 6.84999e-06 [loop_unroll]: 5.41002e-06 [a_1]: 0.0001214 [with_stream_mark]: 1.158e-05 [recompute_prepare]: 5.52001e-06 [updatestate_depend_eliminate]: 2.79001e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.53e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 6.422e-05 [accelerated_algorithm]: 5.57999e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 5.69e-06 [merge_send_recv]: 4.23999e-06 [auto_parallel]: 5.86e-06 [parallel]: 5.20001e-06 [flash_sp]: 2.93e-06 [merge_comm]: 2.76e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 6.38e-06 [virtual_dataset]: 5.40001e-06 [get_grad_eliminate_]: 5.35999e-06 [virtual_output]: 5.31002e-06 [merge_forward]: 2.63e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 6.96999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.156e-05 [merge_recompute_call_nodes]: 1.08001e-06 [before_grad]: 8.25e-06 [set_forward_comm_id_for_comm_node_pass]: 2.73e-06 [meta_fg_expand]: 1.70001e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.60999e-06 [after_resolve]: 1.071e-05 [a_after_grad]: 8.54e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.48002e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 6.12999e-06 [cse]: 1.269e-05 [a_3]: 3.149e-05 [py_interpret_to_execute_after_opt_a]: 4.90999e-06 [slice_cell_reuse_recomputed_activation]: 2.09999e-06 [rewriter_after_opt_a]: 2.791e-05 [convert_after_rewriter]: 1.34e-06 [order_py_execute_after_rewriter]: 1.27999e-06 [mutable_eliminate]: 0.00062043 [opt_b]: 0.00018588, [1] [Cycle 1]: 0.00017943, [7] [b_1]: 0.00011136 [b_2]: 7.02002e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.16e-06 [renormalize]: 9.30013e-07 [cse]: 1.693e-05 [optimize_parallel_all_gather_comm]: 2.645e-05 [overlap_param_gather]: 1.112e-05 [cconv]: 2.722e-05 [loop_unroll]: 0.0004409 [opt_after_cconv]: 9.62e-05, [1] [Cycle 1]: 9.055e-05, [7] [c_1]: 2.857e-05 [parameter_eliminate]: 2.87002e-06 [updatestate_depend_eliminate]: 4.81002e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.12999e-06 [cse]: 1.62e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.413e-05 [tuple_transform]: 7.286e-05, [1] [Cycle 1]: 6.809e-05, [4] [d_1]: 4.178e-05 [none_parameter_eliminate]: 1.50001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.93e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 6.257e-05 [cse_after_recomputation]: 2.059e-05, [1] [Cycle 1]: 1.62e-05, [1] [cse]: 1.115e-05 [environ_conv]: 2.015e-05 [swap_dp_allreduce_reducescatter]: 2.421e-05 [bias_add_comm_swap]: 1.002e-05 [label_micro_interleaved_index]: 1.254e-05 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.66e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 8.90999e-06 [full_micro_interleaved_order_control]: 1.02e-05 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.37999e-06 [interleave_parallel_branches]: 8.94e-06 [overlap_opt_shard_in_pipeline]: 2.281e-05 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.352e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 4e-06 [overlap_recompute_and_grad_model_parallel]: 1.293e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49998e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 1.901e-05 [overlap_grad_flash_sp]: 4.295e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 9.90002e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 7.48e-05, [1] [Cycle 1]: 6.987e-05, [6] [build]: 2.83998e-06 [elim_shapecalc]: 9.67001e-06 [elim_not_effective]: 1.289e-05 [opt_reshape]: 6.48003e-06 [fold_const_symbol]: 9.34998e-06 [renormalize]: 1.60013e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.66e-06 [auto_monad_reorder]: 2.256e-05 [get_jit_bprop_graph]: 2.31998e-06 [rewriter_after_jit_bprop_graph]: 3.58999e-06 [opt_after_jit_grad]: 0.00049481 [validate]: 6.172e-05 [backend_pass]: 9.79984e-07 [task_emit]: 7.08937 [execute]: 8.95999e-06 Sums bootstrap : 0.000831s : 0.01% type_inference : 0.023100s : 0.32% event_method : 0.000022s : 0.00% auto_monad : 0.000139s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000052s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000233s : 0.00% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000081s : 0.00% optimize.opt_a.loop_unroll : 0.000039s : 0.00% optimize.opt_a.a_1 : 0.000759s : 0.01% optimize.opt_a.with_stream_mark : 0.000028s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000140s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000046s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000088s : 0.00% optimize.opt_a.flash_sp : 0.000034s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.000780s : 0.01% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.00% optimize.opt_a.cse : 0.000059s : 0.00% optimize.opt_a.a_3 : 0.000074s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000620s : 0.01% optimize.opt_b.b_1 : 0.000111s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000027s : 0.00% optimize.loop_unroll : 0.000441s : 0.01% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000042s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000063s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000024s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000023s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000043s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000495s : 0.01% validate : 0.000062s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 7.089369s : 99.59% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000196 29 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.71% : 0.000001s : 2: substitution.fold_const_symbol 3.01% : 0.000006s : 4: substitution.graph_param_transform 77.55% : 0.000152s : 5: substitution.inline 1.74% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.38% : 0.000013s : 4: substitution.remove_not_recompute_node 2.37% : 0.000005s : 4: substitution.replace_old_param 7.34% : 0.000014s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.023012 2 95.82% : 0.022050s : 1: type_inference.infer 4.18% : 0.000962s : 1: type_inference.specialize ------[replace.] 0.000064 9 65.75% : 0.000042s : 5: replace.inline 34.25% : 0.000022s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 9 92.20% : 0.000150s : 5: match.inline 7.80% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000195 1345 0.91% : 0.000002s : 14: predicate.accumulaten_eliminater 0.83% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 14: predicate.addn_zero_filter 0.87% : 0.000002s : 14: predicate.adjust_all_reduce_mul_add 1.99% : 0.000004s : 22: predicate.arithmetic_simplify 0.92% : 0.000002s : 14: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.52% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 14: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 14: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 14: predicate.dict_set_item_eliminator 1.11% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.34% : 0.000001s : 4: predicate.elim_not_effective 0.62% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 18: predicate.environ_add_const_eliminate 1.06% : 0.000002s : 18: predicate.environ_get_add_eliminate 1.18% : 0.000002s : 18: predicate.environ_get_depend_swap 1.72% : 0.000003s : 26: predicate.environ_get_eliminate 1.10% : 0.000002s : 18: predicate.environ_get_set_eliminate 1.54% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.39% : 0.000005s : 23: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.71% : 0.000001s : 8: predicate.get_grad_eliminate 0.25% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.77% : 0.000011s : 61: predicate.inline 0.75% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000001s : 8: predicate.less_batch_normalization 1.88% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 40: predicate.load_eliminater 0.85% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.86% : 0.000006s : 41: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 22: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 14: predicate.minmaximum_grad 1.05% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.99% : 0.000004s : 23: predicate.partial_defer_inline 1.59% : 0.000003s : 22: predicate.partial_eliminate 0.98% : 0.000002s : 14: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 14: predicate.reduce_eliminate 2.56% : 0.000005s : 40: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 26: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.21% : 0.000000s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 14: predicate.reshape_eliminate 0.79% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.68% : 0.000003s : 23: predicate.switch_defer_inline 2.20% : 0.000004s : 31: predicate.switch_layer_defer_inline 5.67% : 0.000011s : 76: predicate.switch_simplify 0.89% : 0.000002s : 14: predicate.tile_eliminate 0.92% : 0.000002s : 14: predicate.transpose_eliminate 1.54% : 0.000003s : 22: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 22: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 22: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.42% : 0.000003s : 22: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000004s : 30: predicate.tuple_list_set_item_eliminator 2.02% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.49% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 3.15% : 0.000006s : 48: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000639 12 47.99% : 0.000306s : 5: func_graph_cloner_run.FuncGraphClonerGraph 52.01% : 0.000332s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 7.142983 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.11% : 0.007813s : 1: add_attr 0.11% : 0.007796s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000067s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000146s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.01% : 0.000883s : 1: bootstrap 0.00% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000024s : 1: environ_conv 0.00% : 0.000029s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.01% : 0.000450s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.000630s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.02% : 0.001183s : 78: opt.transform.opt_a 0.00% : 0.000027s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000092s : 28: opt.transform.opt_b 0.00% : 0.000047s : 2: opt.transform.opt_trans_graph 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.04% : 0.002978s : 1: opt_a 0.00% : 0.000100s : 1: opt_after_cconv 0.01% : 0.000505s : 1: opt_after_jit_grad 0.00% : 0.000190s : 1: opt_b 0.08% : 0.005397s : 1: optimize 0.00% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000047s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000057s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.01% : 0.000397s : 1: renormalize.infer 0.01% : 0.000374s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.00% : 0.000239s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000077s : 1: symbol_engine_optimizer 99.25% : 7.089486s : 1: task_emit 0.00% : 0.000076s : 1: tuple_transform 0.32% : 0.023131s : 1: type_inference 0.00% : 0.000087s : 1: validate TotalTime = 8.38405, [24] [bootstrap]: 0.00109536 [type_inference]: 0.263534 [event_method]: 2.466e-05 [auto_monad]: 0.00045816 [graph_reusing]: 6.61e-06 [inline]: 2.26998e-06 [add_attr]: 0.00968396, [1] [add_attr_with_inline]: 0.00966522, [1] [Cycle 1]: 0.00016527, [2] [tag_attr]: 5.11e-05 [meta_addattr_fg_expand]: 1.662e-05 [parallel-infer-symbol]: 4.11001e-06 [pre_auto_parallel]: 6.588e-05 [insert-virtual-dataset]: 2.40002e-06 [parallel-infer-symbol-second]: 1.44e-06 [dataset_repeat_opt]: 2.63e-06 [pipeline_split]: 1.95001e-06 [optimize]: 0.0141841, [53] [py_interpret_to_execute]: 9.54e-06 [rewriter_before_opt_a]: 0.00012087 [opt_a]: 0.0101097, [2] [Cycle 1]: 0.00811938, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 8.676e-05 [loop_unroll]: 8.113e-05 [a_1]: 0.00127118 [with_stream_mark]: 3.056e-05 [recompute_prepare]: 2.801e-05 [updatestate_depend_eliminate]: 9.42e-05 [updatestate_assign_eliminate]: 1.517e-05 [updatestate_loads_eliminate]: 4.257e-05 [parameter_eliminate]: 2.48998e-06 [a_2]: 0.00031763 [accelerated_algorithm]: 5.506e-05 [shard]: 3.51001e-06 [meta_shard_fg_expand]: 5.32001e-06 [shard_inline]: 4.644e-05 [merge_send_recv]: 5.432e-05 [auto_parallel]: 1.987e-05 [parallel]: 9.499e-05 [flash_sp]: 3.971e-05 [merge_comm]: 1.287e-05 [allreduce_fusion]: 1.967e-05 [matmul_add_comm_reduction]: 2.89e-05 [allreduce_slice_to_reducescatter]: 1.014e-05 [virtual_shard_identity]: 2.698e-05 [virtual_dataset]: 2.001e-05 [get_grad_eliminate_]: 1.865e-05 [virtual_output]: 1.881e-05 [merge_forward]: 1.065e-05 [cell_reuse_recompute_pass]: 2.04e-06 [offload_activation]: 2.876e-05 [cell_reuse_handle_not_recompute_node_pass]: 6.68e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 3.164e-05 [set_forward_comm_id_for_comm_node_pass]: 2.01e-05 [meta_fg_expand]: 8.01001e-06 [flash_sp_send_recv_attached]: 5.39e-06 [receive_attached]: 1.812e-05 [after_resolve]: 3.055e-05 [a_after_grad]: 3.138e-05 [renormalize]: 0.00461682 [add_forward_monad_depend]: 6.36e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 8.997e-05 [cse]: 0.00019674 [a_3]: 0.00014211 [Cycle 2]: 0.00197687, [45] [expand_dump_flag]: 2.74001e-06 [switch_simplify]: 2.183e-05 [loop_unroll]: 1.911e-05 [a_1]: 0.00058829 [with_stream_mark]: 2.503e-05 [recompute_prepare]: 1.86e-05 [updatestate_depend_eliminate]: 1.144e-05 [updatestate_assign_eliminate]: 1.373e-05 [updatestate_loads_eliminate]: 1.331e-05 [parameter_eliminate]: 1.57999e-06 [a_2]: 0.00028607 [accelerated_algorithm]: 2.332e-05 [shard]: 2.74001e-06 [meta_shard_fg_expand]: 4.21001e-06 [shard_inline]: 1.964e-05 [merge_send_recv]: 1.65e-05 [auto_parallel]: 1.649e-05 [parallel]: 1.014e-05 [flash_sp]: 1.881e-05 [merge_comm]: 1.16e-05 [allreduce_fusion]: 1.017e-05 [matmul_add_comm_reduction]: 1.731e-05 [allreduce_slice_to_reducescatter]: 1.13001e-06 [virtual_shard_identity]: 2.079e-05 [virtual_dataset]: 1.843e-05 [get_grad_eliminate_]: 1.815e-05 [virtual_output]: 1.848e-05 [merge_forward]: 9.44998e-06 [cell_reuse_recompute_pass]: 2.25002e-06 [offload_activation]: 1.913e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.467e-05 [merge_recompute_call_nodes]: 1.24e-06 [before_grad]: 5.79e-05 [set_forward_comm_id_for_comm_node_pass]: 1.113e-05 [meta_fg_expand]: 7.31001e-06 [flash_sp_send_recv_attached]: 2.01003e-06 [receive_attached]: 2.66e-06 [after_resolve]: 2.695e-05 [a_after_grad]: 3.034e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 3.01999e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 5.908e-05 [cse]: 6.775e-05 [a_3]: 0.00012574 [py_interpret_to_execute_after_opt_a]: 8.96002e-06 [slice_cell_reuse_recomputed_activation]: 2.29999e-06 [rewriter_after_opt_a]: 7.197e-05 [convert_after_rewriter]: 1.37e-06 [order_py_execute_after_rewriter]: 1.23002e-06 [mutable_eliminate]: 0.00082115 [opt_b]: 0.00067256, [1] [Cycle 1]: 0.00066332, [7] [b_1]: 0.00046615 [b_2]: 2.182e-05 [updatestate_depend_eliminate]: 1.57e-05 [updatestate_assign_eliminate]: 1.272e-05 [updatestate_loads_eliminate]: 1.377e-05 [renormalize]: 9.89996e-07 [cse]: 8.413e-05 [optimize_parallel_all_gather_comm]: 5.107e-05 [overlap_param_gather]: 1.141e-05 [cconv]: 3.694e-05 [loop_unroll]: 0.00063254 [opt_after_cconv]: 0.00028969, [1] [Cycle 1]: 0.00028019, [7] [c_1]: 0.000124 [parameter_eliminate]: 5.07e-06 [updatestate_depend_eliminate]: 1.459e-05 [updatestate_assign_eliminate]: 1.158e-05 [updatestate_loads_eliminate]: 1.285e-05 [cse]: 7.207e-05 [renormalize]: 6.09987e-07 [remove_dup_value]: 8.309e-05 [tuple_transform]: 0.00019117, [1] [Cycle 1]: 0.00018498, [4] [d_1]: 0.00013662 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 2.219e-05 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 0.00014538 [cse_after_recomputation]: 9.908e-05, [1] [Cycle 1]: 9.274e-05, [1] [cse]: 8.291e-05 [environ_conv]: 5.762e-05 [swap_dp_allreduce_reducescatter]: 3.851e-05 [bias_add_comm_swap]: 1.293e-05 [label_micro_interleaved_index]: 1.428e-05 [label_fine_grained_interleaved_index]: 2.83998e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.03997e-06 [micro_interleaved_order_control]: 2.56998e-06 [assign_add_opt]: 1.71e-06 [ForceFp32Comm]: 8.80013e-07 [remove_cast_before_assign_add]: 9.67999e-06 [full_micro_interleaved_order_control]: 1.081e-05 [reorder_send_recv_between_fp_bp]: 3.26999e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 8.87e-06 [overlap_opt_shard_in_pipeline]: 2.217e-05 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 3.508e-05 [grouped_pairwise_exchange_alltoall]: 2.04e-06 [offloading_packed_experts]: 8.48999e-06 [overlap_recompute_and_grad_model_parallel]: 1.711e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.67001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46998e-06 [overlap_recompute_comm]: 2.28998e-06 [overlap_grad_ring_attention]: 2.305e-05 [overlap_grad_flash_sp]: 6.826e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 9.80002e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 0.00017648, [1] [Cycle 1]: 0.00016882, [6] [build]: 7.31999e-06 [elim_shapecalc]: 3.138e-05 [elim_not_effective]: 3.597e-05 [opt_reshape]: 2.289e-05 [fold_const_symbol]: 2.981e-05 [renormalize]: 2.69996e-07 [detach_backward]: 1.96998e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 0.00015156 [get_jit_bprop_graph]: 2.01e-06 [rewriter_after_jit_bprop_graph]: 7.16001e-06 [opt_after_jit_grad]: 0.00079691 [validate]: 0.00013006 [backend_pass]: 1.05999e-06 [task_emit]: 8.09295 [execute]: 1.407e-05 Sums bootstrap : 0.001095s : 0.01% type_inference : 0.263534s : 3.15% event_method : 0.000025s : 0.00% auto_monad : 0.000458s : 0.01% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000051s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000066s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.00% optimize.rewriter_before_opt_a : 0.000121s : 0.00% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000109s : 0.00% optimize.opt_a.loop_unroll : 0.000100s : 0.00% optimize.opt_a.a_1 : 0.001859s : 0.02% optimize.opt_a.with_stream_mark : 0.000056s : 0.00% optimize.opt_a.recompute_prepare : 0.000047s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000106s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000029s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000056s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000604s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000078s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000066s : 0.00% optimize.opt_a.merge_send_recv : 0.000071s : 0.00% optimize.opt_a.auto_parallel : 0.000036s : 0.00% optimize.opt_a.parallel : 0.000105s : 0.00% optimize.opt_a.flash_sp : 0.000059s : 0.00% optimize.opt_a.merge_comm : 0.000024s : 0.00% optimize.opt_a.allreduce_fusion : 0.000030s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000046s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000011s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000048s : 0.00% optimize.opt_a.virtual_dataset : 0.000038s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000037s : 0.00% optimize.opt_a.virtual_output : 0.000037s : 0.00% optimize.opt_a.merge_forward : 0.000020s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000048s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000101s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000090s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000031s : 0.00% optimize.opt_a.meta_fg_expand : 0.000015s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000058s : 0.00% optimize.opt_a.a_after_grad : 0.000062s : 0.00% optimize.opt_a.renormalize : 0.004617s : 0.06% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000149s : 0.00% optimize.opt_a.cse : 0.000264s : 0.00% optimize.opt_a.a_3 : 0.000268s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000072s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000821s : 0.01% optimize.opt_b.b_1 : 0.000466s : 0.01% optimize.opt_b.b_2 : 0.000022s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000014s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000084s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000051s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000037s : 0.00% optimize.loop_unroll : 0.000633s : 0.01% optimize.opt_after_cconv.c_1 : 0.000124s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.cse : 0.000072s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000083s : 0.00% optimize.tuple_transform.d_1 : 0.000137s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000022s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000145s : 0.00% optimize.cse_after_recomputation.cse : 0.000083s : 0.00% optimize.environ_conv : 0.000058s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000039s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000022s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000035s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000017s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000068s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000007s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000031s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000036s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000023s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000030s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000152s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000797s : 0.01% validate : 0.000130s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 8.092948s : 96.66% execute : 0.000014s : 0.00% Time group info: ------[substitution.] 0.000584 168 19.97% : 0.000117s : 4: substitution.arithmetic_simplify 1.77% : 0.000010s : 2: substitution.depend_value_elim 0.77% : 0.000005s : 10: substitution.elim_not_effective 0.69% : 0.000004s : 10: substitution.fold_const_symbol 2.26% : 0.000013s : 16: substitution.graph_param_transform 40.11% : 0.000234s : 4: substitution.inline 6.29% : 0.000037s : 20: substitution.j_node_and_user_rematch 5.44% : 0.000032s : 2: substitution.less_batch_normalization 1.49% : 0.000009s : 12: substitution.load_eliminater 4.16% : 0.000024s : 20: substitution.remove_not_recompute_node 1.73% : 0.000010s : 10: substitution.replace_old_param 7.01% : 0.000041s : 26: substitution.updatestate_pure_node_eliminater 8.29% : 0.000048s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.263359 2 98.70% : 0.259938s : 1: type_inference.infer 1.30% : 0.003421s : 1: type_inference.specialize ------[replace.] 0.000046 4 100.00% : 0.000046s : 4: replace.inline ------[match.] 0.000231 4 100.00% : 0.000231s : 4: match.inline ------[predicate.] 0.000670 4085 0.74% : 0.000005s : 40: predicate.accumulaten_eliminater 0.83% : 0.000006s : 18: predicate.ad_related_special_op_eliminate 0.68% : 0.000005s : 32: predicate.addn_check_dump 0.81% : 0.000005s : 40: predicate.addn_zero_filter 0.68% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 1.89% : 0.000013s : 72: predicate.arithmetic_simplify 0.81% : 0.000005s : 40: predicate.cast_eliminate 0.62% : 0.000004s : 32: predicate.check_bprop_eliminate 0.63% : 0.000004s : 32: predicate.compare_switch_simplify 0.20% : 0.000001s : 16: predicate.const_output_eliminate 0.64% : 0.000004s : 32: predicate.depend_value_elim 0.80% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 0.86% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.77% : 0.000005s : 40: predicate.dict_set_item_eliminator 0.89% : 0.000006s : 34: predicate.dumpgradient_eliminate 0.26% : 0.000002s : 16: predicate.elim_not_effective 0.38% : 0.000003s : 16: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000008s : 56: predicate.environ_add_const_eliminate 0.99% : 0.000007s : 56: predicate.environ_get_add_eliminate 0.99% : 0.000007s : 56: predicate.environ_get_depend_swap 1.73% : 0.000012s : 88: predicate.environ_get_eliminate 1.01% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.82% : 0.000005s : 44: predicate.exchange_switch_depend_value 1.19% : 0.000008s : 44: predicate.float_depend_g_call 0.64% : 0.000004s : 32: predicate.float_environ_get_switch 0.97% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.18% : 0.000001s : 16: predicate.fold_const_symbol 0.68% : 0.000005s : 32: predicate.get_grad_eliminate 0.23% : 0.000002s : 16: predicate.graph_param_transform 0.65% : 0.000004s : 32: predicate.incorporate_call 0.57% : 0.000004s : 32: predicate.incorporate_call_switch 8.21% : 0.000055s : 180: predicate.inline 0.80% : 0.000005s : 32: predicate.inline_without_move 0.35% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.84% : 0.000006s : 35: predicate.less_batch_normalization 1.42% : 0.000010s : 72: predicate.list_to_tuple_eliminator_ 2.07% : 0.000014s : 112: predicate.load_eliminater 0.68% : 0.000005s : 16: predicate.loop_unroll_after_grad 6.79% : 0.000046s : 61: predicate.loop_unroll_before_grad 1.49% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 0.64% : 0.000004s : 32: predicate.merge_addn 0.63% : 0.000004s : 32: predicate.micro_step_allgather_replace 0.69% : 0.000005s : 32: predicate.mini_step_allgather_replace 0.67% : 0.000004s : 40: predicate.minmaximum_grad 0.68% : 0.000005s : 16: predicate.mutable_eliminate 0.39% : 0.000003s : 16: predicate.opt_reshape 0.34% : 0.000002s : 16: predicate.parallel_virtual_node 1.19% : 0.000008s : 44: predicate.partial_defer_inline 1.15% : 0.000008s : 56: predicate.partial_eliminate 0.75% : 0.000005s : 40: predicate.print_const_string_wrapper 0.64% : 0.000004s : 32: predicate.reduce_all_const_elim 1.01% : 0.000007s : 40: predicate.reduce_eliminate 2.07% : 0.000014s : 112: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000003s : 32: predicate.remove_not_recompute_node 1.08% : 0.000007s : 72: predicate.replace_applicator 0.38% : 0.000003s : 32: predicate.replace_old_param 0.24% : 0.000002s : 16: predicate.reset_defer_inline 0.80% : 0.000005s : 40: predicate.reshape_eliminate 0.66% : 0.000004s : 32: predicate.row_tensor_add_zeros_like 0.37% : 0.000003s : 16: predicate.row_tensor_eliminate 0.88% : 0.000006s : 32: predicate.same_eliminate 0.48% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.82% : 0.000006s : 32: predicate.shard_identity_eliminate 0.77% : 0.000005s : 34: predicate.special_op_eliminate 0.72% : 0.000005s : 32: predicate.specialize_transform 0.70% : 0.000005s : 32: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000006s : 32: predicate.stack_unstack_eliminate 0.37% : 0.000002s : 16: predicate.switch_call_monad_eliminater 0.91% : 0.000006s : 44: predicate.switch_defer_inline 1.50% : 0.000010s : 76: predicate.switch_layer_defer_inline 3.43% : 0.000023s : 153: predicate.switch_simplify 0.74% : 0.000005s : 40: predicate.tile_eliminate 0.72% : 0.000005s : 40: predicate.transpose_eliminate 1.49% : 0.000010s : 72: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 8.41% : 0.000056s : 104: predicate.tuple_list_get_item_eliminator 1.44% : 0.000010s : 72: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000015s : 104: predicate.tuple_list_set_item_eliminator 1.36% : 0.000009s : 72: predicate.tuple_to_list_eliminator_ 2.14% : 0.000014s : 112: predicate.updatestate_pure_node_eliminater 2.97% : 0.000020s : 144: predicate.updatestate_useless_node_eliminater 0.34% : 0.000002s : 16: predicate.value_based_eliminate 0.73% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.74% : 0.000005s : 32: predicate.virtual_output_eliminate 0.36% : 0.000002s : 18: predicate.virtual_view_grad_eliminate 0.41% : 0.000003s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003783 26 68.17% : 0.002579s : 20: func_graph_cloner_run.FuncGraphClonerGraph 31.83% : 0.001204s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 8.416573 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.12% : 0.009690s : 1: add_attr 0.11% : 0.009669s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000153s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000475s : 1: auto_monad 0.00% : 0.000162s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.01% : 0.001150s : 1: bootstrap 0.00% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000040s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000102s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000063s : 1: environ_conv 0.00% : 0.000031s : 1: event_method 0.00% : 0.000074s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000645s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.000834s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.00% : 0.000033s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000054s : 1: opt.transform.mutable_eliminate 0.04% : 0.003483s : 78: opt.transform.opt_a 0.00% : 0.000122s : 1: opt.transform.opt_after_cconv 0.00% : 0.000095s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000453s : 28: opt.transform.opt_b 0.00% : 0.000155s : 2: opt.transform.opt_trans_graph 0.00% : 0.000115s : 4: opt.transform.symbol_engine_opt 0.12% : 0.010114s : 1: opt_a 0.00% : 0.000293s : 1: opt_after_cconv 0.01% : 0.000812s : 1: opt_after_jit_grad 0.01% : 0.000677s : 1: opt_b 0.17% : 0.014191s : 1: optimize 0.00% : 0.000056s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000074s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000021s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000071s : 1: pre_auto_parallel 0.00% : 0.000013s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000090s : 1: remove_dup_value 0.04% : 0.003132s : 1: renormalize.infer 0.02% : 0.001471s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000077s : 1: rewriter_after_opt_a 0.00% : 0.000126s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000043s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000180s : 1: symbol_engine_optimizer 96.16% : 8.093183s : 1: task_emit 0.00% : 0.000195s : 1: tuple_transform 3.13% : 0.263563s : 1: type_inference 0.00% : 0.000186s : 1: validate TotalTime = 0.351366, [24] [bootstrap]: 0.00090063 [type_inference]: 0.235098 [event_method]: 0.00043264 [auto_monad]: 0.00051261 [graph_reusing]: 1.07e-05 [inline]: 3.81001e-06 [add_attr]: 0.00558507, [1] [add_attr_with_inline]: 0.00557196, [1] [Cycle 1]: 0.00011938, [2] [tag_attr]: 6.498e-05 [meta_addattr_fg_expand]: 1.494e-05 [parallel-infer-symbol]: 4.97e-06 [pre_auto_parallel]: 7.736e-05 [insert-virtual-dataset]: 3.7e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 3.08e-06 [optimize]: 0.0923982, [53] [py_interpret_to_execute]: 5.84e-06 [rewriter_before_opt_a]: 0.00049855 [opt_a]: 0.0881642, [4] [Cycle 1]: 0.0728086, [45] [expand_dump_flag]: 4.97999e-06 [switch_simplify]: 0.00019192 [loop_unroll]: 9.59e-05 [a_1]: 0.00294104 [with_stream_mark]: 4.693e-05 [recompute_prepare]: 5.352e-05 [updatestate_depend_eliminate]: 0.00012472 [updatestate_assign_eliminate]: 2.453e-05 [updatestate_loads_eliminate]: 1.958e-05 [parameter_eliminate]: 3.33998e-06 [a_2]: 0.00056552 [accelerated_algorithm]: 9.202e-05 [shard]: 2.36998e-06 [meta_shard_fg_expand]: 1.245e-05 [shard_inline]: 3.435e-05 [merge_send_recv]: 4.82e-05 [auto_parallel]: 2.776e-05 [parallel]: 5.358e-05 [flash_sp]: 2.022e-05 [merge_comm]: 2.374e-05 [allreduce_fusion]: 2.083e-05 [matmul_add_comm_reduction]: 5.158e-05 [allreduce_slice_to_reducescatter]: 1.61998e-06 [virtual_shard_identity]: 3.941e-05 [virtual_dataset]: 3.231e-05 [get_grad_eliminate_]: 3.367e-05 [virtual_output]: 3.193e-05 [merge_forward]: 2.364e-05 [cell_reuse_recompute_pass]: 2.48e-06 [offload_activation]: 3.216e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.99e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 6.492e-05 [set_forward_comm_id_for_comm_node_pass]: 2.631e-05 [meta_fg_expand]: 0.0282554 [flash_sp_send_recv_attached]: 9.96998e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 0.00021809 [a_after_grad]: 0.0002821 [renormalize]: 0.0361354 [add_forward_monad_depend]: 2.335e-05 [auto_monad_grad]: 2.087e-05 [auto_monad_eliminator]: 0.00019532 [cse]: 0.00048319 [a_3]: 0.00194416 [Cycle 2]: 0.010336, [45] [expand_dump_flag]: 5.11997e-06 [switch_simplify]: 0.0001287 [loop_unroll]: 0.00012239 [a_1]: 0.00414626 [with_stream_mark]: 4.985e-05 [recompute_prepare]: 4.418e-05 [updatestate_depend_eliminate]: 3.405e-05 [updatestate_assign_eliminate]: 2.415e-05 [updatestate_loads_eliminate]: 2.155e-05 [parameter_eliminate]: 5.20001e-06 [a_2]: 0.00089171 [accelerated_algorithm]: 3.38e-05 [shard]: 2.49001e-06 [meta_shard_fg_expand]: 9.20001e-06 [shard_inline]: 2.46e-05 [merge_send_recv]: 2.266e-05 [auto_parallel]: 1.971e-05 [parallel]: 9.67999e-06 [flash_sp]: 5.75001e-06 [merge_comm]: 1.32e-05 [allreduce_fusion]: 1.239e-05 [matmul_add_comm_reduction]: 2.271e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 2.646e-05 [virtual_dataset]: 2.363e-05 [get_grad_eliminate_]: 2.187e-05 [virtual_output]: 2.473e-05 [merge_forward]: 1.946e-05 [cell_reuse_recompute_pass]: 2.61e-06 [offload_activation]: 3.059e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.207e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 4.245e-05 [set_forward_comm_id_for_comm_node_pass]: 1.784e-05 [meta_fg_expand]: 0.00017795 [flash_sp_send_recv_attached]: 2.20002e-06 [receive_attached]: 2.97002e-06 [after_resolve]: 3.825e-05 [a_after_grad]: 3.847e-05 [renormalize]: 0.00341268 [add_forward_monad_depend]: 1.11e-05 [auto_monad_grad]: 3.25e-06 [auto_monad_eliminator]: 7.337e-05 [cse]: 0.0001401 [a_3]: 0.00018036 [Cycle 3]: 0.003236, [45] [expand_dump_flag]: 3.47002e-06 [switch_simplify]: 2.522e-05 [loop_unroll]: 2.38e-05 [a_1]: 0.00067192 [with_stream_mark]: 2.689e-05 [recompute_prepare]: 2.635e-05 [updatestate_depend_eliminate]: 5.404e-05 [updatestate_assign_eliminate]: 1.533e-05 [updatestate_loads_eliminate]: 1.497e-05 [parameter_eliminate]: 2.36998e-06 [a_2]: 0.0002858 [accelerated_algorithm]: 2.41e-05 [shard]: 2.33998e-06 [meta_shard_fg_expand]: 5.10001e-06 [shard_inline]: 1.979e-05 [merge_send_recv]: 1.822e-05 [auto_parallel]: 1.652e-05 [parallel]: 9.13002e-06 [flash_sp]: 2.02001e-06 [merge_comm]: 1.124e-05 [allreduce_fusion]: 1.058e-05 [matmul_add_comm_reduction]: 1.979e-05 [allreduce_slice_to_reducescatter]: 9.40025e-07 [virtual_shard_identity]: 2.148e-05 [virtual_dataset]: 1.851e-05 [get_grad_eliminate_]: 1.782e-05 [virtual_output]: 1.813e-05 [merge_forward]: 1.119e-05 [cell_reuse_recompute_pass]: 4.23001e-06 [offload_activation]: 1.959e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.582e-05 [merge_recompute_call_nodes]: 1.62001e-06 [before_grad]: 2.968e-05 [set_forward_comm_id_for_comm_node_pass]: 1.247e-05 [meta_fg_expand]: 7.73999e-06 [flash_sp_send_recv_attached]: 2.04e-06 [receive_attached]: 2.60997e-06 [after_resolve]: 2.896e-05 [a_after_grad]: 2.81e-05 [renormalize]: 0.00113113 [add_forward_monad_depend]: 7.46999e-06 [auto_monad_grad]: 1.95001e-06 [auto_monad_eliminator]: 4.944e-05 [cse]: 8.857e-05 [a_3]: 0.00013764 [Cycle 4]: 0.0017584, [45] [expand_dump_flag]: 2.37001e-06 [switch_simplify]: 2.109e-05 [loop_unroll]: 1.784e-05 [a_1]: 0.00052507 [with_stream_mark]: 2.078e-05 [recompute_prepare]: 2.02e-05 [updatestate_depend_eliminate]: 1.122e-05 [updatestate_assign_eliminate]: 1.287e-05 [updatestate_loads_eliminate]: 1.3e-05 [parameter_eliminate]: 2.59001e-06 [a_2]: 0.00027748 [accelerated_algorithm]: 2.449e-05 [shard]: 1.55001e-06 [meta_shard_fg_expand]: 4.77998e-06 [shard_inline]: 1.756e-05 [merge_send_recv]: 1.651e-05 [auto_parallel]: 1.539e-05 [parallel]: 6.98998e-06 [flash_sp]: 1.57999e-06 [merge_comm]: 1.042e-05 [allreduce_fusion]: 9.64e-06 [matmul_add_comm_reduction]: 1.757e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 2.042e-05 [virtual_dataset]: 1.731e-05 [get_grad_eliminate_]: 1.808e-05 [virtual_output]: 1.747e-05 [merge_forward]: 9.90002e-06 [cell_reuse_recompute_pass]: 2.78998e-06 [offload_activation]: 1.919e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.403e-05 [merge_recompute_call_nodes]: 8.50006e-07 [before_grad]: 2.905e-05 [set_forward_comm_id_for_comm_node_pass]: 1.004e-05 [meta_fg_expand]: 7.33999e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 2.07999e-06 [after_resolve]: 2.529e-05 [a_after_grad]: 2.796e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.72001e-06 [auto_monad_grad]: 2.09999e-06 [auto_monad_eliminator]: 4.21e-05 [cse]: 6.1e-05 [a_3]: 0.00012081 [py_interpret_to_execute_after_opt_a]: 7.31999e-06 [slice_cell_reuse_recomputed_activation]: 2.24999e-06 [rewriter_after_opt_a]: 5.267e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.08001e-06 [mutable_eliminate]: 0.00082905 [opt_b]: 0.00068461, [1] [Cycle 1]: 0.00067532, [7] [b_1]: 0.00042569 [b_2]: 2.501e-05 [updatestate_depend_eliminate]: 1.601e-05 [updatestate_assign_eliminate]: 1.265e-05 [updatestate_loads_eliminate]: 6.792e-05 [renormalize]: 7.30011e-07 [cse]: 7.938e-05 [optimize_parallel_all_gather_comm]: 3.584e-05 [overlap_param_gather]: 2.24999e-06 [cconv]: 3.371e-05 [loop_unroll]: 0.00062981 [opt_after_cconv]: 0.00027392, [1] [Cycle 1]: 0.00026554, [7] [c_1]: 0.00011475 [parameter_eliminate]: 3.44001e-06 [updatestate_depend_eliminate]: 1.446e-05 [updatestate_assign_eliminate]: 1.157e-05 [updatestate_loads_eliminate]: 1.187e-05 [cse]: 6.838e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 6.971e-05 [tuple_transform]: 0.00019287, [1] [Cycle 1]: 0.00018591, [4] [d_1]: 0.00013713 [none_parameter_eliminate]: 2.08998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 2.405e-05 [partial_unused_args_eliminate]: 2.26998e-06 [add_recomputation]: 0.00017893 [cse_after_recomputation]: 7.804e-05, [1] [Cycle 1]: 7.09e-05, [1] [cse]: 6.062e-05 [environ_conv]: 1.489e-05 [swap_dp_allreduce_reducescatter]: 1.732e-05 [bias_add_comm_swap]: 3.66999e-06 [label_micro_interleaved_index]: 8.04997e-06 [label_fine_grained_interleaved_index]: 2.74999e-06 [merge_cast_opt]: 1.46998e-06 [slice_recompute_activation]: 2.47001e-06 [micro_interleaved_order_control]: 2.59001e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 1.24e-06 [remove_cast_before_assign_add]: 1.37999e-06 [full_micro_interleaved_order_control]: 2.22999e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.50001e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 1.62001e-06 [control_data_broadcast_order]: 4.389e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 1.225e-05 [overlap_recompute_and_grad_model_parallel]: 1.157e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 1.109e-05 [overlap_grad_flash_sp]: 5.335e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 2.02001e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 0.00016505, [1] [Cycle 1]: 0.00015917, [6] [build]: 7.33e-06 [elim_shapecalc]: 2.866e-05 [elim_not_effective]: 3.699e-05 [opt_reshape]: 2.066e-05 [fold_const_symbol]: 3.123e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.56002e-06 [auto_monad_reorder]: 0.00014208 [get_jit_bprop_graph]: 2.87002e-06 [rewriter_after_jit_bprop_graph]: 5.57001e-06 [opt_after_jit_grad]: 0.00066229 [validate]: 0.00015389 [backend_pass]: 1.40999e-06 [task_emit]: 0.0149768 [execute]: 9.07001e-06 Sums bootstrap : 0.000901s : 0.26% type_inference : 0.235098s : 68.40% event_method : 0.000433s : 0.13% auto_monad : 0.000513s : 0.15% graph_reusing : 0.000011s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000065s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000077s : 0.02% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000499s : 0.15% optimize.opt_a.expand_dump_flag : 0.000016s : 0.00% optimize.opt_a.switch_simplify : 0.000367s : 0.11% optimize.opt_a.loop_unroll : 0.000260s : 0.08% optimize.opt_a.a_1 : 0.008284s : 2.41% optimize.opt_a.with_stream_mark : 0.000144s : 0.04% optimize.opt_a.recompute_prepare : 0.000144s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000224s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000077s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000069s : 0.02% optimize.opt_a.parameter_eliminate : 0.000013s : 0.00% optimize.opt_a.a_2 : 0.002021s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000174s : 0.05% optimize.opt_a.shard : 0.000009s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000032s : 0.01% optimize.opt_a.shard_inline : 0.000096s : 0.03% optimize.opt_a.merge_send_recv : 0.000106s : 0.03% optimize.opt_a.auto_parallel : 0.000079s : 0.02% optimize.opt_a.parallel : 0.000079s : 0.02% optimize.opt_a.flash_sp : 0.000030s : 0.01% optimize.opt_a.merge_comm : 0.000059s : 0.02% optimize.opt_a.allreduce_fusion : 0.000053s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000112s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000108s : 0.03% optimize.opt_a.virtual_dataset : 0.000092s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000091s : 0.03% optimize.opt_a.virtual_output : 0.000092s : 0.03% optimize.opt_a.merge_forward : 0.000064s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000012s : 0.00% optimize.opt_a.offload_activation : 0.000102s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000182s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000166s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000067s : 0.02% optimize.opt_a.meta_fg_expand : 0.028448s : 8.28% optimize.opt_a.flash_sp_send_recv_attached : 0.000016s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000311s : 0.09% optimize.opt_a.a_after_grad : 0.000377s : 0.11% optimize.opt_a.renormalize : 0.040679s : 11.84% optimize.opt_a.add_forward_monad_depend : 0.000045s : 0.01% optimize.opt_a.auto_monad_grad : 0.000028s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000360s : 0.10% optimize.opt_a.cse : 0.000773s : 0.22% optimize.opt_a.a_3 : 0.002383s : 0.69% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000053s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000829s : 0.24% optimize.opt_b.b_1 : 0.000426s : 0.12% optimize.opt_b.b_2 : 0.000025s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000068s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000079s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000036s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.01% optimize.loop_unroll : 0.000630s : 0.18% optimize.opt_after_cconv.c_1 : 0.000115s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.cse : 0.000068s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000070s : 0.02% optimize.tuple_transform.d_1 : 0.000137s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000024s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000179s : 0.05% optimize.cse_after_recomputation.cse : 0.000061s : 0.02% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000017s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000044s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000012s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.00% optimize.overlap_grad_flash_sp : 0.000053s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000007s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000029s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000037s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000031s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000142s : 0.04% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000662s : 0.19% validate : 0.000154s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.014977s : 4.36% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.004067 564 5.01% : 0.000204s : 9: substitution.arithmetic_simplify 0.66% : 0.000027s : 8: substitution.depend_value_elim 0.13% : 0.000005s : 10: substitution.elim_not_effective 0.30% : 0.000012s : 13: substitution.float_depend_g_call 0.10% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.12% : 0.000005s : 10: substitution.fold_const_symbol 41.69% : 0.001696s : 8: substitution.getattr_setattr_resolve 0.42% : 0.000017s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.05% : 0.000002s : 2: substitution.incorporate_call_switch 32.25% : 0.001312s : 26: substitution.inline 1.06% : 0.000043s : 5: substitution.inline_without_move 0.70% : 0.000028s : 51: substitution.j_node_and_user_rematch 1.34% : 0.000055s : 4: substitution.less_batch_normalization 0.39% : 0.000016s : 20: substitution.load_eliminater 0.46% : 0.000019s : 11: substitution.minmaximum_grad 0.73% : 0.000030s : 13: substitution.partial_eliminate 1.05% : 0.000043s : 51: substitution.remove_not_recompute_node 2.38% : 0.000097s : 35: substitution.replace_applicator 0.90% : 0.000037s : 53: substitution.replace_old_param 0.19% : 0.000008s : 2: substitution.set_cell_output_no_recompute 0.31% : 0.000013s : 3: substitution.switch_simplify 0.91% : 0.000037s : 11: substitution.tuple_list_convert_item_index_to_positive 0.40% : 0.000016s : 11: substitution.tuple_list_get_item_const_eliminator 0.59% : 0.000024s : 11: substitution.tuple_list_get_item_depend_reorder 1.98% : 0.000080s : 24: substitution.tuple_list_get_item_eliminator 0.58% : 0.000023s : 11: substitution.tuple_list_get_set_item_eliminator 1.90% : 0.000077s : 62: substitution.updatestate_pure_node_eliminater 3.33% : 0.000135s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.234873 2 96.67% : 0.227053s : 1: type_inference.infer 3.33% : 0.007820s : 1: type_inference.specialize ------[replace.] 0.000981 49 12.99% : 0.000127s : 6: replace.getattr_setattr_resolve 43.40% : 0.000426s : 26: replace.inline 5.74% : 0.000056s : 2: replace.replace_applicator 5.10% : 0.000050s : 3: replace.switch_simplify 25.97% : 0.000255s : 11: replace.tuple_list_get_item_eliminator 6.80% : 0.000067s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002950 49 53.88% : 0.001589s : 6: match.getattr_setattr_resolve 43.90% : 0.001295s : 26: match.inline 0.60% : 0.000018s : 2: match.replace_applicator 0.33% : 0.000010s : 3: match.switch_simplify 0.87% : 0.000026s : 11: match.tuple_list_get_item_eliminator 0.41% : 0.000012s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.002053 14794 0.96% : 0.000020s : 161: predicate.accumulaten_eliminater 0.27% : 0.000005s : 17: predicate.ad_related_special_op_eliminate 1.95% : 0.000040s : 107: predicate.addn_check_dump 1.09% : 0.000022s : 161: predicate.addn_zero_filter 0.92% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.42% : 0.000050s : 263: predicate.arithmetic_simplify 1.00% : 0.000021s : 161: predicate.cast_eliminate 1.99% : 0.000041s : 317: predicate.check_bprop_eliminate 0.65% : 0.000013s : 107: predicate.compare_switch_simplify 0.07% : 0.000002s : 15: predicate.const_output_eliminate 0.69% : 0.000014s : 102: predicate.depend_value_elim 1.08% : 0.000022s : 161: predicate.dict_get_item_const_eliminator 1.13% : 0.000023s : 161: predicate.dict_get_item_eliminator 0.96% : 0.000020s : 161: predicate.dict_set_item_eliminator 0.26% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.08% : 0.000002s : 15: predicate.elim_not_effective 0.12% : 0.000002s : 15: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000021s : 176: predicate.environ_add_const_eliminate 1.02% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.04% : 0.000021s : 176: predicate.environ_get_depend_swap 1.81% : 0.000037s : 278: predicate.environ_get_eliminate 1.01% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.24% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.76% : 0.000036s : 198: predicate.float_depend_g_call 0.69% : 0.000014s : 107: predicate.float_environ_get_switch 0.75% : 0.000015s : 122: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 15: predicate.fold_const_symbol 0.55% : 0.000011s : 82: predicate.get_grad_eliminate 0.66% : 0.000014s : 40: predicate.getattr_setattr_resolve 0.08% : 0.000002s : 15: predicate.graph_param_transform 0.67% : 0.000014s : 102: predicate.incorporate_call 0.60% : 0.000012s : 102: predicate.incorporate_call_switch 4.84% : 0.000099s : 578: predicate.inline 1.59% : 0.000033s : 186: predicate.inline_without_move 0.29% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.80% : 0.000016s : 85: predicate.less_batch_normalization 1.32% : 0.000027s : 202: predicate.list_to_tuple_eliminator_ 2.25% : 0.000046s : 363: predicate.load_eliminater 0.22% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.79% : 0.000037s : 273: predicate.loop_unroll_before_grad 1.22% : 0.000025s : 191: predicate.make_slice_get_slice_eliminator 0.70% : 0.000014s : 107: predicate.merge_addn 1.95% : 0.000040s : 309: predicate.micro_step_allgather_replace 1.92% : 0.000039s : 309: predicate.mini_step_allgather_replace 0.96% : 0.000020s : 161: predicate.minmaximum_grad 0.28% : 0.000006s : 15: predicate.mutable_eliminate 0.11% : 0.000002s : 15: predicate.opt_reshape 0.11% : 0.000002s : 15: predicate.parallel_virtual_node 1.79% : 0.000037s : 198: predicate.partial_defer_inline 1.29% : 0.000027s : 187: predicate.partial_eliminate 0.98% : 0.000020s : 161: predicate.print_const_string_wrapper 0.68% : 0.000014s : 102: predicate.reduce_all_const_elim 1.28% : 0.000026s : 161: predicate.reduce_eliminate 2.14% : 0.000044s : 363: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000007s : 82: predicate.remove_not_recompute_node 2.22% : 0.000045s : 485: predicate.replace_applicator 0.75% : 0.000015s : 186: predicate.replace_old_param 0.07% : 0.000002s : 15: predicate.reset_defer_inline 1.01% : 0.000021s : 161: predicate.reshape_eliminate 2.01% : 0.000041s : 309: predicate.row_tensor_add_zeros_like 0.13% : 0.000003s : 15: predicate.row_tensor_eliminate 2.12% : 0.000043s : 317: predicate.same_eliminate 0.43% : 0.000009s : 97: predicate.set_cell_output_no_recompute 0.67% : 0.000014s : 82: predicate.shard_identity_eliminate 0.23% : 0.000005s : 32: predicate.special_op_eliminate 0.78% : 0.000016s : 107: predicate.specialize_transform 2.14% : 0.000044s : 309: predicate.split_environ_get_set_with_tuple_value 1.46% : 0.000030s : 186: predicate.stack_unstack_eliminate 0.11% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.38% : 0.000028s : 198: predicate.switch_defer_inline 3.33% : 0.000068s : 515: predicate.switch_layer_defer_inline 4.07% : 0.000084s : 599: predicate.switch_simplify 0.98% : 0.000020s : 161: predicate.tile_eliminate 0.97% : 0.000020s : 161: predicate.transpose_eliminate 1.32% : 0.000027s : 191: predicate.tuple_list_convert_item_index_to_positive 1.36% : 0.000028s : 191: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000026s : 191: predicate.tuple_list_get_item_depend_reorder 2.33% : 0.000048s : 304: predicate.tuple_list_get_item_eliminator 1.32% : 0.000027s : 191: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000042s : 293: predicate.tuple_list_set_item_eliminator 1.24% : 0.000025s : 202: predicate.tuple_to_list_eliminator_ 2.26% : 0.000046s : 363: predicate.updatestate_pure_node_eliminater 3.02% : 0.000062s : 467: predicate.updatestate_useless_node_eliminater 0.11% : 0.000002s : 15: predicate.value_based_eliminate 0.55% : 0.000011s : 82: predicate.virtual_dataset_eliminate 0.60% : 0.000012s : 82: predicate.virtual_output_eliminate 0.12% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.12% : 0.000002s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.010342 98 69.95% : 0.007234s : 58: func_graph_cloner_run.FuncGraphClonerGraph 2.34% : 0.000242s : 4: func_graph_cloner_run.FuncGraphClonerNode 27.71% : 0.002866s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.507563 307 0.00% : 0.000004s : 1: ForceFp32Comm 1.10% : 0.005591s : 1: add_attr 1.10% : 0.005575s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000187s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000530s : 1: auto_monad 0.03% : 0.000151s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.18% : 0.000930s : 1: bootstrap 0.01% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000048s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000082s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.09% : 0.000454s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.13% : 0.000641s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.17% : 0.000840s : 1: mutable_eliminate 0.00% : 0.000016s : 1: offloading_packed_experts 0.01% : 0.000035s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000034s : 1: opt.transform.mutable_eliminate 2.93% : 0.014863s : 181: opt.transform.opt_a 0.02% : 0.000113s : 1: opt.transform.opt_after_cconv 0.01% : 0.000074s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000417s : 28: opt.transform.opt_b 0.38% : 0.001949s : 4: opt.transform.opt_resolve 0.03% : 0.000157s : 2: opt.transform.opt_trans_graph 0.02% : 0.000112s : 4: opt.transform.symbol_engine_opt 17.37% : 0.088169s : 1: opt_a 0.05% : 0.000278s : 1: opt_after_cconv 0.13% : 0.000675s : 1: opt_after_jit_grad 0.14% : 0.000689s : 1: opt_b 18.21% : 0.092406s : 1: optimize 0.01% : 0.000041s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000058s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000015s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000082s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000076s : 1: remove_dup_value 6.36% : 0.032264s : 3: renormalize.infer 1.65% : 0.008376s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000056s : 1: rewriter_after_opt_a 0.10% : 0.000508s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000021s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000168s : 1: symbol_engine_optimizer 2.95% : 0.014997s : 1: task_emit 0.04% : 0.000196s : 1: tuple_transform 46.32% : 0.235127s : 1: type_inference 0.04% : 0.000214s : 1: validate TotalTime = 0.200062, [24] [bootstrap]: 0.00159294 [type_inference]: 0.16415 [event_method]: 2.261e-05 [auto_monad]: 0.00032476 [graph_reusing]: 6.74999e-06 [inline]: 2.72001e-06 [add_attr]: 0.00509, [1] [add_attr_with_inline]: 0.00508056, [1] [Cycle 1]: 0.00012821, [2] [tag_attr]: 8.404e-05 [meta_addattr_fg_expand]: 7.8e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 4.751e-05 [insert-virtual-dataset]: 3.31001e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 2.58e-06 [optimize]: 0.0135965, [53] [py_interpret_to_execute]: 6.76e-06 [rewriter_before_opt_a]: 0.0001114 [opt_a]: 0.0101679, [2] [Cycle 1]: 0.0082923, [45] [expand_dump_flag]: 3.19001e-06 [switch_simplify]: 6.304e-05 [loop_unroll]: 4.246e-05 [a_1]: 0.00118001 [with_stream_mark]: 2.358e-05 [recompute_prepare]: 2.501e-05 [updatestate_depend_eliminate]: 5.338e-05 [updatestate_assign_eliminate]: 1.456e-05 [updatestate_loads_eliminate]: 1.249e-05 [parameter_eliminate]: 2.78998e-06 [a_2]: 0.00029707 [accelerated_algorithm]: 4.107e-05 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 4.52e-06 [shard_inline]: 1.896e-05 [merge_send_recv]: 1.61e-05 [auto_parallel]: 1.326e-05 [parallel]: 2.234e-05 [flash_sp]: 1.166e-05 [merge_comm]: 1.064e-05 [allreduce_fusion]: 9.82001e-06 [matmul_add_comm_reduction]: 1.79e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 2.093e-05 [virtual_dataset]: 1.898e-05 [get_grad_eliminate_]: 1.884e-05 [virtual_output]: 1.829e-05 [merge_forward]: 9.45001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.847e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.25e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 2.941e-05 [set_forward_comm_id_for_comm_node_pass]: 1.062e-05 [meta_fg_expand]: 7.60998e-06 [flash_sp_send_recv_attached]: 4.68999e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 2.671e-05 [a_after_grad]: 2.957e-05 [renormalize]: 0.00548245 [add_forward_monad_depend]: 8.33001e-06 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 5.943e-05 [cse]: 0.00016288 [a_3]: 0.00015793 [Cycle 2]: 0.0018623, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 2.198e-05 [loop_unroll]: 2.123e-05 [a_1]: 0.00060424 [with_stream_mark]: 2.062e-05 [recompute_prepare]: 1.923e-05 [updatestate_depend_eliminate]: 1.043e-05 [updatestate_assign_eliminate]: 1.254e-05 [updatestate_loads_eliminate]: 1.394e-05 [parameter_eliminate]: 1.57001e-06 [a_2]: 0.00029581 [accelerated_algorithm]: 2.535e-05 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 4.85001e-06 [shard_inline]: 1.855e-05 [merge_send_recv]: 1.618e-05 [auto_parallel]: 1.436e-05 [parallel]: 8.05e-06 [flash_sp]: 4.29997e-06 [merge_comm]: 9.81e-06 [allreduce_fusion]: 9.69e-06 [matmul_add_comm_reduction]: 1.869e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 2.042e-05 [virtual_dataset]: 1.977e-05 [get_grad_eliminate_]: 2.023e-05 [virtual_output]: 2.034e-05 [merge_forward]: 1.004e-05 [cell_reuse_recompute_pass]: 4.37003e-06 [offload_activation]: 1.789e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.539e-05 [merge_recompute_call_nodes]: 1.19e-06 [before_grad]: 2.97e-05 [set_forward_comm_id_for_comm_node_pass]: 1.081e-05 [meta_fg_expand]: 7.4e-06 [flash_sp_send_recv_attached]: 1.45001e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 2.83e-05 [a_after_grad]: 2.902e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.82001e-06 [auto_monad_grad]: 1.62001e-06 [auto_monad_eliminator]: 3.814e-05 [cse]: 5.602e-05 [a_3]: 0.00012722 [py_interpret_to_execute_after_opt_a]: 7.17002e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 5.201e-05 [convert_after_rewriter]: 1.39e-06 [order_py_execute_after_rewriter]: 1.21002e-06 [mutable_eliminate]: 0.00077025 [opt_b]: 0.00063019, [1] [Cycle 1]: 0.00062155, [7] [b_1]: 0.00045647 [b_2]: 2.129e-05 [updatestate_depend_eliminate]: 1.304e-05 [updatestate_assign_eliminate]: 1.075e-05 [updatestate_loads_eliminate]: 1.14e-05 [renormalize]: 4.39992e-07 [cse]: 5.683e-05 [optimize_parallel_all_gather_comm]: 3.174e-05 [overlap_param_gather]: 1.89999e-06 [cconv]: 3.044e-05 [loop_unroll]: 0.00052163 [opt_after_cconv]: 0.00025699, [1] [Cycle 1]: 0.00025034, [7] [c_1]: 0.00011911 [parameter_eliminate]: 2.86e-06 [updatestate_depend_eliminate]: 1.244e-05 [updatestate_assign_eliminate]: 1.085e-05 [updatestate_loads_eliminate]: 1.13e-05 [cse]: 5.673e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 7.132e-05 [tuple_transform]: 0.0001777, [1] [Cycle 1]: 0.00017248, [4] [d_1]: 0.0001316 [none_parameter_eliminate]: 1.87001e-06 [renormalize]: 2.19996e-07 [switch_simplify]: 1.971e-05 [partial_unused_args_eliminate]: 1.93002e-06 [add_recomputation]: 0.00010796 [cse_after_recomputation]: 5.634e-05, [1] [Cycle 1]: 5.151e-05, [1] [cse]: 4.538e-05 [environ_conv]: 1.286e-05 [swap_dp_allreduce_reducescatter]: 1.347e-05 [bias_add_comm_swap]: 2.98e-06 [label_micro_interleaved_index]: 5.82001e-06 [label_fine_grained_interleaved_index]: 2.52001e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.05999e-06 [remove_cast_before_assign_add]: 1.21002e-06 [full_micro_interleaved_order_control]: 2.29999e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.28002e-06 [overlap_opt_shard_in_pipeline]: 1.67001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.68002e-06 [control_data_broadcast_order]: 4.17e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 1.065e-05 [overlap_recompute_and_grad_model_parallel]: 1.154e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19003e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 1.092e-05 [overlap_grad_flash_sp]: 4.711e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 1.54e-06 [handle_group_info]: 1.17999e-06 [symbol_engine_optimizer]: 0.00015688, [1] [Cycle 1]: 0.00015219, [6] [build]: 5.89e-06 [elim_shapecalc]: 2.429e-05 [elim_not_effective]: 3.862e-05 [opt_reshape]: 2.122e-05 [fold_const_symbol]: 3.182e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.65001e-06 [auto_monad_reorder]: 0.00012954 [get_jit_bprop_graph]: 2.21e-06 [rewriter_after_jit_bprop_graph]: 4.93001e-06 [opt_after_jit_grad]: 0.00059861 [validate]: 9.219e-05 [backend_pass]: 9.79984e-07 [task_emit]: 0.0141108 [execute]: 7.65e-06 Sums bootstrap : 0.001593s : 0.82% type_inference : 0.164150s : 84.67% event_method : 0.000023s : 0.01% auto_monad : 0.000325s : 0.17% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000084s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000048s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000111s : 0.06% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000085s : 0.04% optimize.opt_a.loop_unroll : 0.000064s : 0.03% optimize.opt_a.a_1 : 0.001784s : 0.92% optimize.opt_a.with_stream_mark : 0.000044s : 0.02% optimize.opt_a.recompute_prepare : 0.000044s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000064s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000027s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000026s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000593s : 0.31% optimize.opt_a.accelerated_algorithm : 0.000066s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000038s : 0.02% optimize.opt_a.merge_send_recv : 0.000032s : 0.02% optimize.opt_a.auto_parallel : 0.000028s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000020s : 0.01% optimize.opt_a.allreduce_fusion : 0.000020s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000037s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000041s : 0.02% optimize.opt_a.virtual_dataset : 0.000039s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.02% optimize.opt_a.virtual_output : 0.000039s : 0.02% optimize.opt_a.merge_forward : 0.000019s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000036s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000068s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000059s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000021s : 0.01% optimize.opt_a.meta_fg_expand : 0.000015s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000055s : 0.03% optimize.opt_a.a_after_grad : 0.000059s : 0.03% optimize.opt_a.renormalize : 0.005483s : 2.83% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000098s : 0.05% optimize.opt_a.cse : 0.000219s : 0.11% optimize.opt_a.a_3 : 0.000285s : 0.15% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000052s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000770s : 0.40% optimize.opt_b.b_1 : 0.000456s : 0.24% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000057s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000030s : 0.02% optimize.loop_unroll : 0.000522s : 0.27% optimize.opt_after_cconv.c_1 : 0.000119s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.cse : 0.000057s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000071s : 0.04% optimize.tuple_transform.d_1 : 0.000132s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000108s : 0.06% optimize.cse_after_recomputation.cse : 0.000045s : 0.02% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000042s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000011s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.01% optimize.overlap_grad_flash_sp : 0.000047s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000039s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000032s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000130s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000599s : 0.31% validate : 0.000092s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.014111s : 7.28% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000460 168 16.87% : 0.000078s : 4: substitution.arithmetic_simplify 2.05% : 0.000009s : 2: substitution.depend_value_elim 1.22% : 0.000006s : 10: substitution.elim_not_effective 1.06% : 0.000005s : 10: substitution.fold_const_symbol 2.78% : 0.000013s : 16: substitution.graph_param_transform 48.14% : 0.000222s : 4: substitution.inline 2.09% : 0.000010s : 20: substitution.j_node_and_user_rematch 4.06% : 0.000019s : 2: substitution.less_batch_normalization 1.86% : 0.000009s : 12: substitution.load_eliminater 3.37% : 0.000016s : 20: substitution.remove_not_recompute_node 1.89% : 0.000009s : 10: substitution.replace_old_param 7.04% : 0.000032s : 26: substitution.updatestate_pure_node_eliminater 7.56% : 0.000035s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.164022 2 97.56% : 0.160018s : 1: type_inference.infer 2.44% : 0.004004s : 1: type_inference.specialize ------[replace.] 0.000050 4 100.00% : 0.000050s : 4: replace.inline ------[match.] 0.000219 4 100.00% : 0.000219s : 4: match.inline ------[predicate.] 0.000575 4085 0.98% : 0.000006s : 40: predicate.accumulaten_eliminater 0.89% : 0.000005s : 18: predicate.ad_related_special_op_eliminate 0.72% : 0.000004s : 32: predicate.addn_check_dump 0.97% : 0.000006s : 40: predicate.addn_zero_filter 0.84% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 2.31% : 0.000013s : 72: predicate.arithmetic_simplify 1.04% : 0.000006s : 40: predicate.cast_eliminate 0.79% : 0.000005s : 32: predicate.check_bprop_eliminate 0.68% : 0.000004s : 32: predicate.compare_switch_simplify 0.23% : 0.000001s : 16: predicate.const_output_eliminate 0.81% : 0.000005s : 32: predicate.depend_value_elim 0.95% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 1.19% : 0.000007s : 40: predicate.dict_get_item_eliminator 0.89% : 0.000005s : 40: predicate.dict_set_item_eliminator 1.07% : 0.000006s : 34: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 16: predicate.elim_not_effective 0.43% : 0.000002s : 16: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.21% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.19% : 0.000007s : 56: predicate.environ_get_depend_swap 2.02% : 0.000012s : 88: predicate.environ_get_eliminate 1.30% : 0.000007s : 56: predicate.environ_get_set_eliminate 1.00% : 0.000006s : 44: predicate.exchange_switch_depend_value 1.57% : 0.000009s : 44: predicate.float_depend_g_call 0.70% : 0.000004s : 32: predicate.float_environ_get_switch 1.05% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 16: predicate.fold_const_symbol 0.87% : 0.000005s : 32: predicate.get_grad_eliminate 0.30% : 0.000002s : 16: predicate.graph_param_transform 0.72% : 0.000004s : 32: predicate.incorporate_call 0.68% : 0.000004s : 32: predicate.incorporate_call_switch 5.22% : 0.000030s : 180: predicate.inline 0.88% : 0.000005s : 32: predicate.inline_without_move 0.41% : 0.000002s : 32: predicate.j_node_and_user_rematch 1.11% : 0.000006s : 35: predicate.less_batch_normalization 1.67% : 0.000010s : 72: predicate.list_to_tuple_eliminator_ 2.51% : 0.000014s : 112: predicate.load_eliminater 0.76% : 0.000004s : 16: predicate.loop_unroll_after_grad 1.45% : 0.000008s : 61: predicate.loop_unroll_before_grad 1.68% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 0.81% : 0.000005s : 32: predicate.merge_addn 0.70% : 0.000004s : 32: predicate.micro_step_allgather_replace 0.73% : 0.000004s : 32: predicate.mini_step_allgather_replace 0.83% : 0.000005s : 40: predicate.minmaximum_grad 0.90% : 0.000005s : 16: predicate.mutable_eliminate 0.42% : 0.000002s : 16: predicate.opt_reshape 0.44% : 0.000003s : 16: predicate.parallel_virtual_node 1.22% : 0.000007s : 44: predicate.partial_defer_inline 1.37% : 0.000008s : 56: predicate.partial_eliminate 0.97% : 0.000006s : 40: predicate.print_const_string_wrapper 0.79% : 0.000005s : 32: predicate.reduce_all_const_elim 1.12% : 0.000006s : 40: predicate.reduce_eliminate 2.48% : 0.000014s : 112: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000003s : 32: predicate.remove_not_recompute_node 1.24% : 0.000007s : 72: predicate.replace_applicator 0.54% : 0.000003s : 32: predicate.replace_old_param 0.25% : 0.000001s : 16: predicate.reset_defer_inline 0.89% : 0.000005s : 40: predicate.reshape_eliminate 0.76% : 0.000004s : 32: predicate.row_tensor_add_zeros_like 0.44% : 0.000003s : 16: predicate.row_tensor_eliminate 0.99% : 0.000006s : 32: predicate.same_eliminate 0.53% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.91% : 0.000005s : 32: predicate.shard_identity_eliminate 0.92% : 0.000005s : 34: predicate.special_op_eliminate 0.87% : 0.000005s : 32: predicate.specialize_transform 0.96% : 0.000006s : 32: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000005s : 32: predicate.stack_unstack_eliminate 0.40% : 0.000002s : 16: predicate.switch_call_monad_eliminater 1.12% : 0.000006s : 44: predicate.switch_defer_inline 1.96% : 0.000011s : 76: predicate.switch_layer_defer_inline 3.82% : 0.000022s : 153: predicate.switch_simplify 0.88% : 0.000005s : 40: predicate.tile_eliminate 0.85% : 0.000005s : 40: predicate.transpose_eliminate 1.68% : 0.000010s : 72: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 2.74% : 0.000016s : 104: predicate.tuple_list_get_item_eliminator 1.64% : 0.000009s : 72: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000015s : 104: predicate.tuple_list_set_item_eliminator 1.73% : 0.000010s : 72: predicate.tuple_to_list_eliminator_ 2.54% : 0.000015s : 112: predicate.updatestate_pure_node_eliminater 3.43% : 0.000020s : 144: predicate.updatestate_useless_node_eliminater 0.39% : 0.000002s : 16: predicate.value_based_eliminate 0.81% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.86% : 0.000005s : 32: predicate.virtual_output_eliminate 0.43% : 0.000002s : 18: predicate.virtual_view_grad_eliminate 0.45% : 0.000003s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004177 26 63.94% : 0.002671s : 20: func_graph_cloner_run.FuncGraphClonerGraph 36.06% : 0.001506s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.228330 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.23% : 0.005096s : 1: add_attr 2.23% : 0.005084s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000112s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.15% : 0.000337s : 1: auto_monad 0.06% : 0.000137s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.71% : 0.001619s : 1: bootstrap 0.01% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000045s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000059s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000017s : 1: environ_conv 0.01% : 0.000028s : 1: event_method 0.01% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.23% : 0.000532s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.34% : 0.000781s : 1: mutable_eliminate 0.01% : 0.000014s : 1: offloading_packed_experts 0.01% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000033s : 1: opt.transform.mutable_eliminate 1.42% : 0.003250s : 78: opt.transform.opt_a 0.05% : 0.000117s : 1: opt.transform.opt_after_cconv 0.03% : 0.000075s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000443s : 28: opt.transform.opt_b 0.07% : 0.000149s : 2: opt.transform.opt_trans_graph 0.05% : 0.000112s : 4: opt.transform.symbol_engine_opt 4.46% : 0.010172s : 1: opt_a 0.11% : 0.000261s : 1: opt_after_cconv 0.27% : 0.000609s : 1: opt_after_jit_grad 0.28% : 0.000635s : 1: opt_b 5.96% : 0.013603s : 1: optimize 0.02% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000051s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000076s : 1: remove_dup_value 1.57% : 0.003590s : 1: renormalize.infer 0.82% : 0.001879s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000055s : 1: rewriter_after_opt_a 0.05% : 0.000116s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000160s : 1: symbol_engine_optimizer 6.19% : 0.014126s : 1: task_emit 0.08% : 0.000181s : 1: tuple_transform 71.90% : 0.164171s : 1: type_inference 0.07% : 0.000151s : 1: validate TotalTime = 0.273521, [24] [bootstrap]: 0.00089471 [type_inference]: 0.191008 [event_method]: 0.00040203 [auto_monad]: 0.00037868 [graph_reusing]: 8.62e-06 [inline]: 2.99001e-06 [add_attr]: 0.00355903, [1] [add_attr_with_inline]: 0.00355064, [1] [Cycle 1]: 0.00010167, [2] [tag_attr]: 6.142e-05 [meta_addattr_fg_expand]: 1.418e-05 [parallel-infer-symbol]: 3.93001e-06 [pre_auto_parallel]: 7.198e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.83997e-06 [pipeline_split]: 2.51e-06 [optimize]: 0.0629638, [53] [py_interpret_to_execute]: 4.92e-06 [rewriter_before_opt_a]: 0.00045858 [opt_a]: 0.0593603, [4] [Cycle 1]: 0.0456911, [45] [expand_dump_flag]: 4.15e-06 [switch_simplify]: 0.00017511 [loop_unroll]: 9.422e-05 [a_1]: 0.00279658 [with_stream_mark]: 3.382e-05 [recompute_prepare]: 4.405e-05 [updatestate_depend_eliminate]: 0.00011133 [updatestate_assign_eliminate]: 2.242e-05 [updatestate_loads_eliminate]: 1.805e-05 [parameter_eliminate]: 2.74001e-06 [a_2]: 0.00053573 [accelerated_algorithm]: 5.602e-05 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 8.43001e-06 [shard_inline]: 3.187e-05 [merge_send_recv]: 3.92e-05 [auto_parallel]: 2.345e-05 [parallel]: 2.281e-05 [flash_sp]: 1.524e-05 [merge_comm]: 2.03e-05 [allreduce_fusion]: 1.98e-05 [matmul_add_comm_reduction]: 3.588e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 3.507e-05 [virtual_dataset]: 3.138e-05 [get_grad_eliminate_]: 3.348e-05 [virtual_output]: 3.114e-05 [merge_forward]: 2.144e-05 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 2.994e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.604e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 5.153e-05 [set_forward_comm_id_for_comm_node_pass]: 2.394e-05 [meta_fg_expand]: 0.00955179 [flash_sp_send_recv_attached]: 9.04e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 0.00020783 [a_after_grad]: 0.00026584 [renormalize]: 0.0282004 [add_forward_monad_depend]: 2.454e-05 [auto_monad_grad]: 1.912e-05 [auto_monad_eliminator]: 0.00019712 [cse]: 0.00049138 [a_3]: 0.0019354 [Cycle 2]: 0.00927584, [45] [expand_dump_flag]: 3.66999e-06 [switch_simplify]: 0.00012402 [loop_unroll]: 0.00011995 [a_1]: 0.00373929 [with_stream_mark]: 3.575e-05 [recompute_prepare]: 3.699e-05 [updatestate_depend_eliminate]: 2.892e-05 [updatestate_assign_eliminate]: 2.158e-05 [updatestate_loads_eliminate]: 2.085e-05 [parameter_eliminate]: 4.21001e-06 [a_2]: 0.00083508 [accelerated_algorithm]: 3.166e-05 [shard]: 2.52001e-06 [meta_shard_fg_expand]: 6.34999e-06 [shard_inline]: 2.422e-05 [merge_send_recv]: 2.104e-05 [auto_parallel]: 1.946e-05 [parallel]: 8.87e-06 [flash_sp]: 4.33999e-06 [merge_comm]: 1.35e-05 [allreduce_fusion]: 1.23e-05 [matmul_add_comm_reduction]: 2.115e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 2.463e-05 [virtual_dataset]: 2.324e-05 [get_grad_eliminate_]: 2.233e-05 [virtual_output]: 2.34e-05 [merge_forward]: 1.363e-05 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 2.512e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.36e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 3.835e-05 [set_forward_comm_id_for_comm_node_pass]: 1.615e-05 [meta_fg_expand]: 0.00014978 [flash_sp_send_recv_attached]: 2.32001e-06 [receive_attached]: 3.00002e-06 [after_resolve]: 3.419e-05 [a_after_grad]: 3.674e-05 [renormalize]: 0.00304172 [add_forward_monad_depend]: 8.32998e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 6.237e-05 [cse]: 0.00011341 [a_3]: 0.00016653 [Cycle 3]: 0.00269474, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 2.361e-05 [loop_unroll]: 2.256e-05 [a_1]: 0.00064026 [with_stream_mark]: 1.861e-05 [recompute_prepare]: 2.209e-05 [updatestate_depend_eliminate]: 4.307e-05 [updatestate_assign_eliminate]: 1.191e-05 [updatestate_loads_eliminate]: 1.311e-05 [parameter_eliminate]: 1.44e-06 [a_2]: 0.00026867 [accelerated_algorithm]: 2.198e-05 [shard]: 1.35999e-06 [meta_shard_fg_expand]: 4.04002e-06 [shard_inline]: 1.746e-05 [merge_send_recv]: 1.431e-05 [auto_parallel]: 1.271e-05 [parallel]: 6.09001e-06 [flash_sp]: 1.60001e-06 [merge_comm]: 9.88002e-06 [allreduce_fusion]: 9.71e-06 [matmul_add_comm_reduction]: 1.591e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.797e-05 [virtual_dataset]: 1.712e-05 [get_grad_eliminate_]: 1.696e-05 [virtual_output]: 1.705e-05 [merge_forward]: 8.23999e-06 [cell_reuse_recompute_pass]: 2.07001e-06 [offload_activation]: 1.612e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.146e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 2.77e-05 [set_forward_comm_id_for_comm_node_pass]: 1.005e-05 [meta_fg_expand]: 6.58e-06 [flash_sp_send_recv_attached]: 1.43002e-06 [receive_attached]: 1.81e-06 [after_resolve]: 2.418e-05 [a_after_grad]: 2.835e-05 [renormalize]: 0.00080265 [add_forward_monad_depend]: 5.74999e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 3.887e-05 [cse]: 7.878e-05 [a_3]: 0.00012562 [Cycle 4]: 0.00167856, [45] [expand_dump_flag]: 1.11997e-06 [switch_simplify]: 1.92e-05 [loop_unroll]: 1.774e-05 [a_1]: 0.00056433 [with_stream_mark]: 1.507e-05 [recompute_prepare]: 1.76e-05 [updatestate_depend_eliminate]: 9.76998e-06 [updatestate_assign_eliminate]: 1.048e-05 [updatestate_loads_eliminate]: 1.084e-05 [parameter_eliminate]: 1.15001e-06 [a_2]: 0.00026999 [accelerated_algorithm]: 2.045e-05 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 3.47002e-06 [shard_inline]: 1.692e-05 [merge_send_recv]: 1.17e-05 [auto_parallel]: 1.174e-05 [parallel]: 4.63999e-06 [flash_sp]: 8.70001e-07 [merge_comm]: 9.56e-06 [allreduce_fusion]: 9.25999e-06 [matmul_add_comm_reduction]: 1.413e-05 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 1.997e-05 [virtual_dataset]: 1.718e-05 [get_grad_eliminate_]: 1.684e-05 [virtual_output]: 1.682e-05 [merge_forward]: 8.10999e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 1.408e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.1e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 2.765e-05 [set_forward_comm_id_for_comm_node_pass]: 9.86003e-06 [meta_fg_expand]: 6.14001e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.10001e-06 [after_resolve]: 2.291e-05 [a_after_grad]: 2.702e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 1.14e-06 [auto_monad_eliminator]: 3.265e-05 [cse]: 5.156e-05 [a_3]: 0.00011806 [py_interpret_to_execute_after_opt_a]: 5.05001e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.647e-05 [convert_after_rewriter]: 1.00999e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00068304 [opt_b]: 0.00057249, [1] [Cycle 1]: 0.00056538, [7] [b_1]: 0.00041694 [b_2]: 1.988e-05 [updatestate_depend_eliminate]: 1.219e-05 [updatestate_assign_eliminate]: 1.069e-05 [updatestate_loads_eliminate]: 1.079e-05 [renormalize]: 4.59986e-07 [cse]: 5.718e-05 [optimize_parallel_all_gather_comm]: 2.82e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 1.877e-05 [loop_unroll]: 0.00047921 [opt_after_cconv]: 0.00025139, [1] [Cycle 1]: 0.00024459, [7] [c_1]: 0.0001145 [parameter_eliminate]: 3.72002e-06 [updatestate_depend_eliminate]: 1.428e-05 [updatestate_assign_eliminate]: 1.071e-05 [updatestate_loads_eliminate]: 1.096e-05 [cse]: 5.488e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 5.814e-05 [tuple_transform]: 0.00017936, [1] [Cycle 1]: 0.00017372, [4] [d_1]: 0.00012995 [none_parameter_eliminate]: 1.25999e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 2.199e-05 [partial_unused_args_eliminate]: 1.54998e-06 [add_recomputation]: 0.00010917 [cse_after_recomputation]: 6.372e-05, [1] [Cycle 1]: 5.883e-05, [1] [cse]: 5.221e-05 [environ_conv]: 1.049e-05 [swap_dp_allreduce_reducescatter]: 1.59e-05 [bias_add_comm_swap]: 3.81001e-06 [label_micro_interleaved_index]: 5.02e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.66002e-06 [slice_recompute_activation]: 2.21e-06 [micro_interleaved_order_control]: 2.25002e-06 [assign_add_opt]: 1.25999e-06 [ForceFp32Comm]: 8.30012e-07 [remove_cast_before_assign_add]: 1.20999e-06 [full_micro_interleaved_order_control]: 2.33002e-06 [reorder_send_recv_between_fp_bp]: 2.86999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.02998e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.37e-06 [overlap_opt_shard_in_pipeline]: 1.79e-06 [overlap_opt_shard_grad_in_pipeline]: 1.68002e-06 [control_data_broadcast_order]: 8.13e-05 [grouped_pairwise_exchange_alltoall]: 1.91e-06 [offloading_packed_experts]: 1.047e-05 [overlap_recompute_and_grad_model_parallel]: 1.127e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.64998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.55002e-06 [overlap_grad_ring_attention]: 1.103e-05 [overlap_grad_flash_sp]: 4.717e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.00002e-06 [split_layernorm_comm]: 1.53002e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 0.00016168, [1] [Cycle 1]: 0.00015663, [6] [build]: 4.33999e-06 [elim_shapecalc]: 2.551e-05 [elim_not_effective]: 3.862e-05 [opt_reshape]: 2.214e-05 [fold_const_symbol]: 3.381e-05 [renormalize]: 1.50001e-07 [detach_backward]: 1.87999e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 0.0001288 [get_jit_bprop_graph]: 1.52999e-06 [rewriter_after_jit_bprop_graph]: 4.08001e-06 [opt_after_jit_grad]: 0.00056816 [validate]: 8.762e-05 [backend_pass]: 9.00007e-07 [task_emit]: 0.0131615 [execute]: 7.17002e-06 Sums bootstrap : 0.000895s : 0.33% type_inference : 0.191008s : 71.20% event_method : 0.000402s : 0.15% auto_monad : 0.000379s : 0.14% graph_reusing : 0.000009s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000061s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000072s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000459s : 0.17% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000342s : 0.13% optimize.opt_a.loop_unroll : 0.000254s : 0.09% optimize.opt_a.a_1 : 0.007740s : 2.89% optimize.opt_a.with_stream_mark : 0.000103s : 0.04% optimize.opt_a.recompute_prepare : 0.000121s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000193s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000066s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000063s : 0.02% optimize.opt_a.parameter_eliminate : 0.000010s : 0.00% optimize.opt_a.a_2 : 0.001909s : 0.71% optimize.opt_a.accelerated_algorithm : 0.000130s : 0.05% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000022s : 0.01% optimize.opt_a.shard_inline : 0.000090s : 0.03% optimize.opt_a.merge_send_recv : 0.000086s : 0.03% optimize.opt_a.auto_parallel : 0.000067s : 0.03% optimize.opt_a.parallel : 0.000042s : 0.02% optimize.opt_a.flash_sp : 0.000022s : 0.01% optimize.opt_a.merge_comm : 0.000053s : 0.02% optimize.opt_a.allreduce_fusion : 0.000051s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000087s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000098s : 0.04% optimize.opt_a.virtual_dataset : 0.000089s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000090s : 0.03% optimize.opt_a.virtual_output : 0.000088s : 0.03% optimize.opt_a.merge_forward : 0.000051s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000085s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000162s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000145s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000060s : 0.02% optimize.opt_a.meta_fg_expand : 0.009714s : 3.62% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000289s : 0.11% optimize.opt_a.a_after_grad : 0.000358s : 0.13% optimize.opt_a.renormalize : 0.032045s : 11.94% optimize.opt_a.add_forward_monad_depend : 0.000040s : 0.01% optimize.opt_a.auto_monad_grad : 0.000023s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000331s : 0.12% optimize.opt_a.cse : 0.000735s : 0.27% optimize.opt_a.a_3 : 0.002346s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000683s : 0.25% optimize.opt_b.b_1 : 0.000417s : 0.16% optimize.opt_b.b_2 : 0.000020s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000057s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000019s : 0.01% optimize.loop_unroll : 0.000479s : 0.18% optimize.opt_after_cconv.c_1 : 0.000114s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000055s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000058s : 0.02% optimize.tuple_transform.d_1 : 0.000130s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000022s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000109s : 0.04% optimize.cse_after_recomputation.cse : 0.000052s : 0.02% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000016s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000081s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.00% optimize.overlap_grad_flash_sp : 0.000047s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000026s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000039s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000034s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000129s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000568s : 0.21% validate : 0.000088s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.013162s : 4.91% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.003449 564 4.40% : 0.000152s : 9: substitution.arithmetic_simplify 0.57% : 0.000020s : 8: substitution.depend_value_elim 0.16% : 0.000006s : 10: substitution.elim_not_effective 0.30% : 0.000010s : 13: substitution.float_depend_g_call 0.11% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.13% : 0.000005s : 10: substitution.fold_const_symbol 42.98% : 0.001482s : 8: substitution.getattr_setattr_resolve 0.41% : 0.000014s : 15: substitution.graph_param_transform 0.08% : 0.000003s : 2: substitution.incorporate_call 0.08% : 0.000003s : 2: substitution.incorporate_call_switch 32.44% : 0.001119s : 26: substitution.inline 1.20% : 0.000041s : 5: substitution.inline_without_move 0.71% : 0.000025s : 51: substitution.j_node_and_user_rematch 0.62% : 0.000022s : 4: substitution.less_batch_normalization 0.36% : 0.000013s : 20: substitution.load_eliminater 0.44% : 0.000015s : 11: substitution.minmaximum_grad 0.32% : 0.000011s : 13: substitution.partial_eliminate 1.03% : 0.000036s : 51: substitution.remove_not_recompute_node 2.56% : 0.000088s : 35: substitution.replace_applicator 0.91% : 0.000031s : 53: substitution.replace_old_param 0.14% : 0.000005s : 2: substitution.set_cell_output_no_recompute 0.35% : 0.000012s : 3: substitution.switch_simplify 1.21% : 0.000042s : 11: substitution.tuple_list_convert_item_index_to_positive 0.46% : 0.000016s : 11: substitution.tuple_list_get_item_const_eliminator 0.60% : 0.000021s : 11: substitution.tuple_list_get_item_depend_reorder 1.83% : 0.000063s : 24: substitution.tuple_list_get_item_eliminator 0.63% : 0.000022s : 11: substitution.tuple_list_get_set_item_eliminator 1.93% : 0.000067s : 62: substitution.updatestate_pure_node_eliminater 3.02% : 0.000104s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.190845 2 96.10% : 0.183398s : 1: type_inference.infer 3.90% : 0.007448s : 1: type_inference.specialize ------[replace.] 0.000821 49 12.33% : 0.000101s : 6: replace.getattr_setattr_resolve 44.38% : 0.000364s : 26: replace.inline 6.47% : 0.000053s : 2: replace.replace_applicator 5.07% : 0.000042s : 3: replace.switch_simplify 25.82% : 0.000212s : 11: replace.tuple_list_get_item_eliminator 5.94% : 0.000049s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002547 49 54.45% : 0.001387s : 6: match.getattr_setattr_resolve 43.27% : 0.001102s : 26: match.inline 0.73% : 0.000019s : 2: match.replace_applicator 0.37% : 0.000010s : 3: match.switch_simplify 0.86% : 0.000022s : 11: match.tuple_list_get_item_eliminator 0.32% : 0.000008s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001965 14794 1.01% : 0.000020s : 161: predicate.accumulaten_eliminater 0.27% : 0.000005s : 17: predicate.ad_related_special_op_eliminate 0.69% : 0.000014s : 107: predicate.addn_check_dump 1.03% : 0.000020s : 161: predicate.addn_zero_filter 0.95% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.20% : 0.000043s : 263: predicate.arithmetic_simplify 1.01% : 0.000020s : 161: predicate.cast_eliminate 2.03% : 0.000040s : 317: predicate.check_bprop_eliminate 0.69% : 0.000014s : 107: predicate.compare_switch_simplify 0.06% : 0.000001s : 15: predicate.const_output_eliminate 0.68% : 0.000013s : 102: predicate.depend_value_elim 1.09% : 0.000021s : 161: predicate.dict_get_item_const_eliminator 1.21% : 0.000024s : 161: predicate.dict_get_item_eliminator 0.99% : 0.000019s : 161: predicate.dict_set_item_eliminator 0.27% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 15: predicate.elim_not_effective 0.12% : 0.000002s : 15: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000021s : 176: predicate.environ_add_const_eliminate 1.07% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.05% : 0.000021s : 176: predicate.environ_get_depend_swap 1.77% : 0.000035s : 278: predicate.environ_get_eliminate 1.05% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.28% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.73% : 0.000034s : 198: predicate.float_depend_g_call 0.69% : 0.000014s : 107: predicate.float_environ_get_switch 0.79% : 0.000015s : 122: predicate.float_tuple_getitem_switch 0.05% : 0.000001s : 15: predicate.fold_const_symbol 0.57% : 0.000011s : 82: predicate.get_grad_eliminate 0.58% : 0.000011s : 40: predicate.getattr_setattr_resolve 0.07% : 0.000001s : 15: predicate.graph_param_transform 0.68% : 0.000013s : 102: predicate.incorporate_call 0.63% : 0.000012s : 102: predicate.incorporate_call_switch 4.91% : 0.000096s : 578: predicate.inline 1.62% : 0.000032s : 186: predicate.inline_without_move 0.30% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.68% : 0.000013s : 85: predicate.less_batch_normalization 1.34% : 0.000026s : 202: predicate.list_to_tuple_eliminator_ 2.32% : 0.000046s : 363: predicate.load_eliminater 0.21% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.86% : 0.000037s : 273: predicate.loop_unroll_before_grad 1.21% : 0.000024s : 191: predicate.make_slice_get_slice_eliminator 0.73% : 0.000014s : 107: predicate.merge_addn 2.01% : 0.000039s : 309: predicate.micro_step_allgather_replace 2.00% : 0.000039s : 309: predicate.mini_step_allgather_replace 0.98% : 0.000019s : 161: predicate.minmaximum_grad 0.22% : 0.000004s : 15: predicate.mutable_eliminate 0.12% : 0.000002s : 15: predicate.opt_reshape 0.12% : 0.000002s : 15: predicate.parallel_virtual_node 1.71% : 0.000034s : 198: predicate.partial_defer_inline 1.37% : 0.000027s : 187: predicate.partial_eliminate 0.99% : 0.000019s : 161: predicate.print_const_string_wrapper 0.65% : 0.000013s : 102: predicate.reduce_all_const_elim 1.22% : 0.000024s : 161: predicate.reduce_eliminate 2.31% : 0.000045s : 363: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000006s : 82: predicate.remove_not_recompute_node 2.26% : 0.000044s : 485: predicate.replace_applicator 0.73% : 0.000014s : 186: predicate.replace_old_param 0.07% : 0.000001s : 15: predicate.reset_defer_inline 1.01% : 0.000020s : 161: predicate.reshape_eliminate 2.04% : 0.000040s : 309: predicate.row_tensor_add_zeros_like 0.13% : 0.000002s : 15: predicate.row_tensor_eliminate 2.18% : 0.000043s : 317: predicate.same_eliminate 0.41% : 0.000008s : 97: predicate.set_cell_output_no_recompute 0.62% : 0.000012s : 82: predicate.shard_identity_eliminate 0.27% : 0.000005s : 32: predicate.special_op_eliminate 0.78% : 0.000015s : 107: predicate.specialize_transform 2.05% : 0.000040s : 309: predicate.split_environ_get_set_with_tuple_value 1.46% : 0.000029s : 186: predicate.stack_unstack_eliminate 0.12% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.39% : 0.000027s : 198: predicate.switch_defer_inline 3.39% : 0.000067s : 515: predicate.switch_layer_defer_inline 4.23% : 0.000083s : 599: predicate.switch_simplify 1.08% : 0.000021s : 161: predicate.tile_eliminate 0.98% : 0.000019s : 161: predicate.transpose_eliminate 1.28% : 0.000025s : 191: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000027s : 191: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000026s : 191: predicate.tuple_list_get_item_depend_reorder 2.34% : 0.000046s : 304: predicate.tuple_list_get_item_eliminator 1.39% : 0.000027s : 191: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000043s : 293: predicate.tuple_list_set_item_eliminator 1.29% : 0.000025s : 202: predicate.tuple_to_list_eliminator_ 2.29% : 0.000045s : 363: predicate.updatestate_pure_node_eliminater 3.20% : 0.000063s : 467: predicate.updatestate_useless_node_eliminater 0.11% : 0.000002s : 15: predicate.value_based_eliminate 0.56% : 0.000011s : 82: predicate.virtual_dataset_eliminate 0.56% : 0.000011s : 82: predicate.virtual_output_eliminate 0.12% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.12% : 0.000002s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.009203 98 68.34% : 0.006290s : 58: func_graph_cloner_run.FuncGraphClonerGraph 2.34% : 0.000216s : 4: func_graph_cloner_run.FuncGraphClonerNode 29.31% : 0.002698s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.388582 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.92% : 0.003564s : 1: add_attr 0.91% : 0.003554s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000114s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000391s : 1: auto_monad 0.03% : 0.000136s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.24% : 0.000919s : 1: bootstrap 0.01% : 0.000022s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000086s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000067s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.11% : 0.000415s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.13% : 0.000489s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.18% : 0.000693s : 1: mutable_eliminate 0.00% : 0.000014s : 1: offloading_packed_experts 0.01% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 3.61% : 0.014019s : 181: opt.transform.opt_a 0.03% : 0.000113s : 1: opt.transform.opt_after_cconv 0.02% : 0.000072s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000406s : 28: opt.transform.opt_b 0.43% : 0.001688s : 4: opt.transform.opt_resolve 0.04% : 0.000149s : 2: opt.transform.opt_trans_graph 0.03% : 0.000116s : 4: opt.transform.symbol_engine_opt 15.28% : 0.059364s : 1: opt_a 0.07% : 0.000255s : 1: opt_after_cconv 0.15% : 0.000579s : 1: opt_after_jit_grad 0.15% : 0.000576s : 1: opt_b 16.20% : 0.062970s : 1: optimize 0.01% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000051s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.02% : 0.000077s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000063s : 1: remove_dup_value 6.20% : 0.024095s : 3: renormalize.infer 2.04% : 0.007919s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000050s : 1: rewriter_after_opt_a 0.12% : 0.000466s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000019s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000165s : 1: symbol_engine_optimizer 3.39% : 0.013180s : 1: task_emit 0.05% : 0.000182s : 1: tuple_transform 49.16% : 0.191026s : 1: type_inference 0.03% : 0.000135s : 1: validate TotalTime = 0.201002, [24] [bootstrap]: 0.00061831 [type_inference]: 0.165818 [event_method]: 2.672e-05 [auto_monad]: 0.00043064 [graph_reusing]: 6.44001e-06 [inline]: 3.56999e-06 [add_attr]: 0.00579112, [1] [add_attr_with_inline]: 0.00578141, [1] [Cycle 1]: 7.081e-05, [2] [tag_attr]: 3.334e-05 [meta_addattr_fg_expand]: 7.21999e-06 [parallel-infer-symbol]: 3.3e-06 [pre_auto_parallel]: 4.915e-05 [insert-virtual-dataset]: 2.89001e-06 [parallel-infer-symbol-second]: 8.30012e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.0134618, [53] [py_interpret_to_execute]: 5.14e-06 [rewriter_before_opt_a]: 0.00010987 [opt_a]: 0.00989564, [2] [Cycle 1]: 0.00815475, [45] [expand_dump_flag]: 2.44001e-06 [switch_simplify]: 6.011e-05 [loop_unroll]: 4.215e-05 [a_1]: 0.00112057 [with_stream_mark]: 1.872e-05 [recompute_prepare]: 2.426e-05 [updatestate_depend_eliminate]: 5.124e-05 [updatestate_assign_eliminate]: 1.278e-05 [updatestate_loads_eliminate]: 1.178e-05 [parameter_eliminate]: 1.55999e-06 [a_2]: 0.0002846 [accelerated_algorithm]: 4.053e-05 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 4.92999e-06 [shard_inline]: 1.861e-05 [merge_send_recv]: 1.48e-05 [auto_parallel]: 1.255e-05 [parallel]: 2.107e-05 [flash_sp]: 1.125e-05 [merge_comm]: 1.054e-05 [allreduce_fusion]: 9.76e-06 [matmul_add_comm_reduction]: 1.907e-05 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 1.98e-05 [virtual_dataset]: 1.797e-05 [get_grad_eliminate_]: 1.789e-05 [virtual_output]: 1.836e-05 [merge_forward]: 8.66002e-06 [cell_reuse_recompute_pass]: 1.04998e-06 [offload_activation]: 1.8e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.084e-05 [merge_recompute_call_nodes]: 1.83002e-06 [before_grad]: 2.858e-05 [set_forward_comm_id_for_comm_node_pass]: 1.034e-05 [meta_fg_expand]: 6.89001e-06 [flash_sp_send_recv_attached]: 4.37e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 2.574e-05 [a_after_grad]: 2.93e-05 [renormalize]: 0.00527932 [add_forward_monad_depend]: 7.66999e-06 [auto_monad_grad]: 2.39999e-06 [auto_monad_eliminator]: 5.677e-05 [cse]: 0.00035955 [a_3]: 0.0001436 [Cycle 2]: 0.00172846, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 2.09e-05 [loop_unroll]: 1.831e-05 [a_1]: 0.00054887 [with_stream_mark]: 1.801e-05 [recompute_prepare]: 1.84e-05 [updatestate_depend_eliminate]: 1.046e-05 [updatestate_assign_eliminate]: 1.158e-05 [updatestate_loads_eliminate]: 1.152e-05 [parameter_eliminate]: 1.67001e-06 [a_2]: 0.00027384 [accelerated_algorithm]: 2.223e-05 [shard]: 1.74e-06 [meta_shard_fg_expand]: 4.23001e-06 [shard_inline]: 1.812e-05 [merge_send_recv]: 1.434e-05 [auto_parallel]: 1.35e-05 [parallel]: 7.70998e-06 [flash_sp]: 4.2e-06 [merge_comm]: 9.67001e-06 [allreduce_fusion]: 9.29998e-06 [matmul_add_comm_reduction]: 1.581e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.962e-05 [virtual_dataset]: 1.804e-05 [get_grad_eliminate_]: 1.792e-05 [virtual_output]: 1.782e-05 [merge_forward]: 9.10001e-06 [cell_reuse_recompute_pass]: 3.18e-06 [offload_activation]: 1.632e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.225e-05 [merge_recompute_call_nodes]: 1.24998e-06 [before_grad]: 2.823e-05 [set_forward_comm_id_for_comm_node_pass]: 9.91e-06 [meta_fg_expand]: 6.94001e-06 [flash_sp_send_recv_attached]: 1.92999e-06 [receive_attached]: 2.58998e-06 [after_resolve]: 2.604e-05 [a_after_grad]: 2.946e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.76998e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 3.911e-05 [cse]: 5.448e-05 [a_3]: 0.00012104 [py_interpret_to_execute_after_opt_a]: 6.33998e-06 [slice_cell_reuse_recomputed_activation]: 2.36998e-06 [rewriter_after_opt_a]: 4.911e-05 [convert_after_rewriter]: 1.16997e-06 [order_py_execute_after_rewriter]: 1.09e-06 [mutable_eliminate]: 0.00083069 [opt_b]: 0.00073113, [1] [Cycle 1]: 0.000724, [7] [b_1]: 0.00049804 [b_2]: 2.108e-05 [updatestate_depend_eliminate]: 1.272e-05 [updatestate_assign_eliminate]: 1.039e-05 [updatestate_loads_eliminate]: 1.163e-05 [renormalize]: 5.79981e-07 [cse]: 6.005e-05 [optimize_parallel_all_gather_comm]: 3.032e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.435e-05 [loop_unroll]: 0.00048433 [opt_after_cconv]: 0.00025228, [1] [Cycle 1]: 0.00024627, [7] [c_1]: 0.00011723 [parameter_eliminate]: 2.81e-06 [updatestate_depend_eliminate]: 1.26e-05 [updatestate_assign_eliminate]: 1.039e-05 [updatestate_loads_eliminate]: 1.122e-05 [cse]: 5.63e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 5.905e-05 [tuple_transform]: 0.00017526, [1] [Cycle 1]: 0.00016944, [4] [d_1]: 0.00013 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 1.918e-05 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 0.00010228 [cse_after_recomputation]: 6.544e-05, [1] [Cycle 1]: 6.059e-05, [1] [cse]: 5.406e-05 [environ_conv]: 1.264e-05 [swap_dp_allreduce_reducescatter]: 1.448e-05 [bias_add_comm_swap]: 2.83003e-06 [label_micro_interleaved_index]: 4.79002e-06 [label_fine_grained_interleaved_index]: 2.84001e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.05002e-06 [micro_interleaved_order_control]: 2.23998e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 9.90025e-07 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.46e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.61998e-06 [overlap_opt_shard_in_pipeline]: 1.40999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.27999e-06 [control_data_broadcast_order]: 3.705e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 9.63002e-06 [overlap_recompute_and_grad_model_parallel]: 1.072e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 9.57999e-06 [overlap_grad_flash_sp]: 4.487e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 1.91e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.10999e-06 [symbol_engine_optimizer]: 0.00015768, [1] [Cycle 1]: 0.00014903, [6] [build]: 4.99e-06 [elim_shapecalc]: 2.437e-05 [elim_not_effective]: 3.587e-05 [opt_reshape]: 2.128e-05 [fold_const_symbol]: 3.307e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.07001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 0.00012947 [get_jit_bprop_graph]: 1.88002e-06 [rewriter_after_jit_bprop_graph]: 4.60001e-06 [opt_after_jit_grad]: 0.00057026 [validate]: 8.647e-05 [backend_pass]: 8.50006e-07 [task_emit]: 0.0137282 [execute]: 6.89999e-06 Sums bootstrap : 0.000618s : 0.32% type_inference : 0.165818s : 85.48% event_method : 0.000027s : 0.01% auto_monad : 0.000431s : 0.22% graph_reusing : 0.000006s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000049s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000110s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000081s : 0.04% optimize.opt_a.loop_unroll : 0.000060s : 0.03% optimize.opt_a.a_1 : 0.001669s : 0.86% optimize.opt_a.with_stream_mark : 0.000037s : 0.02% optimize.opt_a.recompute_prepare : 0.000043s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000062s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000024s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000023s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000558s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000063s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000037s : 0.02% optimize.opt_a.merge_send_recv : 0.000029s : 0.02% optimize.opt_a.auto_parallel : 0.000026s : 0.01% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000020s : 0.01% optimize.opt_a.allreduce_fusion : 0.000019s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000035s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000039s : 0.02% optimize.opt_a.virtual_dataset : 0.000036s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000036s : 0.02% optimize.opt_a.virtual_output : 0.000036s : 0.02% optimize.opt_a.merge_forward : 0.000018s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000034s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000063s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000057s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.01% optimize.opt_a.meta_fg_expand : 0.000014s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000052s : 0.03% optimize.opt_a.a_after_grad : 0.000059s : 0.03% optimize.opt_a.renormalize : 0.005279s : 2.72% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000096s : 0.05% optimize.opt_a.cse : 0.000414s : 0.21% optimize.opt_a.a_3 : 0.000265s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000831s : 0.43% optimize.opt_b.b_1 : 0.000498s : 0.26% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000060s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.01% optimize.loop_unroll : 0.000484s : 0.25% optimize.opt_after_cconv.c_1 : 0.000117s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.cse : 0.000056s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000059s : 0.03% optimize.tuple_transform.d_1 : 0.000130s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000102s : 0.05% optimize.cse_after_recomputation.cse : 0.000054s : 0.03% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000037s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000036s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000033s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000129s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000570s : 0.29% validate : 0.000086s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.013728s : 7.08% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000428 168 17.09% : 0.000073s : 4: substitution.arithmetic_simplify 1.91% : 0.000008s : 2: substitution.depend_value_elim 1.26% : 0.000005s : 10: substitution.elim_not_effective 1.09% : 0.000005s : 10: substitution.fold_const_symbol 2.77% : 0.000012s : 16: substitution.graph_param_transform 47.48% : 0.000203s : 4: substitution.inline 2.04% : 0.000009s : 20: substitution.j_node_and_user_rematch 4.37% : 0.000019s : 2: substitution.less_batch_normalization 1.60% : 0.000007s : 12: substitution.load_eliminater 3.38% : 0.000014s : 20: substitution.remove_not_recompute_node 1.82% : 0.000008s : 10: substitution.replace_old_param 7.32% : 0.000031s : 26: substitution.updatestate_pure_node_eliminater 7.87% : 0.000034s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.165672 2 93.90% : 0.155568s : 1: type_inference.infer 6.10% : 0.010104s : 1: type_inference.specialize ------[replace.] 0.000045 4 100.00% : 0.000045s : 4: replace.inline ------[match.] 0.000200 4 100.00% : 0.000200s : 4: match.inline ------[predicate.] 0.000554 4085 0.91% : 0.000005s : 40: predicate.accumulaten_eliminater 0.89% : 0.000005s : 18: predicate.ad_related_special_op_eliminate 0.72% : 0.000004s : 32: predicate.addn_check_dump 0.92% : 0.000005s : 40: predicate.addn_zero_filter 0.83% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 2.17% : 0.000012s : 72: predicate.arithmetic_simplify 0.95% : 0.000005s : 40: predicate.cast_eliminate 0.77% : 0.000004s : 32: predicate.check_bprop_eliminate 0.74% : 0.000004s : 32: predicate.compare_switch_simplify 0.23% : 0.000001s : 16: predicate.const_output_eliminate 0.84% : 0.000005s : 32: predicate.depend_value_elim 0.96% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 1.04% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.90% : 0.000005s : 40: predicate.dict_set_item_eliminator 1.06% : 0.000006s : 34: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 16: predicate.elim_not_effective 0.46% : 0.000003s : 16: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.20% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.21% : 0.000007s : 56: predicate.environ_get_depend_swap 2.02% : 0.000011s : 88: predicate.environ_get_eliminate 1.22% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.99% : 0.000006s : 44: predicate.exchange_switch_depend_value 1.40% : 0.000008s : 44: predicate.float_depend_g_call 0.74% : 0.000004s : 32: predicate.float_environ_get_switch 1.12% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 16: predicate.fold_const_symbol 0.80% : 0.000004s : 32: predicate.get_grad_eliminate 0.31% : 0.000002s : 16: predicate.graph_param_transform 0.74% : 0.000004s : 32: predicate.incorporate_call 0.70% : 0.000004s : 32: predicate.incorporate_call_switch 5.22% : 0.000029s : 180: predicate.inline 0.91% : 0.000005s : 32: predicate.inline_without_move 0.42% : 0.000002s : 32: predicate.j_node_and_user_rematch 1.03% : 0.000006s : 35: predicate.less_batch_normalization 1.91% : 0.000011s : 72: predicate.list_to_tuple_eliminator_ 2.54% : 0.000014s : 112: predicate.load_eliminater 0.74% : 0.000004s : 16: predicate.loop_unroll_after_grad 1.49% : 0.000008s : 61: predicate.loop_unroll_before_grad 1.71% : 0.000009s : 72: predicate.make_slice_get_slice_eliminator 0.77% : 0.000004s : 32: predicate.merge_addn 0.82% : 0.000005s : 32: predicate.micro_step_allgather_replace 0.81% : 0.000004s : 32: predicate.mini_step_allgather_replace 0.86% : 0.000005s : 40: predicate.minmaximum_grad 0.82% : 0.000005s : 16: predicate.mutable_eliminate 0.49% : 0.000003s : 16: predicate.opt_reshape 0.45% : 0.000002s : 16: predicate.parallel_virtual_node 1.20% : 0.000007s : 44: predicate.partial_defer_inline 1.36% : 0.000008s : 56: predicate.partial_eliminate 0.91% : 0.000005s : 40: predicate.print_const_string_wrapper 0.84% : 0.000005s : 32: predicate.reduce_all_const_elim 1.12% : 0.000006s : 40: predicate.reduce_eliminate 2.45% : 0.000014s : 112: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000003s : 32: predicate.remove_not_recompute_node 1.27% : 0.000007s : 72: predicate.replace_applicator 0.47% : 0.000003s : 32: predicate.replace_old_param 0.26% : 0.000001s : 16: predicate.reset_defer_inline 1.00% : 0.000006s : 40: predicate.reshape_eliminate 0.87% : 0.000005s : 32: predicate.row_tensor_add_zeros_like 0.41% : 0.000002s : 16: predicate.row_tensor_eliminate 0.94% : 0.000005s : 32: predicate.same_eliminate 0.54% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.84% : 0.000005s : 32: predicate.shard_identity_eliminate 0.91% : 0.000005s : 34: predicate.special_op_eliminate 0.81% : 0.000005s : 32: predicate.specialize_transform 0.89% : 0.000005s : 32: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000005s : 32: predicate.stack_unstack_eliminate 0.42% : 0.000002s : 16: predicate.switch_call_monad_eliminater 1.06% : 0.000006s : 44: predicate.switch_defer_inline 1.83% : 0.000010s : 76: predicate.switch_layer_defer_inline 3.83% : 0.000021s : 153: predicate.switch_simplify 0.91% : 0.000005s : 40: predicate.tile_eliminate 0.90% : 0.000005s : 40: predicate.transpose_eliminate 1.71% : 0.000009s : 72: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.74% : 0.000010s : 72: predicate.tuple_list_get_item_depend_reorder 2.69% : 0.000015s : 104: predicate.tuple_list_get_item_eliminator 1.67% : 0.000009s : 72: predicate.tuple_list_get_set_item_eliminator 2.60% : 0.000014s : 104: predicate.tuple_list_set_item_eliminator 1.68% : 0.000009s : 72: predicate.tuple_to_list_eliminator_ 2.55% : 0.000014s : 112: predicate.updatestate_pure_node_eliminater 3.52% : 0.000020s : 144: predicate.updatestate_useless_node_eliminater 0.45% : 0.000002s : 16: predicate.value_based_eliminate 0.80% : 0.000004s : 32: predicate.virtual_dataset_eliminate 0.81% : 0.000004s : 32: predicate.virtual_output_eliminate 0.42% : 0.000002s : 18: predicate.virtual_view_grad_eliminate 0.51% : 0.000003s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003780 26 63.61% : 0.002404s : 20: func_graph_cloner_run.FuncGraphClonerGraph 36.39% : 0.001376s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.229506 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.53% : 0.005796s : 1: add_attr 2.52% : 0.005785s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000106s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.19% : 0.000444s : 1: auto_monad 0.06% : 0.000136s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.28% : 0.000642s : 1: bootstrap 0.01% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000040s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000068s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.01% : 0.000033s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000010s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.22% : 0.000494s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.39% : 0.000906s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000050s : 1: opt.transform.mutable_eliminate 1.33% : 0.003060s : 78: opt.transform.opt_a 0.05% : 0.000116s : 1: opt.transform.opt_after_cconv 0.03% : 0.000073s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.000488s : 28: opt.transform.opt_b 0.06% : 0.000147s : 2: opt.transform.opt_trans_graph 0.05% : 0.000111s : 4: opt.transform.symbol_engine_opt 4.31% : 0.009900s : 1: opt_a 0.11% : 0.000256s : 1: opt_after_cconv 0.25% : 0.000581s : 1: opt_after_jit_grad 0.32% : 0.000735s : 1: opt_b 5.87% : 0.013468s : 1: optimize 0.01% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000048s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000054s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000064s : 1: remove_dup_value 1.53% : 0.003513s : 1: renormalize.infer 0.77% : 0.001757s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000052s : 1: rewriter_after_opt_a 0.05% : 0.000115s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000161s : 1: symbol_engine_optimizer 5.99% : 0.013742s : 1: task_emit 0.08% : 0.000178s : 1: tuple_transform 72.26% : 0.165843s : 1: type_inference 0.06% : 0.000136s : 1: validate TotalTime = 0.276661, [24] [bootstrap]: 0.00069155 [type_inference]: 0.1923 [event_method]: 0.0003632 [auto_monad]: 0.0003841 [graph_reusing]: 8.07e-06 [inline]: 3.3e-06 [add_attr]: 0.00414329, [1] [add_attr_with_inline]: 0.00413385, [1] [Cycle 1]: 0.00011449, [2] [tag_attr]: 7.165e-05 [meta_addattr_fg_expand]: 1.386e-05 [parallel-infer-symbol]: 4.05e-06 [pre_auto_parallel]: 7.624e-05 [insert-virtual-dataset]: 4.57998e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 3.02002e-06 [optimize]: 0.0644631, [53] [py_interpret_to_execute]: 4.85001e-06 [rewriter_before_opt_a]: 0.00049639 [opt_a]: 0.060619, [4] [Cycle 1]: 0.0461101, [45] [expand_dump_flag]: 5.93998e-06 [switch_simplify]: 0.0001793 [loop_unroll]: 9.26e-05 [a_1]: 0.00285318 [with_stream_mark]: 3.374e-05 [recompute_prepare]: 4.305e-05 [updatestate_depend_eliminate]: 0.00010754 [updatestate_assign_eliminate]: 2.122e-05 [updatestate_loads_eliminate]: 1.888e-05 [parameter_eliminate]: 2.34999e-06 [a_2]: 0.00052745 [accelerated_algorithm]: 5.634e-05 [shard]: 1.92999e-06 [meta_shard_fg_expand]: 8.43001e-06 [shard_inline]: 3.302e-05 [merge_send_recv]: 3.566e-05 [auto_parallel]: 2.402e-05 [parallel]: 2.379e-05 [flash_sp]: 1.597e-05 [merge_comm]: 2.055e-05 [allreduce_fusion]: 2.007e-05 [matmul_add_comm_reduction]: 3.783e-05 [allreduce_slice_to_reducescatter]: 8.30012e-07 [virtual_shard_identity]: 3.609e-05 [virtual_dataset]: 3.208e-05 [get_grad_eliminate_]: 3.285e-05 [virtual_output]: 3.116e-05 [merge_forward]: 2.067e-05 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 2.945e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.598e-05 [merge_recompute_call_nodes]: 1.56998e-06 [before_grad]: 5.26e-05 [set_forward_comm_id_for_comm_node_pass]: 2.308e-05 [meta_fg_expand]: 0.00957984 [flash_sp_send_recv_attached]: 8.30999e-06 [receive_attached]: 2.97002e-06 [after_resolve]: 0.00020255 [a_after_grad]: 0.00026103 [renormalize]: 0.028602 [add_forward_monad_depend]: 1.895e-05 [auto_monad_grad]: 1.836e-05 [auto_monad_eliminator]: 0.00017544 [cse]: 0.0004697 [a_3]: 0.00196309 [Cycle 2]: 0.0100229, [45] [expand_dump_flag]: 4.27e-06 [switch_simplify]: 0.00012693 [loop_unroll]: 0.00012088 [a_1]: 0.00380633 [with_stream_mark]: 4.954e-05 [recompute_prepare]: 4.583e-05 [updatestate_depend_eliminate]: 3.066e-05 [updatestate_assign_eliminate]: 2.327e-05 [updatestate_loads_eliminate]: 2.141e-05 [parameter_eliminate]: 5.67001e-06 [a_2]: 0.00085677 [accelerated_algorithm]: 3.109e-05 [shard]: 2.19001e-06 [meta_shard_fg_expand]: 8.10999e-06 [shard_inline]: 2.369e-05 [merge_send_recv]: 2.07e-05 [auto_parallel]: 1.92e-05 [parallel]: 9.56998e-06 [flash_sp]: 5.05999e-06 [merge_comm]: 1.283e-05 [allreduce_fusion]: 1.242e-05 [matmul_add_comm_reduction]: 2.389e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 2.485e-05 [virtual_dataset]: 2.396e-05 [get_grad_eliminate_]: 2.873e-05 [virtual_output]: 2.378e-05 [merge_forward]: 1.635e-05 [cell_reuse_recompute_pass]: 2.59999e-06 [offload_activation]: 2.824e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.72e-05 [merge_recompute_call_nodes]: 1.46002e-06 [before_grad]: 4.178e-05 [set_forward_comm_id_for_comm_node_pass]: 1.758e-05 [meta_fg_expand]: 0.00023172 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.91999e-06 [after_resolve]: 3.876e-05 [a_after_grad]: 3.808e-05 [renormalize]: 0.00350237 [add_forward_monad_depend]: 9.09e-06 [auto_monad_grad]: 2.32999e-06 [auto_monad_eliminator]: 6.491e-05 [cse]: 0.00013664 [a_3]: 0.00017176 [Cycle 3]: 0.0028433, [45] [expand_dump_flag]: 2.76999e-06 [switch_simplify]: 2.464e-05 [loop_unroll]: 2.398e-05 [a_1]: 0.00065869 [with_stream_mark]: 2.086e-05 [recompute_prepare]: 7.986e-05 [updatestate_depend_eliminate]: 4.537e-05 [updatestate_assign_eliminate]: 1.367e-05 [updatestate_loads_eliminate]: 1.486e-05 [parameter_eliminate]: 1.94999e-06 [a_2]: 0.00027385 [accelerated_algorithm]: 2.32e-05 [shard]: 2.31e-06 [meta_shard_fg_expand]: 4.87e-06 [shard_inline]: 1.81e-05 [merge_send_recv]: 1.423e-05 [auto_parallel]: 1.484e-05 [parallel]: 7.63999e-06 [flash_sp]: 1.35001e-06 [merge_comm]: 1.078e-05 [allreduce_fusion]: 9.69e-06 [matmul_add_comm_reduction]: 1.534e-05 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 1.871e-05 [virtual_dataset]: 1.876e-05 [get_grad_eliminate_]: 1.782e-05 [virtual_output]: 1.731e-05 [merge_forward]: 9.16998e-06 [cell_reuse_recompute_pass]: 3.38e-06 [offload_activation]: 1.733e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.173e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 2.894e-05 [set_forward_comm_id_for_comm_node_pass]: 1.022e-05 [meta_fg_expand]: 6.94999e-06 [flash_sp_send_recv_attached]: 2.06998e-06 [receive_attached]: 2.56e-06 [after_resolve]: 2.643e-05 [a_after_grad]: 2.928e-05 [renormalize]: 0.00082417 [add_forward_monad_depend]: 4.84998e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 4.106e-05 [cse]: 8.283e-05 [a_3]: 0.00012727 [Cycle 4]: 0.00161908, [45] [expand_dump_flag]: 1.47001e-06 [switch_simplify]: 1.911e-05 [loop_unroll]: 1.744e-05 [a_1]: 0.00049898 [with_stream_mark]: 1.533e-05 [recompute_prepare]: 1.722e-05 [updatestate_depend_eliminate]: 9.96e-06 [updatestate_assign_eliminate]: 1.04e-05 [updatestate_loads_eliminate]: 1.051e-05 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00026423 [accelerated_algorithm]: 2.102e-05 [shard]: 1.19e-06 [meta_shard_fg_expand]: 3.43999e-06 [shard_inline]: 1.733e-05 [merge_send_recv]: 1.116e-05 [auto_parallel]: 1.2e-05 [parallel]: 4.33001e-06 [flash_sp]: 1.15001e-06 [merge_comm]: 9.82999e-06 [allreduce_fusion]: 9.86e-06 [matmul_add_comm_reduction]: 1.549e-05 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 1.982e-05 [virtual_dataset]: 1.742e-05 [get_grad_eliminate_]: 1.702e-05 [virtual_output]: 1.652e-05 [merge_forward]: 8.43999e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 1.478e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.987e-05 [merge_recompute_call_nodes]: 9.89996e-07 [before_grad]: 2.86e-05 [set_forward_comm_id_for_comm_node_pass]: 9.74e-06 [meta_fg_expand]: 6.41998e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 1.20999e-06 [after_resolve]: 2.343e-05 [a_after_grad]: 2.827e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.56002e-06 [auto_monad_grad]: 1.55001e-06 [auto_monad_eliminator]: 3.122e-05 [cse]: 5.554e-05 [a_3]: 0.00011787 [py_interpret_to_execute_after_opt_a]: 4.40999e-06 [slice_cell_reuse_recomputed_activation]: 2.58998e-06 [rewriter_after_opt_a]: 4.855e-05 [convert_after_rewriter]: 1.20001e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00083246 [opt_b]: 0.00058714, [1] [Cycle 1]: 0.00058013, [7] [b_1]: 0.00041894 [b_2]: 2.101e-05 [updatestate_depend_eliminate]: 1.233e-05 [updatestate_assign_eliminate]: 1.152e-05 [updatestate_loads_eliminate]: 1.114e-05 [renormalize]: 5.39992e-07 [cse]: 6.73e-05 [optimize_parallel_all_gather_comm]: 3.069e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.786e-05 [loop_unroll]: 0.00049218 [opt_after_cconv]: 0.00025824, [1] [Cycle 1]: 0.00025134, [7] [c_1]: 0.00011178 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 1.224e-05 [updatestate_assign_eliminate]: 1.074e-05 [updatestate_loads_eliminate]: 1.085e-05 [cse]: 6.448e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 6.451e-05 [tuple_transform]: 0.00018254, [1] [Cycle 1]: 0.00017721, [4] [d_1]: 0.00013096 [none_parameter_eliminate]: 2.11e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 2.298e-05 [partial_unused_args_eliminate]: 2.22999e-06 [add_recomputation]: 0.00012389 [cse_after_recomputation]: 7.207e-05, [1] [Cycle 1]: 6.664e-05, [1] [cse]: 6.008e-05 [environ_conv]: 1.306e-05 [swap_dp_allreduce_reducescatter]: 1.696e-05 [bias_add_comm_swap]: 2.84001e-06 [label_micro_interleaved_index]: 5.05999e-06 [label_fine_grained_interleaved_index]: 2.74001e-06 [merge_cast_opt]: 1.44998e-06 [slice_recompute_activation]: 2.08002e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.64001e-06 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.45001e-06 [overlap_opt_shard_in_pipeline]: 1.96e-06 [overlap_opt_shard_grad_in_pipeline]: 1.77999e-06 [control_data_broadcast_order]: 4.136e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 1.095e-05 [overlap_recompute_and_grad_model_parallel]: 1.175e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.31998e-06 [overlap_grad_ring_attention]: 1.164e-05 [overlap_grad_flash_sp]: 5.239e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.31998e-06 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 0.0001612, [1] [Cycle 1]: 0.00015672, [6] [build]: 6.44999e-06 [elim_shapecalc]: 2.486e-05 [elim_not_effective]: 3.952e-05 [opt_reshape]: 2.21e-05 [fold_const_symbol]: 3.119e-05 [renormalize]: 1.60013e-07 [detach_backward]: 1.60001e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 0.00013245 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 5.11002e-06 [opt_after_jit_grad]: 0.0005526 [validate]: 8.731e-05 [backend_pass]: 1.15001e-06 [task_emit]: 0.0131599 [execute]: 5.84999e-06 Sums bootstrap : 0.000692s : 0.26% type_inference : 0.192300s : 71.00% event_method : 0.000363s : 0.13% auto_monad : 0.000384s : 0.14% graph_reusing : 0.000008s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000072s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000076s : 0.03% insert-virtual-dataset : 0.000005s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000496s : 0.18% optimize.opt_a.expand_dump_flag : 0.000014s : 0.01% optimize.opt_a.switch_simplify : 0.000350s : 0.13% optimize.opt_a.loop_unroll : 0.000255s : 0.09% optimize.opt_a.a_1 : 0.007817s : 2.89% optimize.opt_a.with_stream_mark : 0.000119s : 0.04% optimize.opt_a.recompute_prepare : 0.000186s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000194s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000069s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000066s : 0.02% optimize.opt_a.parameter_eliminate : 0.000011s : 0.00% optimize.opt_a.a_2 : 0.001922s : 0.71% optimize.opt_a.accelerated_algorithm : 0.000132s : 0.05% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000025s : 0.01% optimize.opt_a.shard_inline : 0.000092s : 0.03% optimize.opt_a.merge_send_recv : 0.000082s : 0.03% optimize.opt_a.auto_parallel : 0.000070s : 0.03% optimize.opt_a.parallel : 0.000045s : 0.02% optimize.opt_a.flash_sp : 0.000024s : 0.01% optimize.opt_a.merge_comm : 0.000054s : 0.02% optimize.opt_a.allreduce_fusion : 0.000052s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000093s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000099s : 0.04% optimize.opt_a.virtual_dataset : 0.000092s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000096s : 0.04% optimize.opt_a.virtual_output : 0.000089s : 0.03% optimize.opt_a.merge_forward : 0.000055s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000009s : 0.00% optimize.opt_a.offload_activation : 0.000090s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000165s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000152s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000061s : 0.02% optimize.opt_a.meta_fg_expand : 0.009825s : 3.63% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.01% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000291s : 0.11% optimize.opt_a.a_after_grad : 0.000357s : 0.13% optimize.opt_a.renormalize : 0.032929s : 12.16% optimize.opt_a.add_forward_monad_depend : 0.000034s : 0.01% optimize.opt_a.auto_monad_grad : 0.000024s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000313s : 0.12% optimize.opt_a.cse : 0.000745s : 0.27% optimize.opt_a.a_3 : 0.002380s : 0.88% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000832s : 0.31% optimize.opt_b.b_1 : 0.000419s : 0.15% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000067s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.01% optimize.loop_unroll : 0.000492s : 0.18% optimize.opt_after_cconv.c_1 : 0.000112s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000064s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000065s : 0.02% optimize.tuple_transform.d_1 : 0.000131s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000023s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000124s : 0.05% optimize.cse_after_recomputation.cse : 0.000060s : 0.02% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000017s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000041s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000011s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000012s : 0.00% optimize.overlap_grad_flash_sp : 0.000052s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000025s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000040s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000031s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000132s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000553s : 0.20% validate : 0.000087s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.013160s : 4.86% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.003537 564 4.57% : 0.000162s : 9: substitution.arithmetic_simplify 0.62% : 0.000022s : 8: substitution.depend_value_elim 0.17% : 0.000006s : 10: substitution.elim_not_effective 0.29% : 0.000010s : 13: substitution.float_depend_g_call 0.11% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.13% : 0.000005s : 10: substitution.fold_const_symbol 42.39% : 0.001499s : 8: substitution.getattr_setattr_resolve 0.43% : 0.000015s : 15: substitution.graph_param_transform 0.08% : 0.000003s : 2: substitution.incorporate_call 0.08% : 0.000003s : 2: substitution.incorporate_call_switch 33.19% : 0.001174s : 26: substitution.inline 1.00% : 0.000035s : 5: substitution.inline_without_move 0.78% : 0.000028s : 51: substitution.j_node_and_user_rematch 0.70% : 0.000025s : 4: substitution.less_batch_normalization 0.36% : 0.000013s : 20: substitution.load_eliminater 0.47% : 0.000017s : 11: substitution.minmaximum_grad 0.34% : 0.000012s : 13: substitution.partial_eliminate 1.10% : 0.000039s : 51: substitution.remove_not_recompute_node 2.34% : 0.000083s : 35: substitution.replace_applicator 0.94% : 0.000033s : 53: substitution.replace_old_param 0.20% : 0.000007s : 2: substitution.set_cell_output_no_recompute 0.33% : 0.000012s : 3: substitution.switch_simplify 0.99% : 0.000035s : 11: substitution.tuple_list_convert_item_index_to_positive 0.45% : 0.000016s : 11: substitution.tuple_list_get_item_const_eliminator 0.61% : 0.000022s : 11: substitution.tuple_list_get_item_depend_reorder 1.87% : 0.000066s : 24: substitution.tuple_list_get_item_eliminator 0.62% : 0.000022s : 11: substitution.tuple_list_get_set_item_eliminator 1.83% : 0.000065s : 62: substitution.updatestate_pure_node_eliminater 3.01% : 0.000106s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.192132 2 96.19% : 0.184803s : 1: type_inference.infer 3.81% : 0.007329s : 1: type_inference.specialize ------[replace.] 0.000855 49 10.10% : 0.000086s : 6: replace.getattr_setattr_resolve 44.97% : 0.000385s : 26: replace.inline 5.13% : 0.000044s : 2: replace.replace_applicator 5.37% : 0.000046s : 3: replace.switch_simplify 26.85% : 0.000230s : 11: replace.tuple_list_get_item_eliminator 7.58% : 0.000065s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002625 49 53.80% : 0.001412s : 6: match.getattr_setattr_resolve 44.12% : 0.001158s : 26: match.inline 0.54% : 0.000014s : 2: match.replace_applicator 0.34% : 0.000009s : 3: match.switch_simplify 0.85% : 0.000022s : 11: match.tuple_list_get_item_eliminator 0.34% : 0.000009s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001959 14794 1.08% : 0.000021s : 161: predicate.accumulaten_eliminater 0.23% : 0.000005s : 17: predicate.ad_related_special_op_eliminate 0.69% : 0.000013s : 107: predicate.addn_check_dump 1.04% : 0.000020s : 161: predicate.addn_zero_filter 0.95% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.32% : 0.000045s : 263: predicate.arithmetic_simplify 1.10% : 0.000022s : 161: predicate.cast_eliminate 2.07% : 0.000041s : 317: predicate.check_bprop_eliminate 0.68% : 0.000013s : 107: predicate.compare_switch_simplify 0.06% : 0.000001s : 15: predicate.const_output_eliminate 0.70% : 0.000014s : 102: predicate.depend_value_elim 1.09% : 0.000021s : 161: predicate.dict_get_item_const_eliminator 1.15% : 0.000022s : 161: predicate.dict_get_item_eliminator 1.02% : 0.000020s : 161: predicate.dict_set_item_eliminator 0.28% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 15: predicate.elim_not_effective 0.13% : 0.000003s : 15: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000021s : 176: predicate.environ_add_const_eliminate 1.09% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.11% : 0.000022s : 176: predicate.environ_get_depend_swap 1.77% : 0.000035s : 278: predicate.environ_get_eliminate 1.05% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.26% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.70% : 0.000033s : 198: predicate.float_depend_g_call 0.71% : 0.000014s : 107: predicate.float_environ_get_switch 0.79% : 0.000016s : 122: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 15: predicate.fold_const_symbol 0.57% : 0.000011s : 82: predicate.get_grad_eliminate 0.56% : 0.000011s : 40: predicate.getattr_setattr_resolve 0.06% : 0.000001s : 15: predicate.graph_param_transform 0.67% : 0.000013s : 102: predicate.incorporate_call 0.61% : 0.000012s : 102: predicate.incorporate_call_switch 4.74% : 0.000093s : 578: predicate.inline 1.59% : 0.000031s : 186: predicate.inline_without_move 0.30% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.70% : 0.000014s : 85: predicate.less_batch_normalization 1.32% : 0.000026s : 202: predicate.list_to_tuple_eliminator_ 2.31% : 0.000045s : 363: predicate.load_eliminater 0.18% : 0.000003s : 15: predicate.loop_unroll_after_grad 1.85% : 0.000036s : 273: predicate.loop_unroll_before_grad 1.23% : 0.000024s : 191: predicate.make_slice_get_slice_eliminator 0.75% : 0.000015s : 107: predicate.merge_addn 2.01% : 0.000039s : 309: predicate.micro_step_allgather_replace 1.99% : 0.000039s : 309: predicate.mini_step_allgather_replace 1.00% : 0.000020s : 161: predicate.minmaximum_grad 0.21% : 0.000004s : 15: predicate.mutable_eliminate 0.13% : 0.000003s : 15: predicate.opt_reshape 0.13% : 0.000003s : 15: predicate.parallel_virtual_node 1.72% : 0.000034s : 198: predicate.partial_defer_inline 1.37% : 0.000027s : 187: predicate.partial_eliminate 1.02% : 0.000020s : 161: predicate.print_const_string_wrapper 0.68% : 0.000013s : 102: predicate.reduce_all_const_elim 1.24% : 0.000024s : 161: predicate.reduce_eliminate 2.28% : 0.000045s : 363: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000007s : 82: predicate.remove_not_recompute_node 2.16% : 0.000042s : 485: predicate.replace_applicator 0.75% : 0.000015s : 186: predicate.replace_old_param 0.08% : 0.000001s : 15: predicate.reset_defer_inline 1.02% : 0.000020s : 161: predicate.reshape_eliminate 2.04% : 0.000040s : 309: predicate.row_tensor_add_zeros_like 0.13% : 0.000002s : 15: predicate.row_tensor_eliminate 2.16% : 0.000042s : 317: predicate.same_eliminate 0.43% : 0.000008s : 97: predicate.set_cell_output_no_recompute 0.61% : 0.000012s : 82: predicate.shard_identity_eliminate 0.25% : 0.000005s : 32: predicate.special_op_eliminate 0.78% : 0.000015s : 107: predicate.specialize_transform 2.06% : 0.000040s : 309: predicate.split_environ_get_set_with_tuple_value 1.44% : 0.000028s : 186: predicate.stack_unstack_eliminate 0.11% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.39% : 0.000027s : 198: predicate.switch_defer_inline 3.45% : 0.000068s : 515: predicate.switch_layer_defer_inline 4.20% : 0.000082s : 599: predicate.switch_simplify 1.04% : 0.000020s : 161: predicate.tile_eliminate 1.02% : 0.000020s : 161: predicate.transpose_eliminate 1.34% : 0.000026s : 191: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000027s : 191: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000025s : 191: predicate.tuple_list_get_item_depend_reorder 2.31% : 0.000045s : 304: predicate.tuple_list_get_item_eliminator 1.33% : 0.000026s : 191: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000042s : 293: predicate.tuple_list_set_item_eliminator 1.28% : 0.000025s : 202: predicate.tuple_to_list_eliminator_ 2.33% : 0.000046s : 363: predicate.updatestate_pure_node_eliminater 3.10% : 0.000061s : 467: predicate.updatestate_useless_node_eliminater 0.14% : 0.000003s : 15: predicate.value_based_eliminate 0.61% : 0.000012s : 82: predicate.virtual_dataset_eliminate 0.58% : 0.000011s : 82: predicate.virtual_output_eliminate 0.12% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.12% : 0.000002s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.009169 98 68.02% : 0.006236s : 58: func_graph_cloner_run.FuncGraphClonerGraph 2.36% : 0.000216s : 4: func_graph_cloner_run.FuncGraphClonerNode 29.62% : 0.002716s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.394883 307 0.00% : 0.000004s : 1: ForceFp32Comm 1.05% : 0.004148s : 1: add_attr 1.05% : 0.004138s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000129s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000397s : 1: auto_monad 0.04% : 0.000139s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.18% : 0.000716s : 1: bootstrap 0.01% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000045s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000076s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.10% : 0.000376s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.13% : 0.000501s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.21% : 0.000840s : 1: mutable_eliminate 0.00% : 0.000014s : 1: offloading_packed_experts 0.01% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000028s : 1: opt.transform.mutable_eliminate 3.61% : 0.014237s : 181: opt.transform.opt_a 0.03% : 0.000110s : 1: opt.transform.opt_after_cconv 0.02% : 0.000070s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000408s : 28: opt.transform.opt_b 0.43% : 0.001680s : 4: opt.transform.opt_resolve 0.04% : 0.000150s : 2: opt.transform.opt_trans_graph 0.03% : 0.000113s : 4: opt.transform.symbol_engine_opt 15.35% : 0.060623s : 1: opt_a 0.07% : 0.000262s : 1: opt_after_cconv 0.14% : 0.000561s : 1: opt_after_jit_grad 0.15% : 0.000590s : 1: opt_b 16.33% : 0.064470s : 1: optimize 0.01% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000056s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000015s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000081s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000070s : 1: remove_dup_value 6.25% : 0.024692s : 3: renormalize.infer 2.08% : 0.008204s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000052s : 1: rewriter_after_opt_a 0.13% : 0.000505s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000020s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000164s : 1: symbol_engine_optimizer 3.34% : 0.013177s : 1: task_emit 0.05% : 0.000186s : 1: tuple_transform 48.70% : 0.192317s : 1: type_inference 0.04% : 0.000142s : 1: validate TotalTime = 0.193075, [24] [bootstrap]: 0.00074822 [type_inference]: 0.15982 [event_method]: 1.93e-05 [auto_monad]: 0.00028022 [graph_reusing]: 3.99002e-06 [inline]: 2.32999e-06 [add_attr]: 0.0046133, [1] [add_attr_with_inline]: 0.00460106, [1] [Cycle 1]: 7.216e-05, [2] [tag_attr]: 3.58e-05 [meta_addattr_fg_expand]: 6.66999e-06 [parallel-infer-symbol]: 3.15998e-06 [pre_auto_parallel]: 4.049e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.0132626, [53] [py_interpret_to_execute]: 5.56e-06 [rewriter_before_opt_a]: 0.00012173 [opt_a]: 0.0100615, [2] [Cycle 1]: 0.00831608, [45] [expand_dump_flag]: 1.87001e-06 [switch_simplify]: 5.758e-05 [loop_unroll]: 4.255e-05 [a_1]: 0.00115686 [with_stream_mark]: 2.158e-05 [recompute_prepare]: 2.446e-05 [updatestate_depend_eliminate]: 5.377e-05 [updatestate_assign_eliminate]: 1.287e-05 [updatestate_loads_eliminate]: 1.231e-05 [parameter_eliminate]: 1.23002e-06 [a_2]: 0.00028803 [accelerated_algorithm]: 4.098e-05 [shard]: 1.84e-06 [meta_shard_fg_expand]: 4.28001e-06 [shard_inline]: 1.853e-05 [merge_send_recv]: 1.658e-05 [auto_parallel]: 1.334e-05 [parallel]: 2.752e-05 [flash_sp]: 1.202e-05 [merge_comm]: 1.081e-05 [allreduce_fusion]: 9.94999e-06 [matmul_add_comm_reduction]: 2.983e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 2.246e-05 [virtual_dataset]: 1.895e-05 [get_grad_eliminate_]: 1.835e-05 [virtual_output]: 1.865e-05 [merge_forward]: 9.36e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.848e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.223e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 2.842e-05 [set_forward_comm_id_for_comm_node_pass]: 1.051e-05 [meta_fg_expand]: 7.75e-06 [flash_sp_send_recv_attached]: 5.24e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 2.59e-05 [a_after_grad]: 2.959e-05 [renormalize]: 0.0055913 [add_forward_monad_depend]: 7.01001e-06 [auto_monad_grad]: 1.81003e-06 [auto_monad_eliminator]: 5.39e-05 [cse]: 0.00014397 [a_3]: 0.00013953 [Cycle 2]: 0.00173428, [45] [expand_dump_flag]: 2.46998e-06 [switch_simplify]: 1.981e-05 [loop_unroll]: 1.869e-05 [a_1]: 0.00055107 [with_stream_mark]: 1.793e-05 [recompute_prepare]: 1.834e-05 [updatestate_depend_eliminate]: 1.022e-05 [updatestate_assign_eliminate]: 1.276e-05 [updatestate_loads_eliminate]: 1.224e-05 [parameter_eliminate]: 1.49e-06 [a_2]: 0.00027189 [accelerated_algorithm]: 2.294e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 4.22998e-06 [shard_inline]: 1.767e-05 [merge_send_recv]: 1.57e-05 [auto_parallel]: 1.453e-05 [parallel]: 9.37999e-06 [flash_sp]: 4.70999e-06 [merge_comm]: 9.75002e-06 [allreduce_fusion]: 9.49999e-06 [matmul_add_comm_reduction]: 1.706e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.93e-05 [virtual_dataset]: 1.769e-05 [get_grad_eliminate_]: 1.841e-05 [virtual_output]: 1.76e-05 [merge_forward]: 8.82e-06 [cell_reuse_recompute_pass]: 2.94999e-06 [offload_activation]: 1.811e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.156e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 2.791e-05 [set_forward_comm_id_for_comm_node_pass]: 9.99999e-06 [meta_fg_expand]: 7.28999e-06 [flash_sp_send_recv_attached]: 2.03997e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 2.593e-05 [a_after_grad]: 2.894e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.74e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 3.557e-05 [cse]: 4.927e-05 [a_3]: 0.00013159 [py_interpret_to_execute_after_opt_a]: 6.20002e-06 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 4.698e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.25001e-06 [mutable_eliminate]: 0.00068825 [opt_b]: 0.00057589, [1] [Cycle 1]: 0.00056927, [7] [b_1]: 0.00042123 [b_2]: 2.125e-05 [updatestate_depend_eliminate]: 1.133e-05 [updatestate_assign_eliminate]: 1.063e-05 [updatestate_loads_eliminate]: 1.152e-05 [renormalize]: 5.09986e-07 [cse]: 5.722e-05 [optimize_parallel_all_gather_comm]: 2.963e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 2.499e-05 [loop_unroll]: 0.00047992 [opt_after_cconv]: 0.00024448, [1] [Cycle 1]: 0.00023857, [7] [c_1]: 0.00011517 [parameter_eliminate]: 2.27999e-06 [updatestate_depend_eliminate]: 1.153e-05 [updatestate_assign_eliminate]: 1.033e-05 [updatestate_loads_eliminate]: 1.089e-05 [cse]: 5.238e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 5.624e-05 [tuple_transform]: 0.00017341, [1] [Cycle 1]: 0.00016802, [4] [d_1]: 0.00012854 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 3.10014e-07 [switch_simplify]: 1.875e-05 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 0.00010073 [cse_after_recomputation]: 5.593e-05, [1] [Cycle 1]: 5.145e-05, [1] [cse]: 4.542e-05 [environ_conv]: 1.268e-05 [swap_dp_allreduce_reducescatter]: 1.437e-05 [bias_add_comm_swap]: 2.96001e-06 [label_micro_interleaved_index]: 4.68001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.27999e-06 [micro_interleaved_order_control]: 2.51998e-06 [assign_add_opt]: 1.21002e-06 [ForceFp32Comm]: 1.05999e-06 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 2.97002e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 1.25999e-06 [overlap_opt_shard_in_pipeline]: 1.72001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 3.848e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 1.022e-05 [overlap_recompute_and_grad_model_parallel]: 1.139e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.47001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44998e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 1.143e-05 [overlap_grad_flash_sp]: 4.737e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.38998e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 0.0001593, [1] [Cycle 1]: 0.00015378, [6] [build]: 5.07e-06 [elim_shapecalc]: 2.434e-05 [elim_not_effective]: 3.8e-05 [opt_reshape]: 2.174e-05 [fold_const_symbol]: 3.525e-05 [renormalize]: 3.69997e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 0.00013585 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 4.38001e-06 [opt_after_jit_grad]: 0.00056721 [validate]: 0.00013639 [backend_pass]: 1.12999e-06 [task_emit]: 0.0131682 [execute]: 5.82999e-06 Sums bootstrap : 0.000748s : 0.40% type_inference : 0.159820s : 85.29% event_method : 0.000019s : 0.01% auto_monad : 0.000280s : 0.15% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000036s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000040s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000122s : 0.06% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000077s : 0.04% optimize.opt_a.loop_unroll : 0.000061s : 0.03% optimize.opt_a.a_1 : 0.001708s : 0.91% optimize.opt_a.with_stream_mark : 0.000040s : 0.02% optimize.opt_a.recompute_prepare : 0.000043s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000064s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000026s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000025s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000560s : 0.30% optimize.opt_a.accelerated_algorithm : 0.000064s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000036s : 0.02% optimize.opt_a.merge_send_recv : 0.000032s : 0.02% optimize.opt_a.auto_parallel : 0.000028s : 0.01% optimize.opt_a.parallel : 0.000037s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000021s : 0.01% optimize.opt_a.allreduce_fusion : 0.000019s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000042s : 0.02% optimize.opt_a.virtual_dataset : 0.000037s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000037s : 0.02% optimize.opt_a.virtual_output : 0.000036s : 0.02% optimize.opt_a.merge_forward : 0.000018s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000037s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000064s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000056s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000021s : 0.01% optimize.opt_a.meta_fg_expand : 0.000015s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000052s : 0.03% optimize.opt_a.a_after_grad : 0.000059s : 0.03% optimize.opt_a.renormalize : 0.005591s : 2.98% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000089s : 0.05% optimize.opt_a.cse : 0.000193s : 0.10% optimize.opt_a.a_3 : 0.000271s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000688s : 0.37% optimize.opt_b.b_1 : 0.000421s : 0.22% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000057s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000025s : 0.01% optimize.loop_unroll : 0.000480s : 0.26% optimize.opt_after_cconv.c_1 : 0.000115s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.cse : 0.000052s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000056s : 0.03% optimize.tuple_transform.d_1 : 0.000129s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000101s : 0.05% optimize.cse_after_recomputation.cse : 0.000045s : 0.02% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000038s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.01% optimize.overlap_grad_flash_sp : 0.000047s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000038s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000035s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000136s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000567s : 0.30% validate : 0.000136s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.013168s : 7.03% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.000455 168 16.60% : 0.000075s : 4: substitution.arithmetic_simplify 2.08% : 0.000009s : 2: substitution.depend_value_elim 1.42% : 0.000006s : 10: substitution.elim_not_effective 1.02% : 0.000005s : 10: substitution.fold_const_symbol 2.70% : 0.000012s : 16: substitution.graph_param_transform 48.89% : 0.000222s : 4: substitution.inline 1.91% : 0.000009s : 20: substitution.j_node_and_user_rematch 4.32% : 0.000020s : 2: substitution.less_batch_normalization 1.53% : 0.000007s : 12: substitution.load_eliminater 3.22% : 0.000015s : 20: substitution.remove_not_recompute_node 1.80% : 0.000008s : 10: substitution.replace_old_param 6.72% : 0.000031s : 26: substitution.updatestate_pure_node_eliminater 7.78% : 0.000035s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.159691 2 97.55% : 0.155785s : 1: type_inference.infer 2.45% : 0.003905s : 1: type_inference.specialize ------[replace.] 0.000046 4 100.00% : 0.000046s : 4: replace.inline ------[match.] 0.000219 4 100.00% : 0.000219s : 4: match.inline ------[predicate.] 0.000549 4085 0.96% : 0.000005s : 40: predicate.accumulaten_eliminater 0.84% : 0.000005s : 18: predicate.ad_related_special_op_eliminate 0.74% : 0.000004s : 32: predicate.addn_check_dump 0.91% : 0.000005s : 40: predicate.addn_zero_filter 0.85% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 2.25% : 0.000012s : 72: predicate.arithmetic_simplify 0.94% : 0.000005s : 40: predicate.cast_eliminate 0.76% : 0.000004s : 32: predicate.check_bprop_eliminate 0.72% : 0.000004s : 32: predicate.compare_switch_simplify 0.24% : 0.000001s : 16: predicate.const_output_eliminate 0.79% : 0.000004s : 32: predicate.depend_value_elim 0.95% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 1.08% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.92% : 0.000005s : 40: predicate.dict_set_item_eliminator 0.97% : 0.000005s : 34: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 16: predicate.elim_not_effective 0.44% : 0.000002s : 16: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.25% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.19% : 0.000007s : 56: predicate.environ_get_depend_swap 2.07% : 0.000011s : 88: predicate.environ_get_eliminate 1.22% : 0.000007s : 56: predicate.environ_get_set_eliminate 1.02% : 0.000006s : 44: predicate.exchange_switch_depend_value 1.38% : 0.000008s : 44: predicate.float_depend_g_call 0.74% : 0.000004s : 32: predicate.float_environ_get_switch 1.11% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 16: predicate.fold_const_symbol 0.90% : 0.000005s : 32: predicate.get_grad_eliminate 0.28% : 0.000002s : 16: predicate.graph_param_transform 0.74% : 0.000004s : 32: predicate.incorporate_call 0.70% : 0.000004s : 32: predicate.incorporate_call_switch 5.21% : 0.000029s : 180: predicate.inline 0.95% : 0.000005s : 32: predicate.inline_without_move 0.43% : 0.000002s : 32: predicate.j_node_and_user_rematch 1.06% : 0.000006s : 35: predicate.less_batch_normalization 1.79% : 0.000010s : 72: predicate.list_to_tuple_eliminator_ 2.55% : 0.000014s : 112: predicate.load_eliminater 0.70% : 0.000004s : 16: predicate.loop_unroll_after_grad 1.47% : 0.000008s : 61: predicate.loop_unroll_before_grad 1.73% : 0.000009s : 72: predicate.make_slice_get_slice_eliminator 0.77% : 0.000004s : 32: predicate.merge_addn 0.74% : 0.000004s : 32: predicate.micro_step_allgather_replace 0.75% : 0.000004s : 32: predicate.mini_step_allgather_replace 0.85% : 0.000005s : 40: predicate.minmaximum_grad 0.78% : 0.000004s : 16: predicate.mutable_eliminate 0.45% : 0.000002s : 16: predicate.opt_reshape 0.44% : 0.000002s : 16: predicate.parallel_virtual_node 1.20% : 0.000007s : 44: predicate.partial_defer_inline 1.37% : 0.000008s : 56: predicate.partial_eliminate 0.92% : 0.000005s : 40: predicate.print_const_string_wrapper 0.77% : 0.000004s : 32: predicate.reduce_all_const_elim 1.14% : 0.000006s : 40: predicate.reduce_eliminate 2.46% : 0.000013s : 112: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000003s : 32: predicate.remove_not_recompute_node 1.24% : 0.000007s : 72: predicate.replace_applicator 0.46% : 0.000003s : 32: predicate.replace_old_param 0.25% : 0.000001s : 16: predicate.reset_defer_inline 0.93% : 0.000005s : 40: predicate.reshape_eliminate 0.76% : 0.000004s : 32: predicate.row_tensor_add_zeros_like 0.44% : 0.000002s : 16: predicate.row_tensor_eliminate 0.91% : 0.000005s : 32: predicate.same_eliminate 0.56% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.98% : 0.000005s : 32: predicate.shard_identity_eliminate 0.95% : 0.000005s : 34: predicate.special_op_eliminate 0.83% : 0.000005s : 32: predicate.specialize_transform 0.86% : 0.000005s : 32: predicate.split_environ_get_set_with_tuple_value 0.94% : 0.000005s : 32: predicate.stack_unstack_eliminate 0.42% : 0.000002s : 16: predicate.switch_call_monad_eliminater 1.07% : 0.000006s : 44: predicate.switch_defer_inline 1.88% : 0.000010s : 76: predicate.switch_layer_defer_inline 3.75% : 0.000021s : 153: predicate.switch_simplify 0.92% : 0.000005s : 40: predicate.tile_eliminate 0.93% : 0.000005s : 40: predicate.transpose_eliminate 1.75% : 0.000010s : 72: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.69% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 2.74% : 0.000015s : 104: predicate.tuple_list_get_item_eliminator 1.70% : 0.000009s : 72: predicate.tuple_list_get_set_item_eliminator 2.67% : 0.000015s : 104: predicate.tuple_list_set_item_eliminator 1.72% : 0.000009s : 72: predicate.tuple_to_list_eliminator_ 2.59% : 0.000014s : 112: predicate.updatestate_pure_node_eliminater 3.51% : 0.000019s : 144: predicate.updatestate_useless_node_eliminater 0.42% : 0.000002s : 16: predicate.value_based_eliminate 0.82% : 0.000004s : 32: predicate.virtual_dataset_eliminate 0.86% : 0.000005s : 32: predicate.virtual_output_eliminate 0.44% : 0.000002s : 18: predicate.virtual_view_grad_eliminate 0.46% : 0.000003s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003834 26 62.45% : 0.002394s : 20: func_graph_cloner_run.FuncGraphClonerGraph 37.55% : 0.001440s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.220457 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.10% : 0.004619s : 1: add_attr 2.09% : 0.004605s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000105s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.13% : 0.000290s : 1: auto_monad 0.06% : 0.000143s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.35% : 0.000775s : 1: bootstrap 0.01% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000042s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000059s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.01% : 0.000024s : 1: event_method 0.01% : 0.000011s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.22% : 0.000489s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.32% : 0.000697s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 1.41% : 0.003108s : 78: opt.transform.opt_a 0.05% : 0.000114s : 1: opt.transform.opt_after_cconv 0.03% : 0.000073s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000413s : 28: opt.transform.opt_b 0.07% : 0.000145s : 2: opt.transform.opt_trans_graph 0.05% : 0.000115s : 4: opt.transform.symbol_engine_opt 4.57% : 0.010066s : 1: opt_a 0.11% : 0.000248s : 1: opt_after_cconv 0.26% : 0.000577s : 1: opt_after_jit_grad 0.26% : 0.000580s : 1: opt_b 6.02% : 0.013270s : 1: optimize 0.02% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000051s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000045s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000061s : 1: remove_dup_value 1.72% : 0.003794s : 1: renormalize.infer 0.81% : 0.001786s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000050s : 1: rewriter_after_opt_a 0.06% : 0.000126s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000162s : 1: symbol_engine_optimizer 5.98% : 0.013184s : 1: task_emit 0.08% : 0.000176s : 1: tuple_transform 72.50% : 0.159838s : 1: type_inference 0.09% : 0.000188s : 1: validate TotalTime = 0.265519, [24] [bootstrap]: 0.00071228 [type_inference]: 0.181428 [event_method]: 0.00040746 [auto_monad]: 0.00041034 [graph_reusing]: 8.70001e-06 [inline]: 4e-06 [add_attr]: 0.00380003, [1] [add_attr_with_inline]: 0.0037907, [1] [Cycle 1]: 0.00011716, [2] [tag_attr]: 7.088e-05 [meta_addattr_fg_expand]: 1.38e-05 [parallel-infer-symbol]: 3.56999e-06 [pre_auto_parallel]: 8.084e-05 [insert-virtual-dataset]: 4.20999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.32001e-06 [pipeline_split]: 3.36001e-06 [optimize]: 0.0642351, [53] [py_interpret_to_execute]: 6.45997e-06 [rewriter_before_opt_a]: 0.00049994 [opt_a]: 0.0604234, [4] [Cycle 1]: 0.0464336, [45] [expand_dump_flag]: 4.80999e-06 [switch_simplify]: 0.00017926 [loop_unroll]: 9.38e-05 [a_1]: 0.00290944 [with_stream_mark]: 3.176e-05 [recompute_prepare]: 4.495e-05 [updatestate_depend_eliminate]: 0.00010988 [updatestate_assign_eliminate]: 2.309e-05 [updatestate_loads_eliminate]: 1.88e-05 [parameter_eliminate]: 2.54999e-06 [a_2]: 0.00053429 [accelerated_algorithm]: 5.898e-05 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 9.57001e-06 [shard_inline]: 3.253e-05 [merge_send_recv]: 3.422e-05 [auto_parallel]: 2.541e-05 [parallel]: 2.58e-05 [flash_sp]: 1.752e-05 [merge_comm]: 2.022e-05 [allreduce_fusion]: 2.063e-05 [matmul_add_comm_reduction]: 3.894e-05 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 3.647e-05 [virtual_dataset]: 3.255e-05 [get_grad_eliminate_]: 3.375e-05 [virtual_output]: 3.159e-05 [merge_forward]: 2.101e-05 [cell_reuse_recompute_pass]: 2.31998e-06 [offload_activation]: 3.039e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.767e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 5.294e-05 [set_forward_comm_id_for_comm_node_pass]: 2.382e-05 [meta_fg_expand]: 0.00951402 [flash_sp_send_recv_attached]: 8.45999e-06 [receive_attached]: 2.43002e-06 [after_resolve]: 0.00020621 [a_after_grad]: 0.00025705 [renormalize]: 0.0289228 [add_forward_monad_depend]: 1.893e-05 [auto_monad_grad]: 1.787e-05 [auto_monad_eliminator]: 0.0001747 [cse]: 0.00046439 [a_3]: 0.00194079 [Cycle 2]: 0.00949784, [45] [expand_dump_flag]: 4.48999e-06 [switch_simplify]: 0.00012206 [loop_unroll]: 0.00011793 [a_1]: 0.0037447 [with_stream_mark]: 3.982e-05 [recompute_prepare]: 3.963e-05 [updatestate_depend_eliminate]: 2.828e-05 [updatestate_assign_eliminate]: 2.216e-05 [updatestate_loads_eliminate]: 2.15e-05 [parameter_eliminate]: 5.14003e-06 [a_2]: 0.00081731 [accelerated_algorithm]: 3.112e-05 [shard]: 2.60002e-06 [meta_shard_fg_expand]: 7.26001e-06 [shard_inline]: 2.34e-05 [merge_send_recv]: 2.054e-05 [auto_parallel]: 1.964e-05 [parallel]: 9.82999e-06 [flash_sp]: 4.64002e-06 [merge_comm]: 1.27e-05 [allreduce_fusion]: 1.28e-05 [matmul_add_comm_reduction]: 2.274e-05 [allreduce_slice_to_reducescatter]: 9.10019e-07 [virtual_shard_identity]: 2.425e-05 [virtual_dataset]: 2.325e-05 [get_grad_eliminate_]: 2.213e-05 [virtual_output]: 2.348e-05 [merge_forward]: 1.059e-05 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 2.234e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.067e-05 [merge_recompute_call_nodes]: 1.66002e-06 [before_grad]: 3.591e-05 [set_forward_comm_id_for_comm_node_pass]: 1.315e-05 [meta_fg_expand]: 0.00016595 [flash_sp_send_recv_attached]: 2.42001e-06 [receive_attached]: 3.49001e-06 [after_resolve]: 3.419e-05 [a_after_grad]: 3.847e-05 [renormalize]: 0.00326104 [add_forward_monad_depend]: 8.21002e-06 [auto_monad_grad]: 1.77999e-06 [auto_monad_eliminator]: 6.294e-05 [cse]: 0.00011931 [a_3]: 0.00016551 [Cycle 3]: 0.00285816, [45] [expand_dump_flag]: 2.44999e-06 [switch_simplify]: 2.391e-05 [loop_unroll]: 2.261e-05 [a_1]: 0.0006398 [with_stream_mark]: 2.06e-05 [recompute_prepare]: 2.133e-05 [updatestate_depend_eliminate]: 4.293e-05 [updatestate_assign_eliminate]: 1.228e-05 [updatestate_loads_eliminate]: 1.326e-05 [parameter_eliminate]: 1.12e-06 [a_2]: 0.00036404 [accelerated_algorithm]: 2.472e-05 [shard]: 2.05002e-06 [meta_shard_fg_expand]: 4.46002e-06 [shard_inline]: 1.867e-05 [merge_send_recv]: 1.634e-05 [auto_parallel]: 1.528e-05 [parallel]: 8e-06 [flash_sp]: 1.20999e-06 [merge_comm]: 1.044e-05 [allreduce_fusion]: 9.59e-06 [matmul_add_comm_reduction]: 1.668e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 1.873e-05 [virtual_dataset]: 1.778e-05 [get_grad_eliminate_]: 1.746e-05 [virtual_output]: 1.719e-05 [merge_forward]: 9.15001e-06 [cell_reuse_recompute_pass]: 3.08e-06 [offload_activation]: 1.836e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.197e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 2.863e-05 [set_forward_comm_id_for_comm_node_pass]: 1.04e-05 [meta_fg_expand]: 7.1e-06 [flash_sp_send_recv_attached]: 1.84e-06 [receive_attached]: 2.57001e-06 [after_resolve]: 2.666e-05 [a_after_grad]: 2.806e-05 [renormalize]: 0.0008366 [add_forward_monad_depend]: 4.72e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 4.217e-05 [cse]: 7.943e-05 [a_3]: 0.00012717 [Cycle 4]: 0.00161157, [45] [expand_dump_flag]: 1.64998e-06 [switch_simplify]: 1.954e-05 [loop_unroll]: 1.714e-05 [a_1]: 0.00050249 [with_stream_mark]: 1.516e-05 [recompute_prepare]: 1.729e-05 [updatestate_depend_eliminate]: 9.65002e-06 [updatestate_assign_eliminate]: 1.056e-05 [updatestate_loads_eliminate]: 1.041e-05 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.00026199 [accelerated_algorithm]: 2.135e-05 [shard]: 1.34e-06 [meta_shard_fg_expand]: 4.22e-06 [shard_inline]: 1.657e-05 [merge_send_recv]: 1.267e-05 [auto_parallel]: 1.223e-05 [parallel]: 5.59e-06 [flash_sp]: 9.70002e-07 [merge_comm]: 9.32999e-06 [allreduce_fusion]: 9.89001e-06 [matmul_add_comm_reduction]: 1.485e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 1.94e-05 [virtual_dataset]: 1.692e-05 [get_grad_eliminate_]: 1.642e-05 [virtual_output]: 1.676e-05 [merge_forward]: 8.43999e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [offload_activation]: 1.472e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.089e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 2.74e-05 [set_forward_comm_id_for_comm_node_pass]: 9.70002e-06 [meta_fg_expand]: 6.21998e-06 [flash_sp_send_recv_attached]: 1.09998e-06 [receive_attached]: 1.35001e-06 [after_resolve]: 2.382e-05 [a_after_grad]: 2.721e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.82001e-06 [auto_monad_grad]: 1.35001e-06 [auto_monad_eliminator]: 3.261e-05 [cse]: 5.297e-05 [a_3]: 0.00011586 [py_interpret_to_execute_after_opt_a]: 5.22999e-06 [slice_cell_reuse_recomputed_activation]: 2.03002e-06 [rewriter_after_opt_a]: 4.706e-05 [convert_after_rewriter]: 1.25001e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.0008337 [opt_b]: 0.00057858, [1] [Cycle 1]: 0.00057035, [7] [b_1]: 0.00041648 [b_2]: 2.057e-05 [updatestate_depend_eliminate]: 1.278e-05 [updatestate_assign_eliminate]: 1.106e-05 [updatestate_loads_eliminate]: 1.151e-05 [renormalize]: 8.00006e-07 [cse]: 6.002e-05 [optimize_parallel_all_gather_comm]: 2.98e-05 [overlap_param_gather]: 2.46e-06 [cconv]: 2.697e-05 [loop_unroll]: 0.00048384 [opt_after_cconv]: 0.00024594, [1] [Cycle 1]: 0.00023964, [7] [c_1]: 0.00011178 [parameter_eliminate]: 2.81e-06 [updatestate_depend_eliminate]: 1.237e-05 [updatestate_assign_eliminate]: 1.032e-05 [updatestate_loads_eliminate]: 1.105e-05 [cse]: 5.484e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 6.342e-05 [tuple_transform]: 0.00017282, [1] [Cycle 1]: 0.00016729, [4] [d_1]: 0.00012327 [none_parameter_eliminate]: 2.08002e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 2.253e-05 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 0.00012108 [cse_after_recomputation]: 6.642e-05, [1] [Cycle 1]: 6.1e-05, [1] [cse]: 5.388e-05 [environ_conv]: 1.227e-05 [swap_dp_allreduce_reducescatter]: 1.784e-05 [bias_add_comm_swap]: 2.84001e-06 [label_micro_interleaved_index]: 4.49002e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.96001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 1.34e-06 [remove_cast_before_assign_add]: 1.73997e-06 [full_micro_interleaved_order_control]: 2.84001e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 1.09e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.29998e-06 [overlap_opt_shard_in_pipeline]: 1.76e-06 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 4.42e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 1.12e-05 [overlap_recompute_and_grad_model_parallel]: 1.271e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.79998e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 1.16e-05 [overlap_grad_flash_sp]: 5.095e-05 [begin_end_overlap_inline]: 5.79981e-07 [split_matmul_comm_elemetwise]: 2.47001e-06 [split_layernorm_comm]: 2.30002e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00016334, [1] [Cycle 1]: 0.0001575, [6] [build]: 5.38002e-06 [elim_shapecalc]: 2.322e-05 [elim_not_effective]: 3.855e-05 [opt_reshape]: 2.264e-05 [fold_const_symbol]: 3.565e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.11e-06 [pipeline_parallel_scheduler]: 1.84998e-06 [auto_monad_reorder]: 0.00013529 [get_jit_bprop_graph]: 2.27001e-06 [rewriter_after_jit_bprop_graph]: 4.01001e-06 [opt_after_jit_grad]: 0.00060485 [validate]: 9.011e-05 [backend_pass]: 1.07998e-06 [task_emit]: 0.0132891 [execute]: 6.85002e-06 Sums bootstrap : 0.000712s : 0.27% type_inference : 0.181428s : 69.77% event_method : 0.000407s : 0.16% auto_monad : 0.000410s : 0.16% graph_reusing : 0.000009s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000071s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000081s : 0.03% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000500s : 0.19% optimize.opt_a.expand_dump_flag : 0.000013s : 0.01% optimize.opt_a.switch_simplify : 0.000345s : 0.13% optimize.opt_a.loop_unroll : 0.000251s : 0.10% optimize.opt_a.a_1 : 0.007796s : 3.00% optimize.opt_a.with_stream_mark : 0.000107s : 0.04% optimize.opt_a.recompute_prepare : 0.000123s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000191s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000068s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000064s : 0.02% optimize.opt_a.parameter_eliminate : 0.000010s : 0.00% optimize.opt_a.a_2 : 0.001978s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000136s : 0.05% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000026s : 0.01% optimize.opt_a.shard_inline : 0.000091s : 0.04% optimize.opt_a.merge_send_recv : 0.000084s : 0.03% optimize.opt_a.auto_parallel : 0.000073s : 0.03% optimize.opt_a.parallel : 0.000049s : 0.02% optimize.opt_a.flash_sp : 0.000024s : 0.01% optimize.opt_a.merge_comm : 0.000053s : 0.02% optimize.opt_a.allreduce_fusion : 0.000053s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000093s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000099s : 0.04% optimize.opt_a.virtual_dataset : 0.000091s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000090s : 0.03% optimize.opt_a.virtual_output : 0.000089s : 0.03% optimize.opt_a.merge_forward : 0.000049s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000009s : 0.00% optimize.opt_a.offload_activation : 0.000086s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000161s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000145s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000057s : 0.02% optimize.opt_a.meta_fg_expand : 0.009693s : 3.73% optimize.opt_a.flash_sp_send_recv_attached : 0.000014s : 0.01% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000291s : 0.11% optimize.opt_a.a_after_grad : 0.000351s : 0.13% optimize.opt_a.renormalize : 0.033020s : 12.70% optimize.opt_a.add_forward_monad_depend : 0.000034s : 0.01% optimize.opt_a.auto_monad_grad : 0.000022s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000312s : 0.12% optimize.opt_a.cse : 0.000716s : 0.28% optimize.opt_a.a_3 : 0.002349s : 0.90% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000834s : 0.32% optimize.opt_b.b_1 : 0.000416s : 0.16% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000060s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.01% optimize.loop_unroll : 0.000484s : 0.19% optimize.opt_after_cconv.c_1 : 0.000112s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000055s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000063s : 0.02% optimize.tuple_transform.d_1 : 0.000123s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000023s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000121s : 0.05% optimize.cse_after_recomputation.cse : 0.000054s : 0.02% optimize.environ_conv : 0.000012s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000018s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000044s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000011s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000012s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000039s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000023s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000036s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000135s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000605s : 0.23% validate : 0.000090s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.013289s : 5.11% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.003612 564 4.44% : 0.000160s : 9: substitution.arithmetic_simplify 3.12% : 0.000113s : 8: substitution.depend_value_elim 0.18% : 0.000006s : 10: substitution.elim_not_effective 0.28% : 0.000010s : 13: substitution.float_depend_g_call 0.11% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.16% : 0.000006s : 10: substitution.fold_const_symbol 40.99% : 0.001481s : 8: substitution.getattr_setattr_resolve 0.35% : 0.000013s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.08% : 0.000003s : 2: substitution.incorporate_call_switch 33.01% : 0.001192s : 26: substitution.inline 1.01% : 0.000036s : 5: substitution.inline_without_move 0.64% : 0.000023s : 51: substitution.j_node_and_user_rematch 0.72% : 0.000026s : 4: substitution.less_batch_normalization 0.35% : 0.000012s : 20: substitution.load_eliminater 0.49% : 0.000018s : 11: substitution.minmaximum_grad 0.31% : 0.000011s : 13: substitution.partial_eliminate 0.94% : 0.000034s : 51: substitution.remove_not_recompute_node 2.33% : 0.000084s : 35: substitution.replace_applicator 0.90% : 0.000032s : 53: substitution.replace_old_param 0.18% : 0.000007s : 2: substitution.set_cell_output_no_recompute 0.32% : 0.000012s : 3: substitution.switch_simplify 0.93% : 0.000034s : 11: substitution.tuple_list_convert_item_index_to_positive 0.42% : 0.000015s : 11: substitution.tuple_list_get_item_const_eliminator 0.59% : 0.000021s : 11: substitution.tuple_list_get_item_depend_reorder 1.84% : 0.000066s : 24: substitution.tuple_list_get_item_eliminator 0.60% : 0.000022s : 11: substitution.tuple_list_get_set_item_eliminator 1.79% : 0.000065s : 62: substitution.updatestate_pure_node_eliminater 2.84% : 0.000103s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.181241 2 95.82% : 0.173665s : 1: type_inference.infer 4.18% : 0.007576s : 1: type_inference.specialize ------[replace.] 0.000884 49 9.90% : 0.000088s : 6: replace.getattr_setattr_resolve 42.28% : 0.000374s : 26: replace.inline 12.25% : 0.000108s : 2: replace.replace_applicator 5.46% : 0.000048s : 3: replace.switch_simplify 24.69% : 0.000218s : 11: replace.tuple_list_get_item_eliminator 5.42% : 0.000048s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002622 49 53.11% : 0.001393s : 6: match.getattr_setattr_resolve 44.89% : 0.001177s : 26: match.inline 0.55% : 0.000014s : 2: match.replace_applicator 0.35% : 0.000009s : 3: match.switch_simplify 0.82% : 0.000022s : 11: match.tuple_list_get_item_eliminator 0.29% : 0.000008s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001958 14794 1.01% : 0.000020s : 161: predicate.accumulaten_eliminater 0.22% : 0.000004s : 17: predicate.ad_related_special_op_eliminate 0.69% : 0.000013s : 107: predicate.addn_check_dump 1.02% : 0.000020s : 161: predicate.addn_zero_filter 0.96% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.23% : 0.000044s : 263: predicate.arithmetic_simplify 1.06% : 0.000021s : 161: predicate.cast_eliminate 2.01% : 0.000039s : 317: predicate.check_bprop_eliminate 0.68% : 0.000013s : 107: predicate.compare_switch_simplify 0.06% : 0.000001s : 15: predicate.const_output_eliminate 0.70% : 0.000014s : 102: predicate.depend_value_elim 1.10% : 0.000021s : 161: predicate.dict_get_item_const_eliminator 1.21% : 0.000024s : 161: predicate.dict_get_item_eliminator 1.02% : 0.000020s : 161: predicate.dict_set_item_eliminator 0.26% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.06% : 0.000001s : 15: predicate.elim_not_effective 0.11% : 0.000002s : 15: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000022s : 176: predicate.environ_add_const_eliminate 1.08% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.07% : 0.000021s : 176: predicate.environ_get_depend_swap 1.72% : 0.000034s : 278: predicate.environ_get_eliminate 1.07% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.29% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.71% : 0.000033s : 198: predicate.float_depend_g_call 0.68% : 0.000013s : 107: predicate.float_environ_get_switch 0.78% : 0.000015s : 122: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 15: predicate.fold_const_symbol 0.60% : 0.000012s : 82: predicate.get_grad_eliminate 0.50% : 0.000010s : 40: predicate.getattr_setattr_resolve 0.07% : 0.000001s : 15: predicate.graph_param_transform 0.65% : 0.000013s : 102: predicate.incorporate_call 0.62% : 0.000012s : 102: predicate.incorporate_call_switch 4.82% : 0.000094s : 578: predicate.inline 1.56% : 0.000031s : 186: predicate.inline_without_move 0.30% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.80% : 0.000016s : 85: predicate.less_batch_normalization 1.47% : 0.000029s : 202: predicate.list_to_tuple_eliminator_ 2.29% : 0.000045s : 363: predicate.load_eliminater 0.21% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.85% : 0.000036s : 273: predicate.loop_unroll_before_grad 1.37% : 0.000027s : 191: predicate.make_slice_get_slice_eliminator 0.73% : 0.000014s : 107: predicate.merge_addn 1.95% : 0.000038s : 309: predicate.micro_step_allgather_replace 1.98% : 0.000039s : 309: predicate.mini_step_allgather_replace 0.98% : 0.000019s : 161: predicate.minmaximum_grad 0.18% : 0.000004s : 15: predicate.mutable_eliminate 0.12% : 0.000002s : 15: predicate.opt_reshape 0.15% : 0.000003s : 15: predicate.parallel_virtual_node 1.66% : 0.000033s : 198: predicate.partial_defer_inline 1.37% : 0.000027s : 187: predicate.partial_eliminate 1.05% : 0.000021s : 161: predicate.print_const_string_wrapper 0.71% : 0.000014s : 102: predicate.reduce_all_const_elim 1.27% : 0.000025s : 161: predicate.reduce_eliminate 2.25% : 0.000044s : 363: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000007s : 82: predicate.remove_not_recompute_node 2.20% : 0.000043s : 485: predicate.replace_applicator 0.74% : 0.000015s : 186: predicate.replace_old_param 0.08% : 0.000001s : 15: predicate.reset_defer_inline 1.06% : 0.000021s : 161: predicate.reshape_eliminate 2.03% : 0.000040s : 309: predicate.row_tensor_add_zeros_like 0.11% : 0.000002s : 15: predicate.row_tensor_eliminate 2.15% : 0.000042s : 317: predicate.same_eliminate 0.41% : 0.000008s : 97: predicate.set_cell_output_no_recompute 0.62% : 0.000012s : 82: predicate.shard_identity_eliminate 0.24% : 0.000005s : 32: predicate.special_op_eliminate 0.77% : 0.000015s : 107: predicate.specialize_transform 2.00% : 0.000039s : 309: predicate.split_environ_get_set_with_tuple_value 1.46% : 0.000029s : 186: predicate.stack_unstack_eliminate 0.11% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.43% : 0.000028s : 198: predicate.switch_defer_inline 3.41% : 0.000067s : 515: predicate.switch_layer_defer_inline 4.13% : 0.000081s : 599: predicate.switch_simplify 1.09% : 0.000021s : 161: predicate.tile_eliminate 1.01% : 0.000020s : 161: predicate.transpose_eliminate 1.30% : 0.000025s : 191: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000028s : 191: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000025s : 191: predicate.tuple_list_get_item_depend_reorder 2.38% : 0.000047s : 304: predicate.tuple_list_get_item_eliminator 1.43% : 0.000028s : 191: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000042s : 293: predicate.tuple_list_set_item_eliminator 1.29% : 0.000025s : 202: predicate.tuple_to_list_eliminator_ 2.31% : 0.000045s : 363: predicate.updatestate_pure_node_eliminater 3.07% : 0.000060s : 467: predicate.updatestate_useless_node_eliminater 0.13% : 0.000002s : 15: predicate.value_based_eliminate 0.59% : 0.000012s : 82: predicate.virtual_dataset_eliminate 0.60% : 0.000012s : 82: predicate.virtual_output_eliminate 0.12% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.14% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.009163 98 68.61% : 0.006287s : 58: func_graph_cloner_run.FuncGraphClonerGraph 2.36% : 0.000216s : 4: func_graph_cloner_run.FuncGraphClonerNode 29.03% : 0.002660s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.383150 307 0.00% : 0.000005s : 1: ForceFp32Comm 0.99% : 0.003805s : 1: add_attr 0.99% : 0.003795s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000126s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.11% : 0.000423s : 1: auto_monad 0.04% : 0.000142s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000739s : 1: bootstrap 0.01% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000048s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000070s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000016s : 1: environ_conv 0.11% : 0.000422s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.13% : 0.000493s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.22% : 0.000844s : 1: mutable_eliminate 0.00% : 0.000015s : 1: offloading_packed_experts 0.01% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000029s : 1: opt.transform.mutable_eliminate 3.69% : 0.014156s : 181: opt.transform.opt_a 0.03% : 0.000110s : 1: opt.transform.opt_after_cconv 0.02% : 0.000071s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000406s : 28: opt.transform.opt_b 0.43% : 0.001657s : 4: opt.transform.opt_resolve 0.04% : 0.000143s : 2: opt.transform.opt_trans_graph 0.03% : 0.000115s : 4: opt.transform.symbol_engine_opt 15.77% : 0.060427s : 1: opt_a 0.07% : 0.000250s : 1: opt_after_cconv 0.16% : 0.000615s : 1: opt_after_jit_grad 0.15% : 0.000583s : 1: opt_b 16.77% : 0.064242s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000055s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000015s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000085s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000068s : 1: remove_dup_value 6.57% : 0.025190s : 3: renormalize.infer 2.04% : 0.007801s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000050s : 1: rewriter_after_opt_a 0.13% : 0.000509s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000021s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000166s : 1: symbol_engine_optimizer 3.47% : 0.013307s : 1: task_emit 0.05% : 0.000176s : 1: tuple_transform 47.36% : 0.181451s : 1: type_inference 0.04% : 0.000143s : 1: validate TotalTime = 0.196044, [24] [bootstrap]: 0.00065888 [type_inference]: 0.164462 [event_method]: 2.274e-05 [auto_monad]: 0.00048557 [graph_reusing]: 5.99e-06 [inline]: 2.04e-06 [add_attr]: 0.00361215, [1] [add_attr_with_inline]: 0.00360258, [1] [Cycle 1]: 7.511e-05, [2] [tag_attr]: 3.655e-05 [meta_addattr_fg_expand]: 7.16001e-06 [parallel-infer-symbol]: 3.68999e-06 [pre_auto_parallel]: 4.786e-05 [insert-virtual-dataset]: 2.68998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.0131784, [53] [py_interpret_to_execute]: 4.89e-06 [rewriter_before_opt_a]: 0.00010881 [opt_a]: 0.00992724, [2] [Cycle 1]: 0.00821097, [45] [expand_dump_flag]: 3.09999e-06 [switch_simplify]: 5.7e-05 [loop_unroll]: 4.211e-05 [a_1]: 0.0011236 [with_stream_mark]: 2.242e-05 [recompute_prepare]: 2.451e-05 [updatestate_depend_eliminate]: 5.305e-05 [updatestate_assign_eliminate]: 1.337e-05 [updatestate_loads_eliminate]: 1.236e-05 [parameter_eliminate]: 1.76998e-06 [a_2]: 0.00028633 [accelerated_algorithm]: 3.898e-05 [shard]: 1.81e-06 [meta_shard_fg_expand]: 4.17e-06 [shard_inline]: 1.864e-05 [merge_send_recv]: 1.559e-05 [auto_parallel]: 1.302e-05 [parallel]: 2.165e-05 [flash_sp]: 1.005e-05 [merge_comm]: 1.031e-05 [allreduce_fusion]: 9.62001e-06 [matmul_add_comm_reduction]: 1.758e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 2.043e-05 [virtual_dataset]: 1.825e-05 [get_grad_eliminate_]: 1.824e-05 [virtual_output]: 1.876e-05 [merge_forward]: 8.62998e-06 [cell_reuse_recompute_pass]: 1.24998e-06 [offload_activation]: 1.724e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.177e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 2.957e-05 [set_forward_comm_id_for_comm_node_pass]: 1.016e-05 [meta_fg_expand]: 7.26001e-06 [flash_sp_send_recv_attached]: 4.67998e-06 [receive_attached]: 2.21e-06 [after_resolve]: 2.586e-05 [a_after_grad]: 2.98e-05 [renormalize]: 0.00550157 [add_forward_monad_depend]: 5.72999e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 5.154e-05 [cse]: 0.00019475 [a_3]: 0.00013994 [Cycle 2]: 0.00170511, [45] [expand_dump_flag]: 2.08998e-06 [switch_simplify]: 1.949e-05 [loop_unroll]: 1.879e-05 [a_1]: 0.00053921 [with_stream_mark]: 1.694e-05 [recompute_prepare]: 1.864e-05 [updatestate_depend_eliminate]: 1.024e-05 [updatestate_assign_eliminate]: 1.145e-05 [updatestate_loads_eliminate]: 1.16e-05 [parameter_eliminate]: 1.28002e-06 [a_2]: 0.00027408 [accelerated_algorithm]: 2.248e-05 [shard]: 1.52999e-06 [meta_shard_fg_expand]: 3.66001e-06 [shard_inline]: 1.786e-05 [merge_send_recv]: 1.36e-05 [auto_parallel]: 1.252e-05 [parallel]: 6.71999e-06 [flash_sp]: 3.83999e-06 [merge_comm]: 9.64e-06 [allreduce_fusion]: 9.39e-06 [matmul_add_comm_reduction]: 1.712e-05 [allreduce_slice_to_reducescatter]: 7.49977e-07 [virtual_shard_identity]: 1.926e-05 [virtual_dataset]: 1.776e-05 [get_grad_eliminate_]: 1.778e-05 [virtual_output]: 1.788e-05 [merge_forward]: 9.00001e-06 [cell_reuse_recompute_pass]: 2.39001e-06 [offload_activation]: 1.611e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.129e-05 [merge_recompute_call_nodes]: 1.19998e-06 [before_grad]: 2.85e-05 [set_forward_comm_id_for_comm_node_pass]: 9.96e-06 [meta_fg_expand]: 6.26e-06 [flash_sp_send_recv_attached]: 1.62999e-06 [receive_attached]: 2.06998e-06 [after_resolve]: 2.448e-05 [a_after_grad]: 2.84e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.82001e-06 [auto_monad_grad]: 1.21002e-06 [auto_monad_eliminator]: 3.663e-05 [cse]: 5.46e-05 [a_3]: 0.00012216 [py_interpret_to_execute_after_opt_a]: 6.09999e-06 [slice_cell_reuse_recomputed_activation]: 2.24999e-06 [rewriter_after_opt_a]: 4.704e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.01997e-06 [mutable_eliminate]: 0.00066073 [opt_b]: 0.00058977, [1] [Cycle 1]: 0.00058264, [7] [b_1]: 0.00043086 [b_2]: 2.075e-05 [updatestate_depend_eliminate]: 1.236e-05 [updatestate_assign_eliminate]: 1.049e-05 [updatestate_loads_eliminate]: 1.196e-05 [renormalize]: 6.89994e-07 [cse]: 5.841e-05 [optimize_parallel_all_gather_comm]: 3.091e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 2.036e-05 [loop_unroll]: 0.00057326 [opt_after_cconv]: 0.00025486, [1] [Cycle 1]: 0.00024869, [7] [c_1]: 0.00011965 [parameter_eliminate]: 3.04999e-06 [updatestate_depend_eliminate]: 1.321e-05 [updatestate_assign_eliminate]: 1.057e-05 [updatestate_loads_eliminate]: 1.146e-05 [cse]: 5.471e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 5.772e-05 [tuple_transform]: 0.00017344, [1] [Cycle 1]: 0.0001683, [4] [d_1]: 0.00012772 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 2.59985e-07 [switch_simplify]: 1.994e-05 [partial_unused_args_eliminate]: 1.13001e-06 [add_recomputation]: 9.3e-05 [cse_after_recomputation]: 5.61e-05, [1] [Cycle 1]: 5.079e-05, [1] [cse]: 4.408e-05 [environ_conv]: 1.074e-05 [swap_dp_allreduce_reducescatter]: 1.52e-05 [bias_add_comm_swap]: 3.00002e-06 [label_micro_interleaved_index]: 5.09e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 1.96998e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.51998e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.50001e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.57999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94e-06 [control_data_broadcast_order]: 3.651e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 9.47001e-06 [overlap_recompute_and_grad_model_parallel]: 1.104e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.14999e-06 [overlap_grad_ring_attention]: 1.036e-05 [overlap_grad_flash_sp]: 4.539e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.27999e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 0.00015156, [1] [Cycle 1]: 0.00014667, [6] [build]: 4.17e-06 [elim_shapecalc]: 2.354e-05 [elim_not_effective]: 3.483e-05 [opt_reshape]: 2.107e-05 [fold_const_symbol]: 3.066e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.96e-06 [pipeline_parallel_scheduler]: 1.66998e-06 [auto_monad_reorder]: 0.00012222 [get_jit_bprop_graph]: 1.35999e-06 [rewriter_after_jit_bprop_graph]: 4.22998e-06 [opt_after_jit_grad]: 0.00056874 [validate]: 7.859e-05 [backend_pass]: 1.21002e-06 [task_emit]: 0.0125254 [execute]: 5.20999e-06 Sums bootstrap : 0.000659s : 0.34% type_inference : 0.164462s : 85.95% event_method : 0.000023s : 0.01% auto_monad : 0.000486s : 0.25% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000037s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000048s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000109s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000076s : 0.04% optimize.opt_a.loop_unroll : 0.000061s : 0.03% optimize.opt_a.a_1 : 0.001663s : 0.87% optimize.opt_a.with_stream_mark : 0.000039s : 0.02% optimize.opt_a.recompute_prepare : 0.000043s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000063s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000024s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000560s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000061s : 0.03% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000037s : 0.02% optimize.opt_a.merge_send_recv : 0.000029s : 0.02% optimize.opt_a.auto_parallel : 0.000026s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000020s : 0.01% optimize.opt_a.allreduce_fusion : 0.000019s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000035s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000040s : 0.02% optimize.opt_a.virtual_dataset : 0.000036s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000036s : 0.02% optimize.opt_a.virtual_output : 0.000037s : 0.02% optimize.opt_a.merge_forward : 0.000018s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000033s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000063s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000058s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.01% optimize.opt_a.meta_fg_expand : 0.000014s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000050s : 0.03% optimize.opt_a.a_after_grad : 0.000058s : 0.03% optimize.opt_a.renormalize : 0.005502s : 2.88% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000088s : 0.05% optimize.opt_a.cse : 0.000249s : 0.13% optimize.opt_a.a_3 : 0.000262s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000661s : 0.35% optimize.opt_b.b_1 : 0.000431s : 0.23% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000058s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000020s : 0.01% optimize.loop_unroll : 0.000573s : 0.30% optimize.opt_after_cconv.c_1 : 0.000120s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.cse : 0.000055s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000058s : 0.03% optimize.tuple_transform.d_1 : 0.000128s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000093s : 0.05% optimize.cse_after_recomputation.cse : 0.000044s : 0.02% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000015s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000037s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.01% optimize.overlap_grad_flash_sp : 0.000045s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000035s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000031s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000122s : 0.06% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000569s : 0.30% validate : 0.000079s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.012525s : 6.55% execute : 0.000005s : 0.00% Time group info: ------[substitution.] 0.000419 168 18.44% : 0.000077s : 4: substitution.arithmetic_simplify 1.97% : 0.000008s : 2: substitution.depend_value_elim 1.23% : 0.000005s : 10: substitution.elim_not_effective 0.94% : 0.000004s : 10: substitution.fold_const_symbol 2.63% : 0.000011s : 16: substitution.graph_param_transform 45.91% : 0.000192s : 4: substitution.inline 2.09% : 0.000009s : 20: substitution.j_node_and_user_rematch 4.05% : 0.000017s : 2: substitution.less_batch_normalization 1.81% : 0.000008s : 12: substitution.load_eliminater 3.40% : 0.000014s : 20: substitution.remove_not_recompute_node 1.58% : 0.000007s : 10: substitution.replace_old_param 7.94% : 0.000033s : 26: substitution.updatestate_pure_node_eliminater 8.00% : 0.000033s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.163624 2 97.77% : 0.159980s : 1: type_inference.infer 2.23% : 0.003643s : 1: type_inference.specialize ------[replace.] 0.000042 4 100.00% : 0.000042s : 4: replace.inline ------[match.] 0.000190 4 100.00% : 0.000190s : 4: match.inline ------[predicate.] 0.000548 4085 0.91% : 0.000005s : 40: predicate.accumulaten_eliminater 0.94% : 0.000005s : 18: predicate.ad_related_special_op_eliminate 0.75% : 0.000004s : 32: predicate.addn_check_dump 0.90% : 0.000005s : 40: predicate.addn_zero_filter 0.87% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 2.23% : 0.000012s : 72: predicate.arithmetic_simplify 0.94% : 0.000005s : 40: predicate.cast_eliminate 0.78% : 0.000004s : 32: predicate.check_bprop_eliminate 0.74% : 0.000004s : 32: predicate.compare_switch_simplify 0.24% : 0.000001s : 16: predicate.const_output_eliminate 0.76% : 0.000004s : 32: predicate.depend_value_elim 0.96% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 1.08% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.91% : 0.000005s : 40: predicate.dict_set_item_eliminator 0.98% : 0.000005s : 34: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 16: predicate.elim_not_effective 0.46% : 0.000003s : 16: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.23% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.21% : 0.000007s : 56: predicate.environ_get_depend_swap 1.98% : 0.000011s : 88: predicate.environ_get_eliminate 1.26% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.99% : 0.000005s : 44: predicate.exchange_switch_depend_value 1.42% : 0.000008s : 44: predicate.float_depend_g_call 0.76% : 0.000004s : 32: predicate.float_environ_get_switch 1.13% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.22% : 0.000001s : 16: predicate.fold_const_symbol 0.89% : 0.000005s : 32: predicate.get_grad_eliminate 0.24% : 0.000001s : 16: predicate.graph_param_transform 0.75% : 0.000004s : 32: predicate.incorporate_call 0.70% : 0.000004s : 32: predicate.incorporate_call_switch 5.35% : 0.000029s : 180: predicate.inline 0.93% : 0.000005s : 32: predicate.inline_without_move 0.45% : 0.000002s : 32: predicate.j_node_and_user_rematch 1.04% : 0.000006s : 35: predicate.less_batch_normalization 1.73% : 0.000009s : 72: predicate.list_to_tuple_eliminator_ 2.58% : 0.000014s : 112: predicate.load_eliminater 0.78% : 0.000004s : 16: predicate.loop_unroll_after_grad 1.46% : 0.000008s : 61: predicate.loop_unroll_before_grad 1.72% : 0.000009s : 72: predicate.make_slice_get_slice_eliminator 0.77% : 0.000004s : 32: predicate.merge_addn 0.75% : 0.000004s : 32: predicate.micro_step_allgather_replace 0.78% : 0.000004s : 32: predicate.mini_step_allgather_replace 0.86% : 0.000005s : 40: predicate.minmaximum_grad 0.78% : 0.000004s : 16: predicate.mutable_eliminate 0.47% : 0.000003s : 16: predicate.opt_reshape 0.41% : 0.000002s : 16: predicate.parallel_virtual_node 1.19% : 0.000007s : 44: predicate.partial_defer_inline 1.37% : 0.000007s : 56: predicate.partial_eliminate 0.93% : 0.000005s : 40: predicate.print_const_string_wrapper 0.75% : 0.000004s : 32: predicate.reduce_all_const_elim 1.11% : 0.000006s : 40: predicate.reduce_eliminate 2.44% : 0.000013s : 112: predicate.redundant_stop_gradient_eliminater 0.51% : 0.000003s : 32: predicate.remove_not_recompute_node 1.30% : 0.000007s : 72: predicate.replace_applicator 0.50% : 0.000003s : 32: predicate.replace_old_param 0.27% : 0.000002s : 16: predicate.reset_defer_inline 0.87% : 0.000005s : 40: predicate.reshape_eliminate 0.80% : 0.000004s : 32: predicate.row_tensor_add_zeros_like 0.48% : 0.000003s : 16: predicate.row_tensor_eliminate 0.95% : 0.000005s : 32: predicate.same_eliminate 0.56% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.85% : 0.000005s : 32: predicate.shard_identity_eliminate 0.94% : 0.000005s : 34: predicate.special_op_eliminate 0.83% : 0.000005s : 32: predicate.specialize_transform 0.84% : 0.000005s : 32: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000005s : 32: predicate.stack_unstack_eliminate 0.41% : 0.000002s : 16: predicate.switch_call_monad_eliminater 1.07% : 0.000006s : 44: predicate.switch_defer_inline 1.82% : 0.000010s : 76: predicate.switch_layer_defer_inline 3.89% : 0.000021s : 153: predicate.switch_simplify 0.93% : 0.000005s : 40: predicate.tile_eliminate 0.90% : 0.000005s : 40: predicate.transpose_eliminate 1.67% : 0.000009s : 72: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 2.73% : 0.000015s : 104: predicate.tuple_list_get_item_eliminator 1.68% : 0.000009s : 72: predicate.tuple_list_get_set_item_eliminator 2.68% : 0.000015s : 104: predicate.tuple_list_set_item_eliminator 1.70% : 0.000009s : 72: predicate.tuple_to_list_eliminator_ 2.58% : 0.000014s : 112: predicate.updatestate_pure_node_eliminater 3.59% : 0.000020s : 144: predicate.updatestate_useless_node_eliminater 0.42% : 0.000002s : 16: predicate.value_based_eliminate 0.85% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.84% : 0.000005s : 32: predicate.virtual_output_eliminate 0.42% : 0.000002s : 18: predicate.virtual_view_grad_eliminate 0.51% : 0.000003s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003729 34 79.01% : 0.002946s : 28: func_graph_cloner_run.FuncGraphClonerGraph 20.99% : 0.000783s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.222206 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.63% : 0.003618s : 1: add_attr 1.62% : 0.003607s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000097s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.22% : 0.000500s : 1: auto_monad 0.06% : 0.000130s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.31% : 0.000684s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000040s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000059s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.00% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.26% : 0.000583s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.30% : 0.000672s : 1: mutable_eliminate 0.01% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000031s : 1: opt.transform.mutable_eliminate 1.37% : 0.003050s : 78: opt.transform.opt_a 0.05% : 0.000118s : 1: opt.transform.opt_after_cconv 0.03% : 0.000072s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000422s : 28: opt.transform.opt_b 0.07% : 0.000145s : 2: opt.transform.opt_trans_graph 0.05% : 0.000106s : 4: opt.transform.symbol_engine_opt 4.47% : 0.009931s : 1: opt_a 0.12% : 0.000258s : 1: opt_after_cconv 0.26% : 0.000579s : 1: opt_after_jit_grad 0.27% : 0.000594s : 1: opt_b 5.93% : 0.013184s : 1: optimize 0.02% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000049s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000062s : 1: remove_dup_value 2.00% : 0.004441s : 1: renormalize.infer 0.47% : 0.001049s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000051s : 1: rewriter_after_opt_a 0.05% : 0.000113s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000155s : 1: symbol_engine_optimizer 5.64% : 0.012539s : 1: task_emit 0.08% : 0.000176s : 1: tuple_transform 74.02% : 0.164482s : 1: type_inference 0.06% : 0.000127s : 1: validate TotalTime = 0.273426, [24] [bootstrap]: 0.00093563 [type_inference]: 0.187854 [event_method]: 0.00036153 [auto_monad]: 0.00036047 [graph_reusing]: 8.77999e-06 [inline]: 2.56998e-06 [add_attr]: 0.0036099, [1] [add_attr_with_inline]: 0.00360006, [1] [Cycle 1]: 0.00011191, [2] [tag_attr]: 6.333e-05 [meta_addattr_fg_expand]: 1.501e-05 [parallel-infer-symbol]: 3.36001e-06 [pre_auto_parallel]: 7.49e-05 [insert-virtual-dataset]: 3.9e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 2.50997e-06 [optimize]: 0.0658757, [53] [py_interpret_to_execute]: 6.03998e-06 [rewriter_before_opt_a]: 0.00048373 [opt_a]: 0.0620866, [4] [Cycle 1]: 0.0472436, [45] [expand_dump_flag]: 4.85999e-06 [switch_simplify]: 0.00018508 [loop_unroll]: 9.775e-05 [a_1]: 0.00289544 [with_stream_mark]: 3.563e-05 [recompute_prepare]: 4.66e-05 [updatestate_depend_eliminate]: 0.00010584 [updatestate_assign_eliminate]: 2.238e-05 [updatestate_loads_eliminate]: 1.872e-05 [parameter_eliminate]: 3.4e-06 [a_2]: 0.00053825 [accelerated_algorithm]: 5.813e-05 [shard]: 2.32999e-06 [meta_shard_fg_expand]: 9.12001e-06 [shard_inline]: 3.247e-05 [merge_send_recv]: 3.886e-05 [auto_parallel]: 2.327e-05 [parallel]: 2.487e-05 [flash_sp]: 1.664e-05 [merge_comm]: 2.125e-05 [allreduce_fusion]: 2.031e-05 [matmul_add_comm_reduction]: 4.468e-05 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 3.549e-05 [virtual_dataset]: 3.238e-05 [get_grad_eliminate_]: 3.418e-05 [virtual_output]: 3.15e-05 [merge_forward]: 2.233e-05 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 2.982e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.652e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 5.184e-05 [set_forward_comm_id_for_comm_node_pass]: 2.503e-05 [meta_fg_expand]: 0.00932041 [flash_sp_send_recv_attached]: 7.88999e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 0.00019524 [a_after_grad]: 0.00025716 [renormalize]: 0.0299859 [add_forward_monad_depend]: 1.729e-05 [auto_monad_grad]: 1.411e-05 [auto_monad_eliminator]: 0.0001715 [cse]: 0.00046331 [a_3]: 0.00189342 [Cycle 2]: 0.0101087, [45] [expand_dump_flag]: 3.36999e-06 [switch_simplify]: 0.00012293 [loop_unroll]: 0.00011805 [a_1]: 0.00396477 [with_stream_mark]: 3.339e-05 [recompute_prepare]: 3.614e-05 [updatestate_depend_eliminate]: 2.92e-05 [updatestate_assign_eliminate]: 2.115e-05 [updatestate_loads_eliminate]: 2.022e-05 [parameter_eliminate]: 4.13001e-06 [a_2]: 0.00082817 [accelerated_algorithm]: 2.935e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 6.59999e-06 [shard_inline]: 2.341e-05 [merge_send_recv]: 1.924e-05 [auto_parallel]: 1.852e-05 [parallel]: 9.54e-06 [flash_sp]: 4.70001e-06 [merge_comm]: 1.267e-05 [allreduce_fusion]: 1.225e-05 [matmul_add_comm_reduction]: 2.134e-05 [allreduce_slice_to_reducescatter]: 1.26002e-06 [virtual_shard_identity]: 2.407e-05 [virtual_dataset]: 2.284e-05 [get_grad_eliminate_]: 2.216e-05 [virtual_output]: 2.358e-05 [merge_forward]: 1.472e-05 [cell_reuse_recompute_pass]: 2.45002e-06 [offload_activation]: 2.681e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.487e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 3.947e-05 [set_forward_comm_id_for_comm_node_pass]: 1.66e-05 [meta_fg_expand]: 0.00016339 [flash_sp_send_recv_attached]: 2.01e-06 [receive_attached]: 2.71e-06 [after_resolve]: 3.493e-05 [a_after_grad]: 3.705e-05 [renormalize]: 0.00358517 [add_forward_monad_depend]: 1.084e-05 [auto_monad_grad]: 2.37999e-06 [auto_monad_eliminator]: 7.191e-05 [cse]: 0.00014662 [a_3]: 0.00017825 [Cycle 3]: 0.00293923, [45] [expand_dump_flag]: 2.29999e-06 [switch_simplify]: 2.569e-05 [loop_unroll]: 2.249e-05 [a_1]: 0.00065826 [with_stream_mark]: 2.429e-05 [recompute_prepare]: 2.472e-05 [updatestate_depend_eliminate]: 4.839e-05 [updatestate_assign_eliminate]: 1.291e-05 [updatestate_loads_eliminate]: 1.414e-05 [parameter_eliminate]: 1.92999e-06 [a_2]: 0.00027468 [accelerated_algorithm]: 2.372e-05 [shard]: 1.74e-06 [meta_shard_fg_expand]: 4.67e-06 [shard_inline]: 1.76e-05 [merge_send_recv]: 1.503e-05 [auto_parallel]: 1.495e-05 [parallel]: 9.54e-06 [flash_sp]: 1.81e-06 [merge_comm]: 1.072e-05 [allreduce_fusion]: 9.44e-06 [matmul_add_comm_reduction]: 1.619e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.986e-05 [virtual_dataset]: 1.777e-05 [get_grad_eliminate_]: 1.795e-05 [virtual_output]: 1.741e-05 [merge_forward]: 8.63001e-06 [cell_reuse_recompute_pass]: 3.6e-06 [offload_activation]: 1.664e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.221e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 2.949e-05 [set_forward_comm_id_for_comm_node_pass]: 9.92999e-06 [meta_fg_expand]: 7.34002e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.90001e-06 [after_resolve]: 2.541e-05 [a_after_grad]: 2.786e-05 [renormalize]: 0.00094639 [add_forward_monad_depend]: 6.11998e-06 [auto_monad_grad]: 1.99e-06 [auto_monad_eliminator]: 4.472e-05 [cse]: 8.939e-05 [a_3]: 0.0001314 [Cycle 4]: 0.00177194, [45] [expand_dump_flag]: 1.66e-06 [switch_simplify]: 2.003e-05 [loop_unroll]: 1.937e-05 [a_1]: 0.00052117 [with_stream_mark]: 1.833e-05 [recompute_prepare]: 1.822e-05 [updatestate_depend_eliminate]: 1.122e-05 [updatestate_assign_eliminate]: 1.139e-05 [updatestate_loads_eliminate]: 1.156e-05 [parameter_eliminate]: 1.45999e-06 [a_2]: 0.00031129 [accelerated_algorithm]: 2.235e-05 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 5.00999e-06 [shard_inline]: 1.907e-05 [merge_send_recv]: 1.408e-05 [auto_parallel]: 1.402e-05 [parallel]: 6.61999e-06 [flash_sp]: 1.24998e-06 [merge_comm]: 1.032e-05 [allreduce_fusion]: 9.12001e-06 [matmul_add_comm_reduction]: 1.64e-05 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 2.001e-05 [virtual_dataset]: 1.756e-05 [get_grad_eliminate_]: 1.856e-05 [virtual_output]: 1.995e-05 [merge_forward]: 9.20999e-06 [cell_reuse_recompute_pass]: 3.19001e-06 [offload_activation]: 1.558e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.61e-05 [merge_recompute_call_nodes]: 1.03001e-06 [before_grad]: 3.185e-05 [set_forward_comm_id_for_comm_node_pass]: 1.134e-05 [meta_fg_expand]: 7.18998e-06 [flash_sp_send_recv_attached]: 1.98002e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 2.489e-05 [a_after_grad]: 2.894e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.84001e-06 [auto_monad_grad]: 1.77999e-06 [auto_monad_eliminator]: 4.01e-05 [cse]: 6.236e-05 [a_3]: 0.00012157 [py_interpret_to_execute_after_opt_a]: 7.33e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.951e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.14998e-06 [mutable_eliminate]: 0.00078549 [opt_b]: 0.00058388, [1] [Cycle 1]: 0.00057634, [7] [b_1]: 0.00042182 [b_2]: 1.992e-05 [updatestate_depend_eliminate]: 1.309e-05 [updatestate_assign_eliminate]: 1.094e-05 [updatestate_loads_eliminate]: 1.163e-05 [renormalize]: 6.00005e-07 [cse]: 6.131e-05 [optimize_parallel_all_gather_comm]: 2.912e-05 [overlap_param_gather]: 2.37001e-06 [cconv]: 3.136e-05 [loop_unroll]: 0.00050698 [opt_after_cconv]: 0.00025482, [1] [Cycle 1]: 0.0002478, [7] [c_1]: 0.00011426 [parameter_eliminate]: 2.83e-06 [updatestate_depend_eliminate]: 1.405e-05 [updatestate_assign_eliminate]: 1.031e-05 [updatestate_loads_eliminate]: 1.153e-05 [cse]: 5.858e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 6.309e-05 [tuple_transform]: 0.00016926, [1] [Cycle 1]: 0.00016378, [4] [d_1]: 0.00012471 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 1.832e-05 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 0.00011406 [cse_after_recomputation]: 6.528e-05, [1] [Cycle 1]: 5.938e-05, [1] [cse]: 5.271e-05 [environ_conv]: 1.328e-05 [swap_dp_allreduce_reducescatter]: 1.694e-05 [bias_add_comm_swap]: 3.34001e-06 [label_micro_interleaved_index]: 5.74e-06 [label_fine_grained_interleaved_index]: 2.96999e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.31998e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 1.14003e-06 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.17001e-06 [reorder_send_recv_between_fp_bp]: 2.51998e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.44998e-06 [interleave_parallel_branches]: 1.20999e-06 [overlap_opt_shard_in_pipeline]: 1.69998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.29999e-06 [control_data_broadcast_order]: 4.053e-05 [grouped_pairwise_exchange_alltoall]: 2.12001e-06 [offloading_packed_experts]: 9.83998e-06 [overlap_recompute_and_grad_model_parallel]: 1.165e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 9.56998e-06 [overlap_grad_flash_sp]: 5.932e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.14999e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 0.00017353, [1] [Cycle 1]: 0.00016876, [6] [build]: 5.52001e-06 [elim_shapecalc]: 2.538e-05 [elim_not_effective]: 4.235e-05 [opt_reshape]: 2.356e-05 [fold_const_symbol]: 3.988e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.49999e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 0.00013839 [get_jit_bprop_graph]: 2.59001e-06 [rewriter_after_jit_bprop_graph]: 5.09e-06 [opt_after_jit_grad]: 0.00058366 [validate]: 9.243e-05 [backend_pass]: 1.15001e-06 [task_emit]: 0.013223 [execute]: 8.39002e-06 Sums bootstrap : 0.000936s : 0.35% type_inference : 0.187854s : 70.07% event_method : 0.000362s : 0.13% auto_monad : 0.000360s : 0.13% graph_reusing : 0.000009s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000063s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000075s : 0.03% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000484s : 0.18% optimize.opt_a.expand_dump_flag : 0.000012s : 0.00% optimize.opt_a.switch_simplify : 0.000354s : 0.13% optimize.opt_a.loop_unroll : 0.000258s : 0.10% optimize.opt_a.a_1 : 0.008040s : 3.00% optimize.opt_a.with_stream_mark : 0.000112s : 0.04% optimize.opt_a.recompute_prepare : 0.000126s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000195s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000068s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000065s : 0.02% optimize.opt_a.parameter_eliminate : 0.000011s : 0.00% optimize.opt_a.a_2 : 0.001952s : 0.73% optimize.opt_a.accelerated_algorithm : 0.000134s : 0.05% optimize.opt_a.shard : 0.000008s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000025s : 0.01% optimize.opt_a.shard_inline : 0.000093s : 0.03% optimize.opt_a.merge_send_recv : 0.000087s : 0.03% optimize.opt_a.auto_parallel : 0.000071s : 0.03% optimize.opt_a.parallel : 0.000051s : 0.02% optimize.opt_a.flash_sp : 0.000024s : 0.01% optimize.opt_a.merge_comm : 0.000055s : 0.02% optimize.opt_a.allreduce_fusion : 0.000051s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000099s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000099s : 0.04% optimize.opt_a.virtual_dataset : 0.000091s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000093s : 0.03% optimize.opt_a.virtual_output : 0.000092s : 0.03% optimize.opt_a.merge_forward : 0.000055s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000011s : 0.00% optimize.opt_a.offload_activation : 0.000089s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000170s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000153s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000063s : 0.02% optimize.opt_a.meta_fg_expand : 0.009498s : 3.54% optimize.opt_a.flash_sp_send_recv_attached : 0.000013s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000280s : 0.10% optimize.opt_a.a_after_grad : 0.000351s : 0.13% optimize.opt_a.renormalize : 0.034518s : 12.88% optimize.opt_a.add_forward_monad_depend : 0.000037s : 0.01% optimize.opt_a.auto_monad_grad : 0.000020s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000328s : 0.12% optimize.opt_a.cse : 0.000762s : 0.28% optimize.opt_a.a_3 : 0.002325s : 0.87% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000050s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000785s : 0.29% optimize.opt_b.b_1 : 0.000422s : 0.16% optimize.opt_b.b_2 : 0.000020s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000061s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.01% optimize.loop_unroll : 0.000507s : 0.19% optimize.opt_after_cconv.c_1 : 0.000114s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.cse : 0.000059s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000063s : 0.02% optimize.tuple_transform.d_1 : 0.000125s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000018s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000114s : 0.04% optimize.cse_after_recomputation.cse : 0.000053s : 0.02% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000017s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000041s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000059s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000025s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000042s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000024s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000040s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000138s : 0.05% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000584s : 0.22% validate : 0.000092s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.013223s : 4.93% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.003580 564 4.85% : 0.000174s : 9: substitution.arithmetic_simplify 0.61% : 0.000022s : 8: substitution.depend_value_elim 0.18% : 0.000006s : 10: substitution.elim_not_effective 0.29% : 0.000011s : 13: substitution.float_depend_g_call 0.09% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.15% : 0.000005s : 10: substitution.fold_const_symbol 41.27% : 0.001477s : 8: substitution.getattr_setattr_resolve 0.34% : 0.000012s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.06% : 0.000002s : 2: substitution.incorporate_call_switch 33.98% : 0.001216s : 26: substitution.inline 0.99% : 0.000036s : 5: substitution.inline_without_move 0.75% : 0.000027s : 51: substitution.j_node_and_user_rematch 0.72% : 0.000026s : 4: substitution.less_batch_normalization 0.38% : 0.000014s : 20: substitution.load_eliminater 0.45% : 0.000016s : 11: substitution.minmaximum_grad 0.35% : 0.000013s : 13: substitution.partial_eliminate 1.06% : 0.000038s : 51: substitution.remove_not_recompute_node 2.37% : 0.000085s : 35: substitution.replace_applicator 0.87% : 0.000031s : 53: substitution.replace_old_param 0.14% : 0.000005s : 2: substitution.set_cell_output_no_recompute 0.33% : 0.000012s : 3: substitution.switch_simplify 1.00% : 0.000036s : 11: substitution.tuple_list_convert_item_index_to_positive 0.45% : 0.000016s : 11: substitution.tuple_list_get_item_const_eliminator 0.64% : 0.000023s : 11: substitution.tuple_list_get_item_depend_reorder 1.91% : 0.000068s : 24: substitution.tuple_list_get_item_eliminator 0.64% : 0.000023s : 11: substitution.tuple_list_get_set_item_eliminator 2.00% : 0.000071s : 62: substitution.updatestate_pure_node_eliminater 3.02% : 0.000108s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.187700 2 96.34% : 0.180828s : 1: type_inference.infer 3.66% : 0.006872s : 1: type_inference.specialize ------[replace.] 0.000875 49 11.41% : 0.000100s : 6: replace.getattr_setattr_resolve 44.33% : 0.000388s : 26: replace.inline 5.67% : 0.000050s : 2: replace.replace_applicator 5.31% : 0.000046s : 3: replace.switch_simplify 27.09% : 0.000237s : 11: replace.tuple_list_get_item_eliminator 6.19% : 0.000054s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002645 49 52.52% : 0.001389s : 6: match.getattr_setattr_resolve 45.34% : 0.001199s : 26: match.inline 0.55% : 0.000015s : 2: match.replace_applicator 0.39% : 0.000010s : 3: match.switch_simplify 0.85% : 0.000023s : 11: match.tuple_list_get_item_eliminator 0.34% : 0.000009s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001983 14794 1.06% : 0.000021s : 161: predicate.accumulaten_eliminater 0.23% : 0.000005s : 17: predicate.ad_related_special_op_eliminate 0.68% : 0.000013s : 107: predicate.addn_check_dump 1.10% : 0.000022s : 161: predicate.addn_zero_filter 0.97% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.17% : 0.000043s : 263: predicate.arithmetic_simplify 1.07% : 0.000021s : 161: predicate.cast_eliminate 2.03% : 0.000040s : 317: predicate.check_bprop_eliminate 0.68% : 0.000013s : 107: predicate.compare_switch_simplify 0.06% : 0.000001s : 15: predicate.const_output_eliminate 0.67% : 0.000013s : 102: predicate.depend_value_elim 1.08% : 0.000021s : 161: predicate.dict_get_item_const_eliminator 1.23% : 0.000024s : 161: predicate.dict_get_item_eliminator 1.04% : 0.000021s : 161: predicate.dict_set_item_eliminator 0.26% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 15: predicate.elim_not_effective 0.15% : 0.000003s : 15: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000022s : 176: predicate.environ_add_const_eliminate 1.06% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.13% : 0.000022s : 176: predicate.environ_get_depend_swap 1.72% : 0.000034s : 278: predicate.environ_get_eliminate 1.06% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.28% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.75% : 0.000035s : 198: predicate.float_depend_g_call 0.70% : 0.000014s : 107: predicate.float_environ_get_switch 0.77% : 0.000015s : 122: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 15: predicate.fold_const_symbol 0.59% : 0.000012s : 82: predicate.get_grad_eliminate 0.51% : 0.000010s : 40: predicate.getattr_setattr_resolve 0.06% : 0.000001s : 15: predicate.graph_param_transform 0.67% : 0.000013s : 102: predicate.incorporate_call 0.62% : 0.000012s : 102: predicate.incorporate_call_switch 4.82% : 0.000096s : 578: predicate.inline 1.55% : 0.000031s : 186: predicate.inline_without_move 0.30% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.72% : 0.000014s : 85: predicate.less_batch_normalization 1.36% : 0.000027s : 202: predicate.list_to_tuple_eliminator_ 2.32% : 0.000046s : 363: predicate.load_eliminater 0.21% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.89% : 0.000037s : 273: predicate.loop_unroll_before_grad 1.22% : 0.000024s : 191: predicate.make_slice_get_slice_eliminator 0.74% : 0.000015s : 107: predicate.merge_addn 1.94% : 0.000038s : 309: predicate.micro_step_allgather_replace 1.96% : 0.000039s : 309: predicate.mini_step_allgather_replace 1.01% : 0.000020s : 161: predicate.minmaximum_grad 0.25% : 0.000005s : 15: predicate.mutable_eliminate 0.13% : 0.000003s : 15: predicate.opt_reshape 0.13% : 0.000003s : 15: predicate.parallel_virtual_node 1.80% : 0.000036s : 198: predicate.partial_defer_inline 1.37% : 0.000027s : 187: predicate.partial_eliminate 1.04% : 0.000021s : 161: predicate.print_const_string_wrapper 0.67% : 0.000013s : 102: predicate.reduce_all_const_elim 1.35% : 0.000027s : 161: predicate.reduce_eliminate 2.24% : 0.000044s : 363: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000007s : 82: predicate.remove_not_recompute_node 2.22% : 0.000044s : 485: predicate.replace_applicator 0.71% : 0.000014s : 186: predicate.replace_old_param 0.07% : 0.000001s : 15: predicate.reset_defer_inline 1.02% : 0.000020s : 161: predicate.reshape_eliminate 2.04% : 0.000040s : 309: predicate.row_tensor_add_zeros_like 0.11% : 0.000002s : 15: predicate.row_tensor_eliminate 2.18% : 0.000043s : 317: predicate.same_eliminate 0.42% : 0.000008s : 97: predicate.set_cell_output_no_recompute 0.62% : 0.000012s : 82: predicate.shard_identity_eliminate 0.23% : 0.000005s : 32: predicate.special_op_eliminate 0.78% : 0.000015s : 107: predicate.specialize_transform 2.01% : 0.000040s : 309: predicate.split_environ_get_set_with_tuple_value 1.45% : 0.000029s : 186: predicate.stack_unstack_eliminate 0.11% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.43% : 0.000028s : 198: predicate.switch_defer_inline 3.39% : 0.000067s : 515: predicate.switch_layer_defer_inline 4.15% : 0.000082s : 599: predicate.switch_simplify 1.01% : 0.000020s : 161: predicate.tile_eliminate 1.00% : 0.000020s : 161: predicate.transpose_eliminate 1.28% : 0.000025s : 191: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000028s : 191: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000026s : 191: predicate.tuple_list_get_item_depend_reorder 2.33% : 0.000046s : 304: predicate.tuple_list_get_item_eliminator 1.33% : 0.000026s : 191: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000042s : 293: predicate.tuple_list_set_item_eliminator 1.30% : 0.000026s : 202: predicate.tuple_to_list_eliminator_ 2.36% : 0.000047s : 363: predicate.updatestate_pure_node_eliminater 3.14% : 0.000062s : 467: predicate.updatestate_useless_node_eliminater 0.11% : 0.000002s : 15: predicate.value_based_eliminate 0.57% : 0.000011s : 82: predicate.virtual_dataset_eliminate 0.57% : 0.000011s : 82: predicate.virtual_output_eliminate 0.12% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.15% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.010311 110 70.84% : 0.007304s : 70: func_graph_cloner_run.FuncGraphClonerGraph 2.16% : 0.000222s : 4: func_graph_cloner_run.FuncGraphClonerNode 27.00% : 0.002784s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.394268 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.92% : 0.003615s : 1: add_attr 0.91% : 0.003604s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000119s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000374s : 1: auto_monad 0.04% : 0.000146s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.24% : 0.000960s : 1: bootstrap 0.01% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000044s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000069s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.10% : 0.000375s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000007s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.13% : 0.000517s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.20% : 0.000796s : 1: mutable_eliminate 0.00% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000031s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000035s : 1: opt.transform.mutable_eliminate 3.64% : 0.014370s : 181: opt.transform.opt_a 0.03% : 0.000112s : 1: opt.transform.opt_after_cconv 0.02% : 0.000072s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000410s : 28: opt.transform.opt_b 0.43% : 0.001680s : 4: opt.transform.opt_resolve 0.04% : 0.000140s : 2: opt.transform.opt_trans_graph 0.03% : 0.000125s : 4: opt.transform.symbol_engine_opt 15.75% : 0.062091s : 1: opt_a 0.07% : 0.000258s : 1: opt_after_cconv 0.15% : 0.000594s : 1: opt_after_jit_grad 0.15% : 0.000587s : 1: opt_b 16.71% : 0.065883s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000064s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000080s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000068s : 1: remove_dup_value 6.74% : 0.026577s : 3: renormalize.infer 2.01% : 0.007911s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000053s : 1: rewriter_after_opt_a 0.12% : 0.000492s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000021s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000176s : 1: symbol_engine_optimizer 3.36% : 0.013238s : 1: task_emit 0.04% : 0.000172s : 1: tuple_transform 47.65% : 0.187873s : 1: type_inference 0.04% : 0.000145s : 1: validate TotalTime = 0.197041, [24] [bootstrap]: 0.00110322 [type_inference]: 0.163107 [event_method]: 2.254e-05 [auto_monad]: 0.00034666 [graph_reusing]: 7.15e-06 [inline]: 3.40998e-06 [add_attr]: 0.00495414, [1] [add_attr_with_inline]: 0.00494387, [1] [Cycle 1]: 8.063e-05, [2] [tag_attr]: 3.508e-05 [meta_addattr_fg_expand]: 7.29001e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 4.422e-05 [insert-virtual-dataset]: 2.78003e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 1.99999e-06 [pipeline_split]: 1.67999e-06 [optimize]: 0.0130178, [53] [py_interpret_to_execute]: 6.69999e-06 [rewriter_before_opt_a]: 0.00011704 [opt_a]: 0.00978102, [2] [Cycle 1]: 0.00795333, [45] [expand_dump_flag]: 3.53999e-06 [switch_simplify]: 6.05e-05 [loop_unroll]: 4.25e-05 [a_1]: 0.00123414 [with_stream_mark]: 2.386e-05 [recompute_prepare]: 2.547e-05 [updatestate_depend_eliminate]: 5.402e-05 [updatestate_assign_eliminate]: 1.377e-05 [updatestate_loads_eliminate]: 1.338e-05 [parameter_eliminate]: 1.88997e-06 [a_2]: 0.00029008 [accelerated_algorithm]: 4.26e-05 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 4.59998e-06 [shard_inline]: 1.973e-05 [merge_send_recv]: 1.695e-05 [auto_parallel]: 1.232e-05 [parallel]: 2.276e-05 [flash_sp]: 1.124e-05 [merge_comm]: 1.053e-05 [allreduce_fusion]: 9.67999e-06 [matmul_add_comm_reduction]: 1.842e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 2.024e-05 [virtual_dataset]: 1.977e-05 [get_grad_eliminate_]: 1.953e-05 [virtual_output]: 1.9e-05 [merge_forward]: 9.59e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.831e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.209e-05 [merge_recompute_call_nodes]: 1.92001e-06 [before_grad]: 2.945e-05 [set_forward_comm_id_for_comm_node_pass]: 1.029e-05 [meta_fg_expand]: 7.23e-06 [flash_sp_send_recv_attached]: 5.12999e-06 [receive_attached]: 2.63e-06 [after_resolve]: 2.616e-05 [a_after_grad]: 3.105e-05 [renormalize]: 0.00513632 [add_forward_monad_depend]: 5.94e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 5.594e-05 [cse]: 0.00015545 [a_3]: 0.00013845 [Cycle 2]: 0.00181683, [45] [expand_dump_flag]: 1.80001e-06 [switch_simplify]: 1.974e-05 [loop_unroll]: 1.912e-05 [a_1]: 0.00054702 [with_stream_mark]: 1.782e-05 [recompute_prepare]: 1.875e-05 [updatestate_depend_eliminate]: 9.45001e-06 [updatestate_assign_eliminate]: 1.153e-05 [updatestate_loads_eliminate]: 1.116e-05 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00036718 [accelerated_algorithm]: 2.257e-05 [shard]: 1.64e-06 [meta_shard_fg_expand]: 4.23999e-06 [shard_inline]: 1.829e-05 [merge_send_recv]: 1.602e-05 [auto_parallel]: 1.412e-05 [parallel]: 7.72998e-06 [flash_sp]: 4.2e-06 [merge_comm]: 9.96e-06 [allreduce_fusion]: 9.49999e-06 [matmul_add_comm_reduction]: 1.737e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 1.917e-05 [virtual_dataset]: 1.833e-05 [get_grad_eliminate_]: 1.781e-05 [virtual_output]: 1.779e-05 [merge_forward]: 8.43001e-06 [cell_reuse_recompute_pass]: 2.24999e-06 [offload_activation]: 1.619e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.2e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 2.832e-05 [set_forward_comm_id_for_comm_node_pass]: 1.046e-05 [meta_fg_expand]: 6.54999e-06 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 1.86998e-06 [after_resolve]: 2.45e-05 [a_after_grad]: 2.975e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.62001e-06 [auto_monad_grad]: 1.42e-06 [auto_monad_eliminator]: 3.716e-05 [cse]: 5.378e-05 [a_3]: 0.00012272 [py_interpret_to_execute_after_opt_a]: 5.53002e-06 [slice_cell_reuse_recomputed_activation]: 2.46998e-06 [rewriter_after_opt_a]: 4.498e-05 [convert_after_rewriter]: 1.04e-06 [order_py_execute_after_rewriter]: 1.01002e-06 [mutable_eliminate]: 0.0006778 [opt_b]: 0.00058965, [1] [Cycle 1]: 0.00058317, [7] [b_1]: 0.00043387 [b_2]: 2.056e-05 [updatestate_depend_eliminate]: 1.251e-05 [updatestate_assign_eliminate]: 1.085e-05 [updatestate_loads_eliminate]: 1.137e-05 [renormalize]: 4.00003e-07 [cse]: 5.819e-05 [optimize_parallel_all_gather_comm]: 3.02e-05 [overlap_param_gather]: 2.49001e-06 [cconv]: 1.831e-05 [loop_unroll]: 0.00048124 [opt_after_cconv]: 0.00025417, [1] [Cycle 1]: 0.00024779, [7] [c_1]: 0.00011956 [parameter_eliminate]: 2.74999e-06 [updatestate_depend_eliminate]: 1.24e-05 [updatestate_assign_eliminate]: 1.048e-05 [updatestate_loads_eliminate]: 1.106e-05 [cse]: 5.416e-05 [renormalize]: 5.10016e-07 [remove_dup_value]: 5.61e-05 [tuple_transform]: 0.00017485, [1] [Cycle 1]: 0.00016927, [4] [d_1]: 0.00013114 [none_parameter_eliminate]: 1.04e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 1.848e-05 [partial_unused_args_eliminate]: 1.35999e-06 [add_recomputation]: 9.243e-05 [cse_after_recomputation]: 5.661e-05, [1] [Cycle 1]: 5.203e-05, [1] [cse]: 4.58e-05 [environ_conv]: 1.181e-05 [swap_dp_allreduce_reducescatter]: 1.392e-05 [bias_add_comm_swap]: 3.00998e-06 [label_micro_interleaved_index]: 5.86e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.35002e-06 [micro_interleaved_order_control]: 2.80997e-06 [assign_add_opt]: 1.46002e-06 [ForceFp32Comm]: 1.25001e-06 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 2.31e-06 [overlap_opt_shard_grad_in_pipeline]: 2.19001e-06 [control_data_broadcast_order]: 3.938e-05 [grouped_pairwise_exchange_alltoall]: 1.51002e-06 [offloading_packed_experts]: 1.069e-05 [overlap_recompute_and_grad_model_parallel]: 1.109e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.27001e-06 [overlap_grad_ring_attention]: 1.016e-05 [overlap_grad_flash_sp]: 4.379e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.52001e-06 [split_layernorm_comm]: 1.81998e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 0.00020357, [1] [Cycle 1]: 0.00019787, [6] [build]: 4.71002e-06 [elim_shapecalc]: 2.427e-05 [elim_not_effective]: 8.002e-05 [opt_reshape]: 2.24e-05 [fold_const_symbol]: 3.407e-05 [renormalize]: 2.20025e-07 [detach_backward]: 2.32999e-06 [pipeline_parallel_scheduler]: 1.51998e-06 [auto_monad_reorder]: 0.00014043 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 5.17e-06 [opt_after_jit_grad]: 0.00062839 [validate]: 8.685e-05 [backend_pass]: 1.21002e-06 [task_emit]: 0.01329 [execute]: 6.94999e-06 Sums bootstrap : 0.001103s : 0.58% type_inference : 0.163107s : 85.40% event_method : 0.000023s : 0.01% auto_monad : 0.000347s : 0.18% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000044s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000117s : 0.06% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000080s : 0.04% optimize.opt_a.loop_unroll : 0.000062s : 0.03% optimize.opt_a.a_1 : 0.001781s : 0.93% optimize.opt_a.with_stream_mark : 0.000042s : 0.02% optimize.opt_a.recompute_prepare : 0.000044s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000063s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000025s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000657s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000065s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000038s : 0.02% optimize.opt_a.merge_send_recv : 0.000033s : 0.02% optimize.opt_a.auto_parallel : 0.000026s : 0.01% optimize.opt_a.parallel : 0.000030s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000020s : 0.01% optimize.opt_a.allreduce_fusion : 0.000019s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000036s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000039s : 0.02% optimize.opt_a.virtual_dataset : 0.000038s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000037s : 0.02% optimize.opt_a.virtual_output : 0.000037s : 0.02% optimize.opt_a.merge_forward : 0.000018s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000064s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000058s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000021s : 0.01% optimize.opt_a.meta_fg_expand : 0.000014s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000051s : 0.03% optimize.opt_a.a_after_grad : 0.000061s : 0.03% optimize.opt_a.renormalize : 0.005136s : 2.69% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000093s : 0.05% optimize.opt_a.cse : 0.000209s : 0.11% optimize.opt_a.a_3 : 0.000261s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000678s : 0.35% optimize.opt_b.b_1 : 0.000434s : 0.23% optimize.opt_b.b_2 : 0.000021s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000058s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000018s : 0.01% optimize.loop_unroll : 0.000481s : 0.25% optimize.opt_after_cconv.c_1 : 0.000120s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.cse : 0.000054s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000056s : 0.03% optimize.tuple_transform.d_1 : 0.000131s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000018s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000092s : 0.05% optimize.cse_after_recomputation.cse : 0.000046s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000039s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000011s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.01% optimize.overlap_grad_flash_sp : 0.000044s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000024s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000080s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000034s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000140s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000628s : 0.33% validate : 0.000087s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.013290s : 6.96% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000458 168 17.07% : 0.000078s : 4: substitution.arithmetic_simplify 1.90% : 0.000009s : 2: substitution.depend_value_elim 1.12% : 0.000005s : 10: substitution.elim_not_effective 1.17% : 0.000005s : 10: substitution.fold_const_symbol 2.42% : 0.000011s : 16: substitution.graph_param_transform 49.32% : 0.000226s : 4: substitution.inline 1.97% : 0.000009s : 20: substitution.j_node_and_user_rematch 4.33% : 0.000020s : 2: substitution.less_batch_normalization 1.70% : 0.000008s : 12: substitution.load_eliminater 3.37% : 0.000015s : 20: substitution.remove_not_recompute_node 1.61% : 0.000007s : 10: substitution.replace_old_param 6.81% : 0.000031s : 26: substitution.updatestate_pure_node_eliminater 7.19% : 0.000033s : 32: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.162971 2 97.44% : 0.158793s : 1: type_inference.infer 2.56% : 0.004178s : 1: type_inference.specialize ------[replace.] 0.000047 4 100.00% : 0.000047s : 4: replace.inline ------[match.] 0.000223 4 100.00% : 0.000223s : 4: match.inline ------[predicate.] 0.000647 4085 0.77% : 0.000005s : 40: predicate.accumulaten_eliminater 0.90% : 0.000006s : 18: predicate.ad_related_special_op_eliminate 13.91% : 0.000090s : 32: predicate.addn_check_dump 0.83% : 0.000005s : 40: predicate.addn_zero_filter 0.74% : 0.000005s : 40: predicate.adjust_all_reduce_mul_add 2.17% : 0.000014s : 72: predicate.arithmetic_simplify 0.84% : 0.000005s : 40: predicate.cast_eliminate 0.65% : 0.000004s : 32: predicate.check_bprop_eliminate 0.68% : 0.000004s : 32: predicate.compare_switch_simplify 0.21% : 0.000001s : 16: predicate.const_output_eliminate 0.67% : 0.000004s : 32: predicate.depend_value_elim 0.83% : 0.000005s : 40: predicate.dict_get_item_const_eliminator 0.86% : 0.000006s : 40: predicate.dict_get_item_eliminator 0.75% : 0.000005s : 40: predicate.dict_set_item_eliminator 0.90% : 0.000006s : 34: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 16: predicate.elim_not_effective 0.41% : 0.000003s : 16: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.07% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.03% : 0.000007s : 56: predicate.environ_get_depend_swap 1.78% : 0.000011s : 88: predicate.environ_get_eliminate 1.07% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.84% : 0.000005s : 44: predicate.exchange_switch_depend_value 1.25% : 0.000008s : 44: predicate.float_depend_g_call 0.63% : 0.000004s : 32: predicate.float_environ_get_switch 0.93% : 0.000006s : 48: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 16: predicate.fold_const_symbol 0.71% : 0.000005s : 32: predicate.get_grad_eliminate 0.21% : 0.000001s : 16: predicate.graph_param_transform 0.65% : 0.000004s : 32: predicate.incorporate_call 0.59% : 0.000004s : 32: predicate.incorporate_call_switch 4.50% : 0.000029s : 180: predicate.inline 0.81% : 0.000005s : 32: predicate.inline_without_move 0.37% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.94% : 0.000006s : 35: predicate.less_batch_normalization 1.58% : 0.000010s : 72: predicate.list_to_tuple_eliminator_ 2.17% : 0.000014s : 112: predicate.load_eliminater 0.67% : 0.000004s : 16: predicate.loop_unroll_after_grad 1.32% : 0.000009s : 61: predicate.loop_unroll_before_grad 1.51% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 0.73% : 0.000005s : 32: predicate.merge_addn 0.66% : 0.000004s : 32: predicate.micro_step_allgather_replace 0.65% : 0.000004s : 32: predicate.mini_step_allgather_replace 0.75% : 0.000005s : 40: predicate.minmaximum_grad 0.66% : 0.000004s : 16: predicate.mutable_eliminate 0.37% : 0.000002s : 16: predicate.opt_reshape 0.44% : 0.000003s : 16: predicate.parallel_virtual_node 1.07% : 0.000007s : 44: predicate.partial_defer_inline 1.19% : 0.000008s : 56: predicate.partial_eliminate 0.75% : 0.000005s : 40: predicate.print_const_string_wrapper 0.65% : 0.000004s : 32: predicate.reduce_all_const_elim 0.97% : 0.000006s : 40: predicate.reduce_eliminate 2.13% : 0.000014s : 112: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000003s : 32: predicate.remove_not_recompute_node 1.11% : 0.000007s : 72: predicate.replace_applicator 0.43% : 0.000003s : 32: predicate.replace_old_param 0.22% : 0.000001s : 16: predicate.reset_defer_inline 0.79% : 0.000005s : 40: predicate.reshape_eliminate 0.69% : 0.000004s : 32: predicate.row_tensor_add_zeros_like 0.37% : 0.000002s : 16: predicate.row_tensor_eliminate 0.75% : 0.000005s : 32: predicate.same_eliminate 0.47% : 0.000003s : 36: predicate.set_cell_output_no_recompute 0.71% : 0.000005s : 32: predicate.shard_identity_eliminate 0.87% : 0.000006s : 34: predicate.special_op_eliminate 0.74% : 0.000005s : 32: predicate.specialize_transform 0.75% : 0.000005s : 32: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000006s : 32: predicate.stack_unstack_eliminate 0.35% : 0.000002s : 16: predicate.switch_call_monad_eliminater 0.94% : 0.000006s : 44: predicate.switch_defer_inline 1.55% : 0.000010s : 76: predicate.switch_layer_defer_inline 3.33% : 0.000022s : 153: predicate.switch_simplify 0.77% : 0.000005s : 40: predicate.tile_eliminate 0.75% : 0.000005s : 40: predicate.transpose_eliminate 1.43% : 0.000009s : 72: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000009s : 72: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 2.42% : 0.000016s : 104: predicate.tuple_list_get_item_eliminator 1.45% : 0.000009s : 72: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000015s : 104: predicate.tuple_list_set_item_eliminator 1.43% : 0.000009s : 72: predicate.tuple_to_list_eliminator_ 2.25% : 0.000015s : 112: predicate.updatestate_pure_node_eliminater 3.04% : 0.000020s : 144: predicate.updatestate_useless_node_eliminater 0.36% : 0.000002s : 16: predicate.value_based_eliminate 0.75% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.68% : 0.000004s : 32: predicate.virtual_output_eliminate 0.40% : 0.000003s : 18: predicate.virtual_view_grad_eliminate 0.38% : 0.000002s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003378 26 75.03% : 0.002534s : 20: func_graph_cloner_run.FuncGraphClonerGraph 24.97% : 0.000844s : 6: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.224304 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.21% : 0.004960s : 1: add_attr 2.21% : 0.004947s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000097s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.16% : 0.000358s : 1: auto_monad 0.07% : 0.000148s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.50% : 0.001132s : 1: bootstrap 0.01% : 0.000022s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000043s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.01% : 0.000029s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.22% : 0.000491s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.31% : 0.000686s : 1: mutable_eliminate 0.01% : 0.000014s : 1: offloading_packed_experts 0.01% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000030s : 1: opt.transform.mutable_eliminate 1.46% : 0.003279s : 78: opt.transform.opt_a 0.05% : 0.000118s : 1: opt.transform.opt_after_cconv 0.03% : 0.000078s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000424s : 28: opt.transform.opt_b 0.07% : 0.000147s : 2: opt.transform.opt_trans_graph 0.07% : 0.000156s : 4: opt.transform.symbol_engine_opt 4.36% : 0.009786s : 1: opt_a 0.11% : 0.000258s : 1: opt_after_cconv 0.29% : 0.000642s : 1: opt_after_jit_grad 0.26% : 0.000594s : 1: opt_b 5.81% : 0.013024s : 1: optimize 0.02% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000047s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000049s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000061s : 1: remove_dup_value 1.78% : 0.004000s : 1: renormalize.infer 0.50% : 0.001125s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.05% : 0.000122s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000207s : 1: symbol_engine_optimizer 5.93% : 0.013304s : 1: task_emit 0.08% : 0.000178s : 1: tuple_transform 72.73% : 0.163130s : 1: type_inference 0.06% : 0.000136s : 1: validate TotalTime = 0.259768, [24] [bootstrap]: 0.00073015 [type_inference]: 0.178286 [event_method]: 0.00037295 [auto_monad]: 0.00039669 [graph_reusing]: 1.044e-05 [inline]: 3.71001e-06 [add_attr]: 0.00359635, [1] [add_attr_with_inline]: 0.00358804, [1] [Cycle 1]: 0.0001057, [2] [tag_attr]: 5.979e-05 [meta_addattr_fg_expand]: 1.423e-05 [parallel-infer-symbol]: 3.41001e-06 [pre_auto_parallel]: 7.203e-05 [insert-virtual-dataset]: 3.43e-06 [parallel-infer-symbol-second]: 6.19999e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.0618585, [53] [py_interpret_to_execute]: 4.03999e-06 [rewriter_before_opt_a]: 0.00047149 [opt_a]: 0.0582205, [4] [Cycle 1]: 0.0440152, [45] [expand_dump_flag]: 3.58e-06 [switch_simplify]: 0.00018247 [loop_unroll]: 9.466e-05 [a_1]: 0.00286221 [with_stream_mark]: 3.172e-05 [recompute_prepare]: 4.41e-05 [updatestate_depend_eliminate]: 0.00011364 [updatestate_assign_eliminate]: 2.183e-05 [updatestate_loads_eliminate]: 1.841e-05 [parameter_eliminate]: 2.41e-06 [a_2]: 0.00056222 [accelerated_algorithm]: 5.755e-05 [shard]: 2.25002e-06 [meta_shard_fg_expand]: 8.86002e-06 [shard_inline]: 3.334e-05 [merge_send_recv]: 4.098e-05 [auto_parallel]: 2.346e-05 [parallel]: 2.309e-05 [flash_sp]: 1.621e-05 [merge_comm]: 2.091e-05 [allreduce_fusion]: 1.967e-05 [matmul_add_comm_reduction]: 4.284e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 3.569e-05 [virtual_dataset]: 3.206e-05 [get_grad_eliminate_]: 3.47e-05 [virtual_output]: 3.27e-05 [merge_forward]: 2.091e-05 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 3.023e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.658e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 5.153e-05 [set_forward_comm_id_for_comm_node_pass]: 2.34e-05 [meta_fg_expand]: 0.00896391 [flash_sp_send_recv_attached]: 8.54e-06 [receive_attached]: 3.25e-06 [after_resolve]: 0.00020446 [a_after_grad]: 0.00025408 [renormalize]: 0.027112 [add_forward_monad_depend]: 1.98e-05 [auto_monad_grad]: 1.698e-05 [auto_monad_eliminator]: 0.00017285 [cse]: 0.00045981 [a_3]: 0.00191817 [Cycle 2]: 0.00977002, [45] [expand_dump_flag]: 3.76999e-06 [switch_simplify]: 0.00012598 [loop_unroll]: 0.00013486 [a_1]: 0.00387915 [with_stream_mark]: 3.861e-05 [recompute_prepare]: 3.931e-05 [updatestate_depend_eliminate]: 2.938e-05 [updatestate_assign_eliminate]: 2.132e-05 [updatestate_loads_eliminate]: 2.095e-05 [parameter_eliminate]: 5.22999e-06 [a_2]: 0.00082889 [accelerated_algorithm]: 3.071e-05 [shard]: 2.56e-06 [meta_shard_fg_expand]: 7.02002e-06 [shard_inline]: 2.312e-05 [merge_send_recv]: 2.065e-05 [auto_parallel]: 1.923e-05 [parallel]: 1.143e-05 [flash_sp]: 4.83001e-06 [merge_comm]: 1.3e-05 [allreduce_fusion]: 1.219e-05 [matmul_add_comm_reduction]: 2.102e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 2.416e-05 [virtual_dataset]: 2.355e-05 [get_grad_eliminate_]: 2.811e-05 [virtual_output]: 2.36e-05 [merge_forward]: 1.571e-05 [cell_reuse_recompute_pass]: 1.64998e-06 [offload_activation]: 2.756e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.587e-05 [merge_recompute_call_nodes]: 1.79e-06 [before_grad]: 3.994e-05 [set_forward_comm_id_for_comm_node_pass]: 1.748e-05 [meta_fg_expand]: 0.00017089 [flash_sp_send_recv_attached]: 1.97999e-06 [receive_attached]: 2.81999e-06 [after_resolve]: 3.276e-05 [a_after_grad]: 3.636e-05 [renormalize]: 0.00329653 [add_forward_monad_depend]: 7.4e-06 [auto_monad_grad]: 1.44e-06 [auto_monad_eliminator]: 6.293e-05 [cse]: 0.00015249 [a_3]: 0.00017581 [Cycle 3]: 0.00282384, [45] [expand_dump_flag]: 1.50001e-06 [switch_simplify]: 2.392e-05 [loop_unroll]: 2.269e-05 [a_1]: 0.00069421 [with_stream_mark]: 1.759e-05 [recompute_prepare]: 2.163e-05 [updatestate_depend_eliminate]: 4.025e-05 [updatestate_assign_eliminate]: 1.165e-05 [updatestate_loads_eliminate]: 1.245e-05 [parameter_eliminate]: 1.55001e-06 [a_2]: 0.00027089 [accelerated_algorithm]: 2.262e-05 [shard]: 1.33002e-06 [meta_shard_fg_expand]: 4.03999e-06 [shard_inline]: 1.721e-05 [merge_send_recv]: 1.506e-05 [auto_parallel]: 1.422e-05 [parallel]: 7.51001e-06 [flash_sp]: 1.10001e-06 [merge_comm]: 9.91e-06 [allreduce_fusion]: 9.47999e-06 [matmul_add_comm_reduction]: 1.462e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 1.822e-05 [virtual_dataset]: 1.69e-05 [get_grad_eliminate_]: 1.794e-05 [virtual_output]: 1.736e-05 [merge_forward]: 8.23999e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 1.612e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.172e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 2.801e-05 [set_forward_comm_id_for_comm_node_pass]: 9.39998e-06 [meta_fg_expand]: 6.68e-06 [flash_sp_send_recv_attached]: 1.45001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 2.59e-05 [a_after_grad]: 2.772e-05 [renormalize]: 0.00087087 [add_forward_monad_depend]: 6.02001e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 3.849e-05 [cse]: 7.773e-05 [a_3]: 0.00012895 [Cycle 4]: 0.00159102, [45] [expand_dump_flag]: 9.60019e-07 [switch_simplify]: 1.875e-05 [loop_unroll]: 1.722e-05 [a_1]: 0.00049791 [with_stream_mark]: 1.37e-05 [recompute_prepare]: 1.746e-05 [updatestate_depend_eliminate]: 1.01e-05 [updatestate_assign_eliminate]: 1.025e-05 [updatestate_loads_eliminate]: 1.086e-05 [parameter_eliminate]: 8.99978e-07 [a_2]: 0.00026074 [accelerated_algorithm]: 2.147e-05 [shard]: 1.02e-06 [meta_shard_fg_expand]: 3.54002e-06 [shard_inline]: 1.735e-05 [merge_send_recv]: 1.172e-05 [auto_parallel]: 1.188e-05 [parallel]: 4.83001e-06 [flash_sp]: 1.03001e-06 [merge_comm]: 9.79e-06 [allreduce_fusion]: 9.02e-06 [matmul_add_comm_reduction]: 1.472e-05 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 1.89e-05 [virtual_dataset]: 1.66e-05 [get_grad_eliminate_]: 1.677e-05 [virtual_output]: 1.7e-05 [merge_forward]: 8.1e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 1.362e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.982e-05 [merge_recompute_call_nodes]: 9.09989e-07 [before_grad]: 2.755e-05 [set_forward_comm_id_for_comm_node_pass]: 9.46e-06 [meta_fg_expand]: 6.19001e-06 [flash_sp_send_recv_attached]: 1.40999e-06 [receive_attached]: 1.56002e-06 [after_resolve]: 2.275e-05 [a_after_grad]: 2.741e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 2.925e-05 [cse]: 4.944e-05 [a_3]: 0.0001176 [py_interpret_to_execute_after_opt_a]: 6.20002e-06 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 4.668e-05 [convert_after_rewriter]: 1.10001e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00070709 [opt_b]: 0.00057164, [1] [Cycle 1]: 0.00056475, [7] [b_1]: 0.00041353 [b_2]: 2.01e-05 [updatestate_depend_eliminate]: 1.193e-05 [updatestate_assign_eliminate]: 1.039e-05 [updatestate_loads_eliminate]: 1.182e-05 [renormalize]: 3.59985e-07 [cse]: 5.927e-05 [optimize_parallel_all_gather_comm]: 2.937e-05 [overlap_param_gather]: 2.49999e-06 [cconv]: 2.116e-05 [loop_unroll]: 0.0005306 [opt_after_cconv]: 0.00024648, [1] [Cycle 1]: 0.00023913, [7] [c_1]: 0.00011204 [parameter_eliminate]: 2.44999e-06 [updatestate_depend_eliminate]: 1.359e-05 [updatestate_assign_eliminate]: 1.048e-05 [updatestate_loads_eliminate]: 1.098e-05 [cse]: 5.339e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 5.856e-05 [tuple_transform]: 0.00016845, [1] [Cycle 1]: 0.00016326, [4] [d_1]: 0.00012258 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 1.85e-05 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 9.796e-05 [cse_after_recomputation]: 5.691e-05, [1] [Cycle 1]: 5.177e-05, [1] [cse]: 4.537e-05 [environ_conv]: 1.206e-05 [swap_dp_allreduce_reducescatter]: 1.355e-05 [bias_add_comm_swap]: 3.43e-06 [label_micro_interleaved_index]: 5.54e-06 [label_fine_grained_interleaved_index]: 2.43002e-06 [merge_cast_opt]: 1.76998e-06 [slice_recompute_activation]: 2.03002e-06 [micro_interleaved_order_control]: 2.59999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.42e-06 [full_micro_interleaved_order_control]: 2.50997e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.11002e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.11997e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.69e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 4.584e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 1.245e-05 [overlap_recompute_and_grad_model_parallel]: 1.287e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.28002e-06 [overlap_grad_ring_attention]: 1.146e-05 [overlap_grad_flash_sp]: 5.192e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.59999e-06 [split_layernorm_comm]: 2.04e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 0.00016237, [1] [Cycle 1]: 0.00015695, [6] [build]: 4.97999e-06 [elim_shapecalc]: 2.528e-05 [elim_not_effective]: 3.733e-05 [opt_reshape]: 2.241e-05 [fold_const_symbol]: 3.464e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.29999e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 0.0001309 [get_jit_bprop_graph]: 1.59998e-06 [rewriter_after_jit_bprop_graph]: 4.36002e-06 [opt_after_jit_grad]: 0.00054377 [validate]: 8.864e-05 [backend_pass]: 1.03001e-06 [task_emit]: 0.0133894 [execute]: 7.01999e-06 Sums bootstrap : 0.000730s : 0.29% type_inference : 0.178286s : 70.05% event_method : 0.000373s : 0.15% auto_monad : 0.000397s : 0.16% graph_reusing : 0.000010s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000060s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000072s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000471s : 0.19% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000351s : 0.14% optimize.opt_a.loop_unroll : 0.000269s : 0.11% optimize.opt_a.a_1 : 0.007933s : 3.12% optimize.opt_a.with_stream_mark : 0.000102s : 0.04% optimize.opt_a.recompute_prepare : 0.000123s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000193s : 0.08% optimize.opt_a.updatestate_assign_eliminate : 0.000065s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000063s : 0.02% optimize.opt_a.parameter_eliminate : 0.000010s : 0.00% optimize.opt_a.a_2 : 0.001923s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000132s : 0.05% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000023s : 0.01% optimize.opt_a.shard_inline : 0.000091s : 0.04% optimize.opt_a.merge_send_recv : 0.000088s : 0.03% optimize.opt_a.auto_parallel : 0.000069s : 0.03% optimize.opt_a.parallel : 0.000047s : 0.02% optimize.opt_a.flash_sp : 0.000023s : 0.01% optimize.opt_a.merge_comm : 0.000054s : 0.02% optimize.opt_a.allreduce_fusion : 0.000050s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000093s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000097s : 0.04% optimize.opt_a.virtual_dataset : 0.000089s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000098s : 0.04% optimize.opt_a.virtual_output : 0.000091s : 0.04% optimize.opt_a.merge_forward : 0.000053s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000088s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000164s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000147s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000060s : 0.02% optimize.opt_a.meta_fg_expand : 0.009148s : 3.59% optimize.opt_a.flash_sp_send_recv_attached : 0.000013s : 0.01% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000286s : 0.11% optimize.opt_a.a_after_grad : 0.000346s : 0.14% optimize.opt_a.renormalize : 0.031279s : 12.29% optimize.opt_a.add_forward_monad_depend : 0.000035s : 0.01% optimize.opt_a.auto_monad_grad : 0.000021s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000304s : 0.12% optimize.opt_a.cse : 0.000739s : 0.29% optimize.opt_a.a_3 : 0.002341s : 0.92% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000047s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000707s : 0.28% optimize.opt_b.b_1 : 0.000414s : 0.16% optimize.opt_b.b_2 : 0.000020s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000059s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.01% optimize.loop_unroll : 0.000531s : 0.21% optimize.opt_after_cconv.c_1 : 0.000112s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000053s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000059s : 0.02% optimize.tuple_transform.d_1 : 0.000123s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000019s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000098s : 0.04% optimize.cse_after_recomputation.cse : 0.000045s : 0.02% optimize.environ_conv : 0.000012s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000046s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000012s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.00% optimize.overlap_grad_flash_sp : 0.000052s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000005s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000025s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000037s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000022s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000035s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000131s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000544s : 0.21% validate : 0.000089s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.013389s : 5.26% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.003406 564 4.74% : 0.000161s : 9: substitution.arithmetic_simplify 0.61% : 0.000021s : 8: substitution.depend_value_elim 0.17% : 0.000006s : 10: substitution.elim_not_effective 0.30% : 0.000010s : 13: substitution.float_depend_g_call 0.09% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.15% : 0.000005s : 10: substitution.fold_const_symbol 39.45% : 0.001344s : 8: substitution.getattr_setattr_resolve 0.35% : 0.000012s : 15: substitution.graph_param_transform 0.09% : 0.000003s : 2: substitution.incorporate_call 0.08% : 0.000003s : 2: substitution.incorporate_call_switch 35.62% : 0.001213s : 26: substitution.inline 1.03% : 0.000035s : 5: substitution.inline_without_move 0.75% : 0.000025s : 51: substitution.j_node_and_user_rematch 0.75% : 0.000026s : 4: substitution.less_batch_normalization 0.40% : 0.000014s : 20: substitution.load_eliminater 0.49% : 0.000017s : 11: substitution.minmaximum_grad 0.38% : 0.000013s : 13: substitution.partial_eliminate 1.13% : 0.000038s : 51: substitution.remove_not_recompute_node 2.35% : 0.000080s : 35: substitution.replace_applicator 0.88% : 0.000030s : 53: substitution.replace_old_param 0.17% : 0.000006s : 2: substitution.set_cell_output_no_recompute 0.33% : 0.000011s : 3: substitution.switch_simplify 1.01% : 0.000034s : 11: substitution.tuple_list_convert_item_index_to_positive 0.47% : 0.000016s : 11: substitution.tuple_list_get_item_const_eliminator 0.66% : 0.000022s : 11: substitution.tuple_list_get_item_depend_reorder 1.92% : 0.000065s : 24: substitution.tuple_list_get_item_eliminator 0.65% : 0.000022s : 11: substitution.tuple_list_get_set_item_eliminator 1.98% : 0.000067s : 62: substitution.updatestate_pure_node_eliminater 3.01% : 0.000102s : 81: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.178114 2 95.91% : 0.170831s : 1: type_inference.infer 4.09% : 0.007283s : 1: type_inference.specialize ------[replace.] 0.000865 49 10.08% : 0.000087s : 6: replace.getattr_setattr_resolve 45.53% : 0.000394s : 26: replace.inline 5.01% : 0.000043s : 2: replace.replace_applicator 5.11% : 0.000044s : 3: replace.switch_simplify 27.72% : 0.000240s : 11: replace.tuple_list_get_item_eliminator 6.55% : 0.000057s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.002506 49 50.21% : 0.001258s : 6: match.getattr_setattr_resolve 47.79% : 0.001197s : 26: match.inline 0.49% : 0.000012s : 2: match.replace_applicator 0.38% : 0.000010s : 3: match.switch_simplify 0.85% : 0.000021s : 11: match.tuple_list_get_item_eliminator 0.29% : 0.000007s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001968 14794 1.03% : 0.000020s : 161: predicate.accumulaten_eliminater 0.22% : 0.000004s : 17: predicate.ad_related_special_op_eliminate 0.69% : 0.000014s : 107: predicate.addn_check_dump 1.04% : 0.000021s : 161: predicate.addn_zero_filter 0.95% : 0.000019s : 161: predicate.adjust_all_reduce_mul_add 2.23% : 0.000044s : 263: predicate.arithmetic_simplify 1.07% : 0.000021s : 161: predicate.cast_eliminate 2.07% : 0.000041s : 317: predicate.check_bprop_eliminate 0.68% : 0.000013s : 107: predicate.compare_switch_simplify 0.06% : 0.000001s : 15: predicate.const_output_eliminate 0.69% : 0.000014s : 102: predicate.depend_value_elim 1.09% : 0.000021s : 161: predicate.dict_get_item_const_eliminator 1.19% : 0.000023s : 161: predicate.dict_get_item_eliminator 0.99% : 0.000020s : 161: predicate.dict_set_item_eliminator 0.27% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.06% : 0.000001s : 15: predicate.elim_not_effective 0.11% : 0.000002s : 15: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000022s : 176: predicate.environ_add_const_eliminate 1.08% : 0.000021s : 176: predicate.environ_get_add_eliminate 1.06% : 0.000021s : 176: predicate.environ_get_depend_swap 1.77% : 0.000035s : 278: predicate.environ_get_eliminate 1.09% : 0.000021s : 176: predicate.environ_get_set_eliminate 1.29% : 0.000025s : 198: predicate.exchange_switch_depend_value 1.73% : 0.000034s : 198: predicate.float_depend_g_call 0.67% : 0.000013s : 107: predicate.float_environ_get_switch 0.76% : 0.000015s : 122: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 15: predicate.fold_const_symbol 0.59% : 0.000012s : 82: predicate.get_grad_eliminate 0.48% : 0.000010s : 40: predicate.getattr_setattr_resolve 0.06% : 0.000001s : 15: predicate.graph_param_transform 0.66% : 0.000013s : 102: predicate.incorporate_call 0.62% : 0.000012s : 102: predicate.incorporate_call_switch 4.77% : 0.000094s : 578: predicate.inline 1.55% : 0.000030s : 186: predicate.inline_without_move 0.30% : 0.000006s : 82: predicate.j_node_and_user_rematch 0.68% : 0.000013s : 85: predicate.less_batch_normalization 1.35% : 0.000027s : 202: predicate.list_to_tuple_eliminator_ 2.28% : 0.000045s : 363: predicate.load_eliminater 0.20% : 0.000004s : 15: predicate.loop_unroll_after_grad 1.99% : 0.000039s : 273: predicate.loop_unroll_before_grad 1.24% : 0.000024s : 191: predicate.make_slice_get_slice_eliminator 0.71% : 0.000014s : 107: predicate.merge_addn 1.99% : 0.000039s : 309: predicate.micro_step_allgather_replace 1.97% : 0.000039s : 309: predicate.mini_step_allgather_replace 1.01% : 0.000020s : 161: predicate.minmaximum_grad 0.17% : 0.000003s : 15: predicate.mutable_eliminate 0.12% : 0.000002s : 15: predicate.opt_reshape 0.11% : 0.000002s : 15: predicate.parallel_virtual_node 1.65% : 0.000032s : 198: predicate.partial_defer_inline 1.37% : 0.000027s : 187: predicate.partial_eliminate 1.02% : 0.000020s : 161: predicate.print_const_string_wrapper 0.66% : 0.000013s : 102: predicate.reduce_all_const_elim 1.32% : 0.000026s : 161: predicate.reduce_eliminate 2.27% : 0.000045s : 363: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000007s : 82: predicate.remove_not_recompute_node 2.28% : 0.000045s : 485: predicate.replace_applicator 0.69% : 0.000014s : 186: predicate.replace_old_param 0.07% : 0.000001s : 15: predicate.reset_defer_inline 1.55% : 0.000030s : 161: predicate.reshape_eliminate 2.03% : 0.000040s : 309: predicate.row_tensor_add_zeros_like 0.12% : 0.000002s : 15: predicate.row_tensor_eliminate 2.17% : 0.000043s : 317: predicate.same_eliminate 0.40% : 0.000008s : 97: predicate.set_cell_output_no_recompute 0.59% : 0.000012s : 82: predicate.shard_identity_eliminate 0.24% : 0.000005s : 32: predicate.special_op_eliminate 0.76% : 0.000015s : 107: predicate.specialize_transform 2.11% : 0.000041s : 309: predicate.split_environ_get_set_with_tuple_value 1.44% : 0.000028s : 186: predicate.stack_unstack_eliminate 0.11% : 0.000002s : 15: predicate.switch_call_monad_eliminater 1.40% : 0.000028s : 198: predicate.switch_defer_inline 3.37% : 0.000066s : 515: predicate.switch_layer_defer_inline 4.15% : 0.000082s : 599: predicate.switch_simplify 1.01% : 0.000020s : 161: predicate.tile_eliminate 1.02% : 0.000020s : 161: predicate.transpose_eliminate 1.31% : 0.000026s : 191: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000027s : 191: predicate.tuple_list_get_item_const_eliminator 1.28% : 0.000025s : 191: predicate.tuple_list_get_item_depend_reorder 2.33% : 0.000046s : 304: predicate.tuple_list_get_item_eliminator 1.39% : 0.000027s : 191: predicate.tuple_list_get_set_item_eliminator 2.02% : 0.000040s : 293: predicate.tuple_list_set_item_eliminator 1.29% : 0.000025s : 202: predicate.tuple_to_list_eliminator_ 2.32% : 0.000046s : 363: predicate.updatestate_pure_node_eliminater 3.09% : 0.000061s : 467: predicate.updatestate_useless_node_eliminater 0.11% : 0.000002s : 15: predicate.value_based_eliminate 0.59% : 0.000012s : 82: predicate.virtual_dataset_eliminate 0.57% : 0.000011s : 82: predicate.virtual_output_eliminate 0.11% : 0.000002s : 17: predicate.virtual_view_grad_eliminate 0.12% : 0.000002s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.009368 98 68.76% : 0.006442s : 58: func_graph_cloner_run.FuncGraphClonerGraph 2.55% : 0.000239s : 4: func_graph_cloner_run.FuncGraphClonerNode 28.69% : 0.002688s : 36: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.373034 307 0.00% : 0.000004s : 1: ForceFp32Comm 0.97% : 0.003601s : 1: add_attr 0.96% : 0.003591s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000102s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000409s : 1: auto_monad 0.04% : 0.000138s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.20% : 0.000753s : 1: bootstrap 0.01% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000049s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000015s : 1: environ_conv 0.10% : 0.000387s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000006s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.14% : 0.000539s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.19% : 0.000716s : 1: mutable_eliminate 0.00% : 0.000016s : 1: offloading_packed_experts 0.01% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000028s : 1: opt.transform.mutable_eliminate 3.82% : 0.014248s : 181: opt.transform.opt_a 0.03% : 0.000110s : 1: opt.transform.opt_after_cconv 0.02% : 0.000071s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000403s : 28: opt.transform.opt_b 0.41% : 0.001519s : 4: opt.transform.opt_resolve 0.04% : 0.000139s : 2: opt.transform.opt_trans_graph 0.03% : 0.000115s : 4: opt.transform.symbol_engine_opt 15.61% : 0.058224s : 1: opt_a 0.07% : 0.000250s : 1: opt_after_cconv 0.15% : 0.000553s : 1: opt_after_jit_grad 0.15% : 0.000575s : 1: opt_b 16.58% : 0.061865s : 1: optimize 0.01% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000056s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000015s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000007s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000007s : 1: pipeline_split 0.02% : 0.000077s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000063s : 1: remove_dup_value 6.32% : 0.023578s : 3: renormalize.infer 2.06% : 0.007674s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000050s : 1: rewriter_after_opt_a 0.13% : 0.000479s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000165s : 1: symbol_engine_optimizer 3.59% : 0.013406s : 1: task_emit 0.05% : 0.000171s : 1: tuple_transform 47.80% : 0.178308s : 1: type_inference 0.04% : 0.000137s : 1: validate random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as query random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as key random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as value random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as pse random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as query random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as key random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as value random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as pse random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as query random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as key random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as value random_generator: generate a numpy.ndarray(shape=(1, 2, 4, 4), dtype=, seed=1967515154) by numpy.random.randn, will be used as pse loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[-0.1]] [[-0.2]]] loop 2 the output is: [[[-0.1]] [[-0.2]]] loop 3 the output is: [[[-0.1]] [[-0.2]]] loop 4 the output is: [[[-0.1]] [[-0.2]]] loop 5 the output is: [[[-0.1]] [[-0.2]]] loop 6 the output is: [[[-0.1]] [[-0.2]]] loop 7 the output is: [[[-0.1]] [[-0.2]]] loop 8 the output is: [[[-0.1]] [[-0.2]]] loop 9 the output is: [[[-0.1]] [[-0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[-1.]] [[ 1.]]] loop 2 the output is: [[[-1.]] [[ 1.]]] loop 3 the output is: [[[-1.]] [[ 1.]]] loop 4 the output is: [[[-1.]] [[ 1.]]] loop 5 the output is: [[[-1.]] [[ 1.]]] loop 6 the output is: [[[-1.]] [[ 1.]]] loop 7 the output is: [[[-1.]] [[ 1.]]] loop 8 the output is: [[[-1.]] [[ 1.]]] loop 9 the output is: [[[-1.]] [[ 1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip_designed', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 2 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 3 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 4 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 5 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 6 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 7 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 8 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 9 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip_designed', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[inf]] [[ 1.]]] loop 2 the output is: [[[inf]] [[ 1.]]] loop 3 the output is: [[[inf]] [[ 1.]]] loop 4 the output is: [[[inf]] [[ 1.]]] loop 5 the output is: [[[inf]] [[ 1.]]] loop 6 the output is: [[[inf]] [[ 1.]]] loop 7 the output is: [[[inf]] [[ 1.]]] loop 8 the output is: [[[inf]] [[ 1.]]] loop 9 the output is: [[[inf]] [[ 1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'multiply', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.2]] [[-0.2]]] loop 2 the output is: [[[ 0.2]] [[-0.2]]] loop 3 the output is: [[[ 0.2]] [[-0.2]]] loop 4 the output is: [[[ 0.2]] [[-0.2]]] loop 5 the output is: [[[ 0.2]] [[-0.2]]] loop 6 the output is: [[[ 0.2]] [[-0.2]]] loop 7 the output is: [[[ 0.2]] [[-0.2]]] loop 8 the output is: [[[ 0.2]] [[-0.2]]] loop 9 the output is: [[[ 0.2]] [[-0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'multiply', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[2.]] [[1.]]] loop 2 the output is: [[[2.]] [[1.]]] loop 3 the output is: [[[2.]] [[1.]]] loop 4 the output is: [[[2.]] [[1.]]] loop 5 the output is: [[[2.]] [[1.]]] loop 6 the output is: [[[2.]] [[1.]]] loop 7 the output is: [[[2.]] [[1.]]] loop 8 the output is: [[[2.]] [[1.]]] loop 9 the output is: [[[2.]] [[1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'multiply_max', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.1]] [[-0.4]]] loop 2 the output is: [[[ 0.1]] [[-0.4]]] loop 3 the output is: [[[ 0.1]] [[-0.4]]] loop 4 the output is: [[[ 0.1]] [[-0.4]]] loop 5 the output is: [[[ 0.1]] [[-0.4]]] loop 6 the output is: [[[ 0.1]] [[-0.4]]] loop 7 the output is: [[[ 0.1]] [[-0.4]]] loop 8 the output is: [[[ 0.1]] [[-0.4]]] loop 9 the output is: [[[ 0.1]] [[-0.4]]] Start Backward Testing, error_mode 'cycle', flip_mode 'multiply_max', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[4.]] [[2.]]] loop 2 the output is: [[[4.]] [[2.]]] loop 3 the output is: [[[4.]] [[2.]]] loop 4 the output is: [[[4.]] [[2.]]] loop 5 the output is: [[[4.]] [[2.]]] loop 6 the output is: [[[4.]] [[2.]]] loop 7 the output is: [[[4.]] [[2.]]] loop 8 the output is: [[[4.]] [[2.]]] loop 9 the output is: [[[4.]] [[2.]]] Start Forward Testing, error_mode 'specific', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.1]] [[-0.2]]] loop 2 the output is: [[[ 0.1]] [[-0.2]]] loop 3 the output is: [[[ 0.1]] [[-0.2]]] loop 4 the output is: [[[-0.1]] [[-0.2]]] loop 5 the output is: [[[-0.1]] [[-0.2]]] loop 6 the output is: [[[ 0.1]] [[-0.2]]] loop 7 the output is: [[[ 0.1]] [[-0.2]]] loop 8 the output is: [[[-0.1]] [[-0.2]]] loop 9 the output is: [[[ 0.1]] [[-0.2]]] Start Backward Testing, error_mode 'specific', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[1.]] [[1.]]] loop 2 the output is: [[[1.]] [[1.]]] loop 3 the output is: [[[1.]] [[1.]]] loop 4 the output is: [[[-1.]] [[ 1.]]] loop 5 the output is: [[[-1.]] [[ 1.]]] loop 6 the output is: [[[1.]] [[1.]]] loop 7 the output is: [[[1.]] [[1.]]] loop 8 the output is: [[[-1.]] [[ 1.]]] loop 9 the output is: [[[1.]] [[1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '1.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[-0.1]] [[ 0.2]]] loop 2 the output is: [[[-0.1]] [[ 0.2]]] loop 3 the output is: [[[-0.1]] [[ 0.2]]] loop 4 the output is: [[[-0.1]] [[ 0.2]]] loop 5 the output is: [[[-0.1]] [[ 0.2]]] loop 6 the output is: [[[-0.1]] [[ 0.2]]] loop 7 the output is: [[[-0.1]] [[ 0.2]]] loop 8 the output is: [[[-0.1]] [[ 0.2]]] loop 9 the output is: [[[-0.1]] [[ 0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '1.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[-1.]] [[-1.]]] loop 2 the output is: [[[-1.]] [[-1.]]] loop 3 the output is: [[[-1.]] [[-1.]]] loop 4 the output is: [[[-1.]] [[-1.]]] loop 5 the output is: [[[-1.]] [[-1.]]] loop 6 the output is: [[[-1.]] [[-1.]]] loop 7 the output is: [[[-1.]] [[-1.]]] loop 8 the output is: [[[-1.]] [[-1.]]] loop 9 the output is: [[[-1.]] [[-1.]]] loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[-0.1]] [[-0.2]]] loop 2 the output is: [[[-0.1]] [[-0.2]]] loop 3 the output is: [[[-0.1]] [[-0.2]]] loop 4 the output is: [[[-0.1]] [[-0.2]]] loop 5 the output is: [[[-0.1]] [[-0.2]]] loop 6 the output is: [[[-0.1]] [[-0.2]]] loop 7 the output is: [[[-0.1]] [[-0.2]]] loop 8 the output is: [[[-0.1]] [[-0.2]]] loop 9 the output is: [[[-0.1]] [[-0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[-1.]] [[ 1.]]] loop 2 the output is: [[[-1.]] [[ 1.]]] loop 3 the output is: [[[-1.]] [[ 1.]]] loop 4 the output is: [[[-1.]] [[ 1.]]] loop 5 the output is: [[[-1.]] [[ 1.]]] loop 6 the output is: [[[-1.]] [[ 1.]]] loop 7 the output is: [[[-1.]] [[ 1.]]] loop 8 the output is: [[[-1.]] [[ 1.]]] loop 9 the output is: [[[-1.]] [[ 1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip_designed', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 2 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 3 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 4 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 5 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 6 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 7 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 8 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] loop 9 the output is: [[[ 3.4028237e+37]] [[-2.0000000e-01]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip_designed', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[inf]] [[ 1.]]] loop 2 the output is: [[[inf]] [[ 1.]]] loop 3 the output is: [[[inf]] [[ 1.]]] loop 4 the output is: [[[inf]] [[ 1.]]] loop 5 the output is: [[[inf]] [[ 1.]]] loop 6 the output is: [[[inf]] [[ 1.]]] loop 7 the output is: [[[inf]] [[ 1.]]] loop 8 the output is: [[[inf]] [[ 1.]]] loop 9 the output is: [[[inf]] [[ 1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'multiply', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.2]] [[-0.2]]] loop 2 the output is: [[[ 0.2]] [[-0.2]]] loop 3 the output is: [[[ 0.2]] [[-0.2]]] loop 4 the output is: [[[ 0.2]] [[-0.2]]] loop 5 the output is: [[[ 0.2]] [[-0.2]]] loop 6 the output is: [[[ 0.2]] [[-0.2]]] loop 7 the output is: [[[ 0.2]] [[-0.2]]] loop 8 the output is: [[[ 0.2]] [[-0.2]]] loop 9 the output is: [[[ 0.2]] [[-0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'multiply', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[2.]] [[1.]]] loop 2 the output is: [[[2.]] [[1.]]] loop 3 the output is: [[[2.]] [[1.]]] loop 4 the output is: [[[2.]] [[1.]]] loop 5 the output is: [[[2.]] [[1.]]] loop 6 the output is: [[[2.]] [[1.]]] loop 7 the output is: [[[2.]] [[1.]]] loop 8 the output is: [[[2.]] [[1.]]] loop 9 the output is: [[[2.]] [[1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'multiply_max', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.1]] [[-0.4]]] loop 2 the output is: [[[ 0.1]] [[-0.4]]] loop 3 the output is: [[[ 0.1]] [[-0.4]]] loop 4 the output is: [[[ 0.1]] [[-0.4]]] loop 5 the output is: [[[ 0.1]] [[-0.4]]] loop 6 the output is: [[[ 0.1]] [[-0.4]]] loop 7 the output is: [[[ 0.1]] [[-0.4]]] loop 8 the output is: [[[ 0.1]] [[-0.4]]] loop 9 the output is: [[[ 0.1]] [[-0.4]]] Start Backward Testing, error_mode 'cycle', flip_mode 'multiply_max', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[4.]] [[2.]]] loop 2 the output is: [[[4.]] [[2.]]] loop 3 the output is: [[[4.]] [[2.]]] loop 4 the output is: [[[4.]] [[2.]]] loop 5 the output is: [[[4.]] [[2.]]] loop 6 the output is: [[[4.]] [[2.]]] loop 7 the output is: [[[4.]] [[2.]]] loop 8 the output is: [[[4.]] [[2.]]] loop 9 the output is: [[[4.]] [[2.]]] Start Forward Testing, error_mode 'specific', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[ 0.1]] [[-0.2]]] loop 2 the output is: [[[ 0.1]] [[-0.2]]] loop 3 the output is: [[[ 0.1]] [[-0.2]]] loop 4 the output is: [[[-0.1]] [[-0.2]]] loop 5 the output is: [[[-0.1]] [[-0.2]]] loop 6 the output is: [[[ 0.1]] [[-0.2]]] loop 7 the output is: [[[ 0.1]] [[-0.2]]] loop 8 the output is: [[[-0.1]] [[-0.2]]] loop 9 the output is: [[[ 0.1]] [[-0.2]]] Start Backward Testing, error_mode 'specific', flip_mode 'bitflip', flip_probability '0.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[1.]] [[1.]]] loop 2 the output is: [[[1.]] [[1.]]] loop 3 the output is: [[[1.]] [[1.]]] loop 4 the output is: [[[-1.]] [[ 1.]]] loop 5 the output is: [[[-1.]] [[ 1.]]] loop 6 the output is: [[[1.]] [[1.]]] loop 7 the output is: [[[1.]] [[1.]]] loop 8 the output is: [[[-1.]] [[ 1.]]] loop 9 the output is: [[[1.]] [[1.]]] Start Forward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '1.0', ele_pos '0' loop 0 the output is: [[[ 0.1]] [[-0.2]]] loop 1 the output is: [[[-0.1]] [[ 0.2]]] loop 2 the output is: [[[-0.1]] [[ 0.2]]] loop 3 the output is: [[[-0.1]] [[ 0.2]]] loop 4 the output is: [[[-0.1]] [[ 0.2]]] loop 5 the output is: [[[-0.1]] [[ 0.2]]] loop 6 the output is: [[[-0.1]] [[ 0.2]]] loop 7 the output is: [[[-0.1]] [[ 0.2]]] loop 8 the output is: [[[-0.1]] [[ 0.2]]] loop 9 the output is: [[[-0.1]] [[ 0.2]]] Start Backward Testing, error_mode 'cycle', flip_mode 'bitflip', flip_probability '1.0', ele_pos '0' loop 0 the output is: [[[1.]] [[1.]]] loop 1 the output is: [[[-1.]] [[-1.]]] loop 2 the output is: [[[-1.]] [[-1.]]] loop 3 the output is: [[[-1.]] [[-1.]]] loop 4 the output is: [[[-1.]] [[-1.]]] loop 5 the output is: [[[-1.]] [[-1.]]] loop 6 the output is: [[[-1.]] [[-1.]]] loop 7 the output is: [[[-1.]] [[-1.]]] loop 8 the output is: [[[-1.]] [[-1.]]] loop 9 the output is: [[[-1.]] [[-1.]]] group_cases_8 have all been run, results of sub cases are below: case: (1,) {} pass. case: ('pynative',) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. case: (1,) {} pass. case: ('GRAPH',) {} pass. case: ('PYBOOST',) {} pass. case: ('KBK',) {} pass. ops group_cases_9 with 8 cases start to running, all cases are below: case: (, 0) case: (, 1) case: (, 0) case: (, 1) case: (, 'pynative') case: (, 'KBK') case: (, 'pynative') case: (, 'KBK') ops group_cases_9 total running memory: 96M, memory threshold: 51200M [LOG_WARNING] can not open file, file: /home/jenkins/ascend/log/debug/plog/plog-171921_20260129173659633.log, possible reason: Permission denied. TotalTime = 2.52735, [24] [bootstrap]: 0.00088889 [type_inference]: 0.0585466 [event_method]: 0.0001144 [auto_monad]: 0.0001524 [graph_reusing]: 5.85002e-06 [inline]: 2.61e-06 [add_attr]: 0.00680901, [1] [add_attr_with_inline]: 0.00679066, [1] [Cycle 1]: 0.00014908, [2] [tag_attr]: 4.615e-05 [meta_addattr_fg_expand]: 1.74e-05 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 6.472e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.00632267, [53] [py_interpret_to_execute]: 4.62e-06 [rewriter_before_opt_a]: 0.00028262 [opt_a]: 0.00378261, [2] [Cycle 1]: 0.00306108, [45] [expand_dump_flag]: 3.35e-06 [switch_simplify]: 0.00013186 [loop_unroll]: 4.39e-05 [a_1]: 0.00084129 [with_stream_mark]: 1.186e-05 [recompute_prepare]: 8.75999e-06 [updatestate_depend_eliminate]: 1.457e-05 [updatestate_assign_eliminate]: 1.205e-05 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 8.534e-05 [accelerated_algorithm]: 6.94999e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 1.62999e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 4.884e-05 [auto_parallel]: 9.75002e-06 [parallel]: 0.00010728 [flash_sp]: 3.893e-05 [merge_comm]: 5.15999e-06 [allreduce_fusion]: 1.343e-05 [matmul_add_comm_reduction]: 1.875e-05 [allreduce_slice_to_reducescatter]: 1.002e-05 [virtual_shard_identity]: 9.62999e-06 [virtual_dataset]: 7.55e-06 [get_grad_eliminate_]: 6.54999e-06 [virtual_output]: 6.69001e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.924e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.271e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.079e-05 [set_forward_comm_id_for_comm_node_pass]: 1.373e-05 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.48e-06 [receive_attached]: 2.111e-05 [after_resolve]: 1.2e-05 [a_after_grad]: 1.063e-05 [renormalize]: 0.00108505 [add_forward_monad_depend]: 4.92e-06 [auto_monad_grad]: 2.59001e-06 [auto_monad_eliminator]: 2.833e-05 [cse]: 5.821e-05 [a_3]: 4.924e-05 [Cycle 2]: 0.00070811, [45] [expand_dump_flag]: 2.58e-06 [switch_simplify]: 7.58999e-06 [loop_unroll]: 6.21e-06 [a_1]: 0.00012148 [with_stream_mark]: 1.214e-05 [recompute_prepare]: 6.17001e-06 [updatestate_depend_eliminate]: 3.57002e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.66e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 7.243e-05 [accelerated_algorithm]: 5.99e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 5.69999e-06 [merge_send_recv]: 6.74999e-06 [auto_parallel]: 7.06999e-06 [parallel]: 6.56e-06 [flash_sp]: 3.53999e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.20002e-06 [matmul_add_comm_reduction]: 7.15998e-06 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 6.81999e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 6.08998e-06 [virtual_output]: 5.63002e-06 [merge_forward]: 3.68999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 7.61999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.212e-05 [merge_recompute_call_nodes]: 1.17e-06 [before_grad]: 8.64003e-06 [set_forward_comm_id_for_comm_node_pass]: 3.89002e-06 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 1.78002e-06 [receive_attached]: 1.71002e-06 [after_resolve]: 1.019e-05 [a_after_grad]: 9.19e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 6.77002e-06 [cse]: 1.666e-05 [a_3]: 3.642e-05 [py_interpret_to_execute_after_opt_a]: 4.70001e-06 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 3.371e-05 [convert_after_rewriter]: 1.30001e-06 [order_py_execute_after_rewriter]: 1.77999e-06 [mutable_eliminate]: 0.00067231 [opt_b]: 0.00020198, [1] [Cycle 1]: 0.00019526, [7] [b_1]: 0.00012362 [b_2]: 7.62002e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.56e-06 [renormalize]: 5.00004e-07 [cse]: 1.937e-05 [optimize_parallel_all_gather_comm]: 3.095e-05 [overlap_param_gather]: 1.248e-05 [cconv]: 1.343e-05 [loop_unroll]: 0.00042849 [opt_after_cconv]: 9.635e-05, [1] [Cycle 1]: 9.048e-05, [7] [c_1]: 2.869e-05 [parameter_eliminate]: 1.84998e-06 [updatestate_depend_eliminate]: 4.53999e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 1.876e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.172e-05 [tuple_transform]: 6.852e-05, [1] [Cycle 1]: 6.477e-05, [4] [d_1]: 3.952e-05 [none_parameter_eliminate]: 8.19971e-07 [renormalize]: 1.60013e-07 [switch_simplify]: 6.91001e-06 [partial_unused_args_eliminate]: 9.29984e-07 [add_recomputation]: 4.854e-05 [cse_after_recomputation]: 2.316e-05, [1] [Cycle 1]: 1.875e-05, [1] [cse]: 1.32e-05 [environ_conv]: 2.249e-05 [swap_dp_allreduce_reducescatter]: 2.686e-05 [bias_add_comm_swap]: 1.225e-05 [label_micro_interleaved_index]: 1.508e-05 [label_fine_grained_interleaved_index]: 3.06001e-06 [merge_cast_opt]: 1.58002e-06 [slice_recompute_activation]: 2.39999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.102e-05 [full_micro_interleaved_order_control]: 1.186e-05 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.025e-05 [overlap_opt_shard_in_pipeline]: 2.061e-05 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.426e-05 [grouped_pairwise_exchange_alltoall]: 1.51998e-06 [offloading_packed_experts]: 3.71001e-06 [overlap_recompute_and_grad_model_parallel]: 1.478e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.73e-06 [overlap_grad_ring_attention]: 2.33e-05 [overlap_grad_flash_sp]: 5.013e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 1.179e-05 [split_layernorm_comm]: 1.97001e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.502e-05, [1] [Cycle 1]: 7.064e-05, [6] [build]: 1.96e-06 [elim_shapecalc]: 9.64e-06 [elim_not_effective]: 1.273e-05 [opt_reshape]: 7.41999e-06 [fold_const_symbol]: 9.96e-06 [renormalize]: 2.10013e-07 [detach_backward]: 2.54001e-06 [pipeline_parallel_scheduler]: 1.45001e-06 [auto_monad_reorder]: 1.872e-05 [get_jit_bprop_graph]: 1.56998e-06 [rewriter_after_jit_bprop_graph]: 2.84999e-06 [opt_after_jit_grad]: 0.00045203 [validate]: 5.368e-05 [backend_pass]: 9.30013e-07 [task_emit]: 2.45356 [execute]: 5.76e-06 Sums bootstrap : 0.000889s : 0.04% type_inference : 0.058547s : 2.32% event_method : 0.000114s : 0.00% auto_monad : 0.000152s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000065s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000283s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000139s : 0.01% optimize.opt_a.loop_unroll : 0.000050s : 0.00% optimize.opt_a.a_1 : 0.000963s : 0.04% optimize.opt_a.with_stream_mark : 0.000024s : 0.00% optimize.opt_a.recompute_prepare : 0.000015s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000158s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000056s : 0.00% optimize.opt_a.auto_parallel : 0.000017s : 0.00% optimize.opt_a.parallel : 0.000114s : 0.00% optimize.opt_a.flash_sp : 0.000042s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000011s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000023s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.00% optimize.opt_a.a_after_grad : 0.000020s : 0.00% optimize.opt_a.renormalize : 0.001085s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.00% optimize.opt_a.cse : 0.000075s : 0.00% optimize.opt_a.a_3 : 0.000086s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000034s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000672s : 0.03% optimize.opt_b.b_1 : 0.000124s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000013s : 0.00% optimize.loop_unroll : 0.000428s : 0.02% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000012s : 0.00% optimize.tuple_transform.d_1 : 0.000040s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000049s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000022s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000021s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000452s : 0.02% validate : 0.000054s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.453560s : 97.39% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.000265 33 0.75% : 0.000002s : 2: substitution.elim_not_effective 0.48% : 0.000001s : 2: substitution.fold_const_symbol 1.49% : 0.000004s : 4: substitution.graph_param_transform 72.70% : 0.000193s : 8: substitution.inline 1.18% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.65% : 0.000015s : 4: substitution.remove_not_recompute_node 1.60% : 0.000004s : 4: substitution.replace_old_param 6.21% : 0.000016s : 1: substitution.switch_simplify 9.94% : 0.000026s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.058460 2 96.80% : 0.056589s : 1: type_inference.infer 3.20% : 0.001870s : 1: type_inference.specialize ------[replace.] 0.000109 13 58.51% : 0.000064s : 8: replace.inline 15.43% : 0.000017s : 1: replace.switch_simplify 26.06% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000229 13 82.45% : 0.000189s : 8: match.inline 6.85% : 0.000016s : 1: match.switch_simplify 10.69% : 0.000024s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1528 1.00% : 0.000002s : 17: predicate.accumulaten_eliminater 0.63% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.99% : 0.000002s : 17: predicate.addn_zero_filter 0.91% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 25: predicate.arithmetic_simplify 1.04% : 0.000002s : 17: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.45% : 0.000001s : 8: predicate.depend_value_elim 1.03% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.23% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.07% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 21: predicate.environ_get_depend_swap 1.70% : 0.000004s : 29: predicate.environ_get_eliminate 1.14% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.79% : 0.000004s : 29: predicate.exchange_switch_depend_value 2.36% : 0.000005s : 29: predicate.float_depend_g_call 0.44% : 0.000001s : 8: predicate.float_environ_get_switch 0.65% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.69% : 0.000002s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.76% : 0.000013s : 70: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.66% : 0.000001s : 8: predicate.less_batch_normalization 2.04% : 0.000005s : 29: predicate.list_to_tuple_eliminator_ 2.64% : 0.000006s : 46: predicate.load_eliminater 0.72% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.09% : 0.000007s : 52: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 17: predicate.minmaximum_grad 0.77% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 2.37% : 0.000005s : 29: predicate.partial_defer_inline 1.57% : 0.000004s : 25: predicate.partial_eliminate 1.09% : 0.000002s : 17: predicate.print_const_string_wrapper 0.45% : 0.000001s : 8: predicate.reduce_all_const_elim 1.33% : 0.000003s : 17: predicate.reduce_eliminate 2.60% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 29: predicate.replace_applicator 0.39% : 0.000001s : 8: predicate.replace_old_param 0.21% : 0.000000s : 4: predicate.reset_defer_inline 0.98% : 0.000002s : 17: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 4: predicate.row_tensor_eliminate 0.56% : 0.000001s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 8: predicate.shard_identity_eliminate 0.59% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 0.89% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.90% : 0.000004s : 29: predicate.switch_defer_inline 2.25% : 0.000005s : 37: predicate.switch_layer_defer_inline 6.09% : 0.000014s : 95: predicate.switch_simplify 1.04% : 0.000002s : 17: predicate.tile_eliminate 1.00% : 0.000002s : 17: predicate.transpose_eliminate 1.65% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000007s : 37: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000005s : 33: predicate.tuple_list_set_item_eliminator 1.81% : 0.000004s : 29: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 46: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 54: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.21% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000772 15 34.23% : 0.000264s : 5: func_graph_cloner_run.FuncGraphClonerGraph 65.77% : 0.000508s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.543191 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.27% : 0.006815s : 1: add_attr 0.27% : 0.006795s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000162s : 1: auto_monad 0.00% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.04% : 0.000944s : 1: bootstrap 0.00% : 0.000017s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000027s : 1: environ_conv 0.00% : 0.000125s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.02% : 0.000437s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000682s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.06% : 0.001504s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000105s : 28: opt.transform.opt_b 0.00% : 0.000044s : 2: opt.transform.opt_trans_graph 0.00% : 0.000036s : 4: opt.transform.symbol_engine_opt 0.15% : 0.003786s : 1: opt_a 0.00% : 0.000100s : 1: opt_after_cconv 0.02% : 0.000462s : 1: opt_after_jit_grad 0.01% : 0.000205s : 1: opt_b 0.25% : 0.006328s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000117s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000015s : 1: remove_dup_value 0.02% : 0.000542s : 1: renormalize.infer 0.02% : 0.000535s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000038s : 1: rewriter_after_opt_a 0.01% : 0.000289s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000078s : 1: symbol_engine_optimizer 96.48% : 2.453589s : 1: task_emit 0.00% : 0.000071s : 1: tuple_transform 2.30% : 0.058564s : 1: type_inference 0.00% : 0.000081s : 1: validate TotalTime = 2.56603, [24] [bootstrap]: 0.00073699 [type_inference]: 0.0572219 [event_method]: 0.00010236 [auto_monad]: 0.00013715 [graph_reusing]: 5.10001e-06 [inline]: 2.68998e-06 [add_attr]: 0.00647313, [1] [add_attr_with_inline]: 0.00645676, [1] [Cycle 1]: 0.00013503, [2] [tag_attr]: 4.054e-05 [meta_addattr_fg_expand]: 1.675e-05 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 6.18e-05 [insert-virtual-dataset]: 2.27001e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.81998e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00642897, [53] [py_interpret_to_execute]: 4.10998e-06 [rewriter_before_opt_a]: 0.00027293 [opt_a]: 0.00376936, [2] [Cycle 1]: 0.00309207, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 0.00012887 [loop_unroll]: 4.724e-05 [a_1]: 0.00089802 [with_stream_mark]: 1.349e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 1.33e-05 [updatestate_assign_eliminate]: 1.147e-05 [updatestate_loads_eliminate]: 2.36998e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 8.972e-05 [accelerated_algorithm]: 7.50998e-06 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 1.70001e-06 [shard_inline]: 6.84001e-06 [merge_send_recv]: 4.771e-05 [auto_parallel]: 6.61e-06 [parallel]: 0.00014347 [flash_sp]: 3.558e-05 [merge_comm]: 4.4e-06 [allreduce_fusion]: 1.212e-05 [matmul_add_comm_reduction]: 1.603e-05 [allreduce_slice_to_reducescatter]: 9.22001e-06 [virtual_shard_identity]: 9.87001e-06 [virtual_dataset]: 7.06999e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 7.25e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 1.07998e-06 [offload_activation]: 1.893e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.348e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.043e-05 [set_forward_comm_id_for_comm_node_pass]: 1.262e-05 [meta_fg_expand]: 3.68e-06 [flash_sp_send_recv_attached]: 2.29001e-06 [receive_attached]: 1.967e-05 [after_resolve]: 1.311e-05 [a_after_grad]: 1.102e-05 [renormalize]: 0.0010585 [add_forward_monad_depend]: 6.11e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 2.502e-05 [cse]: 4.256e-05 [a_3]: 5.409e-05 [Cycle 2]: 0.00066658, [45] [expand_dump_flag]: 1.72001e-06 [switch_simplify]: 7.92e-06 [loop_unroll]: 6.61999e-06 [a_1]: 0.00014983 [with_stream_mark]: 1.305e-05 [recompute_prepare]: 7.14001e-06 [updatestate_depend_eliminate]: 3.04001e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 2.76999e-06 [parameter_eliminate]: 9.30013e-07 [a_2]: 7.943e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 1.37999e-06 [meta_shard_fg_expand]: 1.30999e-06 [shard_inline]: 6.49999e-06 [merge_send_recv]: 5.38002e-06 [auto_parallel]: 6.28e-06 [parallel]: 6.34999e-06 [flash_sp]: 2.99001e-06 [merge_comm]: 3.11001e-06 [allreduce_fusion]: 3.05998e-06 [matmul_add_comm_reduction]: 5.64998e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.55e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.11e-06 [virtual_output]: 6.07001e-06 [merge_forward]: 3.23998e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 7.26999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.419e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 9.52999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.63e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.57999e-06 [after_resolve]: 1.296e-05 [a_after_grad]: 1.013e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.41002e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 6.76e-06 [cse]: 1.386e-05 [a_3]: 3.878e-05 [py_interpret_to_execute_after_opt_a]: 4.63001e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 3.047e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 2.01998e-06 [mutable_eliminate]: 0.00065339 [opt_b]: 0.00021404, [1] [Cycle 1]: 0.00020704, [7] [b_1]: 0.00013532 [b_2]: 8.22e-06 [updatestate_depend_eliminate]: 5.38002e-06 [updatestate_assign_eliminate]: 2.60002e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 5.29981e-07 [cse]: 2.028e-05 [optimize_parallel_all_gather_comm]: 2.825e-05 [overlap_param_gather]: 1.222e-05 [cconv]: 2.574e-05 [loop_unroll]: 0.00045498 [opt_after_cconv]: 0.00010563, [1] [Cycle 1]: 9.891e-05, [7] [c_1]: 3.378e-05 [parameter_eliminate]: 3.38e-06 [updatestate_depend_eliminate]: 5.42999e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.42001e-06 [cse]: 1.938e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.339e-05 [tuple_transform]: 0.00014371, [1] [Cycle 1]: 0.00013881, [4] [d_1]: 4.727e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 7.35e-06 [partial_unused_args_eliminate]: 1.77001e-06 [add_recomputation]: 6.84e-05 [cse_after_recomputation]: 2.306e-05, [1] [Cycle 1]: 1.881e-05, [1] [cse]: 1.323e-05 [environ_conv]: 2.926e-05 [swap_dp_allreduce_reducescatter]: 2.616e-05 [bias_add_comm_swap]: 1.216e-05 [label_micro_interleaved_index]: 1.364e-05 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.79001e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.09e-05 [full_micro_interleaved_order_control]: 1.114e-05 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.21002e-06 [interleave_parallel_branches]: 9.91e-06 [overlap_opt_shard_in_pipeline]: 1.866e-05 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.389e-05 [grouped_pairwise_exchange_alltoall]: 1.35001e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 1.422e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.25001e-06 [overlap_recompute_comm]: 2.73e-06 [overlap_grad_ring_attention]: 2.276e-05 [overlap_grad_flash_sp]: 4.639e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 1.13e-05 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 7.997e-05, [1] [Cycle 1]: 7.526e-05, [6] [build]: 2.63998e-06 [elim_shapecalc]: 1.16e-05 [elim_not_effective]: 1.38e-05 [opt_reshape]: 7.58999e-06 [fold_const_symbol]: 1.077e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.13998e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 2.124e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00050346 [validate]: 6.415e-05 [backend_pass]: 9.30013e-07 [task_emit]: 2.49395 [execute]: 7.39002e-06 Sums bootstrap : 0.000737s : 0.03% type_inference : 0.057222s : 2.24% event_method : 0.000102s : 0.00% auto_monad : 0.000137s : 0.01% graph_reusing : 0.000005s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000062s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000273s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000137s : 0.01% optimize.opt_a.loop_unroll : 0.000054s : 0.00% optimize.opt_a.a_1 : 0.001048s : 0.04% optimize.opt_a.with_stream_mark : 0.000027s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000169s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000053s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000150s : 0.01% optimize.opt_a.flash_sp : 0.000039s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.00% optimize.opt_a.a_after_grad : 0.000021s : 0.00% optimize.opt_a.renormalize : 0.001059s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.00% optimize.opt_a.cse : 0.000056s : 0.00% optimize.opt_a.a_3 : 0.000093s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000653s : 0.03% optimize.opt_b.b_1 : 0.000135s : 0.01% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000455s : 0.02% optimize.opt_after_cconv.c_1 : 0.000034s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000047s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000029s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000026s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000046s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000503s : 0.02% validate : 0.000064s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.493955s : 97.48% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000283 37 0.62% : 0.000002s : 2: substitution.elim_not_effective 0.47% : 0.000001s : 2: substitution.fold_const_symbol 2.16% : 0.000006s : 5: substitution.graph_param_transform 73.83% : 0.000209s : 7: substitution.inline 1.05% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.87% : 0.000014s : 4: substitution.remove_not_recompute_node 1.93% : 0.000005s : 6: substitution.replace_old_param 5.01% : 0.000014s : 1: substitution.switch_simplify 10.06% : 0.000028s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.057137 2 97.35% : 0.055623s : 1: type_inference.infer 2.65% : 0.001514s : 1: type_inference.specialize ------[replace.] 0.000107 14 50.73% : 0.000055s : 7: replace.inline 18.30% : 0.000020s : 1: replace.switch_simplify 30.98% : 0.000033s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000245 14 83.90% : 0.000206s : 7: match.inline 5.46% : 0.000013s : 1: match.switch_simplify 10.63% : 0.000026s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000249 1785 0.94% : 0.000002s : 19: predicate.accumulaten_eliminater 0.80% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 19: predicate.addn_zero_filter 0.88% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.23% : 0.000006s : 29: predicate.arithmetic_simplify 0.96% : 0.000002s : 19: predicate.cast_eliminate 0.49% : 0.000001s : 10: predicate.check_bprop_eliminate 0.46% : 0.000001s : 10: predicate.compare_switch_simplify 0.17% : 0.000000s : 5: predicate.const_output_eliminate 0.51% : 0.000001s : 10: predicate.depend_value_elim 1.01% : 0.000003s : 19: predicate.dict_get_item_const_eliminator 1.45% : 0.000004s : 19: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 19: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000003s : 24: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 24: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 24: predicate.environ_get_depend_swap 1.73% : 0.000004s : 34: predicate.environ_get_eliminate 1.13% : 0.000003s : 24: predicate.environ_get_set_eliminate 1.64% : 0.000004s : 32: predicate.exchange_switch_depend_value 2.53% : 0.000006s : 32: predicate.float_depend_g_call 0.45% : 0.000001s : 10: predicate.float_environ_get_switch 0.65% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.56% : 0.000001s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.49% : 0.000001s : 10: predicate.incorporate_call 0.43% : 0.000001s : 10: predicate.incorporate_call_switch 5.61% : 0.000014s : 81: predicate.inline 0.60% : 0.000001s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 10: predicate.less_batch_normalization 1.84% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.69% : 0.000007s : 54: predicate.load_eliminater 0.72% : 0.000002s : 5: predicate.loop_unroll_after_grad 3.13% : 0.000008s : 57: predicate.loop_unroll_before_grad 1.57% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.62% : 0.000002s : 10: predicate.merge_addn 0.60% : 0.000002s : 10: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 19: predicate.minmaximum_grad 0.80% : 0.000002s : 5: predicate.mutable_eliminate 0.30% : 0.000001s : 5: predicate.opt_reshape 0.30% : 0.000001s : 5: predicate.parallel_virtual_node 2.04% : 0.000005s : 32: predicate.partial_defer_inline 1.67% : 0.000004s : 30: predicate.partial_eliminate 1.06% : 0.000003s : 19: predicate.print_const_string_wrapper 0.51% : 0.000001s : 10: predicate.reduce_all_const_elim 1.40% : 0.000003s : 19: predicate.reduce_eliminate 2.68% : 0.000007s : 54: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 10: predicate.remove_not_recompute_node 1.55% : 0.000004s : 35: predicate.replace_applicator 0.39% : 0.000001s : 10: predicate.replace_old_param 0.22% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000002s : 19: predicate.reshape_eliminate 0.52% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 5: predicate.row_tensor_eliminate 0.60% : 0.000001s : 10: predicate.same_eliminate 0.39% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.61% : 0.000002s : 10: predicate.specialize_transform 0.79% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.80% : 0.000004s : 32: predicate.switch_defer_inline 2.30% : 0.000006s : 42: predicate.switch_layer_defer_inline 6.10% : 0.000015s : 106: predicate.switch_simplify 1.15% : 0.000003s : 19: predicate.tile_eliminate 0.94% : 0.000002s : 19: predicate.transpose_eliminate 1.47% : 0.000004s : 29: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000004s : 29: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 29: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000007s : 45: predicate.tuple_list_get_item_eliminator 1.45% : 0.000004s : 29: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 39: predicate.tuple_list_set_item_eliminator 1.85% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.65% : 0.000007s : 54: predicate.updatestate_pure_node_eliminater 3.25% : 0.000008s : 64: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.58% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 10: predicate.virtual_output_eliminate 0.23% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000762 14 37.51% : 0.000286s : 5: func_graph_cloner_run.FuncGraphClonerGraph 62.49% : 0.000476s : 9: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.581772 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.25% : 0.006479s : 1: add_attr 0.25% : 0.006461s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000073s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000146s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000771s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000016s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000033s : 1: environ_conv 0.00% : 0.000111s : 1: event_method 0.00% : 0.000020s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000464s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000663s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000016s : 1: opt.transform.mutable_eliminate 0.06% : 0.001624s : 78: opt.transform.opt_a 0.00% : 0.000033s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000117s : 28: opt.transform.opt_b 0.00% : 0.000052s : 2: opt.transform.opt_trans_graph 0.00% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.15% : 0.003773s : 1: opt_a 0.00% : 0.000109s : 1: opt_after_cconv 0.02% : 0.000512s : 1: opt_after_jit_grad 0.01% : 0.000218s : 1: opt_b 0.25% : 0.006434s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000050s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000022s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000066s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000017s : 1: remove_dup_value 0.02% : 0.000550s : 1: renormalize.infer 0.02% : 0.000499s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000034s : 1: rewriter_after_opt_a 0.01% : 0.000279s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000083s : 1: symbol_engine_optimizer 96.60% : 2.494000s : 1: task_emit 0.01% : 0.000147s : 1: tuple_transform 2.22% : 0.057238s : 1: type_inference 0.00% : 0.000093s : 1: validate TotalTime = 2.59266, [24] [bootstrap]: 0.00088697 [type_inference]: 0.0366556 [event_method]: 1.748e-05 [auto_monad]: 0.00013987 [graph_reusing]: 6.07001e-06 [inline]: 2.23002e-06 [add_attr]: 0.00775102, [1] [add_attr_with_inline]: 0.00773627, [1] [Cycle 1]: 0.00013528, [2] [tag_attr]: 3.226e-05 [meta_addattr_fg_expand]: 1.758e-05 [parallel-infer-symbol]: 2.92002e-06 [pre_auto_parallel]: 5.484e-05 [insert-virtual-dataset]: 2.53e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.00494253, [53] [py_interpret_to_execute]: 4.06001e-06 [rewriter_before_opt_a]: 0.00020201 [opt_a]: 0.00270589, [2] [Cycle 1]: 0.00216503, [45] [expand_dump_flag]: 3.88001e-06 [switch_simplify]: 8.059e-05 [loop_unroll]: 2.865e-05 [a_1]: 0.0005449 [with_stream_mark]: 1.37e-05 [recompute_prepare]: 6.84999e-06 [updatestate_depend_eliminate]: 5.225e-05 [updatestate_assign_eliminate]: 1.385e-05 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 6.811e-05 [accelerated_algorithm]: 5.77001e-06 [shard]: 1.99999e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 5.27001e-06 [merge_send_recv]: 5.393e-05 [auto_parallel]: 6.26e-06 [parallel]: 9.142e-05 [flash_sp]: 4.155e-05 [merge_comm]: 3.62002e-06 [allreduce_fusion]: 1.365e-05 [matmul_add_comm_reduction]: 1.915e-05 [allreduce_slice_to_reducescatter]: 1.197e-05 [virtual_shard_identity]: 7.56999e-06 [virtual_dataset]: 5.51e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 5.22e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 1.21997e-06 [offload_activation]: 2.034e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.168e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 8.84e-06 [set_forward_comm_id_for_comm_node_pass]: 1.346e-05 [meta_fg_expand]: 2.54001e-06 [flash_sp_send_recv_attached]: 2.51e-06 [receive_attached]: 2.303e-05 [after_resolve]: 8.61002e-06 [a_after_grad]: 1.641e-05 [renormalize]: 0.00058782 [add_forward_monad_depend]: 5.24998e-06 [auto_monad_grad]: 2.50002e-06 [auto_monad_eliminator]: 2.566e-05 [cse]: 5.12e-05 [a_3]: 3.88e-05 [Cycle 2]: 0.00052962, [45] [expand_dump_flag]: 1.43002e-06 [switch_simplify]: 6.04001e-06 [loop_unroll]: 4.67e-06 [a_1]: 9.268e-05 [with_stream_mark]: 1.103e-05 [recompute_prepare]: 5.30001e-06 [updatestate_depend_eliminate]: 2.47001e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.09999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 5.572e-05 [accelerated_algorithm]: 5.19e-06 [shard]: 1.17999e-06 [meta_shard_fg_expand]: 1.20999e-06 [shard_inline]: 4.87e-06 [merge_send_recv]: 4.28999e-06 [auto_parallel]: 4.86997e-06 [parallel]: 4.82998e-06 [flash_sp]: 3.13998e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.48e-06 [matmul_add_comm_reduction]: 4.80001e-06 [allreduce_slice_to_reducescatter]: 3.99974e-07 [virtual_shard_identity]: 6.07999e-06 [virtual_dataset]: 4.99998e-06 [get_grad_eliminate_]: 5.04e-06 [virtual_output]: 4.85999e-06 [merge_forward]: 2.24001e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 5.20001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.069e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 8.06001e-06 [set_forward_comm_id_for_comm_node_pass]: 2.68e-06 [meta_fg_expand]: 1.49e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 7.64002e-06 [a_after_grad]: 6.96001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.10999e-06 [auto_monad_grad]: 1.02998e-06 [auto_monad_eliminator]: 4.84e-06 [cse]: 1.155e-05 [a_3]: 2.77e-05 [py_interpret_to_execute_after_opt_a]: 3.92002e-06 [slice_cell_reuse_recomputed_activation]: 2.34001e-06 [rewriter_after_opt_a]: 2.779e-05 [convert_after_rewriter]: 1.42e-06 [order_py_execute_after_rewriter]: 1.19998e-06 [mutable_eliminate]: 0.00053235 [opt_b]: 0.00016314, [1] [Cycle 1]: 0.00015712, [7] [b_1]: 9.491e-05 [b_2]: 6.67002e-06 [updatestate_depend_eliminate]: 4.72998e-06 [updatestate_assign_eliminate]: 2.02999e-06 [updatestate_loads_eliminate]: 2.02999e-06 [renormalize]: 4.30009e-07 [cse]: 1.452e-05 [optimize_parallel_all_gather_comm]: 2.78e-05 [overlap_param_gather]: 1.372e-05 [cconv]: 2.312e-05 [loop_unroll]: 0.00040786 [opt_after_cconv]: 8.688e-05, [1] [Cycle 1]: 8.093e-05, [7] [c_1]: 2.352e-05 [parameter_eliminate]: 2.01003e-06 [updatestate_depend_eliminate]: 4.72e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.11e-06 [cse]: 1.477e-05 [renormalize]: 2.79979e-07 [remove_dup_value]: 1.336e-05 [tuple_transform]: 6.173e-05, [1] [Cycle 1]: 5.735e-05, [4] [d_1]: 3.278e-05 [none_parameter_eliminate]: 1.60999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 5.81e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 5.814e-05 [cse_after_recomputation]: 1.851e-05, [1] [Cycle 1]: 1.434e-05, [1] [cse]: 9.21998e-06 [environ_conv]: 3.938e-05 [swap_dp_allreduce_reducescatter]: 2.722e-05 [bias_add_comm_swap]: 1.345e-05 [label_micro_interleaved_index]: 1.541e-05 [label_fine_grained_interleaved_index]: 2.49999e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 2.08002e-06 [micro_interleaved_order_control]: 2.68998e-06 [assign_add_opt]: 1.56998e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.107e-05 [full_micro_interleaved_order_control]: 1.347e-05 [reorder_send_recv_between_fp_bp]: 2.72001e-06 [comm_op_add_attrs]: 1.36002e-06 [add_comm_op_reuse_tag]: 1.40001e-06 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.102e-05 [overlap_opt_shard_in_pipeline]: 1.847e-05 [overlap_opt_shard_grad_in_pipeline]: 1.79998e-06 [control_data_broadcast_order]: 1.152e-05 [grouped_pairwise_exchange_alltoall]: 1.51002e-06 [offloading_packed_experts]: 3.43999e-06 [overlap_recompute_and_grad_model_parallel]: 1.496e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.50001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.05002e-06 [overlap_grad_ring_attention]: 2.46e-05 [overlap_grad_flash_sp]: 4.853e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 1.303e-05 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.01997e-06 [symbol_engine_optimizer]: 6.927e-05, [1] [Cycle 1]: 6.488e-05, [6] [build]: 2.30002e-06 [elim_shapecalc]: 9.14e-06 [elim_not_effective]: 1.129e-05 [opt_reshape]: 5.76003e-06 [fold_const_symbol]: 8.87e-06 [renormalize]: 2.9002e-07 [detach_backward]: 1.54e-06 [pipeline_parallel_scheduler]: 1.86e-06 [auto_monad_reorder]: 2.246e-05 [get_jit_bprop_graph]: 1.37e-06 [rewriter_after_jit_bprop_graph]: 2.83e-06 [opt_after_jit_grad]: 0.00044363 [validate]: 5.593e-05 [backend_pass]: 9.79984e-07 [task_emit]: 2.54108 [execute]: 1.093e-05 Sums bootstrap : 0.000887s : 0.03% type_inference : 0.036656s : 1.42% event_method : 0.000017s : 0.00% auto_monad : 0.000140s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000018s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000055s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000202s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000087s : 0.00% optimize.opt_a.loop_unroll : 0.000033s : 0.00% optimize.opt_a.a_1 : 0.000638s : 0.02% optimize.opt_a.with_stream_mark : 0.000025s : 0.00% optimize.opt_a.recompute_prepare : 0.000012s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000055s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000124s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000010s : 0.00% optimize.opt_a.merge_send_recv : 0.000058s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000096s : 0.00% optimize.opt_a.flash_sp : 0.000045s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000016s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000010s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.000016s : 0.00% optimize.opt_a.a_after_grad : 0.000023s : 0.00% optimize.opt_a.renormalize : 0.000588s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.00% optimize.opt_a.cse : 0.000063s : 0.00% optimize.opt_a.a_3 : 0.000066s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000532s : 0.02% optimize.opt_b.b_1 : 0.000095s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000015s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000408s : 0.02% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000015s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000033s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000058s : 0.00% optimize.cse_after_recomputation.cse : 0.000009s : 0.00% optimize.environ_conv : 0.000039s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000013s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000011s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000018s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.00% optimize.overlap_grad_flash_sp : 0.000049s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000013s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000444s : 0.02% validate : 0.000056s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.541080s : 98.35% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000185 24 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000001s : 2: substitution.fold_const_symbol 2.68% : 0.000005s : 3: substitution.graph_param_transform 71.01% : 0.000132s : 5: substitution.inline 1.80% : 0.000003s : 4: substitution.j_node_and_user_rematch 8.26% : 0.000015s : 4: substitution.remove_not_recompute_node 1.70% : 0.000003s : 2: substitution.replace_old_param 12.82% : 0.000024s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.036569 2 97.72% : 0.035736s : 1: type_inference.infer 2.28% : 0.000833s : 1: type_inference.specialize ------[replace.] 0.000058 7 76.88% : 0.000045s : 5: replace.inline 23.12% : 0.000013s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 7 85.09% : 0.000129s : 5: match.inline 14.91% : 0.000023s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000162 1031 0.94% : 0.000002s : 11: predicate.accumulaten_eliminater 0.86% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 6: predicate.addn_check_dump 1.16% : 0.000002s : 11: predicate.addn_zero_filter 0.85% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.20% : 0.000004s : 17: predicate.arithmetic_simplify 1.01% : 0.000002s : 11: predicate.cast_eliminate 0.62% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.26% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.91% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_depend_swap 1.68% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.52% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.59% : 0.000004s : 18: predicate.float_depend_g_call 0.49% : 0.000001s : 6: predicate.float_environ_get_switch 0.72% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.76% : 0.000001s : 6: predicate.get_grad_eliminate 0.23% : 0.000000s : 3: predicate.graph_param_transform 0.54% : 0.000001s : 6: predicate.incorporate_call 0.46% : 0.000001s : 6: predicate.incorporate_call_switch 5.77% : 0.000009s : 47: predicate.inline 0.73% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.91% : 0.000001s : 6: predicate.less_batch_normalization 1.78% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.41% : 0.000004s : 30: predicate.load_eliminater 1.07% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.84% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.81% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 6: predicate.merge_addn 0.49% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000001s : 11: predicate.minmaximum_grad 1.15% : 0.000002s : 3: predicate.mutable_eliminate 0.42% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 2.11% : 0.000003s : 18: predicate.partial_defer_inline 1.44% : 0.000002s : 16: predicate.partial_eliminate 0.93% : 0.000001s : 11: predicate.print_const_string_wrapper 0.56% : 0.000001s : 6: predicate.reduce_all_const_elim 1.36% : 0.000002s : 11: predicate.reduce_eliminate 2.42% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 6: predicate.remove_not_recompute_node 1.29% : 0.000002s : 19: predicate.replace_applicator 0.41% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 1.11% : 0.000002s : 11: predicate.reshape_eliminate 0.60% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 3: predicate.row_tensor_eliminate 0.86% : 0.000001s : 6: predicate.same_eliminate 0.41% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.70% : 0.000001s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.74% : 0.000001s : 6: predicate.specialize_transform 0.89% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.59% : 0.000003s : 18: predicate.switch_defer_inline 2.05% : 0.000003s : 24: predicate.switch_layer_defer_inline 5.79% : 0.000009s : 61: predicate.switch_simplify 0.94% : 0.000002s : 11: predicate.tile_eliminate 0.96% : 0.000002s : 11: predicate.transpose_eliminate 1.66% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000002s : 17: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.46% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.18% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.68% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.31% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.97% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.30% : 0.000000s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000499 12 49.86% : 0.000249s : 5: func_graph_cloner_run.FuncGraphClonerGraph 50.14% : 0.000250s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.606932 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.30% : 0.007756s : 1: add_attr 0.30% : 0.007740s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000062s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000146s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.04% : 0.000935s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000021s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000043s : 1: environ_conv 0.00% : 0.000023s : 1: event_method 0.00% : 0.000040s : 1: execute 0.00% : 0.000016s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000014s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.02% : 0.000417s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000541s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000012s : 1: opt.transform.mutable_eliminate 0.04% : 0.001033s : 78: opt.transform.opt_a 0.00% : 0.000022s : 1: opt.transform.opt_after_cconv 0.00% : 0.000020s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000076s : 28: opt.transform.opt_b 0.00% : 0.000037s : 2: opt.transform.opt_trans_graph 0.00% : 0.000032s : 4: opt.transform.symbol_engine_opt 0.10% : 0.002709s : 1: opt_a 0.00% : 0.000090s : 1: opt_after_cconv 0.02% : 0.000453s : 1: opt_after_jit_grad 0.01% : 0.000166s : 1: opt_b 0.19% : 0.004947s : 1: optimize 0.00% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000052s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000027s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000022s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000055s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000059s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000017s : 1: remove_dup_value 0.01% : 0.000275s : 1: renormalize.infer 0.01% : 0.000305s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.01% : 0.000208s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000016s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000072s : 1: symbol_engine_optimizer 97.48% : 2.541215s : 1: task_emit 0.00% : 0.000065s : 1: tuple_transform 1.41% : 0.036677s : 1: type_inference 0.00% : 0.000082s : 1: validate TotalTime = 2.73805, [24] [bootstrap]: 0.0009313 [type_inference]: 0.0243865 [event_method]: 2.239e-05 [auto_monad]: 0.00012247 [graph_reusing]: 6.26e-06 [inline]: 1.97999e-06 [add_attr]: 0.00767234, [1] [add_attr_with_inline]: 0.00765738, [1] [Cycle 1]: 0.00011463, [2] [tag_attr]: 3.23e-05 [meta_addattr_fg_expand]: 1.394e-05 [parallel-infer-symbol]: 2.81999e-06 [pre_auto_parallel]: 5.149e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00495856, [53] [py_interpret_to_execute]: 3.85998e-06 [rewriter_before_opt_a]: 0.00023102 [opt_a]: 0.00277593, [2] [Cycle 1]: 0.00220099, [45] [expand_dump_flag]: 2.54999e-06 [switch_simplify]: 7.099e-05 [loop_unroll]: 3.128e-05 [a_1]: 0.0005734 [with_stream_mark]: 1.405e-05 [recompute_prepare]: 7.33e-06 [updatestate_depend_eliminate]: 1.287e-05 [updatestate_assign_eliminate]: 1.046e-05 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 2.23002e-06 [a_2]: 7.611e-05 [accelerated_algorithm]: 6.09001e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 5.72001e-06 [merge_send_recv]: 3.911e-05 [auto_parallel]: 6.64001e-06 [parallel]: 8.899e-05 [flash_sp]: 3.321e-05 [merge_comm]: 4.15e-06 [allreduce_fusion]: 1.049e-05 [matmul_add_comm_reduction]: 1.48e-05 [allreduce_slice_to_reducescatter]: 8.72e-06 [virtual_shard_identity]: 7.96001e-06 [virtual_dataset]: 6.54001e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 3.855e-05 [merge_forward]: 4.34997e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 1.56e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.076e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 9.44e-06 [set_forward_comm_id_for_comm_node_pass]: 1.082e-05 [meta_fg_expand]: 2.98998e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 1.667e-05 [after_resolve]: 1.179e-05 [a_after_grad]: 8.42e-06 [renormalize]: 0.00065519 [add_forward_monad_depend]: 5.50001e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 2.393e-05 [cse]: 4.749e-05 [a_3]: 4.15e-05 [Cycle 2]: 0.00056542, [45] [expand_dump_flag]: 9.20001e-07 [switch_simplify]: 6.94999e-06 [loop_unroll]: 6.32001e-06 [a_1]: 9.517e-05 [with_stream_mark]: 1.107e-05 [recompute_prepare]: 5.56998e-06 [updatestate_depend_eliminate]: 3.12002e-06 [updatestate_assign_eliminate]: 2.50002e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 6.293e-05 [accelerated_algorithm]: 5.22999e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.22e-06 [shard_inline]: 5.20001e-06 [merge_send_recv]: 4.43001e-06 [auto_parallel]: 5.42999e-06 [parallel]: 4.39998e-06 [flash_sp]: 3.03998e-06 [merge_comm]: 3.22002e-06 [allreduce_fusion]: 2.79001e-06 [matmul_add_comm_reduction]: 5.57001e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 5.87999e-06 [virtual_dataset]: 5.44e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 5.30999e-06 [merge_forward]: 2.79999e-06 [cell_reuse_recompute_pass]: 1.24998e-06 [offload_activation]: 5.65001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.172e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 7.98999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.26001e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.04998e-06 [after_resolve]: 8.3e-06 [a_after_grad]: 7.58001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 5.86e-06 [cse]: 1.479e-05 [a_3]: 3.161e-05 [py_interpret_to_execute_after_opt_a]: 3.66001e-06 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 2.583e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.25999e-06 [mutable_eliminate]: 0.00048334 [opt_b]: 0.00018627, [1] [Cycle 1]: 0.00017968, [7] [b_1]: 0.00010965 [b_2]: 7.11999e-06 [updatestate_depend_eliminate]: 5.36998e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [renormalize]: 3.29979e-07 [cse]: 1.96e-05 [optimize_parallel_all_gather_comm]: 2.605e-05 [overlap_param_gather]: 1.024e-05 [cconv]: 2.304e-05 [loop_unroll]: 0.00040667 [opt_after_cconv]: 9.236e-05, [1] [Cycle 1]: 8.695e-05, [7] [c_1]: 2.436e-05 [parameter_eliminate]: 2.19999e-06 [updatestate_depend_eliminate]: 4.92e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.41998e-06 [cse]: 1.878e-05 [renormalize]: 3.9002e-07 [remove_dup_value]: 1.636e-05 [tuple_transform]: 6.329e-05, [1] [Cycle 1]: 5.911e-05, [4] [d_1]: 3.381e-05 [none_parameter_eliminate]: 1.44998e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.28998e-06 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 5.729e-05 [cse_after_recomputation]: 2.297e-05, [1] [Cycle 1]: 1.862e-05, [1] [cse]: 1.316e-05 [environ_conv]: 1.839e-05 [swap_dp_allreduce_reducescatter]: 2.171e-05 [bias_add_comm_swap]: 9.74e-06 [label_micro_interleaved_index]: 1.201e-05 [label_fine_grained_interleaved_index]: 2.55997e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.60002e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 7.10017e-07 [remove_cast_before_assign_add]: 8.08001e-06 [full_micro_interleaved_order_control]: 9.94001e-06 [reorder_send_recv_between_fp_bp]: 2.53003e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 7.6e-06 [overlap_opt_shard_in_pipeline]: 1.993e-05 [overlap_opt_shard_grad_in_pipeline]: 1.88002e-06 [control_data_broadcast_order]: 1.24e-05 [grouped_pairwise_exchange_alltoall]: 1.87001e-06 [offloading_packed_experts]: 3.8e-06 [overlap_recompute_and_grad_model_parallel]: 1.222e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.31002e-06 [overlap_recompute_comm]: 2.20002e-06 [overlap_grad_ring_attention]: 1.932e-05 [overlap_grad_flash_sp]: 3.813e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 9.66e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 9.69972e-07 [symbol_engine_optimizer]: 7.159e-05, [1] [Cycle 1]: 6.724e-05, [6] [build]: 2.61e-06 [elim_shapecalc]: 9.37001e-06 [elim_not_effective]: 1.242e-05 [opt_reshape]: 6.41e-06 [fold_const_symbol]: 9.17999e-06 [renormalize]: 1.80007e-07 [detach_backward]: 1.69e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 2.335e-05 [get_jit_bprop_graph]: 9.79984e-07 [rewriter_after_jit_bprop_graph]: 3.21001e-06 [opt_after_jit_grad]: 0.00047281 [validate]: 5.733e-05 [backend_pass]: 9.70002e-07 [task_emit]: 2.69901 [execute]: 1.08e-05 Sums bootstrap : 0.000931s : 0.03% type_inference : 0.024386s : 0.89% event_method : 0.000022s : 0.00% auto_monad : 0.000122s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000051s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000231s : 0.01% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000078s : 0.00% optimize.opt_a.loop_unroll : 0.000038s : 0.00% optimize.opt_a.a_1 : 0.000669s : 0.02% optimize.opt_a.with_stream_mark : 0.000025s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000139s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000044s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000093s : 0.00% optimize.opt_a.flash_sp : 0.000036s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000044s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.000655s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.00% optimize.opt_a.cse : 0.000062s : 0.00% optimize.opt_a.a_3 : 0.000073s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000483s : 0.02% optimize.opt_b.b_1 : 0.000110s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000407s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.00% optimize.tuple_transform.d_1 : 0.000034s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000057s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000018s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000022s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000020s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000038s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000473s : 0.02% validate : 0.000057s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.699011s : 98.89% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000196 24 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 2.50% : 0.000005s : 3: substitution.graph_param_transform 75.41% : 0.000148s : 5: substitution.inline 1.59% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.48% : 0.000013s : 4: substitution.remove_not_recompute_node 2.03% : 0.000004s : 2: substitution.replace_old_param 10.41% : 0.000020s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024312 2 95.40% : 0.023194s : 1: type_inference.infer 4.60% : 0.001118s : 1: type_inference.specialize ------[replace.] 0.000069 7 73.93% : 0.000051s : 5: replace.inline 26.07% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000165 7 88.29% : 0.000145s : 5: match.inline 11.71% : 0.000019s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000164 1031 0.97% : 0.000002s : 11: predicate.accumulaten_eliminater 0.87% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 6: predicate.addn_check_dump 0.98% : 0.000002s : 11: predicate.addn_zero_filter 0.86% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.17% : 0.000004s : 17: predicate.arithmetic_simplify 0.95% : 0.000002s : 11: predicate.cast_eliminate 0.59% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.51% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 14: predicate.environ_get_depend_swap 1.72% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.52% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.50% : 0.000004s : 18: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.67% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.53% : 0.000001s : 6: predicate.incorporate_call 0.45% : 0.000001s : 6: predicate.incorporate_call_switch 6.17% : 0.000010s : 47: predicate.inline 0.75% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.73% : 0.000001s : 6: predicate.less_batch_normalization 1.70% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.39% : 0.000004s : 30: predicate.load_eliminater 0.93% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.88% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 6: predicate.merge_addn 0.56% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.85% : 0.000001s : 11: predicate.minmaximum_grad 1.06% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.40% : 0.000001s : 3: predicate.parallel_virtual_node 1.98% : 0.000003s : 18: predicate.partial_defer_inline 1.46% : 0.000002s : 16: predicate.partial_eliminate 0.98% : 0.000002s : 11: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.22% : 0.000002s : 11: predicate.reduce_eliminate 2.50% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 6: predicate.remove_not_recompute_node 1.33% : 0.000002s : 19: predicate.replace_applicator 0.43% : 0.000001s : 6: predicate.replace_old_param 0.28% : 0.000000s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 11: predicate.reshape_eliminate 0.58% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 3: predicate.row_tensor_eliminate 0.70% : 0.000001s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.75% : 0.000001s : 6: predicate.special_op_eliminate 0.67% : 0.000001s : 6: predicate.specialize_transform 0.91% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.64% : 0.000003s : 18: predicate.switch_defer_inline 2.21% : 0.000004s : 24: predicate.switch_layer_defer_inline 5.87% : 0.000010s : 61: predicate.switch_simplify 1.01% : 0.000002s : 11: predicate.tile_eliminate 0.95% : 0.000002s : 11: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.47% : 0.000002s : 17: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.51% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.29% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 3.00% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.75% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000710 13 48.23% : 0.000343s : 6: func_graph_cloner_run.FuncGraphClonerGraph 51.77% : 0.000368s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.752580 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.28% : 0.007678s : 1: add_attr 0.28% : 0.007662s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000128s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.04% : 0.000983s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000022s : 1: environ_conv 0.00% : 0.000028s : 1: event_method 0.00% : 0.000027s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000010s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.02% : 0.000415s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000492s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.04% : 0.001118s : 78: opt.transform.opt_a 0.00% : 0.000023s : 1: opt.transform.opt_after_cconv 0.00% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000091s : 28: opt.transform.opt_b 0.00% : 0.000038s : 2: opt.transform.opt_trans_graph 0.00% : 0.000033s : 4: opt.transform.symbol_engine_opt 0.10% : 0.002779s : 1: opt_a 0.00% : 0.000096s : 1: opt_after_cconv 0.02% : 0.000483s : 1: opt_after_jit_grad 0.01% : 0.000189s : 1: opt_b 0.18% : 0.004963s : 1: optimize 0.00% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000042s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000056s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000020s : 1: remove_dup_value 0.01% : 0.000302s : 1: renormalize.infer 0.01% : 0.000346s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000030s : 1: rewriter_after_opt_a 0.01% : 0.000237s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000025s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000074s : 1: symbol_engine_optimizer 98.06% : 2.699054s : 1: task_emit 0.00% : 0.000066s : 1: tuple_transform 0.89% : 0.024403s : 1: type_inference 0.00% : 0.000098s : 1: validate TotalTime = 0.09786, [24] [bootstrap]: 0.0004561 [type_inference]: 0.063301 [event_method]: 5.607e-05 [auto_monad]: 0.00016994 [graph_reusing]: 1.023e-05 [inline]: 2.46e-06 [add_attr]: 0.00348213, [1] [add_attr_with_inline]: 0.00347288, [1] [Cycle 1]: 9.002e-05, [2] [tag_attr]: 4.738e-05 [meta_addattr_fg_expand]: 1.266e-05 [parallel-infer-symbol]: 3.38e-06 [pre_auto_parallel]: 6.634e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.0238086, [53] [py_interpret_to_execute]: 4.47e-06 [rewriter_before_opt_a]: 0.00042975 [opt_a]: 0.0215209, [3] [Cycle 1]: 0.0181644, [45] [expand_dump_flag]: 4.90001e-06 [switch_simplify]: 0.00017789 [loop_unroll]: 8.186e-05 [a_1]: 0.00166965 [with_stream_mark]: 2.336e-05 [recompute_prepare]: 2.01e-05 [updatestate_depend_eliminate]: 7.96001e-06 [updatestate_assign_eliminate]: 7.66999e-06 [updatestate_loads_eliminate]: 6.94999e-06 [parameter_eliminate]: 2.21998e-06 [a_2]: 0.00021618 [accelerated_algorithm]: 1.471e-05 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 4.55001e-06 [shard_inline]: 1.419e-05 [merge_send_recv]: 1.567e-05 [auto_parallel]: 1.161e-05 [parallel]: 2.7e-05 [flash_sp]: 9.91e-06 [merge_comm]: 8.65001e-06 [allreduce_fusion]: 8.06001e-06 [matmul_add_comm_reduction]: 2.533e-05 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 1.659e-05 [virtual_dataset]: 1.401e-05 [get_grad_eliminate_]: 1.39e-05 [virtual_output]: 1.417e-05 [merge_forward]: 8.78001e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.709e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.473e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 2.416e-05 [set_forward_comm_id_for_comm_node_pass]: 8.24998e-06 [meta_fg_expand]: 0.00176607 [flash_sp_send_recv_attached]: 4.35e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 6.84e-05 [a_after_grad]: 8.73e-05 [renormalize]: 0.0127288 [add_forward_monad_depend]: 1.056e-05 [auto_monad_grad]: 5.89999e-06 [auto_monad_eliminator]: 5.892e-05 [cse]: 0.0003115 [a_3]: 0.00033148 [Cycle 2]: 0.00279116, [45] [expand_dump_flag]: 2.39999e-06 [switch_simplify]: 4.389e-05 [loop_unroll]: 4.113e-05 [a_1]: 0.00117581 [with_stream_mark]: 1.813e-05 [recompute_prepare]: 7.97e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.37002e-06 [parameter_eliminate]: 1.88002e-06 [a_2]: 7e-05 [accelerated_algorithm]: 6.41e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.37001e-06 [shard_inline]: 5.50001e-06 [merge_send_recv]: 9.09e-06 [auto_parallel]: 9.66e-06 [parallel]: 8.19002e-06 [flash_sp]: 3.84002e-06 [merge_comm]: 3.6e-06 [allreduce_fusion]: 3.33e-06 [matmul_add_comm_reduction]: 8.27e-06 [allreduce_slice_to_reducescatter]: 8.59989e-07 [virtual_shard_identity]: 7.20998e-06 [virtual_dataset]: 5.98998e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 5.79e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.158e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 8.95999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 0.00013655 [flash_sp_send_recv_attached]: 2.22001e-06 [receive_attached]: 2.83e-06 [after_resolve]: 1.159e-05 [a_after_grad]: 8.48001e-06 [renormalize]: 0.00081925 [add_forward_monad_depend]: 3.43999e-06 [auto_monad_grad]: 1.68002e-06 [auto_monad_eliminator]: 1.053e-05 [cse]: 2.577e-05 [a_3]: 4.044e-05 [Cycle 3]: 0.00055065, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 6.64999e-06 [loop_unroll]: 5.55001e-06 [a_1]: 9.313e-05 [with_stream_mark]: 7.68001e-06 [recompute_prepare]: 5.82999e-06 [updatestate_depend_eliminate]: 3.01999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.24001e-06 [parameter_eliminate]: 8.00006e-07 [a_2]: 6.611e-05 [accelerated_algorithm]: 5.64998e-06 [shard]: 1.03001e-06 [meta_shard_fg_expand]: 1.29e-06 [shard_inline]: 5.74e-06 [merge_send_recv]: 4.62e-06 [auto_parallel]: 6.84001e-06 [parallel]: 4.69002e-06 [flash_sp]: 8.50006e-07 [merge_comm]: 3.02002e-06 [allreduce_fusion]: 2.88e-06 [matmul_add_comm_reduction]: 5.77001e-06 [allreduce_slice_to_reducescatter]: 4.90021e-07 [virtual_shard_identity]: 6.28e-06 [virtual_dataset]: 5.36998e-06 [get_grad_eliminate_]: 5.23002e-06 [virtual_output]: 5.30999e-06 [merge_forward]: 2.61999e-06 [cell_reuse_recompute_pass]: 1.25001e-06 [offload_activation]: 8.07e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.152e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 8.60001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.04999e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 9.49978e-07 [receive_attached]: 1.15001e-06 [after_resolve]: 8.08999e-06 [a_after_grad]: 7.31999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.00999e-06 [auto_monad_grad]: 6.69999e-07 [auto_monad_eliminator]: 5.20999e-06 [cse]: 1.309e-05 [a_3]: 3.193e-05 [py_interpret_to_execute_after_opt_a]: 4.38999e-06 [slice_cell_reuse_recomputed_activation]: 2.02999e-06 [rewriter_after_opt_a]: 2.08e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.49e-06 [mutable_eliminate]: 0.00060869 [opt_b]: 0.00020323, [1] [Cycle 1]: 0.0001967, [7] [b_1]: 0.00012493 [b_2]: 7.48e-06 [updatestate_depend_eliminate]: 5.36998e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.48e-06 [renormalize]: 2.89991e-07 [cse]: 2.044e-05 [optimize_parallel_all_gather_comm]: 1.657e-05 [overlap_param_gather]: 1.95001e-06 [cconv]: 1.21e-05 [loop_unroll]: 0.000387 [opt_after_cconv]: 8.596e-05, [1] [Cycle 1]: 8.096e-05, [7] [c_1]: 2.282e-05 [parameter_eliminate]: 1.45999e-06 [updatestate_depend_eliminate]: 3.82002e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.19001e-06 [cse]: 1.7e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 1.68e-05 [tuple_transform]: 5.838e-05, [1] [Cycle 1]: 5.437e-05, [4] [d_1]: 3.061e-05 [none_parameter_eliminate]: 7.40023e-07 [renormalize]: 1.40019e-07 [switch_simplify]: 6.12001e-06 [partial_unused_args_eliminate]: 8.70001e-07 [add_recomputation]: 2.882e-05 [cse_after_recomputation]: 2.223e-05, [1] [Cycle 1]: 1.801e-05, [1] [cse]: 1.251e-05 [environ_conv]: 3.47997e-06 [swap_dp_allreduce_reducescatter]: 5.53002e-06 [bias_add_comm_swap]: 2.83e-06 [label_micro_interleaved_index]: 4.75001e-06 [label_fine_grained_interleaved_index]: 2.60002e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.68003e-06 [comm_op_add_attrs]: 1.11002e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 6.86001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11998e-06 [control_data_broadcast_order]: 1.269e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 3.64002e-06 [overlap_recompute_and_grad_model_parallel]: 4.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45001e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 4.13999e-06 [overlap_grad_flash_sp]: 1.955e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.21998e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 1.82999e-06 [symbol_engine_optimizer]: 6.708e-05, [1] [Cycle 1]: 6.298e-05, [6] [build]: 2.14e-06 [elim_shapecalc]: 8.33999e-06 [elim_not_effective]: 1.146e-05 [opt_reshape]: 6.52001e-06 [fold_const_symbol]: 8.98002e-06 [renormalize]: 1.69995e-07 [detach_backward]: 1.94e-06 [pipeline_parallel_scheduler]: 1.81998e-06 [auto_monad_reorder]: 1.244e-05 [get_jit_bprop_graph]: 1.22999e-06 [rewriter_after_jit_bprop_graph]: 2.92002e-06 [opt_after_jit_grad]: 0.00042806 [validate]: 2.913e-05 [backend_pass]: 5.69999e-07 [task_emit]: 0.00582216 [execute]: 7.20003e-06 Sums bootstrap : 0.000456s : 0.49% type_inference : 0.063301s : 67.92% event_method : 0.000056s : 0.06% auto_monad : 0.000170s : 0.18% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000047s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000066s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000430s : 0.46% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000228s : 0.25% optimize.opt_a.loop_unroll : 0.000129s : 0.14% optimize.opt_a.a_1 : 0.002939s : 3.15% optimize.opt_a.with_stream_mark : 0.000049s : 0.05% optimize.opt_a.recompute_prepare : 0.000034s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000013s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000352s : 0.38% optimize.opt_a.accelerated_algorithm : 0.000027s : 0.03% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.01% optimize.opt_a.shard_inline : 0.000025s : 0.03% optimize.opt_a.merge_send_recv : 0.000029s : 0.03% optimize.opt_a.auto_parallel : 0.000028s : 0.03% optimize.opt_a.parallel : 0.000040s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000015s : 0.02% optimize.opt_a.allreduce_fusion : 0.000014s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000039s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000030s : 0.03% optimize.opt_a.virtual_dataset : 0.000025s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000025s : 0.03% optimize.opt_a.virtual_output : 0.000025s : 0.03% optimize.opt_a.merge_forward : 0.000016s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000048s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000042s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.02% optimize.opt_a.meta_fg_expand : 0.001905s : 2.04% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000088s : 0.09% optimize.opt_a.a_after_grad : 0.000103s : 0.11% optimize.opt_a.renormalize : 0.013548s : 14.54% optimize.opt_a.add_forward_monad_depend : 0.000015s : 0.02% optimize.opt_a.auto_monad_grad : 0.000008s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000075s : 0.08% optimize.opt_a.cse : 0.000350s : 0.38% optimize.opt_a.a_3 : 0.000404s : 0.43% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000609s : 0.65% optimize.opt_b.b_1 : 0.000125s : 0.13% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000012s : 0.01% optimize.loop_unroll : 0.000387s : 0.42% optimize.opt_after_cconv.c_1 : 0.000023s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000001s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000031s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000029s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000003s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000007s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000002s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000012s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000428s : 0.46% validate : 0.000029s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.005822s : 6.25% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000770 170 0.24% : 0.000002s : 2: substitution.elim_not_effective 1.25% : 0.000010s : 13: substitution.float_depend_g_call 0.45% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.17% : 0.000001s : 2: substitution.fold_const_symbol 0.36% : 0.000003s : 3: substitution.graph_param_transform 0.38% : 0.000003s : 2: substitution.incorporate_call 0.31% : 0.000002s : 2: substitution.incorporate_call_switch 66.69% : 0.000513s : 24: substitution.inline 2.10% : 0.000016s : 2: substitution.inline_without_move 1.11% : 0.000009s : 12: substitution.j_node_and_user_rematch 1.31% : 0.000010s : 7: substitution.minmaximum_grad 3.27% : 0.000025s : 13: substitution.partial_eliminate 1.32% : 0.000010s : 12: substitution.remove_not_recompute_node 3.13% : 0.000024s : 9: substitution.replace_applicator 1.48% : 0.000011s : 14: substitution.replace_old_param 0.37% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.82% : 0.000014s : 4: substitution.switch_simplify 2.78% : 0.000021s : 7: substitution.tuple_list_convert_item_index_to_positive 1.25% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.78% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 6.50% : 0.000050s : 18: substitution.tuple_list_get_item_eliminator 1.91% : 0.000015s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.063204 2 94.39% : 0.059661s : 1: type_inference.infer 5.61% : 0.003543s : 1: type_inference.specialize ------[replace.] 0.000289 37 59.62% : 0.000172s : 24: replace.inline 16.22% : 0.000047s : 4: replace.switch_simplify 24.16% : 0.000070s : 9: replace.tuple_list_get_item_eliminator ------[match.] 0.000537 37 93.58% : 0.000502s : 24: match.inline 2.08% : 0.000011s : 4: match.switch_simplify 4.34% : 0.000023s : 9: match.tuple_list_get_item_eliminator ------[predicate.] 0.000607 4428 1.17% : 0.000007s : 57: predicate.accumulaten_eliminater 0.19% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.40% : 0.000002s : 19: predicate.addn_check_dump 1.17% : 0.000007s : 57: predicate.addn_zero_filter 1.09% : 0.000007s : 57: predicate.adjust_all_reduce_mul_add 1.94% : 0.000012s : 76: predicate.arithmetic_simplify 1.18% : 0.000007s : 57: predicate.cast_eliminate 1.05% : 0.000006s : 49: predicate.check_bprop_eliminate 0.47% : 0.000003s : 19: predicate.compare_switch_simplify 0.05% : 0.000000s : 3: predicate.const_output_eliminate 0.41% : 0.000002s : 19: predicate.depend_value_elim 1.27% : 0.000008s : 57: predicate.dict_get_item_const_eliminator 1.32% : 0.000008s : 57: predicate.dict_get_item_eliminator 1.27% : 0.000008s : 57: predicate.dict_set_item_eliminator 0.23% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.06% : 0.000000s : 3: predicate.elim_not_effective 0.11% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000008s : 60: predicate.environ_add_const_eliminate 1.16% : 0.000007s : 60: predicate.environ_get_add_eliminate 1.18% : 0.000007s : 60: predicate.environ_get_depend_swap 1.62% : 0.000010s : 79: predicate.environ_get_eliminate 1.18% : 0.000007s : 60: predicate.environ_get_set_eliminate 1.94% : 0.000012s : 90: predicate.exchange_switch_depend_value 2.57% : 0.000016s : 90: predicate.float_depend_g_call 0.40% : 0.000002s : 19: predicate.float_environ_get_switch 0.47% : 0.000003s : 22: predicate.float_tuple_getitem_switch 0.04% : 0.000000s : 3: predicate.fold_const_symbol 0.50% : 0.000003s : 19: predicate.get_grad_eliminate 0.05% : 0.000000s : 3: predicate.graph_param_transform 0.42% : 0.000003s : 19: predicate.incorporate_call 0.38% : 0.000002s : 19: predicate.incorporate_call_switch 5.73% : 0.000035s : 191: predicate.inline 1.35% : 0.000008s : 48: predicate.inline_without_move 0.23% : 0.000001s : 19: predicate.j_node_and_user_rematch 0.55% : 0.000003s : 19: predicate.less_batch_normalization 1.53% : 0.000009s : 72: predicate.list_to_tuple_eliminator_ 2.62% : 0.000016s : 129: predicate.load_eliminater 0.23% : 0.000001s : 3: predicate.loop_unroll_after_grad 3.16% : 0.000019s : 144: predicate.loop_unroll_before_grad 1.33% : 0.000008s : 63: predicate.make_slice_get_slice_eliminator 0.44% : 0.000003s : 19: predicate.merge_addn 1.04% : 0.000006s : 49: predicate.micro_step_allgather_replace 1.04% : 0.000006s : 49: predicate.mini_step_allgather_replace 1.12% : 0.000007s : 57: predicate.minmaximum_grad 0.30% : 0.000002s : 3: predicate.mutable_eliminate 0.11% : 0.000001s : 3: predicate.opt_reshape 0.11% : 0.000001s : 3: predicate.parallel_virtual_node 2.41% : 0.000015s : 90: predicate.partial_defer_inline 1.63% : 0.000010s : 69: predicate.partial_eliminate 1.19% : 0.000007s : 57: predicate.print_const_string_wrapper 0.52% : 0.000003s : 19: predicate.reduce_all_const_elim 1.35% : 0.000008s : 57: predicate.reduce_eliminate 2.60% : 0.000016s : 129: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000002s : 19: predicate.remove_not_recompute_node 1.83% : 0.000011s : 115: predicate.replace_applicator 0.68% : 0.000004s : 48: predicate.replace_old_param 0.05% : 0.000000s : 3: predicate.reset_defer_inline 1.16% : 0.000007s : 57: predicate.reshape_eliminate 1.09% : 0.000007s : 49: predicate.row_tensor_add_zeros_like 0.15% : 0.000001s : 3: predicate.row_tensor_eliminate 1.22% : 0.000007s : 49: predicate.same_eliminate 0.29% : 0.000002s : 19: predicate.set_cell_output_no_recompute 0.62% : 0.000004s : 19: predicate.shard_identity_eliminate 0.21% : 0.000001s : 6: predicate.special_op_eliminate 0.48% : 0.000003s : 19: predicate.specialize_transform 1.18% : 0.000007s : 49: predicate.split_environ_get_set_with_tuple_value 1.34% : 0.000008s : 48: predicate.stack_unstack_eliminate 0.08% : 0.000000s : 3: predicate.switch_call_monad_eliminater 2.12% : 0.000013s : 90: predicate.switch_defer_inline 3.14% : 0.000019s : 139: predicate.switch_layer_defer_inline 6.26% : 0.000038s : 264: predicate.switch_simplify 1.15% : 0.000007s : 57: predicate.tile_eliminate 1.13% : 0.000007s : 57: predicate.transpose_eliminate 1.30% : 0.000008s : 63: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000009s : 63: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000008s : 63: predicate.tuple_list_get_item_depend_reorder 2.54% : 0.000015s : 91: predicate.tuple_list_get_item_eliminator 1.43% : 0.000009s : 63: predicate.tuple_list_get_set_item_eliminator 1.89% : 0.000011s : 82: predicate.tuple_list_set_item_eliminator 1.57% : 0.000010s : 72: predicate.tuple_to_list_eliminator_ 2.51% : 0.000015s : 129: predicate.updatestate_pure_node_eliminater 2.99% : 0.000018s : 148: predicate.updatestate_useless_node_eliminater 0.07% : 0.000000s : 3: predicate.value_based_eliminate 0.49% : 0.000003s : 19: predicate.virtual_dataset_eliminate 0.48% : 0.000003s : 19: predicate.virtual_output_eliminate 0.06% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.13% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002717 47 57.70% : 0.001568s : 19: func_graph_cloner_run.FuncGraphClonerGraph 42.30% : 0.001149s : 28: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.143239 237 0.00% : 0.000004s : 1: ForceFp32Comm 2.43% : 0.003487s : 1: add_attr 2.43% : 0.003477s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000033s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.12% : 0.000176s : 1: auto_monad 0.01% : 0.000016s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.33% : 0.000475s : 1: bootstrap 0.01% : 0.000016s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000006s : 1: environ_conv 0.04% : 0.000062s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.27% : 0.000393s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.43% : 0.000617s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000011s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000013s : 1: opt.transform.mutable_eliminate 3.07% : 0.004398s : 117: opt.transform.opt_a 0.02% : 0.000022s : 1: opt.transform.opt_after_cconv 0.01% : 0.000019s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000106s : 28: opt.transform.opt_b 0.02% : 0.000035s : 2: opt.transform.opt_trans_graph 0.02% : 0.000032s : 4: opt.transform.symbol_engine_opt 15.03% : 0.021525s : 1: opt_a 0.06% : 0.000089s : 1: opt_after_cconv 0.30% : 0.000436s : 1: opt_after_jit_grad 0.14% : 0.000207s : 1: opt_b 16.63% : 0.023814s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000071s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 7.92% : 0.011343s : 2: renormalize.infer 1.53% : 0.002189s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000024s : 1: rewriter_after_opt_a 0.30% : 0.000436s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000070s : 1: symbol_engine_optimizer 4.08% : 0.005839s : 1: task_emit 0.04% : 0.000061s : 1: tuple_transform 44.20% : 0.063317s : 1: type_inference 0.04% : 0.000053s : 1: validate TotalTime = 0.0928175, [24] [bootstrap]: 0.0006071 [type_inference]: 0.0617051 [event_method]: 0.00017175 [auto_monad]: 0.00016066 [graph_reusing]: 9.28002e-06 [inline]: 2.39999e-06 [add_attr]: 0.00490386, [1] [add_attr_with_inline]: 0.00489035, [1] [Cycle 1]: 8.68e-05, [2] [tag_attr]: 4.802e-05 [meta_addattr_fg_expand]: 1.056e-05 [parallel-infer-symbol]: 3.41999e-06 [pre_auto_parallel]: 6.218e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 1.61002e-06 [dataset_repeat_opt]: 2.03997e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.014503, [53] [py_interpret_to_execute]: 5.11002e-06 [rewriter_before_opt_a]: 0.00030819 [opt_a]: 0.0122022, [3] [Cycle 1]: 0.00919283, [45] [expand_dump_flag]: 4.77e-06 [switch_simplify]: 0.00017027 [loop_unroll]: 7.459e-05 [a_1]: 0.00155643 [with_stream_mark]: 2.367e-05 [recompute_prepare]: 3.048e-05 [updatestate_depend_eliminate]: 8.24998e-06 [updatestate_assign_eliminate]: 6.76999e-06 [updatestate_loads_eliminate]: 5.99e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 0.0002153 [accelerated_algorithm]: 1.545e-05 [shard]: 1.77999e-06 [meta_shard_fg_expand]: 4.80001e-06 [shard_inline]: 1.32e-05 [merge_send_recv]: 1.664e-05 [auto_parallel]: 1.171e-05 [parallel]: 2.507e-05 [flash_sp]: 1.041e-05 [merge_comm]: 8.01001e-06 [allreduce_fusion]: 1.268e-05 [matmul_add_comm_reduction]: 2.302e-05 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 1.702e-05 [virtual_dataset]: 1.531e-05 [get_grad_eliminate_]: 1.501e-05 [virtual_output]: 1.452e-05 [merge_forward]: 8.42e-06 [cell_reuse_recompute_pass]: 1.00001e-06 [offload_activation]: 1.633e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.673e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 2.571e-05 [set_forward_comm_id_for_comm_node_pass]: 7.68001e-06 [meta_fg_expand]: 0.00202711 [flash_sp_send_recv_attached]: 4.55001e-06 [receive_attached]: 2.11e-06 [after_resolve]: 5.925e-05 [a_after_grad]: 8.729e-05 [renormalize]: 0.00370626 [add_forward_monad_depend]: 1.015e-05 [auto_monad_grad]: 5.12e-06 [auto_monad_eliminator]: 4.96e-05 [cse]: 0.00021237 [a_3]: 0.00030009 [Cycle 2]: 0.00244489, [45] [expand_dump_flag]: 2.63998e-06 [switch_simplify]: 3.987e-05 [loop_unroll]: 4.355e-05 [a_1]: 0.00118528 [with_stream_mark]: 1.371e-05 [recompute_prepare]: 7.68001e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 2.77002e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 1.35001e-06 [a_2]: 6.457e-05 [accelerated_algorithm]: 5.82001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 5.36002e-06 [merge_send_recv]: 7.98999e-06 [auto_parallel]: 1.117e-05 [parallel]: 6.73e-06 [flash_sp]: 3.76001e-06 [merge_comm]: 3.28998e-06 [allreduce_fusion]: 3.03e-06 [matmul_add_comm_reduction]: 6.07001e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 6.44001e-06 [virtual_dataset]: 5.05001e-06 [get_grad_eliminate_]: 5.97001e-06 [virtual_output]: 5.10999e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 9.40025e-07 [offload_activation]: 1.224e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.021e-05 [merge_recompute_call_nodes]: 1.21002e-06 [before_grad]: 8.23001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.41001e-06 [meta_fg_expand]: 6.885e-05 [flash_sp_send_recv_attached]: 2.02999e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.024e-05 [a_after_grad]: 8.32998e-06 [renormalize]: 0.00056287 [add_forward_monad_depend]: 4.32e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.077e-05 [cse]: 1.667e-05 [a_3]: 3.979e-05 [Cycle 3]: 0.0005495, [45] [expand_dump_flag]: 1.22e-06 [switch_simplify]: 6.99001e-06 [loop_unroll]: 5.31002e-06 [a_1]: 9.866e-05 [with_stream_mark]: 8.03999e-06 [recompute_prepare]: 5.51e-06 [updatestate_depend_eliminate]: 2.78e-06 [updatestate_assign_eliminate]: 2.20002e-06 [updatestate_loads_eliminate]: 2.57001e-06 [parameter_eliminate]: 8.90024e-07 [a_2]: 6.241e-05 [accelerated_algorithm]: 5.47001e-06 [shard]: 1.04998e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 5.35001e-06 [merge_send_recv]: 3.85e-06 [auto_parallel]: 5.37001e-06 [parallel]: 4.02e-06 [flash_sp]: 8.30012e-07 [merge_comm]: 2.66999e-06 [allreduce_fusion]: 2.58e-06 [matmul_add_comm_reduction]: 4.72e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.05002e-06 [virtual_dataset]: 5.17e-06 [get_grad_eliminate_]: 4.89998e-06 [virtual_output]: 5.09998e-06 [merge_forward]: 2.37001e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 5.86998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.091e-05 [merge_recompute_call_nodes]: 6.99976e-07 [before_grad]: 8.76002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.15002e-06 [meta_fg_expand]: 1.99e-06 [flash_sp_send_recv_attached]: 7.90023e-07 [receive_attached]: 1.44e-06 [after_resolve]: 8.18999e-06 [a_after_grad]: 7.58001e-06 [renormalize]: 5.00004e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 8.40024e-07 [auto_monad_eliminator]: 6.16e-06 [cse]: 1.246e-05 [a_3]: 3.109e-05 [py_interpret_to_execute_after_opt_a]: 4.80001e-06 [slice_cell_reuse_recomputed_activation]: 2.09999e-06 [rewriter_after_opt_a]: 1.61e-05 [convert_after_rewriter]: 1.41002e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00061632 [opt_b]: 0.00017992, [1] [Cycle 1]: 0.00017366, [7] [b_1]: 0.00010651 [b_2]: 6.86001e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.13998e-06 [renormalize]: 4.80009e-07 [cse]: 1.725e-05 [optimize_parallel_all_gather_comm]: 4.414e-05 [overlap_param_gather]: 2.14999e-06 [cconv]: 1.867e-05 [loop_unroll]: 0.00045491 [opt_after_cconv]: 9.396e-05, [1] [Cycle 1]: 8.799e-05, [7] [c_1]: 2.6e-05 [parameter_eliminate]: 2.11e-06 [updatestate_depend_eliminate]: 4.72998e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [cse]: 1.783e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.18e-05 [tuple_transform]: 6.4e-05, [1] [Cycle 1]: 5.993e-05, [4] [d_1]: 3.487e-05 [none_parameter_eliminate]: 9.80013e-07 [renormalize]: 1.50001e-07 [switch_simplify]: 6.35002e-06 [partial_unused_args_eliminate]: 1.49e-06 [add_recomputation]: 5.815e-05 [cse_after_recomputation]: 2.341e-05, [1] [Cycle 1]: 1.896e-05, [1] [cse]: 1.29e-05 [environ_conv]: 4.53999e-06 [swap_dp_allreduce_reducescatter]: 5.39e-06 [bias_add_comm_swap]: 2.93e-06 [label_micro_interleaved_index]: 5.02e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.80001e-06 [slice_recompute_activation]: 2.31998e-06 [micro_interleaved_order_control]: 2.36e-06 [assign_add_opt]: 1.37999e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.37e-06 [full_micro_interleaved_order_control]: 2.81e-06 [reorder_send_recv_between_fp_bp]: 2.99999e-06 [comm_op_add_attrs]: 1.29998e-06 [add_comm_op_reuse_tag]: 1.12e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.33998e-06 [control_data_broadcast_order]: 1.261e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 4.05e-06 [overlap_recompute_and_grad_model_parallel]: 4.69998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.12001e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 1.723e-05 [begin_end_overlap_inline]: 8.39995e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 2.10002e-06 [handle_group_info]: 1.24e-06 [symbol_engine_optimizer]: 6.951e-05, [1] [Cycle 1]: 6.547e-05, [6] [build]: 2.06e-06 [elim_shapecalc]: 9.31998e-06 [elim_not_effective]: 1.065e-05 [opt_reshape]: 6.63e-06 [fold_const_symbol]: 8.87e-06 [renormalize]: 2.10013e-07 [detach_backward]: 2.23002e-06 [pipeline_parallel_scheduler]: 1.37e-06 [auto_monad_reorder]: 1.305e-05 [get_jit_bprop_graph]: 1.22e-06 [rewriter_after_jit_bprop_graph]: 3.41001e-06 [opt_after_jit_grad]: 0.00046945 [validate]: 3.011e-05 [backend_pass]: 6.59988e-07 [task_emit]: 0.00993453 [execute]: 6.74999e-06 Sums bootstrap : 0.000607s : 0.70% type_inference : 0.061705s : 71.24% event_method : 0.000172s : 0.20% auto_monad : 0.000161s : 0.19% graph_reusing : 0.000009s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000048s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000011s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000062s : 0.07% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000308s : 0.36% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000217s : 0.25% optimize.opt_a.loop_unroll : 0.000123s : 0.14% optimize.opt_a.a_1 : 0.002840s : 3.28% optimize.opt_a.with_stream_mark : 0.000045s : 0.05% optimize.opt_a.recompute_prepare : 0.000044s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000014s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000342s : 0.40% optimize.opt_a.accelerated_algorithm : 0.000027s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.01% optimize.opt_a.shard_inline : 0.000024s : 0.03% optimize.opt_a.merge_send_recv : 0.000028s : 0.03% optimize.opt_a.auto_parallel : 0.000028s : 0.03% optimize.opt_a.parallel : 0.000036s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000014s : 0.02% optimize.opt_a.allreduce_fusion : 0.000018s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000034s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000030s : 0.03% optimize.opt_a.virtual_dataset : 0.000026s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000026s : 0.03% optimize.opt_a.virtual_output : 0.000025s : 0.03% optimize.opt_a.merge_forward : 0.000014s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000034s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000048s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000043s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.02% optimize.opt_a.meta_fg_expand : 0.002098s : 2.42% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000078s : 0.09% optimize.opt_a.a_after_grad : 0.000103s : 0.12% optimize.opt_a.renormalize : 0.004269s : 4.93% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.02% optimize.opt_a.auto_monad_grad : 0.000008s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000067s : 0.08% optimize.opt_a.cse : 0.000242s : 0.28% optimize.opt_a.a_3 : 0.000371s : 0.43% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000616s : 0.71% optimize.opt_b.b_1 : 0.000107s : 0.12% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000017s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000044s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000019s : 0.02% optimize.loop_unroll : 0.000455s : 0.53% optimize.opt_after_cconv.c_1 : 0.000026s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000012s : 0.01% optimize.tuple_transform.d_1 : 0.000035s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000058s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000017s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000013s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000469s : 0.54% validate : 0.000030s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.009935s : 11.47% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000779 152 0.17% : 0.000001s : 2: substitution.elim_not_effective 0.99% : 0.000008s : 11: substitution.float_depend_g_call 0.51% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.17% : 0.000001s : 2: substitution.fold_const_symbol 0.50% : 0.000004s : 3: substitution.graph_param_transform 0.51% : 0.000004s : 2: substitution.incorporate_call 0.29% : 0.000002s : 2: substitution.incorporate_call_switch 63.56% : 0.000495s : 20: substitution.inline 2.26% : 0.000018s : 2: substitution.inline_without_move 1.11% : 0.000009s : 12: substitution.j_node_and_user_rematch 1.37% : 0.000011s : 7: substitution.minmaximum_grad 2.97% : 0.000023s : 11: substitution.partial_eliminate 1.20% : 0.000009s : 12: substitution.remove_not_recompute_node 3.97% : 0.000031s : 9: substitution.replace_applicator 0.89% : 0.000007s : 7: substitution.replace_old_param 1.28% : 0.000010s : 1: substitution.set_cell_output_no_recompute 3.44% : 0.000027s : 3: substitution.switch_simplify 2.87% : 0.000022s : 7: substitution.tuple_list_convert_item_index_to_positive 1.57% : 0.000012s : 7: substitution.tuple_list_get_item_const_eliminator 2.04% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 6.29% : 0.000049s : 16: substitution.tuple_list_get_item_eliminator 2.05% : 0.000016s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.061614 2 95.16% : 0.058630s : 1: type_inference.infer 4.84% : 0.002983s : 1: type_inference.specialize ------[replace.] 0.000246 30 60.01% : 0.000148s : 20: replace.inline 16.86% : 0.000041s : 3: replace.switch_simplify 23.13% : 0.000057s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000527 30 91.98% : 0.000485s : 20: match.inline 4.18% : 0.000022s : 3: match.switch_simplify 3.84% : 0.000020s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000569 3814 1.14% : 0.000006s : 49: predicate.accumulaten_eliminater 0.24% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.42% : 0.000002s : 17: predicate.addn_check_dump 1.15% : 0.000007s : 49: predicate.addn_zero_filter 1.08% : 0.000006s : 49: predicate.adjust_all_reduce_mul_add 2.14% : 0.000012s : 66: predicate.arithmetic_simplify 1.19% : 0.000007s : 49: predicate.cast_eliminate 1.15% : 0.000007s : 44: predicate.check_bprop_eliminate 0.43% : 0.000002s : 17: predicate.compare_switch_simplify 0.04% : 0.000000s : 3: predicate.const_output_eliminate 0.42% : 0.000002s : 17: predicate.depend_value_elim 1.23% : 0.000007s : 49: predicate.dict_get_item_const_eliminator 1.39% : 0.000008s : 49: predicate.dict_get_item_eliminator 1.13% : 0.000006s : 49: predicate.dict_set_item_eliminator 0.27% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.06% : 0.000000s : 3: predicate.elim_not_effective 0.11% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000007s : 52: predicate.environ_add_const_eliminate 1.20% : 0.000007s : 52: predicate.environ_get_add_eliminate 1.16% : 0.000007s : 52: predicate.environ_get_depend_swap 1.57% : 0.000009s : 69: predicate.environ_get_eliminate 1.16% : 0.000007s : 52: predicate.environ_get_set_eliminate 1.87% : 0.000011s : 76: predicate.exchange_switch_depend_value 2.52% : 0.000014s : 76: predicate.float_depend_g_call 0.41% : 0.000002s : 17: predicate.float_environ_get_switch 0.51% : 0.000003s : 20: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 3: predicate.fold_const_symbol 0.51% : 0.000003s : 17: predicate.get_grad_eliminate 0.08% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000003s : 17: predicate.incorporate_call 0.40% : 0.000002s : 17: predicate.incorporate_call_switch 5.51% : 0.000031s : 165: predicate.inline 1.28% : 0.000007s : 38: predicate.inline_without_move 0.24% : 0.000001s : 17: predicate.j_node_and_user_rematch 0.57% : 0.000003s : 17: predicate.less_batch_normalization 1.51% : 0.000009s : 62: predicate.list_to_tuple_eliminator_ 2.55% : 0.000014s : 111: predicate.load_eliminater 0.31% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.99% : 0.000017s : 113: predicate.loop_unroll_before_grad 1.50% : 0.000009s : 55: predicate.make_slice_get_slice_eliminator 0.43% : 0.000002s : 17: predicate.merge_addn 1.13% : 0.000006s : 44: predicate.micro_step_allgather_replace 1.16% : 0.000007s : 44: predicate.mini_step_allgather_replace 1.10% : 0.000006s : 49: predicate.minmaximum_grad 0.31% : 0.000002s : 3: predicate.mutable_eliminate 0.12% : 0.000001s : 3: predicate.opt_reshape 0.12% : 0.000001s : 3: predicate.parallel_virtual_node 2.39% : 0.000014s : 76: predicate.partial_defer_inline 1.59% : 0.000009s : 59: predicate.partial_eliminate 1.19% : 0.000007s : 49: predicate.print_const_string_wrapper 0.46% : 0.000003s : 17: predicate.reduce_all_const_elim 1.48% : 0.000008s : 49: predicate.reduce_eliminate 2.58% : 0.000015s : 111: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000001s : 17: predicate.remove_not_recompute_node 1.78% : 0.000010s : 100: predicate.replace_applicator 0.59% : 0.000003s : 38: predicate.replace_old_param 0.07% : 0.000000s : 3: predicate.reset_defer_inline 1.16% : 0.000007s : 49: predicate.reshape_eliminate 1.13% : 0.000006s : 44: predicate.row_tensor_add_zeros_like 0.12% : 0.000001s : 3: predicate.row_tensor_eliminate 1.39% : 0.000008s : 44: predicate.same_eliminate 0.30% : 0.000002s : 17: predicate.set_cell_output_no_recompute 0.57% : 0.000003s : 17: predicate.shard_identity_eliminate 0.22% : 0.000001s : 6: predicate.special_op_eliminate 0.47% : 0.000003s : 17: predicate.specialize_transform 1.27% : 0.000007s : 44: predicate.split_environ_get_set_with_tuple_value 1.22% : 0.000007s : 38: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 3: predicate.switch_call_monad_eliminater 2.06% : 0.000012s : 76: predicate.switch_defer_inline 3.11% : 0.000018s : 120: predicate.switch_layer_defer_inline 5.94% : 0.000034s : 215: predicate.switch_simplify 1.10% : 0.000006s : 49: predicate.tile_eliminate 1.13% : 0.000006s : 49: predicate.transpose_eliminate 1.52% : 0.000009s : 55: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000009s : 55: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000008s : 55: predicate.tuple_list_get_item_depend_reorder 2.61% : 0.000015s : 79: predicate.tuple_list_get_item_eliminator 1.49% : 0.000008s : 55: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000012s : 72: predicate.tuple_list_set_item_eliminator 1.45% : 0.000008s : 62: predicate.tuple_to_list_eliminator_ 2.50% : 0.000014s : 111: predicate.updatestate_pure_node_eliminater 2.93% : 0.000017s : 128: predicate.updatestate_useless_node_eliminater 0.10% : 0.000001s : 3: predicate.value_based_eliminate 0.51% : 0.000003s : 17: predicate.virtual_dataset_eliminate 0.46% : 0.000003s : 17: predicate.virtual_output_eliminate 0.07% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.12% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002918 38 54.63% : 0.001594s : 14: func_graph_cloner_run.FuncGraphClonerGraph 45.37% : 0.001324s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.120884 237 0.00% : 0.000004s : 1: ForceFp32Comm 4.06% : 0.004909s : 1: add_attr 4.05% : 0.004895s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000063s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.14% : 0.000172s : 1: auto_monad 0.01% : 0.000017s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.52% : 0.000630s : 1: bootstrap 0.02% : 0.000022s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000008s : 1: environ_conv 0.15% : 0.000183s : 1: event_method 0.01% : 0.000012s : 1: execute 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.38% : 0.000464s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.52% : 0.000625s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000044s : 1: opt.transform.mutable_eliminate 3.50% : 0.004237s : 117: opt.transform.opt_a 0.02% : 0.000025s : 1: opt.transform.opt_after_cconv 0.02% : 0.000020s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000087s : 28: opt.transform.opt_b 0.03% : 0.000039s : 2: opt.transform.opt_trans_graph 0.03% : 0.000032s : 4: opt.transform.symbol_engine_opt 10.10% : 0.012206s : 1: opt_a 0.08% : 0.000098s : 1: opt_after_cconv 0.40% : 0.000479s : 1: opt_after_jit_grad 0.15% : 0.000183s : 1: opt_b 12.00% : 0.014509s : 1: optimize 0.04% : 0.000049s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.06% : 0.000067s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000015s : 1: remove_dup_value 1.76% : 0.002128s : 2: renormalize.infer 1.76% : 0.002127s : 2: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000019s : 1: rewriter_after_opt_a 0.26% : 0.000315s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000072s : 1: symbol_engine_optimizer 8.23% : 0.009949s : 1: task_emit 0.06% : 0.000067s : 1: tuple_transform 51.06% : 0.061725s : 1: type_inference 0.05% : 0.000056s : 1: validate TotalTime = 0.104776, [24] [bootstrap]: 0.00084879 [type_inference]: 0.0647374 [event_method]: 0.00021086 [auto_monad]: 0.00017566 [graph_reusing]: 9.37999e-06 [inline]: 3.36999e-06 [add_attr]: 0.0035398, [1] [add_attr_with_inline]: 0.00353054, [1] [Cycle 1]: 9.448e-05, [2] [tag_attr]: 4.916e-05 [meta_addattr_fg_expand]: 1.292e-05 [parallel-infer-symbol]: 3.14999e-06 [pre_auto_parallel]: 6.784e-05 [insert-virtual-dataset]: 2.84001e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0250904, [53] [py_interpret_to_execute]: 6.45002e-06 [rewriter_before_opt_a]: 0.00045682 [opt_a]: 0.0219258, [3] [Cycle 1]: 0.0170435, [45] [expand_dump_flag]: 3.76001e-06 [switch_simplify]: 0.00017734 [loop_unroll]: 8.849e-05 [a_1]: 0.00189114 [with_stream_mark]: 2.508e-05 [recompute_prepare]: 2.33e-05 [updatestate_depend_eliminate]: 9.74e-06 [updatestate_assign_eliminate]: 8.36002e-06 [updatestate_loads_eliminate]: 8.22e-06 [parameter_eliminate]: 2.57001e-06 [a_2]: 0.00026166 [accelerated_algorithm]: 7.036e-05 [shard]: 1.64e-06 [meta_shard_fg_expand]: 5.08002e-06 [shard_inline]: 1.766e-05 [merge_send_recv]: 1.788e-05 [auto_parallel]: 1.256e-05 [parallel]: 3.342e-05 [flash_sp]: 1.056e-05 [merge_comm]: 9.81e-06 [allreduce_fusion]: 9.54999e-06 [matmul_add_comm_reduction]: 2.679e-05 [allreduce_slice_to_reducescatter]: 6.49976e-07 [virtual_shard_identity]: 1.925e-05 [virtual_dataset]: 1.711e-05 [get_grad_eliminate_]: 1.768e-05 [virtual_output]: 1.764e-05 [merge_forward]: 9.41998e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.979e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.057e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 2.884e-05 [set_forward_comm_id_for_comm_node_pass]: 9.74e-06 [meta_fg_expand]: 0.00175811 [flash_sp_send_recv_attached]: 7.23e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 7.181e-05 [a_after_grad]: 9.334e-05 [renormalize]: 0.0111018 [add_forward_monad_depend]: 1.107e-05 [auto_monad_grad]: 6.29999e-06 [auto_monad_eliminator]: 7.036e-05 [cse]: 0.00032986 [a_3]: 0.00040586 [Cycle 2]: 0.00379145, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 5.467e-05 [loop_unroll]: 5.103e-05 [a_1]: 0.00160293 [with_stream_mark]: 1.777e-05 [recompute_prepare]: 1.379e-05 [updatestate_depend_eliminate]: 7.71001e-06 [updatestate_assign_eliminate]: 7.04001e-06 [updatestate_loads_eliminate]: 6.39001e-06 [parameter_eliminate]: 2.03002e-06 [a_2]: 0.00016559 [accelerated_algorithm]: 1.611e-05 [shard]: 1.94999e-06 [meta_shard_fg_expand]: 2.64001e-06 [shard_inline]: 1.133e-05 [merge_send_recv]: 1.239e-05 [auto_parallel]: 1.339e-05 [parallel]: 8.59e-06 [flash_sp]: 4.4e-06 [merge_comm]: 7.71999e-06 [allreduce_fusion]: 7.58001e-06 [matmul_add_comm_reduction]: 1.322e-05 [allreduce_slice_to_reducescatter]: 6.80011e-07 [virtual_shard_identity]: 1.249e-05 [virtual_dataset]: 1.109e-05 [get_grad_eliminate_]: 1.107e-05 [virtual_output]: 1.101e-05 [merge_forward]: 5.91998e-06 [cell_reuse_recompute_pass]: 9.20001e-07 [offload_activation]: 1.558e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.179e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.842e-05 [set_forward_comm_id_for_comm_node_pass]: 7.21001e-06 [meta_fg_expand]: 0.0001202 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.671e-05 [a_after_grad]: 1.718e-05 [renormalize]: 0.00102756 [add_forward_monad_depend]: 4.4e-06 [auto_monad_grad]: 1.51002e-06 [auto_monad_eliminator]: 1.983e-05 [cse]: 0.00010964 [a_3]: 8.447e-05 [Cycle 3]: 0.00107457, [45] [expand_dump_flag]: 1.64998e-06 [switch_simplify]: 1.352e-05 [loop_unroll]: 1.158e-05 [a_1]: 0.00026451 [with_stream_mark]: 1.226e-05 [recompute_prepare]: 1.096e-05 [updatestate_depend_eliminate]: 6.36998e-06 [updatestate_assign_eliminate]: 5.46e-06 [updatestate_loads_eliminate]: 6.00002e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 0.00016119 [accelerated_algorithm]: 4.73e-05 [shard]: 1.07e-06 [meta_shard_fg_expand]: 2.10002e-06 [shard_inline]: 1.148e-05 [merge_send_recv]: 9.14e-06 [auto_parallel]: 9.56998e-06 [parallel]: 5.47999e-06 [flash_sp]: 1.09e-06 [merge_comm]: 6.61999e-06 [allreduce_fusion]: 6.89001e-06 [matmul_add_comm_reduction]: 1.047e-05 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 1.247e-05 [virtual_dataset]: 1.103e-05 [get_grad_eliminate_]: 1.081e-05 [virtual_output]: 1.061e-05 [merge_forward]: 6.07999e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 1.232e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.085e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 1.835e-05 [set_forward_comm_id_for_comm_node_pass]: 6.91999e-06 [meta_fg_expand]: 4.1e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 9.60019e-07 [after_resolve]: 1.399e-05 [a_after_grad]: 1.712e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 1.444e-05 [cse]: 3.698e-05 [a_3]: 7.694e-05 [py_interpret_to_execute_after_opt_a]: 5.15999e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 3.716e-05 [convert_after_rewriter]: 1.15999e-06 [order_py_execute_after_rewriter]: 1.04e-06 [mutable_eliminate]: 0.00058664 [opt_b]: 0.00040339, [1] [Cycle 1]: 0.00039692, [7] [b_1]: 0.00028447 [b_2]: 1.364e-05 [updatestate_depend_eliminate]: 8.75999e-06 [updatestate_assign_eliminate]: 5.77001e-06 [updatestate_loads_eliminate]: 5.82001e-06 [renormalize]: 3.69997e-07 [cse]: 4.285e-05 [optimize_parallel_all_gather_comm]: 2.639e-05 [overlap_param_gather]: 2.26998e-06 [cconv]: 1.851e-05 [loop_unroll]: 0.00047412 [opt_after_cconv]: 0.0001552, [1] [Cycle 1]: 0.00014948, [7] [c_1]: 5.296e-05 [parameter_eliminate]: 2.49999e-06 [updatestate_depend_eliminate]: 9.07001e-06 [updatestate_assign_eliminate]: 5.88998e-06 [updatestate_loads_eliminate]: 5.76998e-06 [cse]: 4.022e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 4.885e-05 [tuple_transform]: 0.00012771, [1] [Cycle 1]: 0.00012313, [4] [d_1]: 8.956e-05 [none_parameter_eliminate]: 1.42999e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.252e-05 [partial_unused_args_eliminate]: 1.32999e-06 [add_recomputation]: 6.132e-05 [cse_after_recomputation]: 3.887e-05, [1] [Cycle 1]: 3.421e-05, [1] [cse]: 2.859e-05 [environ_conv]: 8.88002e-06 [swap_dp_allreduce_reducescatter]: 9.25999e-06 [bias_add_comm_swap]: 2.76e-06 [label_micro_interleaved_index]: 4.45e-06 [label_fine_grained_interleaved_index]: 2.52001e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.74999e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 3.09001e-06 [comm_op_add_attrs]: 1.35001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 7.13998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89999e-06 [control_data_broadcast_order]: 2.168e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 5.78002e-06 [overlap_recompute_and_grad_model_parallel]: 6.91001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.77002e-06 [overlap_grad_ring_attention]: 6.43e-06 [overlap_grad_flash_sp]: 2.924e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 0.0003509, [1] [Cycle 1]: 0.00034583, [6] [build]: 0.0002076 [elim_shapecalc]: 1.963e-05 [elim_not_effective]: 4.068e-05 [opt_reshape]: 1.269e-05 [fold_const_symbol]: 2.923e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.44998e-06 [auto_monad_reorder]: 2.244e-05 [get_jit_bprop_graph]: 1.50999e-06 [rewriter_after_jit_bprop_graph]: 4.92999e-06 [opt_after_jit_grad]: 0.00050031 [validate]: 5.175e-05 [backend_pass]: 7.39994e-07 [task_emit]: 0.00924349 [execute]: 8.15999e-06 Sums bootstrap : 0.000849s : 0.85% type_inference : 0.064737s : 64.78% event_method : 0.000211s : 0.21% auto_monad : 0.000176s : 0.18% graph_reusing : 0.000009s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000049s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000068s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000457s : 0.46% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000246s : 0.25% optimize.opt_a.loop_unroll : 0.000151s : 0.15% optimize.opt_a.a_1 : 0.003759s : 3.76% optimize.opt_a.with_stream_mark : 0.000055s : 0.06% optimize.opt_a.recompute_prepare : 0.000048s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000024s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000021s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000021s : 0.02% optimize.opt_a.parameter_eliminate : 0.000006s : 0.01% optimize.opt_a.a_2 : 0.000588s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000134s : 0.13% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.01% optimize.opt_a.shard_inline : 0.000040s : 0.04% optimize.opt_a.merge_send_recv : 0.000039s : 0.04% optimize.opt_a.auto_parallel : 0.000036s : 0.04% optimize.opt_a.parallel : 0.000047s : 0.05% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000024s : 0.02% optimize.opt_a.allreduce_fusion : 0.000024s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000050s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000044s : 0.04% optimize.opt_a.virtual_dataset : 0.000039s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.04% optimize.opt_a.virtual_output : 0.000039s : 0.04% optimize.opt_a.merge_forward : 0.000021s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000048s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000073s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000066s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000024s : 0.02% optimize.opt_a.meta_fg_expand : 0.001882s : 1.88% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000103s : 0.10% optimize.opt_a.a_after_grad : 0.000128s : 0.13% optimize.opt_a.renormalize : 0.012129s : 12.14% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.02% optimize.opt_a.auto_monad_grad : 0.000009s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000105s : 0.10% optimize.opt_a.cse : 0.000476s : 0.48% optimize.opt_a.a_3 : 0.000567s : 0.57% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000037s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000587s : 0.59% optimize.opt_b.b_1 : 0.000284s : 0.28% optimize.opt_b.b_2 : 0.000014s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000043s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000019s : 0.02% optimize.loop_unroll : 0.000474s : 0.47% optimize.opt_after_cconv.c_1 : 0.000053s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.cse : 0.000040s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000049s : 0.05% optimize.tuple_transform.d_1 : 0.000090s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.01% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000061s : 0.06% optimize.cse_after_recomputation.cse : 0.000029s : 0.03% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000007s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000029s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000208s : 0.21% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000041s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000029s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000500s : 0.50% validate : 0.000052s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.009243s : 9.25% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.001016 272 1.96% : 0.000020s : 7: substitution.elim_not_effective 0.92% : 0.000009s : 14: substitution.float_depend_g_call 1.05% : 0.000011s : 6: substitution.float_tuple_getitem_switch 1.17% : 0.000012s : 7: substitution.fold_const_symbol 0.64% : 0.000006s : 8: substitution.graph_param_transform 0.26% : 0.000003s : 2: substitution.incorporate_call 0.18% : 0.000002s : 2: substitution.incorporate_call_switch 55.66% : 0.000566s : 23: substitution.inline 1.72% : 0.000017s : 2: substitution.inline_without_move 1.21% : 0.000012s : 24: substitution.j_node_and_user_rematch 5.25% : 0.000053s : 3: substitution.less_batch_normalization 1.65% : 0.000017s : 13: substitution.minmaximum_grad 2.52% : 0.000026s : 14: substitution.partial_eliminate 1.59% : 0.000016s : 24: substitution.remove_not_recompute_node 2.45% : 0.000025s : 9: substitution.replace_applicator 0.99% : 0.000010s : 15: substitution.replace_old_param 0.24% : 0.000002s : 1: substitution.set_cell_output_no_recompute 1.26% : 0.000013s : 4: substitution.switch_simplify 4.07% : 0.000041s : 15: substitution.tuple_list_convert_item_index_to_positive 1.92% : 0.000020s : 15: substitution.tuple_list_get_item_const_eliminator 2.56% : 0.000026s : 15: substitution.tuple_list_get_item_depend_reorder 8.03% : 0.000082s : 34: substitution.tuple_list_get_item_eliminator 2.71% : 0.000028s : 15: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.064644 2 94.81% : 0.061290s : 1: type_inference.infer 5.19% : 0.003353s : 1: type_inference.specialize ------[replace.] 0.000300 41 56.83% : 0.000170s : 23: replace.inline 14.20% : 0.000043s : 4: replace.switch_simplify 28.97% : 0.000087s : 14: replace.tuple_list_get_item_eliminator ------[match.] 0.000593 41 93.59% : 0.000555s : 23: match.inline 1.70% : 0.000010s : 4: match.switch_simplify 4.71% : 0.000028s : 14: match.tuple_list_get_item_eliminator ------[predicate.] 0.000876 6274 1.05% : 0.000009s : 76: predicate.accumulaten_eliminater 0.28% : 0.000002s : 8: predicate.ad_related_special_op_eliminate 0.47% : 0.000004s : 33: predicate.addn_check_dump 1.12% : 0.000010s : 76: predicate.addn_zero_filter 1.05% : 0.000009s : 76: predicate.adjust_all_reduce_mul_add 1.85% : 0.000016s : 109: predicate.arithmetic_simplify 1.09% : 0.000010s : 76: predicate.cast_eliminate 1.03% : 0.000009s : 70: predicate.check_bprop_eliminate 0.48% : 0.000004s : 33: predicate.compare_switch_simplify 0.07% : 0.000001s : 8: predicate.const_output_eliminate 0.50% : 0.000004s : 33: predicate.depend_value_elim 1.16% : 0.000010s : 76: predicate.dict_get_item_const_eliminator 1.18% : 0.000010s : 76: predicate.dict_get_item_eliminator 1.09% : 0.000010s : 76: predicate.dict_set_item_eliminator 0.34% : 0.000003s : 16: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 8: predicate.elim_not_effective 0.21% : 0.000002s : 8: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000011s : 84: predicate.environ_add_const_eliminate 1.17% : 0.000010s : 84: predicate.environ_get_add_eliminate 1.19% : 0.000010s : 84: predicate.environ_get_depend_swap 1.70% : 0.000015s : 117: predicate.environ_get_eliminate 1.17% : 0.000010s : 84: predicate.environ_get_set_eliminate 1.66% : 0.000015s : 113: predicate.exchange_switch_depend_value 2.22% : 0.000019s : 113: predicate.float_depend_g_call 0.47% : 0.000004s : 33: predicate.float_environ_get_switch 0.64% : 0.000006s : 41: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 8: predicate.fold_const_symbol 0.55% : 0.000005s : 33: predicate.get_grad_eliminate 0.08% : 0.000001s : 8: predicate.graph_param_transform 0.51% : 0.000004s : 33: predicate.incorporate_call 0.46% : 0.000004s : 33: predicate.incorporate_call_switch 5.33% : 0.000047s : 271: predicate.inline 1.21% : 0.000011s : 61: predicate.inline_without_move 0.27% : 0.000002s : 33: predicate.j_node_and_user_rematch 0.65% : 0.000006s : 33: predicate.less_batch_normalization 1.57% : 0.000014s : 106: predicate.list_to_tuple_eliminator_ 2.55% : 0.000022s : 182: predicate.load_eliminater 0.30% : 0.000003s : 8: predicate.loop_unroll_after_grad 2.62% : 0.000023s : 168: predicate.loop_unroll_before_grad 1.42% : 0.000012s : 92: predicate.make_slice_get_slice_eliminator 0.50% : 0.000004s : 33: predicate.merge_addn 1.03% : 0.000009s : 70: predicate.micro_step_allgather_replace 1.03% : 0.000009s : 70: predicate.mini_step_allgather_replace 1.08% : 0.000009s : 76: predicate.minmaximum_grad 0.29% : 0.000003s : 8: predicate.mutable_eliminate 0.16% : 0.000001s : 8: predicate.opt_reshape 0.23% : 0.000002s : 8: predicate.parallel_virtual_node 4.77% : 0.000042s : 113: predicate.partial_defer_inline 1.61% : 0.000014s : 98: predicate.partial_eliminate 1.09% : 0.000010s : 76: predicate.print_const_string_wrapper 0.50% : 0.000004s : 33: predicate.reduce_all_const_elim 1.25% : 0.000011s : 76: predicate.reduce_eliminate 2.57% : 0.000022s : 182: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000003s : 33: predicate.remove_not_recompute_node 1.73% : 0.000015s : 160: predicate.replace_applicator 0.60% : 0.000005s : 61: predicate.replace_old_param 0.08% : 0.000001s : 8: predicate.reset_defer_inline 1.12% : 0.000010s : 76: predicate.reshape_eliminate 1.05% : 0.000009s : 70: predicate.row_tensor_add_zeros_like 0.14% : 0.000001s : 8: predicate.row_tensor_eliminate 1.21% : 0.000011s : 70: predicate.same_eliminate 0.33% : 0.000003s : 33: predicate.set_cell_output_no_recompute 0.61% : 0.000005s : 33: predicate.shard_identity_eliminate 0.30% : 0.000003s : 16: predicate.special_op_eliminate 0.56% : 0.000005s : 33: predicate.specialize_transform 1.17% : 0.000010s : 70: predicate.split_environ_get_set_with_tuple_value 1.15% : 0.000010s : 61: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.82% : 0.000016s : 113: predicate.switch_defer_inline 2.81% : 0.000025s : 183: predicate.switch_layer_defer_inline 5.37% : 0.000047s : 330: predicate.switch_simplify 1.06% : 0.000009s : 76: predicate.tile_eliminate 1.07% : 0.000009s : 76: predicate.transpose_eliminate 1.43% : 0.000012s : 92: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000013s : 92: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000013s : 92: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000022s : 139: predicate.tuple_list_get_item_eliminator 1.51% : 0.000013s : 92: predicate.tuple_list_get_set_item_eliminator 1.95% : 0.000017s : 125: predicate.tuple_list_set_item_eliminator 1.55% : 0.000014s : 106: predicate.tuple_to_list_eliminator_ 2.56% : 0.000022s : 182: predicate.updatestate_pure_node_eliminater 3.07% : 0.000027s : 215: predicate.updatestate_useless_node_eliminater 0.15% : 0.000001s : 8: predicate.value_based_eliminate 0.55% : 0.000005s : 33: predicate.virtual_dataset_eliminate 0.56% : 0.000005s : 33: predicate.virtual_output_eliminate 0.14% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.17% : 0.000002s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002720 44 55.73% : 0.001516s : 17: func_graph_cloner_run.FuncGraphClonerGraph 44.27% : 0.001204s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.151933 237 0.00% : 0.000003s : 1: ForceFp32Comm 2.33% : 0.003544s : 1: add_attr 2.33% : 0.003535s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.12% : 0.000187s : 1: auto_monad 0.02% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.59% : 0.000892s : 1: bootstrap 0.01% : 0.000022s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000025s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000042s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.15% : 0.000221s : 1: event_method 0.01% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.32% : 0.000483s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.39% : 0.000596s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 3.88% : 0.005901s : 117: opt.transform.opt_a 0.03% : 0.000052s : 1: opt.transform.opt_after_cconv 0.03% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000269s : 28: opt.transform.opt_b 0.07% : 0.000100s : 2: opt.transform.opt_trans_graph 0.06% : 0.000098s : 4: opt.transform.symbol_engine_opt 14.43% : 0.021930s : 1: opt_a 0.10% : 0.000158s : 1: opt_after_cconv 0.34% : 0.000511s : 1: opt_after_jit_grad 0.27% : 0.000407s : 1: opt_b 16.52% : 0.025096s : 1: optimize 0.02% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000073s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000053s : 1: remove_dup_value 6.39% : 0.009711s : 2: renormalize.infer 1.58% : 0.002402s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000041s : 1: rewriter_after_opt_a 0.31% : 0.000463s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.23% : 0.000354s : 1: symbol_engine_optimizer 6.10% : 0.009262s : 1: task_emit 0.09% : 0.000131s : 1: tuple_transform 42.62% : 0.064755s : 1: type_inference 0.06% : 0.000087s : 1: validate .group_cases_9 have all been run, results of sub cases are below: case: (1,) {} pass. case: (1,) {} pass. case: ('pynative',) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. case: (0,) {} pass. case: ('KBK',) {} pass. case: ('pynative',) {} pass. ops group_cases_10 with 8 cases start to running, all cases are below: case: (, 0) case: (, 1) case: (, 0) case: (, 1) case: (, mindspore.float32) case: (, mindspore.float16) case: (, 0) case: (, 1) ops group_cases_10 total running memory: 32M, memory threshold: 51200M [WARNING] SESSION(15834,ffffbf434f30,python3.9):2026-01-29-17:47:02.694.161 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaae028c7b01, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(15834,ffffbf434f30,python3.9):2026-01-29-17:47:02.694.263 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaae028c7b01, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) [WARNING] SESSION(15824,ffffbf434f30,python3.9):2026-01-29-17:47:02.739.126 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Gradients/Default/Grad_Index/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (4, 3, 2, 4), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaae01d15e01, ref_value: AbstractRefTensor(shape: (4, 3, 2, 4), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(15824,ffffbf434f30,python3.9):2026-01-29-17:47:02.739.269 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaae01d15e01, ref_value: AbstractRefTensor(shape: (4, 3, 2, 4), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) TotalTime = 0.684413, [24] [bootstrap]: 0.00109911 [type_inference]: 0.16367 [event_method]: 2.121e-05 [auto_monad]: 0.00048474 [graph_reusing]: 6.74001e-06 [inline]: 2.07001e-06 [add_attr]: 0.00790281, [1] [add_attr_with_inline]: 0.00788107, [1] [Cycle 1]: 0.00012374, [2] [tag_attr]: 3.036e-05 [meta_addattr_fg_expand]: 1.302e-05 [parallel-infer-symbol]: 2.92002e-06 [pre_auto_parallel]: 4.754e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00668773, [53] [py_interpret_to_execute]: 3.86999e-06 [rewriter_before_opt_a]: 0.00021949 [opt_a]: 0.00420761, [2] [Cycle 1]: 0.00356962, [45] [expand_dump_flag]: 3.31001e-06 [switch_simplify]: 7.19e-05 [loop_unroll]: 3.396e-05 [a_1]: 0.00057052 [with_stream_mark]: 1.528e-05 [recompute_prepare]: 7.61999e-06 [updatestate_depend_eliminate]: 1.221e-05 [updatestate_assign_eliminate]: 1.011e-05 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 1.72001e-06 [a_2]: 8.331e-05 [accelerated_algorithm]: 6.60997e-06 [shard]: 1.75001e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 3.947e-05 [auto_parallel]: 5.82999e-06 [parallel]: 8.703e-05 [flash_sp]: 2.934e-05 [merge_comm]: 4.05e-06 [allreduce_fusion]: 1.06e-05 [matmul_add_comm_reduction]: 1.568e-05 [allreduce_slice_to_reducescatter]: 7.48e-06 [virtual_shard_identity]: 9.14e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 6.40002e-06 [virtual_output]: 6.20002e-06 [merge_forward]: 3.83001e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.538e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.134e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.033e-05 [set_forward_comm_id_for_comm_node_pass]: 1.062e-05 [meta_fg_expand]: 3.26001e-06 [flash_sp_send_recv_attached]: 2.50002e-06 [receive_attached]: 1.608e-05 [after_resolve]: 1.039e-05 [a_after_grad]: 9.71e-06 [renormalize]: 0.00204785 [add_forward_monad_depend]: 5.96e-06 [auto_monad_grad]: 2.12999e-06 [auto_monad_eliminator]: 2.374e-05 [cse]: 4.414e-05 [a_3]: 4.597e-05 [Cycle 2]: 0.00062761, [45] [expand_dump_flag]: 1.04003e-06 [switch_simplify]: 8.05999e-06 [loop_unroll]: 6.24001e-06 [a_1]: 0.00012289 [with_stream_mark]: 1.12e-05 [recompute_prepare]: 6.07999e-06 [updatestate_depend_eliminate]: 2.83e-06 [updatestate_assign_eliminate]: 2.20002e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 7.142e-05 [accelerated_algorithm]: 6.14001e-06 [shard]: 1.22999e-06 [meta_shard_fg_expand]: 1.30001e-06 [shard_inline]: 6.23e-06 [merge_send_recv]: 4.51002e-06 [auto_parallel]: 5.07999e-06 [parallel]: 3.89002e-06 [flash_sp]: 2.83e-06 [merge_comm]: 2.74999e-06 [allreduce_fusion]: 2.57001e-06 [matmul_add_comm_reduction]: 4.77e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.97002e-06 [virtual_dataset]: 5.99999e-06 [get_grad_eliminate_]: 5.65001e-06 [virtual_output]: 5.71e-06 [merge_forward]: 2.76999e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 6.02001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.331e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 8.67e-06 [set_forward_comm_id_for_comm_node_pass]: 3.43999e-06 [meta_fg_expand]: 1.80001e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.19e-06 [a_after_grad]: 1.029e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 7.89994e-07 [auto_monad_eliminator]: 6.81999e-06 [cse]: 1.406e-05 [a_3]: 4.071e-05 [py_interpret_to_execute_after_opt_a]: 4.60001e-06 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 2.771e-05 [convert_after_rewriter]: 2.54999e-06 [order_py_execute_after_rewriter]: 1.21002e-06 [mutable_eliminate]: 0.0006529 [opt_b]: 0.00022357, [1] [Cycle 1]: 0.00021708, [7] [b_1]: 0.00014139 [b_2]: 8.72e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.47001e-06 [renormalize]: 5.3001e-07 [cse]: 1.877e-05 [optimize_parallel_all_gather_comm]: 2.766e-05 [overlap_param_gather]: 1.044e-05 [cconv]: 2.343e-05 [loop_unroll]: 0.00045096 [opt_after_cconv]: 0.00010105, [1] [Cycle 1]: 9.517e-05, [7] [c_1]: 3.007e-05 [parameter_eliminate]: 2.19001e-06 [updatestate_depend_eliminate]: 5.12999e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 1.913e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.416e-05 [tuple_transform]: 7.952e-05, [1] [Cycle 1]: 7.5e-05, [4] [d_1]: 4.813e-05 [none_parameter_eliminate]: 1.61002e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 6.63e-06 [partial_unused_args_eliminate]: 1.57001e-06 [add_recomputation]: 5.423e-05 [cse_after_recomputation]: 2.117e-05, [1] [Cycle 1]: 1.697e-05, [1] [cse]: 1.166e-05 [environ_conv]: 2.367e-05 [swap_dp_allreduce_reducescatter]: 2.292e-05 [bias_add_comm_swap]: 1.028e-05 [label_micro_interleaved_index]: 1.338e-05 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.75001e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 8.92e-06 [full_micro_interleaved_order_control]: 9.42001e-06 [reorder_send_recv_between_fp_bp]: 2.56e-06 [comm_op_add_attrs]: 1.37e-06 [add_comm_op_reuse_tag]: 1.30999e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 8.13999e-06 [overlap_opt_shard_in_pipeline]: 2.279e-05 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 1.157e-05 [grouped_pairwise_exchange_alltoall]: 1.80001e-06 [offloading_packed_experts]: 3.74002e-06 [overlap_recompute_and_grad_model_parallel]: 1.134e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.79001e-06 [overlap_grad_ring_attention]: 1.782e-05 [overlap_grad_flash_sp]: 3.78e-05 [begin_end_overlap_inline]: 4.39992e-07 [split_matmul_comm_elemetwise]: 9.36e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 7.475e-05, [1] [Cycle 1]: 6.955e-05, [6] [build]: 2.17001e-06 [elim_shapecalc]: 9.97999e-06 [elim_not_effective]: 1.329e-05 [opt_reshape]: 7.14001e-06 [fold_const_symbol]: 9.74e-06 [renormalize]: 3.10014e-07 [detach_backward]: 1.60001e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 2.019e-05 [get_jit_bprop_graph]: 1.02e-06 [rewriter_after_jit_bprop_graph]: 3.21001e-06 [opt_after_jit_grad]: 0.00050269 [validate]: 7.55e-05 [backend_pass]: 1.09998e-06 [task_emit]: 0.503605 [execute]: 8.84998e-06 Sums bootstrap : 0.001099s : 0.16% type_inference : 0.163670s : 24.23% event_method : 0.000021s : 0.00% auto_monad : 0.000485s : 0.07% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000048s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000219s : 0.03% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000080s : 0.01% optimize.opt_a.loop_unroll : 0.000040s : 0.01% optimize.opt_a.a_1 : 0.000693s : 0.10% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000155s : 0.02% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000044s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000091s : 0.01% optimize.opt_a.flash_sp : 0.000032s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000017s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.00% optimize.opt_a.a_after_grad : 0.000020s : 0.00% optimize.opt_a.renormalize : 0.002048s : 0.30% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.00% optimize.opt_a.cse : 0.000058s : 0.01% optimize.opt_a.a_3 : 0.000087s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000003s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000653s : 0.10% optimize.opt_b.b_1 : 0.000141s : 0.02% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000451s : 0.07% optimize.opt_after_cconv.c_1 : 0.000030s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000048s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.01% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000024s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000009s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000023s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000018s : 0.00% optimize.overlap_grad_flash_sp : 0.000038s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000503s : 0.07% validate : 0.000075s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.503605s : 74.56% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000175 25 1.24% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 3.35% : 0.000006s : 4: substitution.graph_param_transform 76.53% : 0.000134s : 5: substitution.inline 1.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 7.76% : 0.000014s : 4: substitution.remove_not_recompute_node 1.87% : 0.000003s : 2: substitution.replace_old_param 6.77% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.163584 2 98.56% : 0.161233s : 1: type_inference.infer 1.44% : 0.002351s : 1: type_inference.specialize ------[replace.] 0.000058 7 74.24% : 0.000043s : 5: replace.inline 25.76% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000142 7 92.50% : 0.000131s : 5: match.inline 7.50% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000193 1267 0.82% : 0.000002s : 13: predicate.accumulaten_eliminater 0.87% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.81% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 21: predicate.arithmetic_simplify 0.90% : 0.000002s : 13: predicate.cast_eliminate 0.65% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.87% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.03% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.16% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_depend_swap 1.71% : 0.000003s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.35% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.17% : 0.000004s : 20: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 5.67% : 0.000011s : 57: predicate.inline 0.85% : 0.000002s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.89% : 0.000004s : 23: predicate.list_to_tuple_eliminator_ 2.35% : 0.000005s : 36: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.84% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.69% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.59% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.73% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 13: predicate.minmaximum_grad 1.20% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.90% : 0.000004s : 20: predicate.partial_defer_inline 1.36% : 0.000003s : 19: predicate.partial_eliminate 0.85% : 0.000002s : 13: predicate.print_const_string_wrapper 0.72% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 13: predicate.reduce_eliminate 2.44% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.37% : 0.000003s : 23: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.37% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.49% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000001s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.73% : 0.000001s : 8: predicate.specialize_transform 0.81% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.79% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 20: predicate.switch_defer_inline 2.11% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.57% : 0.000011s : 73: predicate.switch_simplify 0.89% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.69% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000007s : 31: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.44% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.77% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 36: predicate.updatestate_pure_node_eliminater 3.13% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001941 20 67.46% : 0.001309s : 13: func_graph_cloner_run.FuncGraphClonerGraph 32.54% : 0.000632s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.702370 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.13% : 0.007907s : 1: add_attr 1.12% : 0.007885s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000059s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000498s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.16% : 0.001153s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000006s : 1: convert_after_rewriter 0.00% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000028s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000012s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.07% : 0.000460s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.09% : 0.000663s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.16% : 0.001152s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000114s : 28: opt.transform.opt_b 0.01% : 0.000053s : 2: opt.transform.opt_trans_graph 0.01% : 0.000036s : 4: opt.transform.symbol_engine_opt 0.60% : 0.004211s : 1: opt_a 0.01% : 0.000105s : 1: opt_after_cconv 0.07% : 0.000512s : 1: opt_after_jit_grad 0.03% : 0.000227s : 1: opt_b 0.95% : 0.006692s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000041s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000021s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.01% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.19% : 0.001349s : 1: renormalize.infer 0.10% : 0.000691s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000032s : 1: rewriter_after_opt_a 0.03% : 0.000225s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000026s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000077s : 1: symbol_engine_optimizer 71.70% : 0.503629s : 1: task_emit 0.01% : 0.000082s : 1: tuple_transform 23.31% : 0.163692s : 1: type_inference 0.01% : 0.000105s : 1: validate TotalTime = 2.69315, [24] [bootstrap]: 0.00083526 [type_inference]: 0.17597 [event_method]: 0.00036497 [auto_monad]: 0.00020813 [graph_reusing]: 6.00002e-06 [inline]: 2.27999e-06 [add_attr]: 0.00731228, [1] [add_attr_with_inline]: 0.00729639, [1] [Cycle 1]: 0.00011417, [2] [tag_attr]: 3.778e-05 [meta_addattr_fg_expand]: 1.329e-05 [parallel-infer-symbol]: 2.98998e-06 [pre_auto_parallel]: 5.153e-05 [insert-virtual-dataset]: 1.36002e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.64e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.00896882, [53] [py_interpret_to_execute]: 4.98001e-06 [rewriter_before_opt_a]: 0.00030411 [opt_a]: 0.00590648, [2] [Cycle 1]: 0.00489, [45] [expand_dump_flag]: 3.03e-06 [switch_simplify]: 8.516e-05 [loop_unroll]: 5.73e-05 [a_1]: 0.00114782 [with_stream_mark]: 1.657e-05 [recompute_prepare]: 1.395e-05 [updatestate_depend_eliminate]: 1.232e-05 [updatestate_assign_eliminate]: 5.27999e-06 [updatestate_loads_eliminate]: 4.94e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 0.00015701 [accelerated_algorithm]: 1.187e-05 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.88998e-06 [shard_inline]: 1.194e-05 [merge_send_recv]: 3.413e-05 [auto_parallel]: 1.01e-05 [parallel]: 6.541e-05 [flash_sp]: 2.438e-05 [merge_comm]: 6.95002e-06 [allreduce_fusion]: 1.125e-05 [matmul_add_comm_reduction]: 1.593e-05 [allreduce_slice_to_reducescatter]: 4.42e-06 [virtual_shard_identity]: 1.372e-05 [virtual_dataset]: 1.06e-05 [get_grad_eliminate_]: 1.074e-05 [virtual_output]: 1.047e-05 [merge_forward]: 5.10999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.862e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.694e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.668e-05 [set_forward_comm_id_for_comm_node_pass]: 1.294e-05 [meta_fg_expand]: 4.66002e-06 [flash_sp_send_recv_attached]: 1.82999e-06 [receive_attached]: 1.276e-05 [after_resolve]: 1.518e-05 [a_after_grad]: 1.705e-05 [renormalize]: 0.00249603 [add_forward_monad_depend]: 6.43998e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 3.659e-05 [cse]: 0.00012305 [a_3]: 8.106e-05 [Cycle 2]: 0.00100518, [45] [expand_dump_flag]: 1.91e-06 [switch_simplify]: 1.256e-05 [loop_unroll]: 1.017e-05 [a_1]: 0.00026626 [with_stream_mark]: 1.653e-05 [recompute_prepare]: 1.097e-05 [updatestate_depend_eliminate]: 6.39999e-06 [updatestate_assign_eliminate]: 5.34e-06 [updatestate_loads_eliminate]: 5.20999e-06 [parameter_eliminate]: 9.29984e-07 [a_2]: 0.00014899 [accelerated_algorithm]: 1.102e-05 [shard]: 1.43002e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 1.074e-05 [merge_send_recv]: 8.19002e-06 [auto_parallel]: 9.61998e-06 [parallel]: 6.24001e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 5.71e-06 [allreduce_fusion]: 5.34e-06 [matmul_add_comm_reduction]: 9.40001e-06 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 1.164e-05 [virtual_dataset]: 1.025e-05 [get_grad_eliminate_]: 1.021e-05 [virtual_output]: 1.001e-05 [merge_forward]: 4.61002e-06 [cell_reuse_recompute_pass]: 3.01999e-06 [offload_activation]: 1.153e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.792e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.63e-05 [set_forward_comm_id_for_comm_node_pass]: 5.64e-06 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 1.01997e-06 [receive_attached]: 1.79e-06 [after_resolve]: 1.465e-05 [a_after_grad]: 1.561e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.66e-06 [auto_monad_grad]: 1.18001e-06 [auto_monad_eliminator]: 1.389e-05 [cse]: 2.91e-05 [a_3]: 6.469e-05 [py_interpret_to_execute_after_opt_a]: 5.53002e-06 [slice_cell_reuse_recomputed_activation]: 2.14e-06 [rewriter_after_opt_a]: 4.48e-05 [convert_after_rewriter]: 1.28002e-06 [order_py_execute_after_rewriter]: 8.90001e-06 [mutable_eliminate]: 0.00071022 [opt_b]: 0.00034378, [1] [Cycle 1]: 0.00033648, [7] [b_1]: 0.00023205 [b_2]: 1.233e-05 [updatestate_depend_eliminate]: 9.32001e-06 [updatestate_assign_eliminate]: 5.09998e-06 [updatestate_loads_eliminate]: 5.03002e-06 [renormalize]: 4.80009e-07 [cse]: 3.643e-05 [optimize_parallel_all_gather_comm]: 3.426e-05 [overlap_param_gather]: 1.058e-05 [cconv]: 2.576e-05 [loop_unroll]: 0.00048354 [opt_after_cconv]: 0.00016118, [1] [Cycle 1]: 0.00015461, [7] [c_1]: 6.495e-05 [parameter_eliminate]: 3.18e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 4.85001e-06 [updatestate_loads_eliminate]: 4.65001e-06 [cse]: 3.33e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.781e-05 [tuple_transform]: 0.00010604, [1] [Cycle 1]: 0.00010147, [4] [d_1]: 6.912e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.161e-05 [partial_unused_args_eliminate]: 1.76e-06 [add_recomputation]: 8.833e-05 [cse_after_recomputation]: 3.375e-05, [1] [Cycle 1]: 2.865e-05, [1] [cse]: 2.324e-05 [environ_conv]: 3.144e-05 [swap_dp_allreduce_reducescatter]: 2.749e-05 [bias_add_comm_swap]: 1.07e-05 [label_micro_interleaved_index]: 1.163e-05 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.46002e-06 [ForceFp32Comm]: 9.80013e-07 [remove_cast_before_assign_add]: 8.95001e-06 [full_micro_interleaved_order_control]: 1.008e-05 [reorder_send_recv_between_fp_bp]: 2.83e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.09998e-06 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 8.61002e-06 [overlap_opt_shard_in_pipeline]: 1.911e-05 [overlap_opt_shard_grad_in_pipeline]: 1.95001e-06 [control_data_broadcast_order]: 2.044e-05 [grouped_pairwise_exchange_alltoall]: 2.24001e-06 [offloading_packed_experts]: 6.02999e-06 [overlap_recompute_and_grad_model_parallel]: 1.461e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.24e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 2.16e-05 [overlap_grad_flash_sp]: 5.087e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 9.37001e-06 [split_layernorm_comm]: 1.71002e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 0.00010089, [1] [Cycle 1]: 9.598e-05, [6] [build]: 3.64002e-06 [elim_shapecalc]: 1.577e-05 [elim_not_effective]: 1.99e-05 [opt_reshape]: 1.118e-05 [fold_const_symbol]: 1.681e-05 [renormalize]: 2.80008e-07 [detach_backward]: 2.58e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 3.659e-05 [get_jit_bprop_graph]: 2.01998e-06 [rewriter_after_jit_bprop_graph]: 3.90998e-06 [opt_after_jit_grad]: 0.00053689 [validate]: 8.012e-05 [backend_pass]: 9.90025e-07 [task_emit]: 2.49846 [execute]: 7.7e-06 Sums bootstrap : 0.000835s : 0.03% type_inference : 0.175970s : 6.55% event_method : 0.000365s : 0.01% auto_monad : 0.000208s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000038s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000052s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000304s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000098s : 0.00% optimize.opt_a.loop_unroll : 0.000067s : 0.00% optimize.opt_a.a_1 : 0.001414s : 0.05% optimize.opt_a.with_stream_mark : 0.000033s : 0.00% optimize.opt_a.recompute_prepare : 0.000025s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000010s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000306s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000023s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000023s : 0.00% optimize.opt_a.merge_send_recv : 0.000042s : 0.00% optimize.opt_a.auto_parallel : 0.000020s : 0.00% optimize.opt_a.parallel : 0.000072s : 0.00% optimize.opt_a.flash_sp : 0.000028s : 0.00% optimize.opt_a.merge_comm : 0.000013s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000005s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000025s : 0.00% optimize.opt_a.virtual_dataset : 0.000021s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000021s : 0.00% optimize.opt_a.virtual_output : 0.000020s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000033s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000019s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000015s : 0.00% optimize.opt_a.after_resolve : 0.000030s : 0.00% optimize.opt_a.a_after_grad : 0.000033s : 0.00% optimize.opt_a.renormalize : 0.002496s : 0.09% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000050s : 0.00% optimize.opt_a.cse : 0.000152s : 0.01% optimize.opt_a.a_3 : 0.000146s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000009s : 0.00% optimize.mutable_eliminate : 0.000710s : 0.03% optimize.opt_b.b_1 : 0.000232s : 0.01% optimize.opt_b.b_2 : 0.000012s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000036s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000484s : 0.02% optimize.opt_after_cconv.c_1 : 0.000065s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000033s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.00% optimize.tuple_transform.d_1 : 0.000069s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000088s : 0.00% optimize.cse_after_recomputation.cse : 0.000023s : 0.00% optimize.environ_conv : 0.000031s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000017s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000037s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000537s : 0.02% validate : 0.000080s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.498458s : 93.06% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000382 75 12.00% : 0.000046s : 5: substitution.arithmetic_simplify 1.81% : 0.000007s : 2: substitution.depend_value_elim 0.76% : 0.000003s : 5: substitution.elim_not_effective 0.63% : 0.000002s : 5: substitution.fold_const_symbol 2.09% : 0.000008s : 8: substitution.graph_param_transform 61.96% : 0.000237s : 8: substitution.inline 1.43% : 0.000005s : 10: substitution.j_node_and_user_rematch 3.66% : 0.000014s : 10: substitution.remove_not_recompute_node 1.10% : 0.000004s : 4: substitution.replace_old_param 6.13% : 0.000023s : 4: substitution.tuple_list_get_item_eliminator 3.58% : 0.000014s : 6: substitution.updatestate_pure_node_eliminater 4.86% : 0.000019s : 8: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.175884 2 98.45% : 0.173159s : 1: type_inference.infer 1.55% : 0.002725s : 1: type_inference.specialize ------[replace.] 0.000099 12 69.18% : 0.000068s : 8: replace.inline 30.82% : 0.000030s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000254 12 91.57% : 0.000232s : 8: match.inline 8.43% : 0.000021s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000364 2596 0.98% : 0.000004s : 28: predicate.accumulaten_eliminater 0.65% : 0.000002s : 8: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 16: predicate.addn_check_dump 0.97% : 0.000004s : 28: predicate.addn_zero_filter 0.89% : 0.000003s : 28: predicate.adjust_all_reduce_mul_add 2.30% : 0.000008s : 44: predicate.arithmetic_simplify 0.98% : 0.000004s : 28: predicate.cast_eliminate 0.56% : 0.000002s : 16: predicate.check_bprop_eliminate 0.54% : 0.000002s : 16: predicate.compare_switch_simplify 0.19% : 0.000001s : 8: predicate.const_output_eliminate 0.60% : 0.000002s : 16: predicate.depend_value_elim 1.05% : 0.000004s : 28: predicate.dict_get_item_const_eliminator 1.14% : 0.000004s : 28: predicate.dict_get_item_eliminator 0.97% : 0.000004s : 28: predicate.dict_set_item_eliminator 0.83% : 0.000003s : 16: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 8: predicate.elim_not_effective 0.37% : 0.000001s : 8: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000005s : 36: predicate.environ_add_const_eliminate 1.21% : 0.000004s : 36: predicate.environ_get_add_eliminate 1.17% : 0.000004s : 36: predicate.environ_get_depend_swap 1.78% : 0.000006s : 52: predicate.environ_get_eliminate 1.22% : 0.000004s : 36: predicate.environ_get_set_eliminate 1.42% : 0.000005s : 40: predicate.exchange_switch_depend_value 2.08% : 0.000008s : 40: predicate.float_depend_g_call 0.55% : 0.000002s : 16: predicate.float_environ_get_switch 0.80% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 8: predicate.fold_const_symbol 0.64% : 0.000002s : 16: predicate.get_grad_eliminate 0.24% : 0.000001s : 8: predicate.graph_param_transform 0.61% : 0.000002s : 16: predicate.incorporate_call 0.53% : 0.000002s : 16: predicate.incorporate_call_switch 5.85% : 0.000021s : 116: predicate.inline 0.71% : 0.000003s : 16: predicate.inline_without_move 0.33% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.72% : 0.000003s : 16: predicate.less_batch_normalization 1.70% : 0.000006s : 48: predicate.list_to_tuple_eliminator_ 2.64% : 0.000010s : 76: predicate.load_eliminater 0.86% : 0.000003s : 8: predicate.loop_unroll_after_grad 2.55% : 0.000009s : 74: predicate.loop_unroll_before_grad 1.59% : 0.000006s : 44: predicate.make_slice_get_slice_eliminator 0.60% : 0.000002s : 16: predicate.merge_addn 0.58% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.57% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.90% : 0.000003s : 28: predicate.minmaximum_grad 1.08% : 0.000004s : 8: predicate.mutable_eliminate 0.32% : 0.000001s : 8: predicate.opt_reshape 0.38% : 0.000001s : 8: predicate.parallel_virtual_node 1.80% : 0.000007s : 40: predicate.partial_defer_inline 1.54% : 0.000006s : 40: predicate.partial_eliminate 0.99% : 0.000004s : 28: predicate.print_const_string_wrapper 0.57% : 0.000002s : 16: predicate.reduce_all_const_elim 1.31% : 0.000005s : 28: predicate.reduce_eliminate 2.55% : 0.000009s : 76: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000002s : 16: predicate.remove_not_recompute_node 1.40% : 0.000005s : 48: predicate.replace_applicator 0.38% : 0.000001s : 16: predicate.replace_old_param 0.27% : 0.000001s : 8: predicate.reset_defer_inline 1.01% : 0.000004s : 28: predicate.reshape_eliminate 0.57% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 8: predicate.row_tensor_eliminate 0.75% : 0.000003s : 16: predicate.same_eliminate 0.48% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.76% : 0.000003s : 16: predicate.shard_identity_eliminate 0.64% : 0.000002s : 16: predicate.special_op_eliminate 0.69% : 0.000003s : 16: predicate.specialize_transform 0.80% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.56% : 0.000006s : 40: predicate.switch_defer_inline 2.11% : 0.000008s : 56: predicate.switch_layer_defer_inline 5.29% : 0.000019s : 138: predicate.switch_simplify 0.95% : 0.000003s : 28: predicate.tile_eliminate 0.98% : 0.000004s : 28: predicate.transpose_eliminate 1.63% : 0.000006s : 44: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000006s : 44: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000006s : 44: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000011s : 64: predicate.tuple_list_get_item_eliminator 1.62% : 0.000006s : 44: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000008s : 60: predicate.tuple_list_set_item_eliminator 1.76% : 0.000006s : 48: predicate.tuple_to_list_eliminator_ 2.62% : 0.000010s : 76: predicate.updatestate_pure_node_eliminater 3.53% : 0.000013s : 92: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 8: predicate.value_based_eliminate 0.64% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.63% : 0.000002s : 16: predicate.virtual_output_eliminate 0.31% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002202 26 67.38% : 0.001484s : 16: func_graph_cloner_run.FuncGraphClonerGraph 32.62% : 0.000718s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.714542 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.27% : 0.007317s : 1: add_attr 0.27% : 0.007300s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000093s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000222s : 1: auto_monad 0.00% : 0.000042s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.03% : 0.000886s : 1: bootstrap 0.00% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000037s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000035s : 1: environ_conv 0.01% : 0.000378s : 1: event_method 0.00% : 0.000019s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.02% : 0.000494s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000722s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000023s : 1: opt.transform.mutable_eliminate 0.08% : 0.002238s : 78: opt.transform.opt_a 0.00% : 0.000063s : 1: opt.transform.opt_after_cconv 0.00% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000215s : 28: opt.transform.opt_b 0.00% : 0.000078s : 2: opt.transform.opt_trans_graph 0.00% : 0.000060s : 4: opt.transform.symbol_engine_opt 0.22% : 0.005910s : 1: opt_a 0.01% : 0.000165s : 1: opt_after_cconv 0.02% : 0.000547s : 1: opt_after_jit_grad 0.01% : 0.000347s : 1: opt_b 0.33% : 0.008974s : 1: optimize 0.00% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000012s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000056s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000021s : 1: remove_dup_value 0.06% : 0.001680s : 1: renormalize.infer 0.03% : 0.000807s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000050s : 1: rewriter_after_opt_a 0.01% : 0.000311s : 1: rewriter_before_opt_a 0.00% : 0.000057s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000104s : 1: symbol_engine_optimizer 92.04% : 2.498493s : 1: task_emit 0.00% : 0.000109s : 1: tuple_transform 6.48% : 0.175985s : 1: type_inference 0.00% : 0.000118s : 1: validate TotalTime = 2.73911, [24] [bootstrap]: 0.00095655 [type_inference]: 0.196813 [event_method]: 0.00046096 [auto_monad]: 0.00023157 [graph_reusing]: 9.73002e-06 [inline]: 2.82002e-06 [add_attr]: 0.00748246, [1] [add_attr_with_inline]: 0.00746811, [1] [Cycle 1]: 0.00014752, [2] [tag_attr]: 5.107e-05 [meta_addattr_fg_expand]: 2.019e-05 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 7.22e-05 [insert-virtual-dataset]: 2.49999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.034454, [53] [py_interpret_to_execute]: 4.47e-06 [rewriter_before_opt_a]: 0.00035438 [opt_a]: 0.0317608, [3] [Cycle 1]: 0.0277968, [45] [expand_dump_flag]: 4.08999e-06 [switch_simplify]: 0.00018601 [loop_unroll]: 7.006e-05 [a_1]: 0.00146473 [with_stream_mark]: 2.395e-05 [recompute_prepare]: 2.091e-05 [updatestate_depend_eliminate]: 1.782e-05 [updatestate_assign_eliminate]: 1.449e-05 [updatestate_loads_eliminate]: 6.89001e-06 [parameter_eliminate]: 3.14001e-06 [a_2]: 0.00022021 [accelerated_algorithm]: 1.526e-05 [shard]: 1.84e-06 [meta_shard_fg_expand]: 4.935e-05 [shard_inline]: 1.54e-05 [merge_send_recv]: 4.852e-05 [auto_parallel]: 1.068e-05 [parallel]: 8.183e-05 [flash_sp]: 3.368e-05 [merge_comm]: 9.12001e-06 [allreduce_fusion]: 1.548e-05 [matmul_add_comm_reduction]: 3.29e-05 [allreduce_slice_to_reducescatter]: 8.80999e-06 [virtual_shard_identity]: 1.71e-05 [virtual_dataset]: 1.506e-05 [get_grad_eliminate_]: 1.43e-05 [virtual_output]: 1.415e-05 [merge_forward]: 8.79e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 2.393e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.631e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 3.246e-05 [set_forward_comm_id_for_comm_node_pass]: 1.626e-05 [meta_fg_expand]: 0.00160448 [flash_sp_send_recv_attached]: 3.79002e-06 [receive_attached]: 2.027e-05 [after_resolve]: 6.087e-05 [a_after_grad]: 8.147e-05 [renormalize]: 0.022448 [add_forward_monad_depend]: 1.189e-05 [auto_monad_grad]: 5.79999e-06 [auto_monad_eliminator]: 5.713e-05 [cse]: 0.00032518 [a_3]: 0.00034876 [Cycle 2]: 0.00324818, [45] [expand_dump_flag]: 3.18998e-06 [switch_simplify]: 4.617e-05 [loop_unroll]: 4.364e-05 [a_1]: 0.00128568 [with_stream_mark]: 1.693e-05 [recompute_prepare]: 1.019e-05 [updatestate_depend_eliminate]: 4.61002e-06 [updatestate_assign_eliminate]: 3.98999e-06 [updatestate_loads_eliminate]: 3.88999e-06 [parameter_eliminate]: 1.39e-06 [a_2]: 9.832e-05 [accelerated_algorithm]: 8.22e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 2.49999e-06 [shard_inline]: 7.61001e-06 [merge_send_recv]: 9.10999e-06 [auto_parallel]: 1.033e-05 [parallel]: 9.09e-06 [flash_sp]: 3.97e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.02e-06 [matmul_add_comm_reduction]: 8.55001e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 8.48999e-06 [virtual_dataset]: 7.48999e-06 [get_grad_eliminate_]: 7.46999e-06 [virtual_output]: 7.26999e-06 [merge_forward]: 4.10998e-06 [cell_reuse_recompute_pass]: 1.09003e-06 [offload_activation]: 9.70002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.482e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.234e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12998e-06 [meta_fg_expand]: 6.966e-05 [flash_sp_send_recv_attached]: 1.91e-06 [receive_attached]: 3.26999e-06 [after_resolve]: 9.44e-06 [a_after_grad]: 1.167e-05 [renormalize]: 0.00115505 [add_forward_monad_depend]: 5.10999e-06 [auto_monad_grad]: 1.82001e-06 [auto_monad_eliminator]: 1.278e-05 [cse]: 2.172e-05 [a_3]: 5.56e-05 [Cycle 3]: 0.00069958, [45] [expand_dump_flag]: 1.44e-06 [switch_simplify]: 9.71998e-06 [loop_unroll]: 7.62998e-06 [a_1]: 0.00015473 [with_stream_mark]: 9.15001e-06 [recompute_prepare]: 7.4e-06 [updatestate_depend_eliminate]: 4.29002e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.29001e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 9.439e-05 [accelerated_algorithm]: 7.38e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 7.51999e-06 [merge_send_recv]: 5.67999e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.02e-06 [flash_sp]: 9.09989e-07 [merge_comm]: 4.02e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 5.72999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 8.33999e-06 [virtual_dataset]: 7.16999e-06 [get_grad_eliminate_]: 7.09001e-06 [virtual_output]: 7.51999e-06 [merge_forward]: 3.46999e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 6.86999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.521e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 1.169e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 8.10018e-07 [receive_attached]: 1.02e-06 [after_resolve]: 7.11999e-06 [a_after_grad]: 1.057e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.10001e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 7.35998e-06 [cse]: 1.716e-05 [a_3]: 4.639e-05 [py_interpret_to_execute_after_opt_a]: 4.20999e-06 [slice_cell_reuse_recomputed_activation]: 2.24999e-06 [rewriter_after_opt_a]: 3.978e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00058821 [opt_b]: 0.00024131, [1] [Cycle 1]: 0.00023406, [7] [b_1]: 0.00015604 [b_2]: 9.22001e-06 [updatestate_depend_eliminate]: 6.17001e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.08e-06 [renormalize]: 4.39992e-07 [cse]: 2.267e-05 [optimize_parallel_all_gather_comm]: 4.458e-05 [overlap_param_gather]: 1.206e-05 [cconv]: 2.389e-05 [loop_unroll]: 0.00044304 [opt_after_cconv]: 0.00011026, [1] [Cycle 1]: 0.00010437, [7] [c_1]: 3.536e-05 [parameter_eliminate]: 2.37001e-06 [updatestate_depend_eliminate]: 6.32001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.233e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.618e-05 [tuple_transform]: 8.215e-05, [1] [Cycle 1]: 7.799e-05, [4] [d_1]: 5.058e-05 [none_parameter_eliminate]: 1.20001e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 8.10999e-06 [partial_unused_args_eliminate]: 1.54998e-06 [add_recomputation]: 5.922e-05 [cse_after_recomputation]: 2.741e-05, [1] [Cycle 1]: 2.331e-05, [1] [cse]: 1.783e-05 [environ_conv]: 1.269e-05 [swap_dp_allreduce_reducescatter]: 2.745e-05 [bias_add_comm_swap]: 1.282e-05 [label_micro_interleaved_index]: 1.345e-05 [label_fine_grained_interleaved_index]: 2.32001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.40002e-06 [assign_add_opt]: 1.45999e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.084e-05 [full_micro_interleaved_order_control]: 1.22e-05 [reorder_send_recv_between_fp_bp]: 2.75002e-06 [comm_op_add_attrs]: 1.06997e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.29e-06 [interleave_parallel_branches]: 1.029e-05 [overlap_opt_shard_in_pipeline]: 1.938e-05 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.759e-05 [grouped_pairwise_exchange_alltoall]: 1.96998e-06 [offloading_packed_experts]: 4.33999e-06 [overlap_recompute_and_grad_model_parallel]: 1.44e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.09999e-06 [overlap_grad_ring_attention]: 2.233e-05 [overlap_grad_flash_sp]: 5.05e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 1.085e-05 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 8.713e-05, [1] [Cycle 1]: 8.158e-05, [6] [build]: 2.93e-06 [elim_shapecalc]: 1.308e-05 [elim_not_effective]: 1.676e-05 [opt_reshape]: 9.05999e-06 [fold_const_symbol]: 1.173e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.30002e-06 [pipeline_parallel_scheduler]: 1.46998e-06 [auto_monad_reorder]: 2.434e-05 [get_jit_bprop_graph]: 1.75001e-06 [rewriter_after_jit_bprop_graph]: 2.71e-06 [opt_after_jit_grad]: 0.00050888 [validate]: 6.633e-05 [backend_pass]: 9.5999e-07 [task_emit]: 2.49769 [execute]: 1.098e-05 Sums bootstrap : 0.000957s : 0.04% type_inference : 0.196813s : 7.21% event_method : 0.000461s : 0.02% auto_monad : 0.000232s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000051s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000072s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000354s : 0.01% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000242s : 0.01% optimize.opt_a.loop_unroll : 0.000121s : 0.00% optimize.opt_a.a_1 : 0.002905s : 0.11% optimize.opt_a.with_stream_mark : 0.000050s : 0.00% optimize.opt_a.recompute_prepare : 0.000038s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000027s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000022s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000014s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000413s : 0.02% optimize.opt_a.accelerated_algorithm : 0.000031s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000054s : 0.00% optimize.opt_a.shard_inline : 0.000031s : 0.00% optimize.opt_a.merge_send_recv : 0.000063s : 0.00% optimize.opt_a.auto_parallel : 0.000027s : 0.00% optimize.opt_a.parallel : 0.000095s : 0.00% optimize.opt_a.flash_sp : 0.000039s : 0.00% optimize.opt_a.merge_comm : 0.000017s : 0.00% optimize.opt_a.allreduce_fusion : 0.000023s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000034s : 0.00% optimize.opt_a.virtual_dataset : 0.000030s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000029s : 0.00% optimize.opt_a.virtual_output : 0.000029s : 0.00% optimize.opt_a.merge_forward : 0.000016s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000040s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000056s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000056s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000025s : 0.00% optimize.opt_a.meta_fg_expand : 0.001677s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000025s : 0.00% optimize.opt_a.after_resolve : 0.000077s : 0.00% optimize.opt_a.a_after_grad : 0.000104s : 0.00% optimize.opt_a.renormalize : 0.023603s : 0.86% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.00% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000077s : 0.00% optimize.opt_a.cse : 0.000364s : 0.01% optimize.opt_a.a_3 : 0.000451s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000588s : 0.02% optimize.opt_b.b_1 : 0.000156s : 0.01% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000045s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000443s : 0.02% optimize.opt_after_cconv.c_1 : 0.000035s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.00% optimize.tuple_transform.d_1 : 0.000051s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000059s : 0.00% optimize.cse_after_recomputation.cse : 0.000018s : 0.00% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000509s : 0.02% validate : 0.000066s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.497690s : 91.48% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000738 160 0.33% : 0.000002s : 3: substitution.elim_not_effective 1.01% : 0.000007s : 11: substitution.float_depend_g_call 0.53% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.18% : 0.000001s : 3: substitution.fold_const_symbol 0.78% : 0.000006s : 5: substitution.graph_param_transform 0.36% : 0.000003s : 2: substitution.incorporate_call 0.26% : 0.000002s : 2: substitution.incorporate_call_switch 64.48% : 0.000476s : 20: substitution.inline 2.09% : 0.000015s : 2: substitution.inline_without_move 2.34% : 0.000017s : 14: substitution.j_node_and_user_rematch 1.37% : 0.000010s : 7: substitution.minmaximum_grad 3.56% : 0.000026s : 11: substitution.partial_eliminate 1.50% : 0.000011s : 14: substitution.remove_not_recompute_node 3.32% : 0.000025s : 9: substitution.replace_applicator 0.64% : 0.000005s : 7: substitution.replace_old_param 0.35% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.49% : 0.000018s : 3: substitution.switch_simplify 2.89% : 0.000021s : 7: substitution.tuple_list_convert_item_index_to_positive 1.31% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.88% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 6.38% : 0.000047s : 16: substitution.tuple_list_get_item_eliminator 1.96% : 0.000014s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.196694 2 98.07% : 0.192903s : 1: type_inference.infer 1.93% : 0.003791s : 1: type_inference.specialize ------[replace.] 0.000245 30 60.75% : 0.000149s : 20: replace.inline 14.53% : 0.000036s : 3: replace.switch_simplify 24.72% : 0.000061s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000502 30 92.69% : 0.000466s : 20: match.inline 3.29% : 0.000017s : 3: match.switch_simplify 4.02% : 0.000020s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000641 4601 1.13% : 0.000007s : 57: predicate.accumulaten_eliminater 0.26% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.47% : 0.000003s : 23: predicate.addn_check_dump 1.14% : 0.000007s : 57: predicate.addn_zero_filter 1.05% : 0.000007s : 57: predicate.adjust_all_reduce_mul_add 2.15% : 0.000014s : 80: predicate.arithmetic_simplify 1.13% : 0.000007s : 57: predicate.cast_eliminate 1.16% : 0.000007s : 55: predicate.check_bprop_eliminate 0.48% : 0.000003s : 23: predicate.compare_switch_simplify 0.07% : 0.000000s : 5: predicate.const_output_eliminate 0.47% : 0.000003s : 23: predicate.depend_value_elim 1.17% : 0.000007s : 57: predicate.dict_get_item_const_eliminator 1.34% : 0.000009s : 57: predicate.dict_get_item_eliminator 1.07% : 0.000007s : 57: predicate.dict_set_item_eliminator 0.34% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 5: predicate.elim_not_effective 0.13% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000008s : 62: predicate.environ_add_const_eliminate 1.20% : 0.000008s : 62: predicate.environ_get_add_eliminate 1.12% : 0.000007s : 62: predicate.environ_get_depend_swap 1.63% : 0.000010s : 85: predicate.environ_get_eliminate 1.22% : 0.000008s : 62: predicate.environ_get_set_eliminate 1.74% : 0.000011s : 84: predicate.exchange_switch_depend_value 2.50% : 0.000016s : 84: predicate.float_depend_g_call 0.48% : 0.000003s : 23: predicate.float_environ_get_switch 0.57% : 0.000004s : 28: predicate.float_tuple_getitem_switch 0.07% : 0.000000s : 5: predicate.fold_const_symbol 0.53% : 0.000003s : 23: predicate.get_grad_eliminate 0.12% : 0.000001s : 5: predicate.graph_param_transform 0.48% : 0.000003s : 23: predicate.incorporate_call 0.43% : 0.000003s : 23: predicate.incorporate_call_switch 5.45% : 0.000035s : 197: predicate.inline 1.31% : 0.000008s : 47: predicate.inline_without_move 0.28% : 0.000002s : 23: predicate.j_node_and_user_rematch 0.61% : 0.000004s : 23: predicate.less_batch_normalization 1.51% : 0.000010s : 74: predicate.list_to_tuple_eliminator_ 2.46% : 0.000016s : 131: predicate.load_eliminater 0.30% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.79% : 0.000018s : 131: predicate.loop_unroll_before_grad 1.37% : 0.000009s : 67: predicate.make_slice_get_slice_eliminator 0.49% : 0.000003s : 23: predicate.merge_addn 1.10% : 0.000007s : 55: predicate.micro_step_allgather_replace 1.11% : 0.000007s : 55: predicate.mini_step_allgather_replace 1.11% : 0.000007s : 57: predicate.minmaximum_grad 0.35% : 0.000002s : 5: predicate.mutable_eliminate 0.19% : 0.000001s : 5: predicate.opt_reshape 0.18% : 0.000001s : 5: predicate.parallel_virtual_node 2.30% : 0.000015s : 84: predicate.partial_defer_inline 1.58% : 0.000010s : 69: predicate.partial_eliminate 1.07% : 0.000007s : 57: predicate.print_const_string_wrapper 0.51% : 0.000003s : 23: predicate.reduce_all_const_elim 1.38% : 0.000009s : 57: predicate.reduce_eliminate 2.51% : 0.000016s : 131: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000002s : 23: predicate.remove_not_recompute_node 1.90% : 0.000012s : 119: predicate.replace_applicator 0.64% : 0.000004s : 47: predicate.replace_old_param 0.09% : 0.000001s : 5: predicate.reset_defer_inline 1.09% : 0.000007s : 57: predicate.reshape_eliminate 1.19% : 0.000008s : 55: predicate.row_tensor_add_zeros_like 0.13% : 0.000001s : 5: predicate.row_tensor_eliminate 1.41% : 0.000009s : 55: predicate.same_eliminate 0.35% : 0.000002s : 23: predicate.set_cell_output_no_recompute 0.55% : 0.000004s : 23: predicate.shard_identity_eliminate 0.24% : 0.000002s : 10: predicate.special_op_eliminate 0.55% : 0.000004s : 23: predicate.specialize_transform 1.27% : 0.000008s : 55: predicate.split_environ_get_set_with_tuple_value 1.25% : 0.000008s : 47: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.91% : 0.000012s : 84: predicate.switch_defer_inline 2.94% : 0.000019s : 139: predicate.switch_layer_defer_inline 5.75% : 0.000037s : 249: predicate.switch_simplify 1.17% : 0.000007s : 57: predicate.tile_eliminate 1.15% : 0.000007s : 57: predicate.transpose_eliminate 1.47% : 0.000009s : 67: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000010s : 67: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000009s : 67: predicate.tuple_list_get_item_depend_reorder 2.66% : 0.000017s : 97: predicate.tuple_list_get_item_eliminator 1.48% : 0.000009s : 67: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000014s : 90: predicate.tuple_list_set_item_eliminator 1.57% : 0.000010s : 74: predicate.tuple_to_list_eliminator_ 2.44% : 0.000016s : 131: predicate.updatestate_pure_node_eliminater 2.95% : 0.000019s : 154: predicate.updatestate_useless_node_eliminater 0.12% : 0.000001s : 5: predicate.value_based_eliminate 0.53% : 0.000003s : 23: predicate.virtual_dataset_eliminate 0.55% : 0.000004s : 23: predicate.virtual_output_eliminate 0.11% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.14% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004118 60 69.33% : 0.002855s : 29: func_graph_cloner_run.FuncGraphClonerGraph 30.67% : 0.001263s : 31: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.809303 237 0.00% : 0.000004s : 1: ForceFp32Comm 0.27% : 0.007487s : 1: add_attr 0.27% : 0.007472s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000064s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.01% : 0.000244s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.04% : 0.001004s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000016s : 1: environ_conv 0.02% : 0.000473s : 1: event_method 0.00% : 0.000021s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000452s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000597s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.16% : 0.004511s : 117: opt.transform.opt_a 0.00% : 0.000034s : 1: opt.transform.opt_after_cconv 0.00% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000138s : 28: opt.transform.opt_b 0.00% : 0.000056s : 2: opt.transform.opt_trans_graph 0.00% : 0.000047s : 4: opt.transform.symbol_engine_opt 1.13% : 0.031764s : 1: opt_a 0.00% : 0.000113s : 1: opt_after_cconv 0.02% : 0.000519s : 1: opt_after_jit_grad 0.01% : 0.000245s : 1: opt_b 1.23% : 0.034459s : 1: optimize 0.00% : 0.000049s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000077s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000020s : 1: remove_dup_value 0.72% : 0.020308s : 2: renormalize.infer 0.12% : 0.003277s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000043s : 1: rewriter_after_opt_a 0.01% : 0.000361s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000090s : 1: symbol_engine_optimizer 88.91% : 2.497733s : 1: task_emit 0.00% : 0.000085s : 1: tuple_transform 7.01% : 0.196833s : 1: type_inference 0.00% : 0.000095s : 1: validate [WARNING] SESSION(15834,ffffbf434f30,python3.9):2026-01-29-17:47:03.647.575 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaaf53b82e03, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(15834,ffffbf434f30,python3.9):2026-01-29-17:47:03.647.636 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaaf53b82e03, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) TotalTime = 0.131273, [24] [bootstrap]: 0.00049111 [type_inference]: 0.107279 [event_method]: 0.00044474 [auto_monad]: 0.00017346 [graph_reusing]: 7.77002e-06 [inline]: 2.62001e-06 [add_attr]: 0.00354545, [1] [add_attr_with_inline]: 0.00353757, [1] [Cycle 1]: 6.881e-05, [2] [tag_attr]: 3.246e-05 [meta_addattr_fg_expand]: 9.37001e-06 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 0.00010593 [insert-virtual-dataset]: 2.83998e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.0081374, [53] [py_interpret_to_execute]: 4.76002e-06 [rewriter_before_opt_a]: 0.00030075 [opt_a]: 0.00562138, [2] [Cycle 1]: 0.00458829, [45] [expand_dump_flag]: 4.04002e-06 [switch_simplify]: 7.056e-05 [loop_unroll]: 5.834e-05 [a_1]: 0.00113296 [with_stream_mark]: 1.649e-05 [recompute_prepare]: 1.229e-05 [updatestate_depend_eliminate]: 6.66999e-06 [updatestate_assign_eliminate]: 6.51999e-06 [updatestate_loads_eliminate]: 5.37001e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 0.00015915 [accelerated_algorithm]: 1.128e-05 [shard]: 1.74e-06 [meta_shard_fg_expand]: 3.18e-06 [shard_inline]: 1.067e-05 [merge_send_recv]: 1.054e-05 [auto_parallel]: 8.67e-06 [parallel]: 2.356e-05 [flash_sp]: 8.62998e-06 [merge_comm]: 6.69001e-06 [allreduce_fusion]: 5.61e-06 [matmul_add_comm_reduction]: 1.149e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.214e-05 [virtual_dataset]: 1.068e-05 [get_grad_eliminate_]: 1.087e-05 [virtual_output]: 1.042e-05 [merge_forward]: 6.20002e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.365e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.954e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.743e-05 [set_forward_comm_id_for_comm_node_pass]: 6.21e-06 [meta_fg_expand]: 5.10001e-06 [flash_sp_send_recv_attached]: 2.86999e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.678e-05 [a_after_grad]: 1.815e-05 [renormalize]: 0.00239609 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 1.94e-06 [auto_monad_eliminator]: 2.415e-05 [cse]: 0.00010525 [a_3]: 7.778e-05 [Cycle 2]: 0.00102283, [45] [expand_dump_flag]: 1.34e-06 [switch_simplify]: 1.21e-05 [loop_unroll]: 1.052e-05 [a_1]: 0.00026422 [with_stream_mark]: 1.355e-05 [recompute_prepare]: 1.119e-05 [updatestate_depend_eliminate]: 5.85002e-06 [updatestate_assign_eliminate]: 4.65001e-06 [updatestate_loads_eliminate]: 5.41998e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 0.00014792 [accelerated_algorithm]: 1.073e-05 [shard]: 1.04e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 1.015e-05 [merge_send_recv]: 7.29001e-06 [auto_parallel]: 7.79002e-06 [parallel]: 4.23001e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 5.39e-06 [allreduce_fusion]: 5.22e-06 [matmul_add_comm_reduction]: 8.57998e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 1.086e-05 [virtual_dataset]: 1.002e-05 [get_grad_eliminate_]: 1.029e-05 [virtual_output]: 9.62999e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 1.059e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.772e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 4.251e-05 [set_forward_comm_id_for_comm_node_pass]: 5.94e-06 [meta_fg_expand]: 4.03001e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 2.18e-05 [a_after_grad]: 1.553e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.67999e-06 [auto_monad_grad]: 1.04998e-06 [auto_monad_eliminator]: 1.462e-05 [cse]: 2.783e-05 [a_3]: 6.804e-05 [py_interpret_to_execute_after_opt_a]: 4.33001e-06 [slice_cell_reuse_recomputed_activation]: 1.87999e-06 [rewriter_after_opt_a]: 2.889e-05 [convert_after_rewriter]: 1.30001e-06 [order_py_execute_after_rewriter]: 1.28002e-06 [mutable_eliminate]: 0.00049065 [opt_b]: 0.00033852, [1] [Cycle 1]: 0.00033261, [7] [b_1]: 0.0002298 [b_2]: 1.29e-05 [updatestate_depend_eliminate]: 7.6e-06 [updatestate_assign_eliminate]: 4.90999e-06 [updatestate_loads_eliminate]: 4.97e-06 [renormalize]: 3.50003e-07 [cse]: 3.709e-05 [optimize_parallel_all_gather_comm]: 2.178e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 2.438e-05 [loop_unroll]: 0.00044291 [opt_after_cconv]: 0.00015277, [1] [Cycle 1]: 0.00014706, [7] [c_1]: 6.278e-05 [parameter_eliminate]: 2.25002e-06 [updatestate_depend_eliminate]: 7.56001e-06 [updatestate_assign_eliminate]: 4.97e-06 [updatestate_loads_eliminate]: 4.70001e-06 [cse]: 3.069e-05 [renormalize]: 3.09985e-07 [remove_dup_value]: 1.714e-05 [tuple_transform]: 0.00010712, [1] [Cycle 1]: 0.00010244, [4] [d_1]: 7.075e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 1.19995e-07 [switch_simplify]: 1.142e-05 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 6.767e-05 [cse_after_recomputation]: 3.252e-05, [1] [Cycle 1]: 2.802e-05, [1] [cse]: 2.236e-05 [environ_conv]: 9.59e-06 [swap_dp_allreduce_reducescatter]: 8.48001e-06 [bias_add_comm_swap]: 2.27999e-06 [label_micro_interleaved_index]: 4.54002e-06 [label_fine_grained_interleaved_index]: 2.69001e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.31998e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.67001e-06 [full_micro_interleaved_order_control]: 2.26998e-06 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.03001e-06 [interleave_split_concat_branches]: 1.47999e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.30999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.846e-05 [grouped_pairwise_exchange_alltoall]: 1.40001e-06 [offloading_packed_experts]: 5.05999e-06 [overlap_recompute_and_grad_model_parallel]: 6.26e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.24e-06 [overlap_recompute_comm]: 2.61e-06 [overlap_grad_ring_attention]: 5.52001e-06 [overlap_grad_flash_sp]: 2.406e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.16e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 9.686e-05, [1] [Cycle 1]: 9.248e-05, [6] [build]: 3.71001e-06 [elim_shapecalc]: 1.476e-05 [elim_not_effective]: 1.909e-05 [opt_reshape]: 1.148e-05 [fold_const_symbol]: 1.602e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.68002e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 3.144e-05 [get_jit_bprop_graph]: 1.05999e-06 [rewriter_after_jit_bprop_graph]: 3.47997e-06 [opt_after_jit_grad]: 0.00048912 [validate]: 5.333e-05 [backend_pass]: 9.50007e-07 [task_emit]: 0.0102734 [execute]: 5.79999e-06 Sums bootstrap : 0.000491s : 0.39% type_inference : 0.107279s : 84.67% event_method : 0.000445s : 0.35% auto_monad : 0.000173s : 0.14% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000106s : 0.08% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000301s : 0.24% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000083s : 0.07% optimize.opt_a.loop_unroll : 0.000069s : 0.05% optimize.opt_a.a_1 : 0.001397s : 1.10% optimize.opt_a.with_stream_mark : 0.000030s : 0.02% optimize.opt_a.recompute_prepare : 0.000023s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000307s : 0.24% optimize.opt_a.accelerated_algorithm : 0.000022s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000021s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000012s : 0.01% optimize.opt_a.allreduce_fusion : 0.000011s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.02% optimize.opt_a.virtual_dataset : 0.000021s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000021s : 0.02% optimize.opt_a.virtual_output : 0.000020s : 0.02% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000060s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.01% optimize.opt_a.meta_fg_expand : 0.000009s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000039s : 0.03% optimize.opt_a.a_after_grad : 0.000034s : 0.03% optimize.opt_a.renormalize : 0.002396s : 1.89% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.03% optimize.opt_a.cse : 0.000133s : 0.11% optimize.opt_a.a_3 : 0.000146s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000491s : 0.39% optimize.opt_b.b_1 : 0.000230s : 0.18% optimize.opt_b.b_2 : 0.000013s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000037s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.02% optimize.loop_unroll : 0.000443s : 0.35% optimize.opt_after_cconv.c_1 : 0.000063s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.cse : 0.000031s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000071s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000011s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.05% optimize.cse_after_recomputation.cse : 0.000022s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000489s : 0.39% validate : 0.000053s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.010273s : 8.11% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.000355 75 9.73% : 0.000035s : 5: substitution.arithmetic_simplify 1.99% : 0.000007s : 2: substitution.depend_value_elim 0.80% : 0.000003s : 5: substitution.elim_not_effective 0.62% : 0.000002s : 5: substitution.fold_const_symbol 2.30% : 0.000008s : 8: substitution.graph_param_transform 68.82% : 0.000244s : 8: substitution.inline 1.62% : 0.000006s : 10: substitution.j_node_and_user_rematch 2.32% : 0.000008s : 10: substitution.remove_not_recompute_node 1.16% : 0.000004s : 4: substitution.replace_old_param 4.28% : 0.000015s : 4: substitution.tuple_list_get_item_eliminator 3.44% : 0.000012s : 6: substitution.updatestate_pure_node_eliminater 2.90% : 0.000010s : 8: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.107199 2 97.52% : 0.104542s : 1: type_inference.infer 2.48% : 0.002657s : 1: type_inference.specialize ------[replace.] 0.000098 12 71.13% : 0.000070s : 8: replace.inline 28.87% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000253 12 94.75% : 0.000239s : 8: match.inline 5.25% : 0.000013s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000362 2596 1.01% : 0.000004s : 28: predicate.accumulaten_eliminater 0.63% : 0.000002s : 8: predicate.ad_related_special_op_eliminate 0.55% : 0.000002s : 16: predicate.addn_check_dump 0.94% : 0.000003s : 28: predicate.addn_zero_filter 0.90% : 0.000003s : 28: predicate.adjust_all_reduce_mul_add 2.29% : 0.000008s : 44: predicate.arithmetic_simplify 1.05% : 0.000004s : 28: predicate.cast_eliminate 0.58% : 0.000002s : 16: predicate.check_bprop_eliminate 0.54% : 0.000002s : 16: predicate.compare_switch_simplify 0.20% : 0.000001s : 8: predicate.const_output_eliminate 0.60% : 0.000002s : 16: predicate.depend_value_elim 1.04% : 0.000004s : 28: predicate.dict_get_item_const_eliminator 1.17% : 0.000004s : 28: predicate.dict_get_item_eliminator 0.97% : 0.000004s : 28: predicate.dict_set_item_eliminator 0.71% : 0.000003s : 16: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 8: predicate.elim_not_effective 0.33% : 0.000001s : 8: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000004s : 36: predicate.environ_add_const_eliminate 1.20% : 0.000004s : 36: predicate.environ_get_add_eliminate 1.23% : 0.000004s : 36: predicate.environ_get_depend_swap 1.92% : 0.000007s : 52: predicate.environ_get_eliminate 1.20% : 0.000004s : 36: predicate.environ_get_set_eliminate 1.45% : 0.000005s : 40: predicate.exchange_switch_depend_value 2.05% : 0.000007s : 40: predicate.float_depend_g_call 0.55% : 0.000002s : 16: predicate.float_environ_get_switch 0.83% : 0.000003s : 24: predicate.float_tuple_getitem_switch 0.19% : 0.000001s : 8: predicate.fold_const_symbol 0.63% : 0.000002s : 16: predicate.get_grad_eliminate 0.21% : 0.000001s : 8: predicate.graph_param_transform 0.61% : 0.000002s : 16: predicate.incorporate_call 0.53% : 0.000002s : 16: predicate.incorporate_call_switch 5.82% : 0.000021s : 116: predicate.inline 0.77% : 0.000003s : 16: predicate.inline_without_move 0.34% : 0.000001s : 16: predicate.j_node_and_user_rematch 0.73% : 0.000003s : 16: predicate.less_batch_normalization 1.79% : 0.000006s : 48: predicate.list_to_tuple_eliminator_ 2.58% : 0.000009s : 76: predicate.load_eliminater 0.59% : 0.000002s : 8: predicate.loop_unroll_after_grad 2.62% : 0.000009s : 74: predicate.loop_unroll_before_grad 1.62% : 0.000006s : 44: predicate.make_slice_get_slice_eliminator 0.61% : 0.000002s : 16: predicate.merge_addn 0.55% : 0.000002s : 16: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 16: predicate.mini_step_allgather_replace 0.92% : 0.000003s : 28: predicate.minmaximum_grad 0.75% : 0.000003s : 8: predicate.mutable_eliminate 0.33% : 0.000001s : 8: predicate.opt_reshape 0.32% : 0.000001s : 8: predicate.parallel_virtual_node 1.86% : 0.000007s : 40: predicate.partial_defer_inline 1.55% : 0.000006s : 40: predicate.partial_eliminate 0.98% : 0.000004s : 28: predicate.print_const_string_wrapper 0.58% : 0.000002s : 16: predicate.reduce_all_const_elim 1.27% : 0.000005s : 28: predicate.reduce_eliminate 2.63% : 0.000010s : 76: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 16: predicate.remove_not_recompute_node 1.38% : 0.000005s : 48: predicate.replace_applicator 0.44% : 0.000002s : 16: predicate.replace_old_param 0.25% : 0.000001s : 8: predicate.reset_defer_inline 1.00% : 0.000004s : 28: predicate.reshape_eliminate 0.61% : 0.000002s : 16: predicate.row_tensor_add_zeros_like 0.43% : 0.000002s : 8: predicate.row_tensor_eliminate 0.69% : 0.000002s : 16: predicate.same_eliminate 0.43% : 0.000002s : 16: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 16: predicate.shard_identity_eliminate 0.73% : 0.000003s : 16: predicate.special_op_eliminate 0.68% : 0.000002s : 16: predicate.specialize_transform 0.73% : 0.000003s : 16: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000003s : 16: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 8: predicate.switch_call_monad_eliminater 1.61% : 0.000006s : 40: predicate.switch_defer_inline 2.18% : 0.000008s : 56: predicate.switch_layer_defer_inline 5.30% : 0.000019s : 138: predicate.switch_simplify 0.98% : 0.000004s : 28: predicate.tile_eliminate 1.02% : 0.000004s : 28: predicate.transpose_eliminate 1.68% : 0.000006s : 44: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000006s : 44: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000006s : 44: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000011s : 64: predicate.tuple_list_get_item_eliminator 1.59% : 0.000006s : 44: predicate.tuple_list_get_set_item_eliminator 2.41% : 0.000009s : 60: predicate.tuple_list_set_item_eliminator 1.79% : 0.000006s : 48: predicate.tuple_to_list_eliminator_ 2.56% : 0.000009s : 76: predicate.updatestate_pure_node_eliminater 3.49% : 0.000013s : 92: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 8: predicate.value_based_eliminate 0.62% : 0.000002s : 16: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 16: predicate.virtual_output_eliminate 0.32% : 0.000001s : 8: predicate.virtual_view_grad_eliminate 0.43% : 0.000002s : 8: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002266 26 60.90% : 0.001380s : 16: func_graph_cloner_run.FuncGraphClonerGraph 39.10% : 0.000886s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.147986 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.40% : 0.003550s : 1: add_attr 2.39% : 0.003541s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.12% : 0.000185s : 1: auto_monad 0.02% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.34% : 0.000510s : 1: bootstrap 0.02% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.31% : 0.000460s : 1: event_method 0.01% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.03% : 0.000041s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.31% : 0.000452s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.34% : 0.000499s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 1.51% : 0.002233s : 78: opt.transform.opt_a 0.04% : 0.000061s : 1: opt.transform.opt_after_cconv 0.03% : 0.000038s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000214s : 28: opt.transform.opt_b 0.05% : 0.000080s : 2: opt.transform.opt_trans_graph 0.04% : 0.000058s : 4: opt.transform.symbol_engine_opt 3.80% : 0.005625s : 1: opt_a 0.11% : 0.000156s : 1: opt_after_cconv 0.34% : 0.000499s : 1: opt_after_jit_grad 0.23% : 0.000342s : 1: opt_b 5.50% : 0.008142s : 1: optimize 0.02% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000027s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.08% : 0.000111s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.98% : 0.001456s : 1: renormalize.infer 0.63% : 0.000930s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000032s : 1: rewriter_after_opt_a 0.21% : 0.000306s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000100s : 1: symbol_engine_optimizer 6.95% : 0.010283s : 1: task_emit 0.07% : 0.000110s : 1: tuple_transform 72.51% : 0.107298s : 1: type_inference 0.06% : 0.000087s : 1: validate TotalTime = 0.267879, [24] [bootstrap]: 0.0006038 [type_inference]: 0.099269 [event_method]: 1.908e-05 [auto_monad]: 6.434e-05 [graph_reusing]: 6.37001e-06 [inline]: 2.81999e-06 [add_attr]: 0.00404192, [1] [add_attr_with_inline]: 0.00403427, [1] [Cycle 1]: 5.855e-05, [2] [tag_attr]: 1.807e-05 [meta_addattr_fg_expand]: 6.11e-06 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 3e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.82999e-06 [optimize]: 0.00566194, [53] [py_interpret_to_execute]: 4.3e-06 [rewriter_before_opt_a]: 0.00022002 [opt_a]: 0.0035766, [2] [Cycle 1]: 0.00296956, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.449e-05 [loop_unroll]: 3.244e-05 [a_1]: 0.00053342 [with_stream_mark]: 1.391e-05 [recompute_prepare]: 8.36002e-06 [updatestate_depend_eliminate]: 3.68e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 7.732e-05 [accelerated_algorithm]: 7.48e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 1.74e-06 [shard_inline]: 6.19001e-06 [merge_send_recv]: 8.81002e-06 [auto_parallel]: 5.47999e-06 [parallel]: 2.61e-05 [flash_sp]: 6.93998e-06 [merge_comm]: 3.53e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 8.55001e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 6.06998e-06 [get_grad_eliminate_]: 6.02999e-06 [virtual_output]: 6.17999e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 9.44e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.132e-05 [merge_recompute_call_nodes]: 1.32e-06 [before_grad]: 1.027e-05 [set_forward_comm_id_for_comm_node_pass]: 3.22002e-06 [meta_fg_expand]: 2.70002e-06 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 2.36e-06 [after_resolve]: 9.77999e-06 [a_after_grad]: 9.34e-06 [renormalize]: 0.0017502 [add_forward_monad_depend]: 5.08002e-06 [auto_monad_grad]: 1.71998e-06 [auto_monad_eliminator]: 1.431e-05 [cse]: 2.82e-05 [a_3]: 4.403e-05 [Cycle 2]: 0.00059735, [45] [expand_dump_flag]: 9.80013e-07 [switch_simplify]: 7.41999e-06 [loop_unroll]: 6.09001e-06 [a_1]: 0.00012227 [with_stream_mark]: 1.005e-05 [recompute_prepare]: 5.94999e-06 [updatestate_depend_eliminate]: 2.80002e-06 [updatestate_assign_eliminate]: 2.19001e-06 [updatestate_loads_eliminate]: 2.54999e-06 [parameter_eliminate]: 8.29983e-07 [a_2]: 6.831e-05 [accelerated_algorithm]: 5.75001e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.17e-06 [shard_inline]: 5.84e-06 [merge_send_recv]: 4.01001e-06 [auto_parallel]: 4.99e-06 [parallel]: 3.81001e-06 [flash_sp]: 2.82002e-06 [merge_comm]: 2.83998e-06 [allreduce_fusion]: 2.52001e-06 [matmul_add_comm_reduction]: 5.04e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.26998e-06 [virtual_dataset]: 5.52999e-06 [get_grad_eliminate_]: 5.27001e-06 [virtual_output]: 6.04999e-06 [merge_forward]: 2.64999e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 5.40999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.345e-05 [merge_recompute_call_nodes]: 6.50005e-07 [before_grad]: 8.32e-06 [set_forward_comm_id_for_comm_node_pass]: 2.93e-06 [meta_fg_expand]: 1.65001e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.12e-06 [a_after_grad]: 8.55999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 9.10019e-07 [auto_monad_eliminator]: 5.63997e-06 [cse]: 1.365e-05 [a_3]: 3.42e-05 [py_interpret_to_execute_after_opt_a]: 3.93001e-06 [slice_cell_reuse_recomputed_activation]: 2.24999e-06 [rewriter_after_opt_a]: 1.611e-05 [convert_after_rewriter]: 1.27999e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00053466 [opt_b]: 0.00019383, [1] [Cycle 1]: 0.00018783, [7] [b_1]: 0.00011818 [b_2]: 7.36999e-06 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.21e-06 [renormalize]: 3.39991e-07 [cse]: 1.834e-05 [optimize_parallel_all_gather_comm]: 1.565e-05 [overlap_param_gather]: 2.34001e-06 [cconv]: 2.296e-05 [loop_unroll]: 0.00044 [opt_after_cconv]: 9.601e-05, [1] [Cycle 1]: 9.034e-05, [7] [c_1]: 2.849e-05 [parameter_eliminate]: 2.13002e-06 [updatestate_depend_eliminate]: 4.95001e-06 [updatestate_assign_eliminate]: 2.32999e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 1.889e-05 [renormalize]: 3.19997e-07 [remove_dup_value]: 1.481e-05 [tuple_transform]: 6.973e-05, [1] [Cycle 1]: 6.571e-05, [4] [d_1]: 4.044e-05 [none_parameter_eliminate]: 1.55001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.29999e-06 [partial_unused_args_eliminate]: 1.57999e-06 [add_recomputation]: 4.038e-05 [cse_after_recomputation]: 2.154e-05, [1] [Cycle 1]: 1.739e-05, [1] [cse]: 1.256e-05 [environ_conv]: 7.22002e-06 [swap_dp_allreduce_reducescatter]: 4.91002e-06 [bias_add_comm_swap]: 2.99999e-06 [label_micro_interleaved_index]: 4.82e-06 [label_fine_grained_interleaved_index]: 2.81999e-06 [merge_cast_opt]: 1.22e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.68e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.23002e-06 [reorder_send_recv_between_fp_bp]: 2.40002e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.23002e-06 [overlap_opt_shard_in_pipeline]: 5.34e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.137e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 3.81001e-06 [overlap_recompute_and_grad_model_parallel]: 4.74e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.54999e-06 [overlap_grad_ring_attention]: 3.66999e-06 [overlap_grad_flash_sp]: 1.632e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.63002e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 7.027e-05, [1] [Cycle 1]: 6.575e-05, [6] [build]: 2.22999e-06 [elim_shapecalc]: 9.17999e-06 [elim_not_effective]: 1.232e-05 [opt_reshape]: 6.52001e-06 [fold_const_symbol]: 9.45001e-06 [renormalize]: 1.59984e-07 [detach_backward]: 1.60001e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.542e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 3.38e-06 [opt_after_jit_grad]: 0.00049033 [validate]: 3.988e-05 [backend_pass]: 1.02e-06 [task_emit]: 0.157402 [execute]: 8.52e-06 Sums bootstrap : 0.000604s : 0.23% type_inference : 0.099269s : 37.76% event_method : 0.000019s : 0.01% auto_monad : 0.000064s : 0.02% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000018s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000030s : 0.01% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000220s : 0.08% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.02% optimize.opt_a.loop_unroll : 0.000039s : 0.01% optimize.opt_a.a_1 : 0.000656s : 0.25% optimize.opt_a.with_stream_mark : 0.000024s : 0.01% optimize.opt_a.recompute_prepare : 0.000014s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000146s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000013s : 0.00% optimize.opt_a.auto_parallel : 0.000010s : 0.00% optimize.opt_a.parallel : 0.000030s : 0.01% optimize.opt_a.flash_sp : 0.000010s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.01% optimize.opt_a.a_after_grad : 0.000018s : 0.01% optimize.opt_a.renormalize : 0.001750s : 0.67% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.01% optimize.opt_a.cse : 0.000042s : 0.02% optimize.opt_a.a_3 : 0.000078s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000535s : 0.20% optimize.opt_b.b_1 : 0.000118s : 0.04% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.01% optimize.loop_unroll : 0.000440s : 0.17% optimize.opt_after_cconv.c_1 : 0.000028s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000040s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000040s : 0.02% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000015s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000490s : 0.19% validate : 0.000040s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.157402s : 59.88% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000147 25 1.26% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000001s : 2: substitution.fold_const_symbol 3.78% : 0.000006s : 4: substitution.graph_param_transform 79.04% : 0.000116s : 5: substitution.inline 2.19% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.83% : 0.000006s : 4: substitution.remove_not_recompute_node 2.03% : 0.000003s : 2: substitution.replace_old_param 7.02% : 0.000010s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.099198 2 97.97% : 0.097181s : 1: type_inference.infer 2.03% : 0.002017s : 1: type_inference.specialize ------[replace.] 0.000055 7 73.82% : 0.000041s : 5: replace.inline 26.18% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000123 7 92.45% : 0.000114s : 5: match.inline 7.55% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000188 1267 0.93% : 0.000002s : 13: predicate.accumulaten_eliminater 0.79% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.82% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.06% : 0.000004s : 21: predicate.arithmetic_simplify 0.95% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.58% : 0.000001s : 8: predicate.depend_value_elim 0.91% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_depend_swap 1.75% : 0.000003s : 25: predicate.environ_get_eliminate 1.07% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.20% : 0.000004s : 20: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 5.75% : 0.000011s : 57: predicate.inline 0.79% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.19% : 0.000002s : 8: predicate.less_batch_normalization 1.67% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.47% : 0.000005s : 36: predicate.load_eliminater 0.88% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.91% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.10% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 1.81% : 0.000003s : 20: predicate.partial_defer_inline 1.42% : 0.000003s : 19: predicate.partial_eliminate 0.98% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000002s : 13: predicate.reduce_eliminate 2.40% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 23: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.31% : 0.000001s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.72% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000003s : 20: predicate.switch_defer_inline 2.18% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.65% : 0.000011s : 73: predicate.switch_simplify 1.03% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.59% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.69% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.31% : 0.000004s : 36: predicate.updatestate_pure_node_eliminater 3.08% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001808 20 69.05% : 0.001248s : 13: func_graph_cloner_run.FuncGraphClonerGraph 30.95% : 0.000560s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.280558 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.44% : 0.004047s : 1: add_attr 1.44% : 0.004038s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000044s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.02% : 0.000070s : 1: auto_monad 0.01% : 0.000019s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.22% : 0.000629s : 1: bootstrap 0.01% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.01% : 0.000025s : 1: event_method 0.01% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.16% : 0.000448s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.19% : 0.000544s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.38% : 0.001059s : 78: opt.transform.opt_a 0.01% : 0.000027s : 1: opt.transform.opt_after_cconv 0.01% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000097s : 28: opt.transform.opt_b 0.02% : 0.000045s : 2: opt.transform.opt_trans_graph 0.01% : 0.000034s : 4: opt.transform.symbol_engine_opt 1.28% : 0.003580s : 1: opt_a 0.04% : 0.000099s : 1: opt_after_cconv 0.18% : 0.000500s : 1: opt_after_jit_grad 0.07% : 0.000197s : 1: opt_b 2.02% : 0.005666s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000006s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000034s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 0.41% : 0.001148s : 1: renormalize.infer 0.21% : 0.000593s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.08% : 0.000226s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000073s : 1: symbol_engine_optimizer 56.11% : 0.157421s : 1: task_emit 0.03% : 0.000073s : 1: tuple_transform 35.39% : 0.099284s : 1: type_inference 0.02% : 0.000068s : 1: validate TotalTime = 0.136445, [24] [bootstrap]: 0.00085524 [type_inference]: 0.112927 [event_method]: 2.174e-05 [auto_monad]: 7.172e-05 [graph_reusing]: 6.58e-06 [inline]: 3.53999e-06 [add_attr]: 0.00514166, [1] [add_attr_with_inline]: 0.00512924, [1] [Cycle 1]: 9.951e-05, [2] [tag_attr]: 2.278e-05 [meta_addattr_fg_expand]: 5.76003e-06 [parallel-infer-symbol]: 4.4e-06 [pre_auto_parallel]: 3.945e-05 [insert-virtual-dataset]: 2.32001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00721394, [53] [py_interpret_to_execute]: 6.17001e-06 [rewriter_before_opt_a]: 0.00022618 [opt_a]: 0.00473348, [2] [Cycle 1]: 0.00408491, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.722e-05 [loop_unroll]: 3.358e-05 [a_1]: 0.00058766 [with_stream_mark]: 1.594e-05 [recompute_prepare]: 8.75999e-06 [updatestate_depend_eliminate]: 4.42e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 8.066e-05 [accelerated_algorithm]: 7.81001e-06 [shard]: 1.77001e-06 [meta_shard_fg_expand]: 2.20002e-06 [shard_inline]: 6.94999e-06 [merge_send_recv]: 9.30001e-06 [auto_parallel]: 5.88998e-06 [parallel]: 7.504e-05 [flash_sp]: 9.42999e-06 [merge_comm]: 4.28001e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 9.59e-06 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 9.74999e-06 [virtual_dataset]: 6.64999e-06 [get_grad_eliminate_]: 7.03998e-06 [virtual_output]: 6.33e-06 [merge_forward]: 4.37e-06 [cell_reuse_recompute_pass]: 1.81003e-06 [offload_activation]: 1.16e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.433e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 1.12e-05 [set_forward_comm_id_for_comm_node_pass]: 3.88999e-06 [meta_fg_expand]: 3.17002e-06 [flash_sp_send_recv_attached]: 2.68e-06 [receive_attached]: 2.91e-06 [after_resolve]: 1.065e-05 [a_after_grad]: 1.019e-05 [renormalize]: 0.00262804 [add_forward_monad_depend]: 6.98e-06 [auto_monad_grad]: 2.11e-06 [auto_monad_eliminator]: 1.714e-05 [cse]: 3.211e-05 [a_3]: 5.357e-05 [Cycle 2]: 0.00063639, [45] [expand_dump_flag]: 1.86e-06 [switch_simplify]: 7.83001e-06 [loop_unroll]: 6.84999e-06 [a_1]: 0.00013329 [with_stream_mark]: 1.389e-05 [recompute_prepare]: 6.76e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 7.31e-05 [accelerated_algorithm]: 6.34001e-06 [shard]: 1.36002e-06 [meta_shard_fg_expand]: 1.59998e-06 [shard_inline]: 6.16e-06 [merge_send_recv]: 5.32001e-06 [auto_parallel]: 5.82999e-06 [parallel]: 5.75001e-06 [flash_sp]: 3.61001e-06 [merge_comm]: 2.87002e-06 [allreduce_fusion]: 3.08e-06 [matmul_add_comm_reduction]: 6.69999e-06 [allreduce_slice_to_reducescatter]: 6.20028e-07 [virtual_shard_identity]: 7.19001e-06 [virtual_dataset]: 5.94e-06 [get_grad_eliminate_]: 6.19999e-06 [virtual_output]: 5.64998e-06 [merge_forward]: 3.65e-06 [cell_reuse_recompute_pass]: 2.01998e-06 [offload_activation]: 7.63999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.348e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 9.44e-06 [set_forward_comm_id_for_comm_node_pass]: 3.09999e-06 [meta_fg_expand]: 2.13998e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.57001e-06 [after_resolve]: 9.59e-06 [a_after_grad]: 9.04e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 6.39999e-06 [cse]: 1.432e-05 [a_3]: 3.549e-05 [py_interpret_to_execute_after_opt_a]: 6.83998e-06 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 1.841e-05 [convert_after_rewriter]: 1.52999e-06 [order_py_execute_after_rewriter]: 3.945e-05 [mutable_eliminate]: 0.00072744 [opt_b]: 0.00020959, [1] [Cycle 1]: 0.0002028, [7] [b_1]: 0.00012808 [b_2]: 8.57998e-06 [updatestate_depend_eliminate]: 5.96e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.17999e-06 [renormalize]: 6.10016e-07 [cse]: 2.102e-05 [optimize_parallel_all_gather_comm]: 1.702e-05 [overlap_param_gather]: 3.63999e-06 [cconv]: 2.591e-05 [loop_unroll]: 0.00047521 [opt_after_cconv]: 0.00010278, [1] [Cycle 1]: 9.634e-05, [7] [c_1]: 3.25e-05 [parameter_eliminate]: 2.97002e-06 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.10002e-06 [cse]: 1.841e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.542e-05 [tuple_transform]: 7.689e-05, [1] [Cycle 1]: 7.235e-05, [4] [d_1]: 4.465e-05 [none_parameter_eliminate]: 1.98002e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 7.08e-06 [partial_unused_args_eliminate]: 1.86003e-06 [add_recomputation]: 4.785e-05 [cse_after_recomputation]: 2.147e-05, [1] [Cycle 1]: 1.651e-05, [1] [cse]: 1.133e-05 [environ_conv]: 8.74e-06 [swap_dp_allreduce_reducescatter]: 5.87001e-06 [bias_add_comm_swap]: 3.10002e-06 [label_micro_interleaved_index]: 4.62e-06 [label_fine_grained_interleaved_index]: 2.61999e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.71e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.09998e-06 [remove_cast_before_assign_add]: 1.27999e-06 [full_micro_interleaved_order_control]: 2.59001e-06 [reorder_send_recv_between_fp_bp]: 2.78003e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 4.131e-05 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.266e-05 [grouped_pairwise_exchange_alltoall]: 1.97001e-06 [offloading_packed_experts]: 3.43999e-06 [overlap_recompute_and_grad_model_parallel]: 5.09003e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.57e-06 [overlap_grad_flash_sp]: 2.124e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 7.796e-05, [1] [Cycle 1]: 7.286e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.057e-05 [elim_not_effective]: 1.35e-05 [opt_reshape]: 7.66999e-06 [fold_const_symbol]: 1.077e-05 [renormalize]: 1.69995e-07 [detach_backward]: 2.37999e-06 [pipeline_parallel_scheduler]: 1.81e-06 [auto_monad_reorder]: 1.567e-05 [get_jit_bprop_graph]: 2.27999e-06 [rewriter_after_jit_bprop_graph]: 4.22e-06 [opt_after_jit_grad]: 0.00051148 [validate]: 5.424e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.00926587 [execute]: 8.89e-06 Sums bootstrap : 0.000855s : 0.66% type_inference : 0.112927s : 86.76% event_method : 0.000022s : 0.02% auto_monad : 0.000072s : 0.06% graph_reusing : 0.000007s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000039s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000226s : 0.17% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000055s : 0.04% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.000721s : 0.55% optimize.opt_a.with_stream_mark : 0.000030s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000154s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000015s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000081s : 0.06% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.01% optimize.opt_a.renormalize : 0.002628s : 2.02% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.02% optimize.opt_a.cse : 0.000046s : 0.04% optimize.opt_a.a_3 : 0.000089s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000039s : 0.03% optimize.mutable_eliminate : 0.000727s : 0.56% optimize.opt_b.b_1 : 0.000128s : 0.10% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000004s : 0.00% optimize.cconv : 0.000026s : 0.02% optimize.loop_unroll : 0.000475s : 0.37% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000045s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000048s : 0.04% optimize.cse_after_recomputation.cse : 0.000011s : 0.01% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000041s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000511s : 0.39% validate : 0.000054s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.009266s : 7.12% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000180 25 1.14% : 0.000002s : 2: substitution.elim_not_effective 0.92% : 0.000002s : 2: substitution.fold_const_symbol 3.47% : 0.000006s : 4: substitution.graph_param_transform 80.34% : 0.000145s : 5: substitution.inline 2.07% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.25% : 0.000006s : 4: substitution.remove_not_recompute_node 2.52% : 0.000005s : 2: substitution.replace_old_param 6.28% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.112826 2 97.54% : 0.110049s : 1: type_inference.infer 2.46% : 0.002777s : 1: type_inference.specialize ------[replace.] 0.000062 7 75.92% : 0.000047s : 5: replace.inline 24.08% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000152 7 93.31% : 0.000142s : 5: match.inline 6.69% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000203 1267 1.11% : 0.000002s : 13: predicate.accumulaten_eliminater 0.77% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 1.04% : 0.000002s : 13: predicate.addn_zero_filter 0.79% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.13% : 0.000004s : 21: predicate.arithmetic_simplify 1.34% : 0.000003s : 13: predicate.cast_eliminate 0.83% : 0.000002s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.63% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.33% : 0.000001s : 4: predicate.elim_not_effective 0.29% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 17: predicate.environ_get_depend_swap 1.81% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.31% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.18% : 0.000004s : 20: predicate.float_depend_g_call 0.60% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.99% : 0.000002s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.33% : 0.000011s : 57: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.69% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.24% : 0.000005s : 36: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.89% : 0.000006s : 41: predicate.loop_unroll_before_grad 1.90% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 8: predicate.merge_addn 0.91% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000002s : 13: predicate.minmaximum_grad 1.41% : 0.000003s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.67% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000003s : 20: predicate.partial_defer_inline 1.31% : 0.000003s : 19: predicate.partial_eliminate 0.89% : 0.000002s : 13: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 13: predicate.reduce_eliminate 2.25% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 8: predicate.remove_not_recompute_node 1.35% : 0.000003s : 23: predicate.replace_applicator 0.48% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000000s : 4: predicate.reset_defer_inline 1.08% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 1.00% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.98% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.87% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 20: predicate.switch_defer_inline 1.98% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.56% : 0.000011s : 73: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.43% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.34% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 36: predicate.updatestate_pure_node_eliminater 2.79% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002074 20 64.01% : 0.001328s : 13: func_graph_cloner_run.FuncGraphClonerGraph 35.99% : 0.000746s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.152755 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.37% : 0.005147s : 1: add_attr 3.36% : 0.005134s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000052s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000077s : 1: auto_monad 0.01% : 0.000019s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.60% : 0.000918s : 1: bootstrap 0.02% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000029s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.32% : 0.000484s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.48% : 0.000738s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 0.76% : 0.001154s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000104s : 28: opt.transform.opt_b 0.03% : 0.000049s : 2: opt.transform.opt_trans_graph 0.03% : 0.000039s : 4: opt.transform.symbol_engine_opt 3.10% : 0.004737s : 1: opt_a 0.07% : 0.000106s : 1: opt_after_cconv 0.34% : 0.000522s : 1: opt_after_jit_grad 0.14% : 0.000214s : 1: opt_b 4.73% : 0.007220s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.03% : 0.000043s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.03% : 0.000045s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000007s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 1.19% : 0.001824s : 1: renormalize.infer 0.52% : 0.000793s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000022s : 1: rewriter_after_opt_a 0.15% : 0.000233s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000081s : 1: symbol_engine_optimizer 6.08% : 0.009287s : 1: task_emit 0.05% : 0.000080s : 1: tuple_transform 73.95% : 0.112957s : 1: type_inference 0.06% : 0.000096s : 1: validate TotalTime = 0.135964, [24] [bootstrap]: 0.00053828 [type_inference]: 0.114176 [event_method]: 2.027e-05 [auto_monad]: 6.505e-05 [graph_reusing]: 5.64e-06 [inline]: 2.14999e-06 [add_attr]: 0.00404299, [1] [add_attr_with_inline]: 0.00403435, [1] [Cycle 1]: 5.248e-05, [2] [tag_attr]: 1.994e-05 [meta_addattr_fg_expand]: 5.20999e-06 [parallel-infer-symbol]: 3.37002e-06 [pre_auto_parallel]: 3.172e-05 [insert-virtual-dataset]: 2.12999e-06 [parallel-infer-symbol-second]: 6.19999e-07 [dataset_repeat_opt]: 2.74001e-06 [pipeline_split]: 1.47001e-06 [optimize]: 0.00685267, [53] [py_interpret_to_execute]: 4.16001e-06 [rewriter_before_opt_a]: 0.00021549 [opt_a]: 0.0046652, [2] [Cycle 1]: 0.00401418, [45] [expand_dump_flag]: 3.56999e-06 [switch_simplify]: 4.965e-05 [loop_unroll]: 3.78e-05 [a_1]: 0.00070966 [with_stream_mark]: 1.513e-05 [recompute_prepare]: 9.44998e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 4.02e-06 [parameter_eliminate]: 1.60001e-06 [a_2]: 9.03e-05 [accelerated_algorithm]: 7.51999e-06 [shard]: 1.68997e-06 [meta_shard_fg_expand]: 1.81998e-06 [shard_inline]: 6.46e-06 [merge_send_recv]: 8.33999e-06 [auto_parallel]: 5.95002e-06 [parallel]: 1.689e-05 [flash_sp]: 7.33e-06 [merge_comm]: 3.38e-06 [allreduce_fusion]: 3.22002e-06 [matmul_add_comm_reduction]: 9.52001e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 7.45998e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 6.74001e-06 [virtual_output]: 6.78e-06 [merge_forward]: 3.91001e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 9.19e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 3.38e-06 [meta_fg_expand]: 2.62001e-06 [flash_sp_send_recv_attached]: 2.74001e-06 [receive_attached]: 2.05002e-06 [after_resolve]: 1.045e-05 [a_after_grad]: 1.182e-05 [renormalize]: 0.00257513 [add_forward_monad_depend]: 4.73001e-06 [auto_monad_grad]: 2.18998e-06 [auto_monad_eliminator]: 1.473e-05 [cse]: 2.816e-05 [a_3]: 4.738e-05 [Cycle 2]: 0.0006408, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 7.56999e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00012737 [with_stream_mark]: 1.092e-05 [recompute_prepare]: 6.35002e-06 [updatestate_depend_eliminate]: 2.81999e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 7.188e-05 [accelerated_algorithm]: 5.94e-06 [shard]: 1.26002e-06 [meta_shard_fg_expand]: 1.29e-06 [shard_inline]: 6.34999e-06 [merge_send_recv]: 4.05998e-06 [auto_parallel]: 4.90001e-06 [parallel]: 4.79998e-06 [flash_sp]: 2.73e-06 [merge_comm]: 2.79001e-06 [allreduce_fusion]: 2.94001e-06 [matmul_add_comm_reduction]: 5.37001e-06 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 6.23e-06 [virtual_dataset]: 6.01998e-06 [get_grad_eliminate_]: 6.07001e-06 [virtual_output]: 5.66003e-06 [merge_forward]: 2.44999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 6.12999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.211e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 9.12001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 1.87999e-06 [flash_sp_send_recv_attached]: 7.90023e-07 [receive_attached]: 1.22e-06 [after_resolve]: 8.89e-06 [a_after_grad]: 9.00999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.31002e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 6.83e-06 [cse]: 1.487e-05 [a_3]: 3.579e-05 [py_interpret_to_execute_after_opt_a]: 4.42e-06 [slice_cell_reuse_recomputed_activation]: 2.24001e-06 [rewriter_after_opt_a]: 1.709e-05 [convert_after_rewriter]: 1.18001e-06 [order_py_execute_after_rewriter]: 1.40001e-06 [mutable_eliminate]: 0.00061826 [opt_b]: 0.00019868, [1] [Cycle 1]: 0.00019245, [7] [b_1]: 0.00012271 [b_2]: 7.50003e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.89999e-06 [renormalize]: 4.19997e-07 [cse]: 1.861e-05 [optimize_parallel_all_gather_comm]: 1.465e-05 [overlap_param_gather]: 2.11998e-06 [cconv]: 2.301e-05 [loop_unroll]: 0.00044194 [opt_after_cconv]: 0.00010007, [1] [Cycle 1]: 9.435e-05, [7] [c_1]: 3.12e-05 [parameter_eliminate]: 2.48e-06 [updatestate_depend_eliminate]: 4.92e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.47001e-06 [cse]: 1.856e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 1.582e-05 [tuple_transform]: 7.39e-05, [1] [Cycle 1]: 6.976e-05, [4] [d_1]: 4.334e-05 [none_parameter_eliminate]: 1.49e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.83999e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.046e-05 [cse_after_recomputation]: 2.26e-05, [1] [Cycle 1]: 1.879e-05, [1] [cse]: 1.354e-05 [environ_conv]: 6.61e-06 [swap_dp_allreduce_reducescatter]: 4.79e-06 [bias_add_comm_swap]: 2.41e-06 [label_micro_interleaved_index]: 4.24002e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.35002e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 7.00005e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 1.96998e-06 [reorder_send_recv_between_fp_bp]: 2.44999e-06 [comm_op_add_attrs]: 9.39996e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.09998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.073e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 3.36001e-06 [overlap_recompute_and_grad_model_parallel]: 4.27e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29003e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 3.83001e-06 [overlap_grad_flash_sp]: 1.611e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.46998e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 9.30013e-07 [symbol_engine_optimizer]: 7.476e-05, [1] [Cycle 1]: 7.016e-05, [6] [build]: 2.32999e-06 [elim_shapecalc]: 8.70999e-06 [elim_not_effective]: 1.164e-05 [opt_reshape]: 7.67002e-06 [fold_const_symbol]: 1.008e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.52001e-06 [pipeline_parallel_scheduler]: 1.25999e-06 [auto_monad_reorder]: 1.484e-05 [get_jit_bprop_graph]: 9.20001e-07 [rewriter_after_jit_bprop_graph]: 3.08e-06 [opt_after_jit_grad]: 0.00048073 [validate]: 4.083e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.00946148 [execute]: 6.84001e-06 Sums bootstrap : 0.000538s : 0.41% type_inference : 0.114176s : 87.22% event_method : 0.000020s : 0.02% auto_monad : 0.000065s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000032s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000215s : 0.16% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000057s : 0.04% optimize.opt_a.loop_unroll : 0.000044s : 0.03% optimize.opt_a.a_1 : 0.000837s : 0.64% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000162s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000012s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.002575s : 1.97% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.02% optimize.opt_a.cse : 0.000043s : 0.03% optimize.opt_a.a_3 : 0.000083s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000618s : 0.47% optimize.opt_b.b_1 : 0.000123s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.02% optimize.loop_unroll : 0.000442s : 0.34% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000043s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000040s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000015s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000481s : 0.37% validate : 0.000041s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.009461s : 7.23% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000161 25 0.98% : 0.000002s : 2: substitution.elim_not_effective 1.06% : 0.000002s : 2: substitution.fold_const_symbol 3.58% : 0.000006s : 4: substitution.graph_param_transform 79.43% : 0.000128s : 5: substitution.inline 2.06% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.19% : 0.000005s : 4: substitution.remove_not_recompute_node 1.74% : 0.000003s : 2: substitution.replace_old_param 7.95% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.114103 2 97.66% : 0.111430s : 1: type_inference.infer 2.34% : 0.002673s : 1: type_inference.specialize ------[replace.] 0.000063 7 72.93% : 0.000046s : 5: replace.inline 27.07% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000136 7 91.75% : 0.000125s : 5: match.inline 8.25% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000192 1267 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.65% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.85% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.14% : 0.000004s : 21: predicate.arithmetic_simplify 0.91% : 0.000002s : 13: predicate.cast_eliminate 0.77% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000003s : 25: predicate.environ_get_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.18% : 0.000004s : 20: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.63% : 0.000011s : 57: predicate.inline 0.68% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.93% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 36: predicate.load_eliminater 0.85% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.78% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.08% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 1.82% : 0.000003s : 20: predicate.partial_defer_inline 1.37% : 0.000003s : 19: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.39% : 0.000003s : 13: predicate.reduce_eliminate 2.47% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 23: predicate.replace_applicator 0.56% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 0.93% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.72% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 8: predicate.shard_identity_eliminate 0.73% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000001s : 8: predicate.specialize_transform 0.78% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.57% : 0.000003s : 20: predicate.switch_defer_inline 2.18% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.51% : 0.000011s : 73: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 0.87% : 0.000002s : 13: predicate.transpose_eliminate 1.54% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.65% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 2.02% : 0.000004s : 23: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 36: predicate.updatestate_pure_node_eliminater 3.10% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.47% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002213 20 64.21% : 0.001421s : 13: func_graph_cloner_run.FuncGraphClonerGraph 35.79% : 0.000792s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.150891 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.68% : 0.004047s : 1: add_attr 2.68% : 0.004038s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000044s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000070s : 1: auto_monad 0.01% : 0.000018s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.38% : 0.000566s : 1: bootstrap 0.02% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.30% : 0.000451s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.42% : 0.000628s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 0.85% : 0.001280s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000103s : 28: opt.transform.opt_b 0.03% : 0.000049s : 2: opt.transform.opt_trans_graph 0.02% : 0.000035s : 4: opt.transform.symbol_engine_opt 3.09% : 0.004668s : 1: opt_a 0.07% : 0.000104s : 1: opt_after_cconv 0.33% : 0.000490s : 1: opt_after_jit_grad 0.13% : 0.000202s : 1: opt_b 4.54% : 0.006857s : 1: optimize 0.01% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000006s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000036s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 1.17% : 0.001762s : 1: renormalize.infer 0.53% : 0.000804s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000020s : 1: rewriter_after_opt_a 0.15% : 0.000222s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000078s : 1: symbol_engine_optimizer 6.28% : 0.009473s : 1: task_emit 0.05% : 0.000077s : 1: tuple_transform 75.68% : 0.114195s : 1: type_inference 0.05% : 0.000073s : 1: validate TotalTime = 0.138618, [24] [bootstrap]: 0.00051633 [type_inference]: 0.116697 [event_method]: 2.048e-05 [auto_monad]: 6.927e-05 [graph_reusing]: 5.76e-06 [inline]: 2.43002e-06 [add_attr]: 0.00401006, [1] [add_attr_with_inline]: 0.00400166, [1] [Cycle 1]: 5.218e-05, [2] [tag_attr]: 1.869e-05 [meta_addattr_fg_expand]: 5.96e-06 [parallel-infer-symbol]: 3.85e-06 [pre_auto_parallel]: 3.085e-05 [insert-virtual-dataset]: 3.01999e-06 [parallel-infer-symbol-second]: 6.39993e-07 [dataset_repeat_opt]: 1.92999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00672326, [53] [py_interpret_to_execute]: 4.42998e-06 [rewriter_before_opt_a]: 0.00022654 [opt_a]: 0.00448699, [2] [Cycle 1]: 0.00386127, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 4.629e-05 [loop_unroll]: 3.378e-05 [a_1]: 0.00068019 [with_stream_mark]: 1.437e-05 [recompute_prepare]: 9.46998e-06 [updatestate_depend_eliminate]: 3.78001e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.76e-06 [a_2]: 9.289e-05 [accelerated_algorithm]: 7.56001e-06 [shard]: 2.10002e-06 [meta_shard_fg_expand]: 1.91998e-06 [shard_inline]: 7.21001e-06 [merge_send_recv]: 8.54002e-06 [auto_parallel]: 6.14001e-06 [parallel]: 1.782e-05 [flash_sp]: 6.73e-06 [merge_comm]: 3.66001e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 9.22999e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 8.52998e-06 [virtual_dataset]: 7.12002e-06 [get_grad_eliminate_]: 7.63001e-06 [virtual_output]: 6.93998e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 9.07001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.311e-05 [merge_recompute_call_nodes]: 1.74e-06 [before_grad]: 1.13e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61999e-06 [meta_fg_expand]: 2.89001e-06 [flash_sp_send_recv_attached]: 2.67001e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.033e-05 [a_after_grad]: 1.1e-05 [renormalize]: 0.00238611 [add_forward_monad_depend]: 5.67999e-06 [auto_monad_grad]: 1.85001e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 2.991e-05 [a_3]: 4.763e-05 [Cycle 2]: 0.00061529, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 7.31001e-06 [loop_unroll]: 6.06e-06 [a_1]: 0.00013007 [with_stream_mark]: 1.127e-05 [recompute_prepare]: 6.39001e-06 [updatestate_depend_eliminate]: 2.73998e-06 [updatestate_assign_eliminate]: 2.19001e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 8.40024e-07 [a_2]: 7.267e-05 [accelerated_algorithm]: 6.24001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 4.32e-06 [auto_parallel]: 5.49e-06 [parallel]: 4.52e-06 [flash_sp]: 3.21001e-06 [merge_comm]: 2.91999e-06 [allreduce_fusion]: 2.61e-06 [matmul_add_comm_reduction]: 5.24e-06 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 6.68998e-06 [virtual_dataset]: 5.82999e-06 [get_grad_eliminate_]: 5.66e-06 [virtual_output]: 5.69e-06 [merge_forward]: 2.53e-06 [cell_reuse_recompute_pass]: 1.05001e-06 [offload_activation]: 5.51998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.237e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 9.18002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.46001e-06 [meta_fg_expand]: 1.79e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.09e-06 [after_resolve]: 9.14e-06 [a_after_grad]: 9.12001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 6.09001e-06 [cse]: 1.495e-05 [a_3]: 3.576e-05 [py_interpret_to_execute_after_opt_a]: 4.24997e-06 [slice_cell_reuse_recomputed_activation]: 2.25002e-06 [rewriter_after_opt_a]: 1.676e-05 [convert_after_rewriter]: 1.15001e-06 [order_py_execute_after_rewriter]: 1.05001e-06 [mutable_eliminate]: 0.00064226 [opt_b]: 0.00019938, [1] [Cycle 1]: 0.0001929, [7] [b_1]: 0.00012178 [b_2]: 7.83001e-06 [updatestate_depend_eliminate]: 5.13002e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.09999e-06 [renormalize]: 3.80009e-07 [cse]: 2.082e-05 [optimize_parallel_all_gather_comm]: 1.532e-05 [overlap_param_gather]: 1.84e-06 [cconv]: 2.277e-05 [loop_unroll]: 0.00044709 [opt_after_cconv]: 0.00010018, [1] [Cycle 1]: 9.446e-05, [7] [c_1]: 3.097e-05 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 4.99003e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.68003e-06 [cse]: 1.904e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.484e-05 [tuple_transform]: 7.376e-05, [1] [Cycle 1]: 6.993e-05, [4] [d_1]: 4.378e-05 [none_parameter_eliminate]: 1.57999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.51999e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 4.153e-05 [cse_after_recomputation]: 2.327e-05, [1] [Cycle 1]: 1.921e-05, [1] [cse]: 1.368e-05 [environ_conv]: 7.11001e-06 [swap_dp_allreduce_reducescatter]: 5.05999e-06 [bias_add_comm_swap]: 2.55002e-06 [label_micro_interleaved_index]: 4.53001e-06 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.08002e-06 [assign_add_opt]: 1.10999e-06 [ForceFp32Comm]: 7.00005e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.78e-06 [reorder_send_recv_between_fp_bp]: 2.34999e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.04003e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.161e-05 [grouped_pairwise_exchange_alltoall]: 1.39e-06 [offloading_packed_experts]: 3.18e-06 [overlap_recompute_and_grad_model_parallel]: 5.04e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.46002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.01e-06 [overlap_grad_ring_attention]: 3.66999e-06 [overlap_grad_flash_sp]: 1.615e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 9.49978e-07 [symbol_engine_optimizer]: 7.694e-05, [1] [Cycle 1]: 7.228e-05, [6] [build]: 2.15002e-06 [elim_shapecalc]: 1.325e-05 [elim_not_effective]: 1.217e-05 [opt_reshape]: 7.6e-06 [fold_const_symbol]: 1.021e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.92999e-06 [pipeline_parallel_scheduler]: 1.30999e-06 [auto_monad_reorder]: 1.573e-05 [get_jit_bprop_graph]: 1.14e-06 [rewriter_after_jit_bprop_graph]: 3.36001e-06 [opt_after_jit_grad]: 0.00049045 [validate]: 4.059e-05 [backend_pass]: 9.99979e-07 [task_emit]: 0.00976402 [execute]: 7.32997e-06 Sums bootstrap : 0.000516s : 0.39% type_inference : 0.116697s : 87.37% event_method : 0.000020s : 0.02% auto_monad : 0.000069s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000031s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000227s : 0.17% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000054s : 0.04% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.000810s : 0.61% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000166s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.002386s : 1.79% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.02% optimize.opt_a.cse : 0.000045s : 0.03% optimize.opt_a.a_3 : 0.000083s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000642s : 0.48% optimize.opt_b.b_1 : 0.000122s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.02% optimize.loop_unroll : 0.000447s : 0.33% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000490s : 0.37% validate : 0.000041s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.009764s : 7.31% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000161 25 0.98% : 0.000002s : 2: substitution.elim_not_effective 1.06% : 0.000002s : 2: substitution.fold_const_symbol 3.41% : 0.000005s : 4: substitution.graph_param_transform 80.65% : 0.000130s : 5: substitution.inline 1.88% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.75% : 0.000004s : 4: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param 7.45% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.116624 2 97.71% : 0.113949s : 1: type_inference.infer 2.29% : 0.002675s : 1: type_inference.specialize ------[replace.] 0.000063 7 73.40% : 0.000047s : 5: replace.inline 26.60% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000137 7 92.45% : 0.000127s : 5: match.inline 7.55% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000196 1267 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 1.11% : 0.000002s : 13: predicate.addn_zero_filter 0.86% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.53% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.22% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.87% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_depend_swap 1.77% : 0.000003s : 25: predicate.environ_get_eliminate 1.11% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.24% : 0.000004s : 20: predicate.float_depend_g_call 0.58% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 5.64% : 0.000011s : 57: predicate.inline 0.73% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 23: predicate.list_to_tuple_eliminator_ 2.28% : 0.000004s : 36: predicate.load_eliminater 1.01% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.69% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.61% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 13: predicate.minmaximum_grad 1.18% : 0.000002s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 20: predicate.partial_defer_inline 1.36% : 0.000003s : 19: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 13: predicate.reduce_eliminate 2.53% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.36% : 0.000003s : 23: predicate.replace_applicator 0.53% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 1.22% : 0.000002s : 13: predicate.reshape_eliminate 0.80% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000001s : 8: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 8: predicate.shard_identity_eliminate 0.79% : 0.000002s : 8: predicate.special_op_eliminate 0.71% : 0.000001s : 8: predicate.specialize_transform 0.85% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 20: predicate.switch_defer_inline 2.04% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.50% : 0.000011s : 73: predicate.switch_simplify 0.90% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.84% : 0.000004s : 23: predicate.tuple_to_list_eliminator_ 2.32% : 0.000005s : 36: predicate.updatestate_pure_node_eliminater 3.08% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.31% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002098 20 64.09% : 0.001345s : 13: func_graph_cloner_run.FuncGraphClonerGraph 35.91% : 0.000753s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.153167 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.62% : 0.004015s : 1: add_attr 2.61% : 0.004005s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000045s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000075s : 1: auto_monad 0.01% : 0.000019s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.36% : 0.000545s : 1: bootstrap 0.02% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.30% : 0.000456s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.43% : 0.000652s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.82% : 0.001248s : 78: opt.transform.opt_a 0.02% : 0.000029s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000103s : 28: opt.transform.opt_b 0.03% : 0.000049s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 2.93% : 0.004490s : 1: opt_a 0.07% : 0.000104s : 1: opt_after_cconv 0.33% : 0.000500s : 1: opt_after_jit_grad 0.13% : 0.000203s : 1: opt_b 4.39% : 0.006728s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000006s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000035s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 1.04% : 0.001590s : 1: renormalize.infer 0.51% : 0.000787s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000020s : 1: rewriter_after_opt_a 0.15% : 0.000232s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000080s : 1: symbol_engine_optimizer 6.38% : 0.009775s : 1: task_emit 0.05% : 0.000076s : 1: tuple_transform 76.20% : 0.116718s : 1: type_inference 0.05% : 0.000069s : 1: validate TotalTime = 0.141263, [24] [bootstrap]: 0.00049908 [type_inference]: 0.11737 [event_method]: 2.041e-05 [auto_monad]: 6.767e-05 [graph_reusing]: 5.82999e-06 [inline]: 2.04e-06 [add_attr]: 0.00409116, [1] [add_attr_with_inline]: 0.00408299, [1] [Cycle 1]: 5.56e-05, [2] [tag_attr]: 2.047e-05 [meta_addattr_fg_expand]: 5.44998e-06 [parallel-infer-symbol]: 4.28001e-06 [pre_auto_parallel]: 3.14e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.52999e-06 [optimize]: 0.00679688, [53] [py_interpret_to_execute]: 4.4e-06 [rewriter_before_opt_a]: 0.00022402 [opt_a]: 0.00450492, [2] [Cycle 1]: 0.00387612, [45] [expand_dump_flag]: 3.99002e-06 [switch_simplify]: 4.493e-05 [loop_unroll]: 3.393e-05 [a_1]: 0.00069026 [with_stream_mark]: 1.473e-05 [recompute_prepare]: 9.65002e-06 [updatestate_depend_eliminate]: 4.05e-06 [updatestate_assign_eliminate]: 3.21999e-06 [updatestate_loads_eliminate]: 3.08e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 0.00012825 [accelerated_algorithm]: 8.54998e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 2.04999e-06 [shard_inline]: 7.36001e-06 [merge_send_recv]: 8.14002e-06 [auto_parallel]: 5.91e-06 [parallel]: 1.809e-05 [flash_sp]: 7.26001e-06 [merge_comm]: 3.48999e-06 [allreduce_fusion]: 3.48e-06 [matmul_add_comm_reduction]: 8.72e-06 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 1.025e-05 [virtual_dataset]: 8.37e-06 [get_grad_eliminate_]: 9.39998e-06 [virtual_output]: 9.29e-06 [merge_forward]: 3.98001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 9.36002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.347e-05 [merge_recompute_call_nodes]: 1.55001e-06 [before_grad]: 1.141e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 2.73003e-06 [receive_attached]: 2.58e-06 [after_resolve]: 1.113e-05 [a_after_grad]: 1.105e-05 [renormalize]: 0.00240203 [add_forward_monad_depend]: 5.22999e-06 [auto_monad_grad]: 2.10002e-06 [auto_monad_eliminator]: 1.486e-05 [cse]: 3.008e-05 [a_3]: 4.691e-05 [Cycle 2]: 0.00061864, [45] [expand_dump_flag]: 1.24e-06 [switch_simplify]: 7.55e-06 [loop_unroll]: 6.61999e-06 [a_1]: 0.00012888 [with_stream_mark]: 1.062e-05 [recompute_prepare]: 6.69999e-06 [updatestate_depend_eliminate]: 3.04001e-06 [updatestate_assign_eliminate]: 2.21e-06 [updatestate_loads_eliminate]: 2.74001e-06 [parameter_eliminate]: 8.90024e-07 [a_2]: 7.141e-05 [accelerated_algorithm]: 6.24001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.23002e-06 [shard_inline]: 6.34999e-06 [merge_send_recv]: 4.35999e-06 [auto_parallel]: 5.29e-06 [parallel]: 4.18001e-06 [flash_sp]: 2.94999e-06 [merge_comm]: 3.23998e-06 [allreduce_fusion]: 2.57001e-06 [matmul_add_comm_reduction]: 5.64e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.78e-06 [virtual_dataset]: 6.01e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.86998e-06 [merge_forward]: 2.66999e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 5.49e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.237e-05 [merge_recompute_call_nodes]: 6.99976e-07 [before_grad]: 9.64e-06 [set_forward_comm_id_for_comm_node_pass]: 3.70998e-06 [meta_fg_expand]: 1.91e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.19e-06 [after_resolve]: 9.27999e-06 [a_after_grad]: 8.87e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.01997e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 6.11e-06 [cse]: 1.383e-05 [a_3]: 3.609e-05 [py_interpret_to_execute_after_opt_a]: 4.63999e-06 [slice_cell_reuse_recomputed_activation]: 2.34001e-06 [rewriter_after_opt_a]: 1.645e-05 [convert_after_rewriter]: 1.45001e-06 [order_py_execute_after_rewriter]: 1.39e-06 [mutable_eliminate]: 0.00067347 [opt_b]: 0.00020052, [1] [Cycle 1]: 0.00019444, [7] [b_1]: 0.00012338 [b_2]: 8.28001e-06 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 2.31998e-06 [updatestate_loads_eliminate]: 2.11e-06 [renormalize]: 3.09985e-07 [cse]: 1.994e-05 [optimize_parallel_all_gather_comm]: 1.527e-05 [overlap_param_gather]: 1.87999e-06 [cconv]: 2.321e-05 [loop_unroll]: 0.00046052 [opt_after_cconv]: 0.00010181, [1] [Cycle 1]: 9.607e-05, [7] [c_1]: 3.153e-05 [parameter_eliminate]: 2.49001e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.68e-06 [cse]: 1.965e-05 [renormalize]: 2.60014e-07 [remove_dup_value]: 1.554e-05 [tuple_transform]: 7.465e-05, [1] [Cycle 1]: 7.034e-05, [4] [d_1]: 4.404e-05 [none_parameter_eliminate]: 1.66002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.34002e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 4.156e-05 [cse_after_recomputation]: 2.197e-05, [1] [Cycle 1]: 1.81e-05, [1] [cse]: 1.279e-05 [environ_conv]: 7.56999e-06 [swap_dp_allreduce_reducescatter]: 4.96002e-06 [bias_add_comm_swap]: 2.43e-06 [label_micro_interleaved_index]: 4.28999e-06 [label_fine_grained_interleaved_index]: 2.89001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.05002e-06 [micro_interleaved_order_control]: 2.94999e-06 [assign_add_opt]: 1.32999e-06 [ForceFp32Comm]: 7.39994e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.37999e-06 [add_comm_op_reuse_tag]: 1.03001e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.631e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 3.53e-06 [overlap_recompute_and_grad_model_parallel]: 4.54002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.59e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 3.65998e-06 [overlap_grad_flash_sp]: 1.494e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.19999e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 7.766e-05, [1] [Cycle 1]: 7.295e-05, [6] [build]: 2.32999e-06 [elim_shapecalc]: 1.119e-05 [elim_not_effective]: 1.35e-05 [opt_reshape]: 7.71001e-06 [fold_const_symbol]: 1.072e-05 [renormalize]: 2.29978e-07 [detach_backward]: 1.74998e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.596e-05 [get_jit_bprop_graph]: 9.39996e-07 [rewriter_after_jit_bprop_graph]: 4.85001e-06 [opt_after_jit_grad]: 0.00066022 [validate]: 4.176e-05 [backend_pass]: 9.09989e-07 [task_emit]: 0.0114254 [execute]: 6.63003e-06 Sums bootstrap : 0.000499s : 0.37% type_inference : 0.117370s : 86.19% event_method : 0.000020s : 0.01% auto_monad : 0.000068s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000005s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000031s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000224s : 0.16% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.04% optimize.opt_a.loop_unroll : 0.000041s : 0.03% optimize.opt_a.a_1 : 0.000819s : 0.60% optimize.opt_a.with_stream_mark : 0.000025s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000200s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000015s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.002402s : 1.76% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.02% optimize.opt_a.cse : 0.000044s : 0.03% optimize.opt_a.a_3 : 0.000083s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000673s : 0.49% optimize.opt_b.b_1 : 0.000123s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.02% optimize.loop_unroll : 0.000461s : 0.34% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000015s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000660s : 0.48% validate : 0.000042s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.011425s : 8.39% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000164 25 1.03% : 0.000002s : 2: substitution.elim_not_effective 1.08% : 0.000002s : 2: substitution.fold_const_symbol 3.48% : 0.000006s : 4: substitution.graph_param_transform 80.13% : 0.000131s : 5: substitution.inline 1.85% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.86% : 0.000005s : 4: substitution.remove_not_recompute_node 2.12% : 0.000003s : 2: substitution.replace_old_param 7.45% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.117296 2 97.70% : 0.114593s : 1: type_inference.infer 2.30% : 0.002703s : 1: type_inference.specialize ------[replace.] 0.000064 7 73.64% : 0.000047s : 5: replace.inline 26.36% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000139 7 92.37% : 0.000128s : 5: match.inline 7.63% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000196 1267 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.90% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 21: predicate.arithmetic_simplify 0.92% : 0.000002s : 13: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.86% : 0.000002s : 13: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 4: predicate.elim_not_effective 0.32% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_depend_swap 1.80% : 0.000004s : 25: predicate.environ_get_eliminate 1.20% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.37% : 0.000005s : 20: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.79% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.51% : 0.000001s : 8: predicate.incorporate_call_switch 5.56% : 0.000011s : 57: predicate.inline 0.74% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 8: predicate.less_batch_normalization 1.85% : 0.000004s : 23: predicate.list_to_tuple_eliminator_ 2.48% : 0.000005s : 36: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.81% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.64% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 13: predicate.minmaximum_grad 1.15% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.92% : 0.000004s : 20: predicate.partial_defer_inline 1.34% : 0.000003s : 19: predicate.partial_eliminate 0.91% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.18% : 0.000002s : 13: predicate.reduce_eliminate 2.36% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000003s : 23: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 8: predicate.shard_identity_eliminate 0.80% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.73% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 20: predicate.switch_defer_inline 2.10% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.57% : 0.000011s : 73: predicate.switch_simplify 0.94% : 0.000002s : 13: predicate.tile_eliminate 0.94% : 0.000002s : 13: predicate.transpose_eliminate 1.62% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.65% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000005s : 29: predicate.tuple_list_set_item_eliminator 1.62% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.22% : 0.000004s : 36: predicate.updatestate_pure_node_eliminater 3.12% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.73% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002130 20 63.18% : 0.001346s : 13: func_graph_cloner_run.FuncGraphClonerGraph 36.82% : 0.000784s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.156176 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.62% : 0.004096s : 1: add_attr 2.62% : 0.004086s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000046s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.05% : 0.000073s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.34% : 0.000526s : 1: bootstrap 0.02% : 0.000027s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.02% : 0.000027s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.30% : 0.000470s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000011s : 1: micro_interleaved_order_control 0.44% : 0.000684s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.81% : 0.001269s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.13% : 0.000197s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000104s : 28: opt.transform.opt_b 0.03% : 0.000049s : 2: opt.transform.opt_trans_graph 0.03% : 0.000039s : 4: opt.transform.symbol_engine_opt 2.89% : 0.004508s : 1: opt_a 0.07% : 0.000106s : 1: opt_after_cconv 0.43% : 0.000672s : 1: opt_after_jit_grad 0.13% : 0.000204s : 1: opt_b 4.35% : 0.006801s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000018s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000006s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000036s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 1.01% : 0.001585s : 1: renormalize.infer 0.52% : 0.000808s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000020s : 1: rewriter_after_opt_a 0.15% : 0.000230s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000080s : 1: symbol_engine_optimizer 7.32% : 0.011437s : 1: task_emit 0.05% : 0.000077s : 1: tuple_transform 75.17% : 0.117390s : 1: type_inference 0.04% : 0.000070s : 1: validate [WARNING] SESSION(15774,ffffbf434f30,python3.9):2026-01-29-17:47:05.432.186 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/Index-op0, the new abstract of Expand/_Index/InnerIndex-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 5, 6, 7), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractTensor(shape: (-2), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny) TotalTime = 0.148338, [24] [bootstrap]: 0.00068702 [type_inference]: 0.124305 [event_method]: 2.053e-05 [auto_monad]: 6.551e-05 [graph_reusing]: 5.90002e-06 [inline]: 2.70002e-06 [add_attr]: 0.00444315, [1] [add_attr_with_inline]: 0.0044334, [1] [Cycle 1]: 5.285e-05, [2] [tag_attr]: 1.918e-05 [meta_addattr_fg_expand]: 5.91998e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 3.099e-05 [insert-virtual-dataset]: 2.98e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.00705505, [53] [py_interpret_to_execute]: 4.82e-06 [rewriter_before_opt_a]: 0.00019859 [opt_a]: 0.00462796, [2] [Cycle 1]: 0.00400802, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.472e-05 [loop_unroll]: 3.323e-05 [a_1]: 0.00069173 [with_stream_mark]: 1.468e-05 [recompute_prepare]: 9.70002e-06 [updatestate_depend_eliminate]: 4.71002e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 1.67001e-06 [a_2]: 9.127e-05 [accelerated_algorithm]: 7.58999e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 6.62002e-06 [merge_send_recv]: 8.16002e-06 [auto_parallel]: 5.57999e-06 [parallel]: 2.154e-05 [flash_sp]: 7.01001e-06 [merge_comm]: 4.49002e-06 [allreduce_fusion]: 3.04999e-06 [matmul_add_comm_reduction]: 8.92999e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.93999e-06 [virtual_dataset]: 7.4e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 7.87003e-06 [merge_forward]: 3.80998e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 9.02e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.276e-05 [merge_recompute_call_nodes]: 1.28002e-06 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 3.54002e-06 [meta_fg_expand]: 2.83998e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.036e-05 [a_after_grad]: 1.167e-05 [renormalize]: 0.00258244 [add_forward_monad_depend]: 5.44998e-06 [auto_monad_grad]: 2.50997e-06 [auto_monad_eliminator]: 1.573e-05 [cse]: 2.807e-05 [a_3]: 4.723e-05 [Cycle 2]: 0.00060915, [45] [expand_dump_flag]: 1.15999e-06 [switch_simplify]: 7.33999e-06 [loop_unroll]: 6.11e-06 [a_1]: 0.000128 [with_stream_mark]: 1.111e-05 [recompute_prepare]: 6.58998e-06 [updatestate_depend_eliminate]: 3.01999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 8.2e-07 [a_2]: 7.112e-05 [accelerated_algorithm]: 6.04999e-06 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.38e-06 [merge_send_recv]: 4.52e-06 [auto_parallel]: 4.90999e-06 [parallel]: 4.48001e-06 [flash_sp]: 2.84001e-06 [merge_comm]: 2.98998e-06 [allreduce_fusion]: 2.53e-06 [matmul_add_comm_reduction]: 5.14e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.61e-06 [virtual_dataset]: 5.84e-06 [get_grad_eliminate_]: 5.81e-06 [virtual_output]: 5.53002e-06 [merge_forward]: 3.08998e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 5.96998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.223e-05 [merge_recompute_call_nodes]: 6.60017e-07 [before_grad]: 8.67e-06 [set_forward_comm_id_for_comm_node_pass]: 3.4e-06 [meta_fg_expand]: 1.94e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 8.85999e-06 [a_after_grad]: 9.19998e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 7.10017e-07 [auto_monad_eliminator]: 5.77999e-06 [cse]: 1.338e-05 [a_3]: 3.55e-05 [py_interpret_to_execute_after_opt_a]: 4.05e-06 [slice_cell_reuse_recomputed_activation]: 2.07001e-06 [rewriter_after_opt_a]: 1.554e-05 [convert_after_rewriter]: 1.19003e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00049582 [opt_b]: 0.00019635, [1] [Cycle 1]: 0.0001899, [7] [b_1]: 0.00012105 [b_2]: 7.48999e-06 [updatestate_depend_eliminate]: 4.90001e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.17999e-06 [renormalize]: 4.89992e-07 [cse]: 1.883e-05 [optimize_parallel_all_gather_comm]: 1.482e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 2.25e-05 [loop_unroll]: 0.00051002 [opt_after_cconv]: 9.931e-05, [1] [Cycle 1]: 9.362e-05, [7] [c_1]: 3.092e-05 [parameter_eliminate]: 2.23998e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 2.27999e-06 [updatestate_loads_eliminate]: 2.74999e-06 [cse]: 1.75e-05 [renormalize]: 3.4002e-07 [remove_dup_value]: 1.334e-05 [tuple_transform]: 7.26e-05, [1] [Cycle 1]: 6.872e-05, [4] [d_1]: 4.213e-05 [none_parameter_eliminate]: 1.75001e-06 [renormalize]: 1.19995e-07 [switch_simplify]: 6.94999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 4.534e-05 [cse_after_recomputation]: 2.142e-05, [1] [Cycle 1]: 1.756e-05, [1] [cse]: 1.241e-05 [environ_conv]: 8.02998e-06 [swap_dp_allreduce_reducescatter]: 4.82e-06 [bias_add_comm_swap]: 2.37999e-06 [label_micro_interleaved_index]: 4.67e-06 [label_fine_grained_interleaved_index]: 2.38998e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 1.94999e-06 [micro_interleaved_order_control]: 2.31e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.05001e-06 [full_micro_interleaved_order_control]: 3.44001e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.20999e-06 [add_comm_op_reuse_tag]: 1.09003e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.12e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.372e-05 [grouped_pairwise_exchange_alltoall]: 1.42999e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 4.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 3.88001e-06 [overlap_grad_flash_sp]: 1.664e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 1.62999e-06 [handle_group_info]: 9.49978e-07 [symbol_engine_optimizer]: 0.00020113, [1] [Cycle 1]: 0.00019535, [6] [build]: 9.651e-05 [elim_shapecalc]: 1.422e-05 [elim_not_effective]: 2.671e-05 [opt_reshape]: 7.94002e-06 [fold_const_symbol]: 1.458e-05 [renormalize]: 2.59985e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.63002e-06 [auto_monad_reorder]: 1.694e-05 [get_jit_bprop_graph]: 1.12999e-06 [rewriter_after_jit_bprop_graph]: 3.25998e-06 [opt_after_jit_grad]: 0.00048991 [validate]: 3.794e-05 [backend_pass]: 9.60019e-07 [task_emit]: 0.0109406 [execute]: 7.55e-06 Sums bootstrap : 0.000687s : 0.48% type_inference : 0.124305s : 87.10% event_method : 0.000021s : 0.01% auto_monad : 0.000066s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000031s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000199s : 0.14% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.04% optimize.opt_a.loop_unroll : 0.000039s : 0.03% optimize.opt_a.a_1 : 0.000820s : 0.57% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000162s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000010s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.002583s : 1.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.02% optimize.opt_a.cse : 0.000041s : 0.03% optimize.opt_a.a_3 : 0.000083s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000496s : 0.35% optimize.opt_b.b_1 : 0.000121s : 0.08% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.02% optimize.loop_unroll : 0.000510s : 0.36% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.01% optimize.tuple_transform.d_1 : 0.000042s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000045s : 0.03% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000017s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000097s : 0.07% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000490s : 0.34% validate : 0.000038s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.010941s : 7.67% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000176 25 8.22% : 0.000014s : 2: substitution.elim_not_effective 3.05% : 0.000005s : 2: substitution.fold_const_symbol 2.93% : 0.000005s : 4: substitution.graph_param_transform 73.08% : 0.000129s : 5: substitution.inline 2.16% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000004s : 4: substitution.remove_not_recompute_node 1.64% : 0.000003s : 2: substitution.replace_old_param 6.51% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.124228 2 97.88% : 0.121597s : 1: type_inference.infer 2.12% : 0.002631s : 1: type_inference.specialize ------[replace.] 0.000064 7 74.80% : 0.000048s : 5: replace.inline 25.20% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000135 7 92.67% : 0.000125s : 5: match.inline 7.33% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000190 1267 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 0.89% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.23% : 0.000004s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.20% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.13% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.90% : 0.000004s : 25: predicate.environ_get_eliminate 1.19% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.43% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.24% : 0.000004s : 20: predicate.float_depend_g_call 0.54% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.50% : 0.000001s : 8: predicate.incorporate_call_switch 5.91% : 0.000011s : 57: predicate.inline 0.74% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 36: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.83% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.56% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.04% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.42% : 0.000001s : 4: predicate.parallel_virtual_node 1.87% : 0.000004s : 20: predicate.partial_defer_inline 1.39% : 0.000003s : 19: predicate.partial_eliminate 1.04% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000002s : 13: predicate.reduce_eliminate 2.35% : 0.000004s : 36: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 23: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.41% : 0.000001s : 4: predicate.row_tensor_eliminate 0.68% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.75% : 0.000001s : 8: predicate.special_op_eliminate 0.70% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.47% : 0.000003s : 20: predicate.switch_defer_inline 2.14% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.53% : 0.000010s : 73: predicate.switch_simplify 0.92% : 0.000002s : 13: predicate.tile_eliminate 0.90% : 0.000002s : 13: predicate.transpose_eliminate 1.48% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.26% : 0.000004s : 36: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.54% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.74% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002773 20 66.51% : 0.001845s : 13: func_graph_cloner_run.FuncGraphClonerGraph 33.49% : 0.000929s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.163865 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.71% : 0.004448s : 1: add_attr 2.71% : 0.004437s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000050s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000071s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.44% : 0.000714s : 1: bootstrap 0.02% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.32% : 0.000519s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.31% : 0.000505s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 0.76% : 0.001253s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000101s : 28: opt.transform.opt_b 0.03% : 0.000047s : 2: opt.transform.opt_trans_graph 0.04% : 0.000059s : 4: opt.transform.symbol_engine_opt 2.83% : 0.004631s : 1: opt_a 0.06% : 0.000103s : 1: opt_after_cconv 0.31% : 0.000500s : 1: opt_after_jit_grad 0.12% : 0.000200s : 1: opt_b 4.31% : 0.007060s : 1: optimize 0.01% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000035s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.11% : 0.000181s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 1.01% : 0.001650s : 1: renormalize.infer 0.56% : 0.000924s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.12% : 0.000205s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000204s : 1: symbol_engine_optimizer 6.69% : 0.010955s : 1: task_emit 0.05% : 0.000075s : 1: tuple_transform 75.87% : 0.124324s : 1: type_inference 0.04% : 0.000070s : 1: validate [WARNING] SESSION(15774,ffffbf434f30,python3.9):2026-01-29-17:47:05.810.232 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/Index-op0, the new abstract of Expand/_Index/InnerIndex-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 5, 6, 7), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractTensor(shape: (-2), element: AbstractScalar(Type: Float64, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny) TotalTime = 0.13962, [24] [bootstrap]: 0.00050426 [type_inference]: 0.117155 [event_method]: 2.075e-05 [auto_monad]: 6.596e-05 [graph_reusing]: 6.29001e-06 [inline]: 1.86e-06 [add_attr]: 0.00395333, [1] [add_attr_with_inline]: 0.00394526, [1] [Cycle 1]: 5.284e-05, [2] [tag_attr]: 1.96e-05 [meta_addattr_fg_expand]: 5.57001e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 3.248e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.48002e-06 [optimize]: 0.00668267, [53] [py_interpret_to_execute]: 4.97e-06 [rewriter_before_opt_a]: 0.00019474 [opt_a]: 0.00459633, [2] [Cycle 1]: 0.00397483, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 4.419e-05 [loop_unroll]: 3.365e-05 [a_1]: 0.00067919 [with_stream_mark]: 1.522e-05 [recompute_prepare]: 9.69e-06 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.64999e-06 [parameter_eliminate]: 1.74998e-06 [a_2]: 9.13e-05 [accelerated_algorithm]: 7.63001e-06 [shard]: 1.65001e-06 [meta_shard_fg_expand]: 2.54999e-06 [shard_inline]: 6.79999e-06 [merge_send_recv]: 8.02998e-06 [auto_parallel]: 5.79999e-06 [parallel]: 1.738e-05 [flash_sp]: 6.88e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 8.89e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 8.11002e-06 [virtual_dataset]: 6.82002e-06 [get_grad_eliminate_]: 6.81999e-06 [virtual_output]: 7.54002e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.03001e-06 [offload_activation]: 8.72e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.252e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 3.33e-06 [meta_fg_expand]: 2.77002e-06 [flash_sp_send_recv_attached]: 2.44999e-06 [receive_attached]: 2.33002e-06 [after_resolve]: 1.023e-05 [a_after_grad]: 1.191e-05 [renormalize]: 0.00257298 [add_forward_monad_depend]: 5.11002e-06 [auto_monad_grad]: 1.67001e-06 [auto_monad_eliminator]: 1.521e-05 [cse]: 2.837e-05 [a_3]: 4.65e-05 [Cycle 2]: 0.00061119, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 7.63001e-06 [loop_unroll]: 6.06e-06 [a_1]: 0.0001285 [with_stream_mark]: 1.058e-05 [recompute_prepare]: 6.13998e-06 [updatestate_depend_eliminate]: 2.88e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 7.118e-05 [accelerated_algorithm]: 6.04999e-06 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 1.24003e-06 [shard_inline]: 6.34999e-06 [merge_send_recv]: 4.15e-06 [auto_parallel]: 4.94e-06 [parallel]: 4.16001e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 2.79999e-06 [allreduce_fusion]: 3.14001e-06 [matmul_add_comm_reduction]: 5.07e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 6.80002e-06 [virtual_dataset]: 5.92999e-06 [get_grad_eliminate_]: 5.89e-06 [virtual_output]: 5.58002e-06 [merge_forward]: 2.67001e-06 [cell_reuse_recompute_pass]: 1.02e-06 [offload_activation]: 5.64998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.215e-05 [merge_recompute_call_nodes]: 8.39995e-07 [before_grad]: 8.75001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.57997e-06 [meta_fg_expand]: 1.97001e-06 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.25001e-06 [after_resolve]: 8.88002e-06 [a_after_grad]: 9.07999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.16997e-06 [auto_monad_grad]: 7.60017e-07 [auto_monad_eliminator]: 5.79e-06 [cse]: 1.396e-05 [a_3]: 3.565e-05 [py_interpret_to_execute_after_opt_a]: 4.14002e-06 [slice_cell_reuse_recomputed_activation]: 1.73002e-06 [rewriter_after_opt_a]: 1.606e-05 [convert_after_rewriter]: 1.12e-06 [order_py_execute_after_rewriter]: 1.06002e-06 [mutable_eliminate]: 0.0004905 [opt_b]: 0.00019794, [1] [Cycle 1]: 0.00019159, [7] [b_1]: 0.00012185 [b_2]: 7.92e-06 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 2.26998e-06 [updatestate_loads_eliminate]: 2.10002e-06 [renormalize]: 4.60015e-07 [cse]: 1.952e-05 [optimize_parallel_all_gather_comm]: 1.615e-05 [overlap_param_gather]: 2.12001e-06 [cconv]: 2.192e-05 [loop_unroll]: 0.00043928 [opt_after_cconv]: 0.00010006, [1] [Cycle 1]: 9.438e-05, [7] [c_1]: 3.118e-05 [parameter_eliminate]: 2.43998e-06 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 2.29999e-06 [updatestate_loads_eliminate]: 2.51998e-06 [cse]: 1.794e-05 [renormalize]: 5.3001e-07 [remove_dup_value]: 1.426e-05 [tuple_transform]: 7.683e-05, [1] [Cycle 1]: 7.254e-05, [4] [d_1]: 4.402e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.83001e-06 [partial_unused_args_eliminate]: 1.52001e-06 [add_recomputation]: 4.152e-05 [cse_after_recomputation]: 2.241e-05, [1] [Cycle 1]: 1.824e-05, [1] [cse]: 1.303e-05 [environ_conv]: 6.18998e-06 [swap_dp_allreduce_reducescatter]: 4.70001e-06 [bias_add_comm_swap]: 2.22001e-06 [label_micro_interleaved_index]: 4.20999e-06 [label_fine_grained_interleaved_index]: 2.48e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 1.85001e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.02998e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.10999e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.30999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.63002e-06 [control_data_broadcast_order]: 1.131e-05 [grouped_pairwise_exchange_alltoall]: 1.43002e-06 [offloading_packed_experts]: 3.71999e-06 [overlap_recompute_and_grad_model_parallel]: 4.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.27999e-06 [overlap_recompute_comm]: 2.25002e-06 [overlap_grad_ring_attention]: 3.83001e-06 [overlap_grad_flash_sp]: 1.574e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.19998e-06 [symbol_engine_optimizer]: 0.00012828, [1] [Cycle 1]: 0.00012356, [6] [build]: 4.839e-05 [elim_shapecalc]: 1.007e-05 [elim_not_effective]: 1.37e-05 [opt_reshape]: 7.50998e-06 [fold_const_symbol]: 1.445e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.39e-06 [pipeline_parallel_scheduler]: 1.35001e-06 [auto_monad_reorder]: 1.545e-05 [get_jit_bprop_graph]: 1.01002e-06 [rewriter_after_jit_bprop_graph]: 2.93998e-06 [opt_after_jit_grad]: 0.00048807 [validate]: 3.908e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.0104289 [execute]: 6.16e-06 Sums bootstrap : 0.000504s : 0.37% type_inference : 0.117155s : 86.98% event_method : 0.000021s : 0.02% auto_monad : 0.000066s : 0.05% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000032s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000195s : 0.14% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000052s : 0.04% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.000808s : 0.60% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000162s : 0.12% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000012s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000014s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.002573s : 1.91% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.02% optimize.opt_a.cse : 0.000042s : 0.03% optimize.opt_a.a_3 : 0.000082s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000491s : 0.36% optimize.opt_b.b_1 : 0.000122s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.02% optimize.loop_unroll : 0.000439s : 0.33% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000014s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000006s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000048s : 0.04% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000015s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000488s : 0.36% validate : 0.000039s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.010429s : 7.74% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.000169 25 1.80% : 0.000003s : 2: substitution.elim_not_effective 3.43% : 0.000006s : 2: substitution.fold_const_symbol 3.59% : 0.000006s : 4: substitution.graph_param_transform 77.39% : 0.000131s : 5: substitution.inline 2.36% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.57% : 0.000004s : 4: substitution.remove_not_recompute_node 1.69% : 0.000003s : 2: substitution.replace_old_param 7.17% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.117083 2 97.80% : 0.114505s : 1: type_inference.infer 2.20% : 0.002578s : 1: type_inference.specialize ------[replace.] 0.000063 7 72.47% : 0.000046s : 5: replace.inline 27.53% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000138 7 92.43% : 0.000128s : 5: match.inline 7.57% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000194 1267 0.96% : 0.000002s : 13: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.15% : 0.000004s : 21: predicate.arithmetic_simplify 1.00% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.55% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.19% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 17: predicate.environ_get_depend_swap 1.85% : 0.000004s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.16% : 0.000004s : 20: predicate.float_depend_g_call 0.57% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.48% : 0.000001s : 8: predicate.incorporate_call_switch 5.82% : 0.000011s : 57: predicate.inline 0.77% : 0.000001s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000003s : 23: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 36: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.75% : 0.000005s : 41: predicate.loop_unroll_before_grad 1.81% : 0.000004s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.62% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 13: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.39% : 0.000001s : 4: predicate.parallel_virtual_node 2.00% : 0.000004s : 20: predicate.partial_defer_inline 1.40% : 0.000003s : 19: predicate.partial_eliminate 0.95% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 13: predicate.reduce_eliminate 2.40% : 0.000005s : 36: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 23: predicate.replace_applicator 0.42% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 13: predicate.reshape_eliminate 0.68% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.44% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 8: predicate.shard_identity_eliminate 0.76% : 0.000001s : 8: predicate.special_op_eliminate 0.65% : 0.000001s : 8: predicate.specialize_transform 0.81% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 20: predicate.switch_defer_inline 2.20% : 0.000004s : 28: predicate.switch_layer_defer_inline 5.45% : 0.000011s : 73: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.91% : 0.000002s : 13: predicate.transpose_eliminate 1.70% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.65% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 3.02% : 0.000006s : 31: predicate.tuple_list_get_item_eliminator 1.63% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.72% : 0.000003s : 23: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 36: predicate.updatestate_pure_node_eliminater 3.11% : 0.000006s : 44: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002241 20 57.43% : 0.001287s : 13: func_graph_cloner_run.FuncGraphClonerGraph 42.57% : 0.000954s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.154255 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.57% : 0.003958s : 1: add_attr 2.56% : 0.003949s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000045s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.05% : 0.000071s : 1: auto_monad 0.01% : 0.000019s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.35% : 0.000533s : 1: bootstrap 0.02% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.02% : 0.000026s : 1: event_method 0.01% : 0.000011s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.29% : 0.000448s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.32% : 0.000499s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000014s : 1: opt.transform.mutable_eliminate 0.80% : 0.001239s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000102s : 28: opt.transform.opt_b 0.03% : 0.000050s : 2: opt.transform.opt_trans_graph 0.03% : 0.000042s : 4: opt.transform.symbol_engine_opt 2.98% : 0.004600s : 1: opt_a 0.07% : 0.000104s : 1: opt_after_cconv 0.32% : 0.000498s : 1: opt_after_jit_grad 0.13% : 0.000201s : 1: opt_b 4.33% : 0.006687s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000036s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 1.04% : 0.001597s : 1: renormalize.infer 0.63% : 0.000967s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.13% : 0.000201s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000131s : 1: symbol_engine_optimizer 6.77% : 0.010440s : 1: task_emit 0.05% : 0.000080s : 1: tuple_transform 75.96% : 0.117174s : 1: type_inference 0.04% : 0.000069s : 1: validate [ERROR] ANALYZER(15774,ffffbf434f30,python3.9):2026-01-29-17:47:06.154.298 [mindspore/ccsrc/frontend/jit/ps/static_analysis/evaluator.cc:724] Run] Primitive: infer failed, failed info: For 'Index', too many indices for tensor of dimension 5 (got 6) ---------------------------------------------------- - C++ Call Stack: (For framework developers) ---------------------------------------------------- mindspore/ops/infer/ops_func_impl//index.cc:77 CheckAndCalOutputShapeInTupleCase ---------------------------------------------------- - The Traceback of Net Construct Code: ---------------------------------------------------- # 0 In file /home/jenkins/mindspore/testcases/testcases/tests/st/utils/test_utils.py:42, 15~43 return self.func(*inputs, **kwargs) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~ # 1 In file /home/jenkins/mindspore/testcases/testcases/tests/st/ops/test_ops_index.py:31, 11~28 return index(x, indices) ^~~~~~~~~~~~~~~~~ # 2 In file /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/auto_generate/gen_ops_def.py:6944, 11~35 return index_op(input, indices) ^~~~~~~~~~~~~~~~~~~~~~~~ (See file '/home/jenkins/mindspore/testcases/testcases/tests/st/ops/allcases_onecard/rank_0/om/analyze_fail.ir' for more details. Get instructions about `analyze_fail.ir` at https://www.mindspore.cn/search?inputValue=analyze_fail.ir) group_cases_10 have all been run, results of sub cases are below: case: (1,) {} pass. case: (1,) {} pass. case: (1,) {} pass. case: (mindspore.float32,) {} pass. case: (mindspore.float16,) {} pass. case: (0,) {} pass. case: (0,) {} pass. case: (0,) {} pass. ops group_cases_11 with 8 cases start to running, all cases are below: case: (, 0) case: (, 1) case: (, 0, mindspore.float16, 'BSH') case: (, 0, mindspore.float16, 'BNSD') case: (, 0, mindspore.bfloat16, 'BSH') case: (, 0, mindspore.bfloat16, 'BNSD') case: (, 1, mindspore.float16, 'BSH') case: (, 1, mindspore.float16, 'BNSD') ops group_cases_11 total running memory: 236M, memory threshold: 51200M [WARNING] ME(17948:281473890602800,ForkProcess-91):2026-01-29-17:47:06.737.952 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(17959:281473890602800,ForkProcess-93):2026-01-29-17:47:06.737.951 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(17963:281473890602800,ForkProcess-95):2026-01-29-17:47:06.738.139 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(17966:281473890602800,ForkProcess-96):2026-01-29-17:47:06.738.149 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(17956:281473890602800,ForkProcess-92):2026-01-29-17:47:06.738.316 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(17961:281473890602800,ForkProcess-94):2026-01-29-17:47:06.738.501 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:09.614.536 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Gradients/Default/network-InplaceIndexPutNet/Grad_InplaceIndexPut/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaae01d36c03, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:09.614.638 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaae01d36c03, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:09.615.197 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/network-InplaceIndexPutNet/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op1 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaae02311204, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:09.615.236 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaae02311204, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) TotalTime = 3.01884, [24] [bootstrap]: 0.00088163 [type_inference]: 0.0643379 [event_method]: 4.045e-05 [auto_monad]: 0.0001341 [graph_reusing]: 6.04001e-06 [inline]: 2.27999e-06 [add_attr]: 0.0076874, [1] [add_attr_with_inline]: 0.00767331, [1] [Cycle 1]: 0.00015532, [2] [tag_attr]: 5.394e-05 [meta_addattr_fg_expand]: 2.255e-05 [parallel-infer-symbol]: 3.06999e-06 [pre_auto_parallel]: 7.845e-05 [insert-virtual-dataset]: 2.61999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.00710281, [53] [py_interpret_to_execute]: 4.06001e-06 [rewriter_before_opt_a]: 0.00050276 [opt_a]: 0.00428008, [2] [Cycle 1]: 0.00350187, [45] [expand_dump_flag]: 2.19999e-06 [switch_simplify]: 0.00010098 [loop_unroll]: 4.91e-05 [a_1]: 0.00086389 [with_stream_mark]: 1.685e-05 [recompute_prepare]: 1.036e-05 [updatestate_depend_eliminate]: 1.755e-05 [updatestate_assign_eliminate]: 1.615e-05 [updatestate_loads_eliminate]: 5.03002e-06 [parameter_eliminate]: 2.50997e-06 [a_2]: 0.00011857 [accelerated_algorithm]: 8.67998e-06 [shard]: 4.79e-06 [meta_shard_fg_expand]: 6.34001e-06 [shard_inline]: 8.51002e-06 [merge_send_recv]: 4.815e-05 [auto_parallel]: 8.42998e-06 [parallel]: 9.51e-05 [flash_sp]: 4.057e-05 [merge_comm]: 5.78002e-06 [allreduce_fusion]: 1.683e-05 [matmul_add_comm_reduction]: 2.158e-05 [allreduce_slice_to_reducescatter]: 1.19e-05 [virtual_shard_identity]: 1.054e-05 [virtual_dataset]: 8.97999e-06 [get_grad_eliminate_]: 8.13999e-06 [virtual_output]: 8.40999e-06 [merge_forward]: 7.48999e-06 [cell_reuse_recompute_pass]: 3.71001e-06 [offload_activation]: 1.967e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.639e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 1.255e-05 [set_forward_comm_id_for_comm_node_pass]: 1.73e-05 [meta_fg_expand]: 5.59e-06 [flash_sp_send_recv_attached]: 5.81e-06 [receive_attached]: 2.501e-05 [after_resolve]: 1.418e-05 [a_after_grad]: 1.211e-05 [renormalize]: 0.00140812 [add_forward_monad_depend]: 4.73001e-06 [auto_monad_grad]: 1.29998e-06 [auto_monad_eliminator]: 2.512e-05 [cse]: 7.67e-05 [a_3]: 6.408e-05 [Cycle 2]: 0.00076915, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 9.62999e-06 [loop_unroll]: 1.112e-05 [a_1]: 0.00015082 [with_stream_mark]: 1.19e-05 [recompute_prepare]: 8.33999e-06 [updatestate_depend_eliminate]: 5.01997e-06 [updatestate_assign_eliminate]: 4.25999e-06 [updatestate_loads_eliminate]: 4.16001e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00010745 [accelerated_algorithm]: 7.97e-06 [shard]: 8.39995e-07 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 8.42e-06 [merge_send_recv]: 5.87999e-06 [auto_parallel]: 7.12002e-06 [parallel]: 4e-06 [flash_sp]: 5.92999e-06 [merge_comm]: 4.99e-06 [allreduce_fusion]: 4.68001e-06 [matmul_add_comm_reduction]: 7.63001e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.89e-06 [virtual_dataset]: 8.3e-06 [get_grad_eliminate_]: 7.65998e-06 [virtual_output]: 7.86001e-06 [merge_forward]: 4.67998e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 9.05999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [merge_recompute_call_nodes]: 5.60016e-07 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 4.82e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 8.89995e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.257e-05 [a_after_grad]: 1.128e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.17999e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 8.60999e-06 [cse]: 3.139e-05 [a_3]: 5.558e-05 [py_interpret_to_execute_after_opt_a]: 3.61001e-06 [slice_cell_reuse_recomputed_activation]: 4.74e-06 [rewriter_after_opt_a]: 3.628e-05 [convert_after_rewriter]: 1.15999e-06 [order_py_execute_after_rewriter]: 9.70002e-07 [mutable_eliminate]: 0.00050865 [opt_b]: 0.0002794, [1] [Cycle 1]: 0.00027386, [7] [b_1]: 0.0001797 [b_2]: 9.53002e-06 [updatestate_depend_eliminate]: 6.29001e-06 [updatestate_assign_eliminate]: 4.27e-06 [updatestate_loads_eliminate]: 4.25e-06 [renormalize]: 2.89991e-07 [cse]: 3.75e-05 [optimize_parallel_all_gather_comm]: 2.559e-05 [overlap_param_gather]: 2.083e-05 [cconv]: 1.994e-05 [loop_unroll]: 0.00042358 [opt_after_cconv]: 0.00012831, [1] [Cycle 1]: 0.0001225, [7] [c_1]: 3.422e-05 [parameter_eliminate]: 2.53e-06 [updatestate_depend_eliminate]: 7.35998e-06 [updatestate_assign_eliminate]: 4.53001e-06 [updatestate_loads_eliminate]: 4.35e-06 [cse]: 3.716e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 7.08e-05 [tuple_transform]: 8.194e-05, [1] [Cycle 1]: 7.758e-05, [4] [d_1]: 4.855e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.20001e-06 [partial_unused_args_eliminate]: 2.14999e-06 [add_recomputation]: 7.088e-05 [cse_after_recomputation]: 2.661e-05, [1] [Cycle 1]: 2.229e-05, [1] [cse]: 1.712e-05 [environ_conv]: 1.866e-05 [swap_dp_allreduce_reducescatter]: 3.055e-05 [bias_add_comm_swap]: 1.246e-05 [label_micro_interleaved_index]: 2.299e-05 [label_fine_grained_interleaved_index]: 1.17999e-06 [merge_cast_opt]: 4.7998e-07 [slice_recompute_activation]: 1.65001e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.973e-05 [full_micro_interleaved_order_control]: 1.433e-05 [reorder_send_recv_between_fp_bp]: 2.27999e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 3.81999e-06 [interleave_split_concat_branches]: 7.10017e-07 [interleave_parallel_branches]: 1.059e-05 [overlap_opt_shard_in_pipeline]: 1.893e-05 [overlap_opt_shard_grad_in_pipeline]: 1.76998e-06 [control_data_broadcast_order]: 1.545e-05 [grouped_pairwise_exchange_alltoall]: 4.22e-06 [offloading_packed_experts]: 6.49001e-06 [overlap_recompute_and_grad_model_parallel]: 1.164e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.53002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.66e-06 [overlap_recompute_comm]: 2.38002e-06 [overlap_grad_ring_attention]: 2.672e-05 [overlap_grad_flash_sp]: 5.206e-05 [begin_end_overlap_inline]: 3.7998e-07 [split_matmul_comm_elemetwise]: 1.209e-05 [split_layernorm_comm]: 1.45999e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 8.264e-05, [1] [Cycle 1]: 7.832e-05, [6] [build]: 2.77002e-06 [elim_shapecalc]: 1.245e-05 [elim_not_effective]: 1.498e-05 [opt_reshape]: 9.21998e-06 [fold_const_symbol]: 1.162e-05 [renormalize]: 2.29978e-07 [detach_backward]: 1.69e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.615e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 3.09999e-06 [opt_after_jit_grad]: 0.00047368 [validate]: 5.981e-05 [backend_pass]: 1.04998e-06 [task_emit]: 2.9377 [execute]: 1.096e-05 Sums bootstrap : 0.000882s : 0.03% type_inference : 0.064338s : 2.14% event_method : 0.000040s : 0.00% auto_monad : 0.000134s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000054s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000023s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000078s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000503s : 0.02% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000111s : 0.00% optimize.opt_a.loop_unroll : 0.000060s : 0.00% optimize.opt_a.a_1 : 0.001015s : 0.03% optimize.opt_a.with_stream_mark : 0.000029s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000226s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.00% optimize.opt_a.merge_send_recv : 0.000054s : 0.00% optimize.opt_a.auto_parallel : 0.000016s : 0.00% optimize.opt_a.parallel : 0.000099s : 0.00% optimize.opt_a.flash_sp : 0.000046s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000022s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000041s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000022s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000026s : 0.00% optimize.opt_a.after_resolve : 0.000027s : 0.00% optimize.opt_a.a_after_grad : 0.000023s : 0.00% optimize.opt_a.renormalize : 0.001408s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.00% optimize.opt_a.cse : 0.000108s : 0.00% optimize.opt_a.a_3 : 0.000120s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000509s : 0.02% optimize.opt_b.b_1 : 0.000180s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000037s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000021s : 0.00% optimize.cconv : 0.000020s : 0.00% optimize.loop_unroll : 0.000424s : 0.01% optimize.opt_after_cconv.c_1 : 0.000034s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000037s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000071s : 0.00% optimize.tuple_transform.d_1 : 0.000049s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000071s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000019s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000031s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000023s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000000s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000020s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000004s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000011s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000027s : 0.00% optimize.overlap_grad_flash_sp : 0.000052s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000474s : 0.02% validate : 0.000060s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.937701s : 97.59% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000266 34 0.74% : 0.000002s : 2: substitution.elim_not_effective 0.34% : 0.000001s : 2: substitution.fold_const_symbol 2.31% : 0.000006s : 5: substitution.graph_param_transform 82.70% : 0.000220s : 5: substitution.inline 1.23% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.99% : 0.000016s : 4: substitution.remove_not_recompute_node 1.80% : 0.000005s : 6: substitution.replace_old_param 4.89% : 0.000013s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.064243 2 96.54% : 0.062019s : 1: type_inference.infer 3.46% : 0.002224s : 1: type_inference.specialize ------[replace.] 0.000138 11 60.01% : 0.000083s : 5: replace.inline 39.99% : 0.000055s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000227 11 95.44% : 0.000217s : 5: match.inline 4.56% : 0.000010s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1659 0.93% : 0.000002s : 17: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 17: predicate.addn_zero_filter 0.88% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 27: predicate.arithmetic_simplify 1.00% : 0.000002s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.52% : 0.000001s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.34% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 22: predicate.environ_get_depend_swap 1.75% : 0.000004s : 32: predicate.environ_get_eliminate 1.11% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.62% : 0.000004s : 28: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 28: predicate.float_depend_g_call 0.50% : 0.000001s : 10: predicate.float_environ_get_switch 0.75% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 5: predicate.fold_const_symbol 0.63% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000001s : 10: predicate.incorporate_call 0.47% : 0.000001s : 10: predicate.incorporate_call_switch 5.60% : 0.000013s : 75: predicate.inline 0.62% : 0.000001s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.64% : 0.000001s : 10: predicate.less_batch_normalization 1.89% : 0.000004s : 33: predicate.list_to_tuple_eliminator_ 2.70% : 0.000006s : 50: predicate.load_eliminater 0.93% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.82% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 10: predicate.merge_addn 0.50% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.57% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 1.00% : 0.000002s : 5: predicate.mutable_eliminate 0.38% : 0.000001s : 5: predicate.opt_reshape 0.32% : 0.000001s : 5: predicate.parallel_virtual_node 2.09% : 0.000005s : 28: predicate.partial_defer_inline 1.70% : 0.000004s : 28: predicate.partial_eliminate 1.04% : 0.000002s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 17: predicate.reduce_eliminate 2.66% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000003s : 33: predicate.replace_applicator 0.43% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000000s : 5: predicate.reset_defer_inline 1.07% : 0.000002s : 17: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.83% : 0.000002s : 10: predicate.same_eliminate 0.43% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.65% : 0.000001s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.65% : 0.000001s : 10: predicate.specialize_transform 0.66% : 0.000001s : 10: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.68% : 0.000004s : 28: predicate.switch_defer_inline 2.25% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.61% : 0.000013s : 91: predicate.switch_simplify 0.99% : 0.000002s : 17: predicate.tile_eliminate 0.99% : 0.000002s : 17: predicate.transpose_eliminate 1.49% : 0.000003s : 27: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 27: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000003s : 27: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000007s : 43: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 27: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000005s : 37: predicate.tuple_list_set_item_eliminator 1.96% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 2.56% : 0.000006s : 50: predicate.updatestate_pure_node_eliminater 3.29% : 0.000007s : 60: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 5: predicate.value_based_eliminate 0.69% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001486 15 55.45% : 0.000824s : 8: func_graph_cloner_run.FuncGraphClonerGraph 44.55% : 0.000662s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.036960 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.25% : 0.007692s : 1: add_attr 0.25% : 0.007677s : 1: add_attr_with_inline 0.00% : 0.000007s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000140s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000925s : 1: bootstrap 0.00% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.00% : 0.000047s : 1: event_method 0.00% : 0.000032s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000003s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000026s : 1: label_micro_interleaved_index 0.01% : 0.000432s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.02% : 0.000518s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.06% : 0.001683s : 78: opt.transform.opt_a 0.00% : 0.000033s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000162s : 28: opt.transform.opt_b 0.00% : 0.000055s : 2: opt.transform.opt_trans_graph 0.00% : 0.000045s : 4: opt.transform.symbol_engine_opt 0.14% : 0.004283s : 1: opt_a 0.00% : 0.000132s : 1: opt_after_cconv 0.02% : 0.000483s : 1: opt_after_jit_grad 0.01% : 0.000283s : 1: opt_b 0.23% : 0.007107s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000055s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000022s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000024s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000083s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000023s : 1: remove_cast_before_assign_add 0.00% : 0.000075s : 1: remove_dup_value 0.02% : 0.000721s : 1: renormalize.infer 0.02% : 0.000678s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000040s : 1: rewriter_after_opt_a 0.02% : 0.000509s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000034s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000085s : 1: symbol_engine_optimizer 96.73% : 2.937742s : 1: task_emit 0.00% : 0.000085s : 1: tuple_transform 2.12% : 0.064357s : 1: type_inference 0.00% : 0.000088s : 1: validate TotalTime = 3.01345, [24] [bootstrap]: 0.0009386 [type_inference]: 0.065603 [event_method]: 4.29e-05 [auto_monad]: 0.00013068 [graph_reusing]: 6.29001e-06 [inline]: 2.10002e-06 [add_attr]: 0.0076583, [1] [add_attr_with_inline]: 0.00764364, [1] [Cycle 1]: 0.00015869, [2] [tag_attr]: 5.191e-05 [meta_addattr_fg_expand]: 2.134e-05 [parallel-infer-symbol]: 3.05998e-06 [pre_auto_parallel]: 7.437e-05 [insert-virtual-dataset]: 2.53003e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00706601, [53] [py_interpret_to_execute]: 4.25999e-06 [rewriter_before_opt_a]: 0.00055216 [opt_a]: 0.00425055, [2] [Cycle 1]: 0.00340116, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 9.479e-05 [loop_unroll]: 4.923e-05 [a_1]: 0.00086454 [with_stream_mark]: 1.659e-05 [recompute_prepare]: 1.014e-05 [updatestate_depend_eliminate]: 1.614e-05 [updatestate_assign_eliminate]: 1.475e-05 [updatestate_loads_eliminate]: 4.95999e-06 [parameter_eliminate]: 2.46e-06 [a_2]: 0.00011848 [accelerated_algorithm]: 8.57e-06 [shard]: 1.64998e-06 [meta_shard_fg_expand]: 3.33e-06 [shard_inline]: 7.9e-06 [merge_send_recv]: 5.007e-05 [auto_parallel]: 8.28999e-06 [parallel]: 8.333e-05 [flash_sp]: 3.74e-05 [merge_comm]: 5.76e-06 [allreduce_fusion]: 1.49e-05 [matmul_add_comm_reduction]: 2.019e-05 [allreduce_slice_to_reducescatter]: 9.51e-06 [virtual_shard_identity]: 1.037e-05 [virtual_dataset]: 9.01998e-06 [get_grad_eliminate_]: 8.18999e-06 [virtual_output]: 8.06001e-06 [merge_forward]: 5.94e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 2.126e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.437e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.215e-05 [set_forward_comm_id_for_comm_node_pass]: 1.475e-05 [meta_fg_expand]: 5.25999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.041e-05 [after_resolve]: 1.391e-05 [a_after_grad]: 1.208e-05 [renormalize]: 0.0013296 [add_forward_monad_depend]: 5.71e-06 [auto_monad_grad]: 2.54999e-06 [auto_monad_eliminator]: 2.902e-05 [cse]: 0.00010124 [a_3]: 6.454e-05 [Cycle 2]: 0.00083949, [45] [expand_dump_flag]: 1.16002e-06 [switch_simplify]: 9.98002e-06 [loop_unroll]: 8.05e-06 [a_1]: 0.0001528 [with_stream_mark]: 1.322e-05 [recompute_prepare]: 8.55001e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 4.42998e-06 [updatestate_loads_eliminate]: 4.38001e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 0.00017518 [accelerated_algorithm]: 8.08999e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 8.08999e-06 [merge_send_recv]: 6.93e-06 [auto_parallel]: 7.73001e-06 [parallel]: 4.23001e-06 [flash_sp]: 3.01999e-06 [merge_comm]: 5.00001e-06 [allreduce_fusion]: 4.59998e-06 [matmul_add_comm_reduction]: 7.86001e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 8.99e-06 [virtual_dataset]: 8.03001e-06 [get_grad_eliminate_]: 7.77e-06 [virtual_output]: 7.5e-06 [merge_forward]: 4.53999e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 9.57001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.446e-05 [merge_recompute_call_nodes]: 6.39993e-07 [before_grad]: 1.065e-05 [set_forward_comm_id_for_comm_node_pass]: 4.98001e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.12e-06 [after_resolve]: 1.215e-05 [a_after_grad]: 1.112e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.23002e-06 [auto_monad_grad]: 8.99978e-07 [auto_monad_eliminator]: 8.82e-06 [cse]: 3.287e-05 [a_3]: 5.498e-05 [py_interpret_to_execute_after_opt_a]: 3.86001e-06 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 4.311e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.24e-06 [mutable_eliminate]: 0.00050908 [opt_b]: 0.00027812, [1] [Cycle 1]: 0.00027218, [7] [b_1]: 0.00017742 [b_2]: 9.27001e-06 [updatestate_depend_eliminate]: 7.47002e-06 [updatestate_assign_eliminate]: 4.58999e-06 [updatestate_loads_eliminate]: 4.37e-06 [renormalize]: 4.19997e-07 [cse]: 3.65e-05 [optimize_parallel_all_gather_comm]: 2.956e-05 [overlap_param_gather]: 1.213e-05 [cconv]: 2.311e-05 [loop_unroll]: 0.00041671 [opt_after_cconv]: 0.00012705, [1] [Cycle 1]: 0.0001214, [7] [c_1]: 3.462e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 7.38e-06 [updatestate_assign_eliminate]: 4.82998e-06 [updatestate_loads_eliminate]: 4.39002e-06 [cse]: 3.509e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 7.265e-05 [tuple_transform]: 8.081e-05, [1] [Cycle 1]: 7.639e-05, [4] [d_1]: 4.762e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.90001e-06 [partial_unused_args_eliminate]: 1.76998e-06 [add_recomputation]: 6.455e-05 [cse_after_recomputation]: 2.838e-05, [1] [Cycle 1]: 2.386e-05, [1] [cse]: 1.851e-05 [environ_conv]: 2.114e-05 [swap_dp_allreduce_reducescatter]: 2.7e-05 [bias_add_comm_swap]: 1.217e-05 [label_micro_interleaved_index]: 1.312e-05 [label_fine_grained_interleaved_index]: 2.42001e-06 [merge_cast_opt]: 1.28002e-06 [slice_recompute_activation]: 2.18998e-06 [micro_interleaved_order_control]: 2.33002e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 7.49977e-07 [remove_cast_before_assign_add]: 9.86998e-06 [full_micro_interleaved_order_control]: 1.099e-05 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 9.60019e-07 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.00001e-06 [interleave_parallel_branches]: 1.061e-05 [overlap_opt_shard_in_pipeline]: 1.61e-05 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.504e-05 [grouped_pairwise_exchange_alltoall]: 1.75001e-06 [offloading_packed_experts]: 4.4e-06 [overlap_recompute_and_grad_model_parallel]: 1.483e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.35001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.15999e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 2.296e-05 [overlap_grad_flash_sp]: 4.288e-05 [begin_end_overlap_inline]: 6.60017e-07 [split_matmul_comm_elemetwise]: 1.111e-05 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 7.951e-05, [1] [Cycle 1]: 7.521e-05, [6] [build]: 2.32999e-06 [elim_shapecalc]: 1.17e-05 [elim_not_effective]: 1.432e-05 [opt_reshape]: 8.59e-06 [fold_const_symbol]: 1.147e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.38002e-06 [auto_monad_reorder]: 2.361e-05 [get_jit_bprop_graph]: 1.33002e-06 [rewriter_after_jit_bprop_graph]: 3.15002e-06 [opt_after_jit_grad]: 0.00046087 [validate]: 5.932e-05 [backend_pass]: 8.60018e-07 [task_emit]: 2.93112 [execute]: 7.71999e-06 Sums bootstrap : 0.000939s : 0.03% type_inference : 0.065603s : 2.18% event_method : 0.000043s : 0.00% auto_monad : 0.000131s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000052s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000021s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000074s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000552s : 0.02% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000105s : 0.00% optimize.opt_a.loop_unroll : 0.000057s : 0.00% optimize.opt_a.a_1 : 0.001017s : 0.03% optimize.opt_a.with_stream_mark : 0.000030s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000294s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000057s : 0.00% optimize.opt_a.auto_parallel : 0.000016s : 0.00% optimize.opt_a.parallel : 0.000088s : 0.00% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000019s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000028s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000031s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000022s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.00% optimize.opt_a.a_after_grad : 0.000023s : 0.00% optimize.opt_a.renormalize : 0.001330s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.00% optimize.opt_a.cse : 0.000134s : 0.00% optimize.opt_a.a_3 : 0.000120s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000509s : 0.02% optimize.opt_b.b_1 : 0.000177s : 0.01% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000036s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000030s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000417s : 0.01% optimize.opt_after_cconv.c_1 : 0.000035s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000035s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000073s : 0.00% optimize.tuple_transform.d_1 : 0.000048s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.00% optimize.cse_after_recomputation.cse : 0.000019s : 0.00% optimize.environ_conv : 0.000021s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000011s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimiz TotalTime = 3.03063, [24] [bootstrap]: 0.0008872 [type_inference]: 0.0643359 [event_method]: 5.282e-05 [auto_monad]: 0.00012036 [graph_reusing]: 4.57e-06 [inline]: 2.22001e-06 [add_attr]: 0.00768951, [1] [add_attr_with_inline]: 0.0076767, [1] [Cycle 1]: 0.00015897, [2] [tag_attr]: 5.294e-05 [meta_addattr_fg_expand]: 2.39e-05 [parallel-infer-symbol]: 1.72999e-06 [pre_auto_parallel]: 7.763e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00710266, [53] [py_interpret_to_execute]: 4.16001e-06 [rewriter_before_opt_a]: 0.00050046 [opt_a]: 0.00426535, [2] [Cycle 1]: 0.00349227, [45] [expand_dump_flag]: 3.30003e-06 [switch_simplify]: 0.00010182 [loop_unroll]: 4.948e-05 [a_1]: 0.00086394 [with_stream_mark]: 1.524e-05 [recompute_prepare]: 1.035e-05 [updatestate_depend_eliminate]: 1.742e-05 [updatestate_assign_eliminate]: 1.658e-05 [updatestate_loads_eliminate]: 8.48001e-06 [parameter_eliminate]: 1.32e-06 [a_2]: 0.00011597 [accelerated_algorithm]: 8.75001e-06 [shard]: 1.94e-06 [meta_shard_fg_expand]: 3.21001e-06 [shard_inline]: 8.35001e-06 [merge_send_recv]: 5.402e-05 [auto_parallel]: 7.98999e-06 [parallel]: 9.831e-05 [flash_sp]: 3.755e-05 [merge_comm]: 5.91e-06 [allreduce_fusion]: 1.711e-05 [matmul_add_comm_reduction]: 2.138e-05 [allreduce_slice_to_reducescatter]: 1.188e-05 [virtual_shard_identity]: 1.085e-05 [virtual_dataset]: 8.70999e-06 [get_grad_eliminate_]: 8.35001e-06 [virtual_output]: 8.2e-06 [merge_forward]: 8.75999e-06 [cell_reuse_recompute_pass]: 1.02e-06 [offload_activation]: 2.561e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.145e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 1.237e-05 [set_forward_comm_id_for_comm_node_pass]: 1.7e-05 [meta_fg_expand]: 5.08002e-06 [flash_sp_send_recv_attached]: 6.54001e-06 [receive_attached]: 2.525e-05 [after_resolve]: 1.36e-05 [a_after_grad]: 1.184e-05 [renormalize]: 0.00137382 [add_forward_monad_depend]: 5.67001e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 2.947e-05 [cse]: 9.55e-05 [a_3]: 6.407e-05 [Cycle 2]: 0.00076375, [45] [expand_dump_flag]: 1.24998e-06 [switch_simplify]: 1.004e-05 [loop_unroll]: 7.81001e-06 [a_1]: 0.00014787 [with_stream_mark]: 1.252e-05 [recompute_prepare]: 8.04002e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 4.28999e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00010852 [accelerated_algorithm]: 7.80998e-06 [shard]: 1.37999e-06 [meta_shard_fg_expand]: 1.82001e-06 [shard_inline]: 8.03001e-06 [merge_send_recv]: 6.44001e-06 [auto_parallel]: 7.11001e-06 [parallel]: 4.02998e-06 [flash_sp]: 3.2e-06 [merge_comm]: 4.96002e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 7.32002e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 8.55001e-06 [virtual_dataset]: 8.23001e-06 [get_grad_eliminate_]: 7.98999e-06 [virtual_output]: 7.89002e-06 [merge_forward]: 4.56002e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 9.09998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.415e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.085e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 2.83e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 1.283e-05 [a_after_grad]: 1.103e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.43002e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 8.35999e-06 [cse]: 3.257e-05 [a_3]: 5.385e-05 [py_interpret_to_execute_after_opt_a]: 4.12e-06 [slice_cell_reuse_recomputed_activation]: 2.39999e-06 [rewriter_after_opt_a]: 4.577e-05 [convert_after_rewriter]: 1.34998e-06 [order_py_execute_after_rewriter]: 1.22999e-06 [mutable_eliminate]: 0.0005137 [opt_b]: 0.00028003, [1] [Cycle 1]: 0.0002742, [7] [b_1]: 0.00017719 [b_2]: 9.24e-06 [updatestate_depend_eliminate]: 7.52002e-06 [updatestate_assign_eliminate]: 4.42998e-06 [updatestate_loads_eliminate]: 4.20999e-06 [renormalize]: 5.09986e-07 [cse]: 3.878e-05 [optimize_parallel_all_gather_comm]: 2.781e-05 [overlap_param_gather]: 1.669e-05 [cconv]: 2.391e-05 [loop_unroll]: 0.00043164 [opt_after_cconv]: 0.00013071, [1] [Cycle 1]: 0.00012471, [7] [c_1]: 3.509e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 7.38999e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 4.2e-06 [cse]: 3.816e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 6.237e-05 [tuple_transform]: 7.998e-05, [1] [Cycle 1]: 7.547e-05, [4] [d_1]: 4.723e-05 [none_parameter_eliminate]: 1.20999e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 9.04e-06 [partial_unused_args_eliminate]: 1.15999e-06 [add_recomputation]: 7.087e-05 [cse_after_recomputation]: 2.814e-05, [1] [Cycle 1]: 2.365e-05, [1] [cse]: 1.83e-05 [environ_conv]: 1.63e-05 [swap_dp_allreduce_reducescatter]: 3.125e-05 [bias_add_comm_swap]: 1.317e-05 [label_micro_interleaved_index]: 2.046e-05 [label_fine_grained_interleaved_index]: 5.34998e-06 [merge_cast_opt]: 6.00005e-07 [slice_recompute_activation]: 3.65e-06 [micro_interleaved_order_control]: 1.20001e-06 [assign_add_opt]: 6.19999e-07 [ForceFp32Comm]: 3.70026e-07 [remove_cast_before_assign_add]: 1.829e-05 [full_micro_interleaved_order_control]: 1.441e-05 [reorder_send_recv_between_fp_bp]: 2.21e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.268e-05 [overlap_opt_shard_in_pipeline]: 1.927e-05 [overlap_opt_shard_grad_in_pipeline]: 5.96e-06 [control_data_broadcast_order]: 1.791e-05 [grouped_pairwise_exchange_alltoall]: 6.00005e-07 [offloading_packed_experts]: 3.65e-06 [overlap_recompute_and_grad_model_parallel]: 1.102e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.61998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.62999e-06 [overlap_recompute_comm]: 5.57001e-06 [overlap_grad_ring_attention]: 2.334e-05 [overlap_grad_flash_sp]: 5.034e-05 [begin_end_overlap_inline]: 3.30008e-07 [split_matmul_comm_elemetwise]: 1.174e-05 [split_layernorm_comm]: 4.70001e-06 [handle_group_info]: 4.10015e-07 [symbol_engine_optimizer]: 8.105e-05, [1] [Cycle 1]: 7.69e-05, [6] [build]: 2.26e-06 [elim_shapecalc]: 1.315e-05 [elim_not_effective]: 1.367e-05 [opt_reshape]: 8.73001e-06 [fold_const_symbol]: 1.16e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.82999e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 2.61e-05 [get_jit_bprop_graph]: 1.09998e-06 [rewriter_after_jit_bprop_graph]: 3.4e-06 [opt_after_jit_grad]: 0.00047047 [validate]: 6.242e-05 [backend_pass]: 7.00005e-07 [task_emit]: 2.94952 [execute]: 7.73999e-06 Sums bootstrap : 0.000887s : 0.03% type_inference : 0.064336s : 2.13% event_method : 0.000053s : 0.00% auto_monad : 0.000120s : 0.00% graph_reusing : 0.000005s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000053s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000024s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000078s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000500s : 0.02% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000112s : 0.00% optimize.opt_a.loop_unroll : 0.000057s : 0.00% optimize.opt_a.a_1 : 0.001012s : 0.03% optimize.opt_a.with_stream_mark : 0.000028s : 0.00% optimize.opt_a.recompute_prepare : 0.000018s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000013s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000224s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000060s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000102s : 0.00% optimize.opt_a.flash_sp : 0.000041s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000022s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000013s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000022s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000026s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.00% optimize.opt_a.a_after_grad : 0.000023s : 0.00% optimize.opt_a.renormalize : 0.001374s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.00% optimize.opt_a.cse : 0.000128s : 0.00% optimize.opt_a.a_3 : 0.000118s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000514s : 0.02% optimize.opt_b.b_1 : 0.000177s : 0.01% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000039s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000017s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000432s : 0.01% optimize.opt_after_cconv.c_1 : 0.000035s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000038s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000062s : 0.00% optimize.tuple_transform.d_1 : 0.000047s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000071s : 0.00% optimize.cse_after_recomputation.cse : 0.000018s : 0.00% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000031s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000020s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000004s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000018s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000006s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000011s : 0.00% optimize.e.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000043s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000461s : 0.02% validate : 0.000059s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.931116s : 97.55% execute : 0.000008s : 0.00% overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000006s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000005s : 0.00% optimize.handle_group_info : 0.000000s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000470s : 0.02% validate : 0.000062s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.949524s : 97.60% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000263 34 0.63% : 0.000002s : 2: substitution.elim_not_effective 0.49% : 0.000001s : 2: substitution.fold_const_symbol 2.15% : 0.000006s : 5: substitution.graph_param_transform 81.83% : 0.000215s : 5: substitution.inline 1.29% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.29% : 0.000014s : 4: substitution.remove_not_recompute_node 1.67% : 0.000004s : 6: substitution.replace_old_param 6.64% : 0.000017s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.065498 2 96.68% : 0.063320s : 1: type_inference.infer 3.32% : 0.002178s : 1: type_inference.specialize ------[replace.] 0.000138 11 59.78% : 0.000082s : 5: replace.inline 40.22% : 0.000055s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000227 11 93.46% : 0.000212s : 5: match.inline 6.54% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000223 1659 1.01% : 0.000002s : 17: predicate.accumulaten_eliminater 0.69% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 10: predicate.addn_check_dump 0.92% : 0.000002s : 17: predicate.addn_zero_filter 0.85% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 27: predicate.arithmetic_simplify 0.99% : 0.000002s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.48% : 0.000001s : 10: predicate.compare_switch_simplify 0.20% : 0.000000s : 5: predicate.const_output_eliminate 0.48% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 17: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 22: predicate.environ_get_add_eliminate 1.11% : 0.000002s : 22: predicate.environ_get_depend_swap 1.75% : 0.000004s : 32: predicate.environ_get_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 28: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 28: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.64% : 0.000001s : 10: predicate.get_grad_eliminate 0.25% : 0.000001s : 5: predicate.graph_param_transform 0.57% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.72% : 0.000013s : 75: predicate.inline 0.65% : 0.000001s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.68% : 0.000002s : 10: predicate.less_batch_normalization 1.90% : 0.000004s : 33: predicate.list_to_tuple_eliminator_ 2.69% : 0.000006s : 50: predicate.load_eliminater 0.65% : 0.000001s : 5: predicate.loop_unroll_after_grad 2.85% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 10: predicate.merge_addn 0.50% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 17: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.33% : 0.000001s : 5: predicate.opt_reshape 0.33% Time group info: ------[substitution.] 0.000262 34 0.46% : 0.000001s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 1.59% : 0.000004s : 5: substitution.graph_param_transform 83.24% : 0.000218s : 5: substitution.inline 1.32% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.32% : 0.000011s : 4: substitution.remove_not_recompute_node 1.67% : 0.000004s : 6: substitution.replace_old_param 6.88% : 0.000018s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.064240 2 96.54% : 0.062020s : 1: type_inference.infer 3.46% : 0.002220s : 1: type_inference.specialize ------[replace.] 0.000132 11 58.87% : 0.000078s : 5: replace.inline 41.13% : 0.000054s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000230 11 93.30% : 0.000215s : 5: match.inline 6.70% : 0.000015s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1659 0.95% : 0.000002s : 17: predicate.accumulaten_eliminater 0.77% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 10: predicate.addn_check_dump 1.05% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 27: predicate.arithmetic_simplify 0.99% : 0.000002s : 17: predicate.cast_eliminate 0.55% : 0.000001s : 10: predicate.check_bprop_eliminate 0.52% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.52% : 0.000001s : 10: predicate.depend_value_elim 1.04% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 17: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.25% : 0.000001s : 5: predicate.elim_not_effective 0.33% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.72% : 0.000004s : 32: predicate.environ_get_eliminate 1.15% : 0.000003s : 22: predicate.environ_get_set_eliminate 1.59% : 0.000004s : 28: predicate.exchange_switch_depend_value 2.27% : 0.000005s : 28: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.76% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.65% : 0.000001s : 10: predicate.get_grad_eliminate 0.23% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000001s : 10: predicate.incorporate_call 0.48% : 0.000001s : 10: predicate.incorporate_call_switch 5.88% : 0.000013s : 75: predicate.inline 0.66% : 0.000001s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.66% : 0.000001s : 10: predicate.less_batch_normalization 1.86% : 0.000004s : 33: predicate.list_to_tuple_eliminator_ 2.76% : 0.000006s : 50: predicate.load_eliminater 0.84% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.92% : 0.000007s : 48: predicate.loop_unroll_before_grad 1.52% : 0.000003s : 27: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 17: predicate.minmaximum_grad 0.88% : 0.000002s : 5: predicate.mutable_eliminate 0.32% : 0.000001s : 5: predicate.opt_reshape 0.31% : 0.000001s : 5: predicate.parallel_virtual_node 2.07% : 0.000005s : 28: predicate.partial_defer_inline 1.69% : 0.000004s : 28: predicate.partial_eliminate 0.98% : 0.000002s : 17: predicate.print_const_string_wrapper 0.59% : 0.000001s : 10: predicate.reduce_all_const_elim 1.27% : 0.000003s : 17: predicate.reduce_eliminate 2.66% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 10: predicate.remove_not_recompute_node 1.51% : 0.000003s : 33: predicate.replace_applicator 0.50% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.99% : 0.000002s : 17: predicate.reshape_eliminate 0.73% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 5: predicate.row_tensor_eliminate 0.67% : 0.000001s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 10: predicate.shard_identity_eliminate 0.68% : 0.000002s : 10: predicate.special_op_eliminate 0.64% : 0.000001s : 10: predicate.specialize_transform 0.69% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.72% : 0.000004s : 28: predicate.switch_defer_inline 2.28% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.67% : 0.000013s : 91: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 0.93% : 0.000002s : 17: predicate.transpose_eliminate 1.52% : 0.000003s : 27: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 27: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 27: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000007s : 43: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 27: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000005s : 37: predicate.tuple_list_set_item_eliminator 1.85% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 2.57% : 0.000006s : 50: predicate.updatestate_pure_node_eliminater 3.34% : 0.000007s : 60: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 5: predicate.value_based_eliminate 0.66% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 10: predicate.virtual_output_eliminate 0.29% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001491 15 57.24% : 0.000853s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.76% : 0.000638s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.031479 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.25% : 0.007663s : 1: add_attr 0.25% : 0.007647s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000069s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000137s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000975s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% :: 0.000001s : 5: predicate.parallel_virtual_node 2.03% : 0.000005s : 28: predicate.partial_defer_inline 1.69% : 0.000004s : 28: predicate.partial_eliminate 0.96% : 0.000002s : 17: predicate.print_const_string_wrapper 0.54% : 0.000001s : 10: predicate.reduce_all_const_elim 1.23% : 0.000003s : 17: predicate.reduce_eliminate 2.54% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 10: predicate.remove_not_recompute_node 1.49% : 0.000003s : 33: predicate.replace_applicator 0.44% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 0.95% : 0.000002s : 17: predicate.reshape_eliminate 0.61% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.46% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.71% : 0.000002s : 10: predicate.shard_identity_eliminate 0.64% : 0.000001s : 10: predicate.special_op_eliminate 0.63% : 0.000001s : 10: predicate.specialize_transform 0.75% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.75% : 0.000004s : 28: predicate.switch_defer_inline 2.20% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.88% : 0.000013s : 91: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 0.90% : 0.000002s : 17: predicate.transpose_eliminate 1.50% : 0.000003s : 27: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 27: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 27: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 43: predicate.tuple_list_get_item_eliminator 1.52% : 0.000003s : 27: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 37: predicate.tuple_list_set_item_eliminator 1.94% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 2.63% : 0.000006s : 50: predicate.updatestate_pure_node_eliminater 3.26% : 0.000007s : 60: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 5: predicate.value_based_eliminate 0.64% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 10: predicate.virtual_output_eliminate 0.24% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001485 15 56.45% : 0.000839s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.55% : 0.000647s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.048705 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.25% : 0.007694s : 1: add_attr 0.25% : 0.007680s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000127s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.03% : 0.000925s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000050s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.01% : 0.000425s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000518s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.06% : 0.001738s : 78: opt.transform.opt_a 0.00% : 0.000033s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000160s : 28: opt.transform.opt_b 0.00% : 0.000054s : 2: opt.transform.opt_trans_graph 0.00% : 0.000043s : 4: opt.transform.symbol_engine_opt 0.14% : 0.004254s : 1: opt_a 0.00% : 0.000130s : 1: opt_after_cconv 0.02% : 0.000470s : 1: opt_after_jit_grad 0.01% : 0.000281s : 1: opt_b 0.23% : 0.007070s : 1: optimize 0.00% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000046s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000019s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000079s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000077s : 1: remove_dup_value 0.02% : 0.000658s : 1: renormalize.infer 0.02% : 0.000663s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000047s : 1: rewriter_after_opt_a 0.02% : 0.000559s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000082s : 1: symbol_engine_optimizer 96.69% : 2.931144s : 1: task_emit 0.00% : 0.000084s : 1: tuple_transform 2.16% : 0.065620s : 1: type_inference 0.00% : 0.000087s : 1: validate 0.000020s : 1: environ_conv 0.00% : 0.000060s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000018s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000015s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000024s : 1: label_micro_interleaved_index 0.01% : 0.000441s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000523s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.05% : 0.001669s : 78: opt.transform.opt_a 0.00% : 0.000034s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000161s : 28: opt.transform.opt_b 0.00% : 0.000054s : 2: opt.transform.opt_trans_graph 0.00% : 0.000044s : 4: opt.transform.symbol_engine_opt 0.14% : 0.004269s : 1: opt_a 0.00% : 0.000134s : 1: opt_after_cconv 0.02% : 0.000480s : 1: opt_after_jit_grad 0.01% : 0.000283s : 1: opt_b 0.23% : 0.007107s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000009s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000020s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000083s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000022s : 1: remove_cast_before_assign_add 0.00% : 0.000067s : 1: remove_dup_value 0.02% : 0.000713s : 1: renormalize.infer 0.02% : 0.000652s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000049s : 1: rewriter_after_opt_a 0.02% : 0.000507s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000035s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000084s : 1: symbol_engine_optimizer 96.75% : 2.949551s : 1: task_emit 0.00% : 0.000083s : 1: tuple_transform 2.11% : 0.064357s : 1: type_inference 0.00% : 0.000091s : 1: validate TotalTime = 3.07528, [24] [bootstrap]: 0.00088167 [type_inference]: 0.0643377 [event_method]: 4.266e-05 [auto_monad]: 0.00013195 [graph_reusing]: 6.22001e-06 [inline]: 2.36e-06 [add_attr]: 0.00768967, [1] [add_attr_with_inline]: 0.00767707, [1] [Cycle 1]: 0.0001597, [2] [tag_attr]: 5.556e-05 [meta_addattr_fg_expand]: 2.534e-05 [parallel-infer-symbol]: 1.83002e-06 [pre_auto_parallel]: 8.954e-05 [insert-virtual-dataset]: 9.50007e-07 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 9.30013e-07 [pipeline_split]: 7.39994e-07 [optimize]: 0.00709518, [53] [py_interpret_to_execute]: 3.06001e-06 [rewriter_before_opt_a]: 0.00049925 [opt_a]: 0.00427785, [2] [Cycle 1]: 0.00350144, [45] [expand_dump_flag]: 2.16998e-06 [switch_simplify]: 0.00010092 [loop_unroll]: 4.95e-05 [a_1]: 0.00088131 [with_stream_mark]: 1.024e-05 [recompute_prepare]: 9.77001e-06 [updatestate_depend_eliminate]: 9.61003e-06 [updatestate_assign_eliminate]: 1.542e-05 [updatestate_loads_eliminate]: 4.67e-06 [parameter_eliminate]: 2.40002e-06 [a_2]: 0.00011832 [accelerated_algorithm]: 8.80001e-06 [shard]: 5.05001e-06 [meta_shard_fg_expand]: 3.46999e-06 [shard_inline]: 8.37e-06 [merge_send_recv]: 5.047e-05 [auto_parallel]: 8.42998e-06 [parallel]: 9.521e-05 [flash_sp]: 4.079e-05 [merge_comm]: 5.70001e-06 [allreduce_fusion]: 1.682e-05 [matmul_add_comm_reduction]: 2.152e-05 [allreduce_slice_to_reducescatter]: 1.214e-05 [virtual_shard_identity]: 1.081e-05 [virtual_dataset]: 8.75999e-06 [get_grad_eliminate_]: 8.69003e-06 [virtual_output]: 8.38999e-06 [merge_forward]: 5.99e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 2.482e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.994e-05 [merge_recompute_call_nodes]: 1.212e-05 [before_grad]: 1.458e-05 [set_forward_comm_id_for_comm_node_pass]: 8.67e-06 [meta_fg_expand]: 4.95999e-06 [flash_sp_send_recv_attached]: 1.47001e-06 [receive_attached]: 2.019e-05 [after_resolve]: 1.397e-05 [a_after_grad]: 1.217e-05 [renormalize]: 0.00140831 [add_forward_monad_depend]: 4.68001e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 2.52e-05 [cse]: 7.745e-05 [a_3]: 6.389e-05 [Cycle 2]: 0.00076688, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 9.57999e-06 [loop_unroll]: 8.65001e-06 [a_1]: 0.00014974 [with_stream_mark]: 1.225e-05 [recompute_prepare]: 8.3e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.24002e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 0.00010708 [accelerated_algorithm]: 7.93001e-06 [shard]: 1.17e-06 [meta_shard_fg_expand]: 1.87999e-06 [shard_inline]: 8.02e-06 [merge_send_recv]: 6.33998e-06 [auto_parallel]: 7.52998e-06 [parallel]: 3.90998e-06 [flash_sp]: 8.79e-06 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 4.52e-06 [matmul_add_comm_reduction]: 6.29999e-06 [allreduce_slice_to_reducescatter]: 2.80008e-07 [virtual_shard_identity]: 9.18002e-06 [virtual_dataset]: 7.93999e-06 [get_grad_eliminate_]: 7.81001e-06 [virtual_output]: 7.92e-06 [merge_forward]: 4.47e-06 [cell_reuse_recompute_pass]: 1.04e-06 [offload_activation]: 8.68001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.379e-05 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 1.051e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90001e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 7.30011e-07 [after_resolve]: 1.234e-05 [a_after_grad]: 1.151e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 8.48001e-06 [cse]: 3.151e-05 [a_3]: 5.348e-05 [py_interpret_to_execute_after_opt_a]: 4.47e-06 [slice_cell_reuse_recomputed_activation]: 1.02e-06 [rewriter_after_opt_a]: 3.513e-05 [convert_after_rewriter]: 1.23002e-06 [order_py_execute_after_rewriter]: 1.02e-06 [mutable_eliminate]: 0.00051441 [opt_b]: 0.00027957, [1] [Cycle 1]: 0.00027377, [7] [b_1]: 0.00017757 [b_2]: 9.81e-06 [updatestate_depend_eliminate]: 7.30003e-06 [updatestate_assign_eliminate]: 4.43999e-06 [updatestate_loads_eliminate]: 4.28001e-06 [renormalize]: 3.89991e-07 [cse]: 3.755e-05 [optimize_parallel_all_gather_comm]: 2.543e-05 [overlap_param_gather]: 1.561e-05 [cconv]: 3.283e-05 [loop_unroll]: 0.0004161 [opt_after_cconv]: 0.00012825, [1] [Cycle 1]: 0.00012256, [7] [c_1]: 3.44e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 7.16001e-06 [updatestate_assign_eliminate]: 4.55999e-06 [updatestate_loads_eliminate]: 4.44002e-06 [cse]: 3.703e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 7.087e-05 [tuple_transform]: 8.175e-05, [1] [Cycle 1]: 7.745e-05, [4] [d_1]: 4.861e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 9.05001e-06 [partial_unused_args_eliminate]: 2.06998e-06 [add_recomputation]: 7.088e-05 [cse_after_recomputation]: 2.67e-05, [1] [Cycle 1]: 2.248e-05, [1] [cse]: 1.728e-05 [environ_conv]: 1.776e-05 [swap_dp_allreduce_reducescatter]: 3.414e-05 [bias_add_comm_swap]: 1.151e-05 [label_micro_interleaved_index]: 1.798e-05 [label_fine_grained_interleaved_index]: 2.39001e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.28002e-06 [micro_interleaved_order_control]: 4.17e-06 [assign_add_opt]: 4.80009e-07 [ForceFp32Comm]: 3.16999e-06 [remove_cast_before_assign_add]: 1.67e-05 [full_micro_interleaved_order_control]: 1.427e-05 [reorder_send_recv_between_fp_bp]: 2.14e-06 [comm_op_add_attrs]: 3.23998e-06 [add_comm_op_reuse_tag]: 3.89991e-07 [interleave_split_concat_branches]: 8.00006e-07 [interleave_parallel_branches]: 1.167e-05 [overlap_opt_shard_in_pipeline]: 1.919e-05 [overlap_opt_shard_grad_in_pipeline]: 3.86001e-06 [control_data_broadcast_order]: 1.459e-05 [grouped_pairwise_exchange_alltoall]: 1.30999e-06 [offloading_packed_experts]: 4.63999e-06 [overlap_recompute_and_grad_model_parallel]: 1.855e-05 [overlap_grad_matmul_and_grad_allreduce]: 7.79983e-07 [overlap_recompute_allgather_and_fa_grad]: 6.50005e-07 [overlap_recompute_comm]: 1.32e-06 [overlap_grad_ring_attention]: 2.67e-05 [overlap_grad_flash_sp]: 5.002e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 1.33e-05 [split_layernorm_comm]: 1.49998e-06 [handle_group_info]: 1.35999e-06 [symbol_engine_optimizer]: 8.314e-05, [1] [Cycle 1]: 7.89e-05, [6] [build]: 6.28002e-06 [elim_shapecalc]: 1.203e-05 [elim_not_effective]: 1.302e-05 [opt_reshape]: 8.82e-06 [fold_const_symbol]: 1.147e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.77001e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 2.623e-05 [get_jit_bprop_graph]: 9.09989e-07 [rewriter_after_jit_bprop_graph]: 2.88998e-06 [opt_after_jit_grad]: 0.00047345 [validate]: 5.99e-05 [backend_pass]: 8.79983e-07 [task_emit]: 2.99416 [execute]: 1.022e-05 Sums bootstrap : 0.000882s : 0.03% type_inference : 0.064338s : 2.10% event_method : 0.000043s : 0.00% auto_monad : 0.000132s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000056s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000090s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000003s : 0.00% optimize.rewriter_before_opt_a : 0.000499s : 0.02% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000110s : 0.00% optimize.opt_a.loop_unroll : 0.000058s : 0.00% optimize.opt_a.a_1 : 0.001031s : 0.03% optimize.opt_a.with_stream_mark : 0.000022s : 0.00% optimize.opt_a.recompute_prepare : 0.000018s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000225s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000057s : 0.00% optimize.opt_a.auto_parallel : 0.000016s : 0.00% optimize.opt_a.parallel : 0.000099s : 0.00% optimize.opt_a.flash_sp : 0.000050s : 0.00% optimize.opt_a.merge_comm : 0.000011s : 0.00% optimize.opt_a.allreduce_fusion : 0.000021s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000028s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000017s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000034s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000013s : 0.00% optimize.opt_a.before_grad : 0.000025s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000026s : 0.00% optimize.opt_a.a_after_grad : 0.000024s : 0.00% optimize.opt_a.renormalize : 0.001408s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.00% optimize.opt_a.cse : 0.000109s : 0.00% optimize.opt_a.a_3 : 0.000117s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000035s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000514s : 0.02% optimize.opt_b.b_1 : 0.000178s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000038s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.00% optimize.overlap_param_gather : 0.000016s : 0.00% optimize.cconv : 0.000033s : 0.00% optimize.loop_unroll : 0.000416s : 0.01% optimize.opt_after_cconv.c_1 : 0.000034s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000037s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000071s : 0.00% optimize.tuple_transform.d_1 : 0.000049s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000071s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000018s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000034s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000018s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000004s : 0.00% optimize.assign_add_opt : 0.000000s : 0.00% optimize.ForceFp32Comm : 0.000003s : 0.00% optimize.remove_cast_before_assign_add : 0.000017s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000012s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000019s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000027s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000013s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000006s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000473s : 0.02% validate : 0.000060s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.994158s : 97.64% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000284 34 0.39% : 0.000001s : 2: substitution.elim_not_effective 0.32% : 0.000001s : 2: substitution.fold_const_symbol 2.12% : 0.000006s : 5: substitution.graph_param_transform 77.44% : 0.000220s : 5: substitution.inline 0.89% : 0.000003s : 4: substitution.j_node_and_user_rematch 7.04% : 0.000020s : 4: substitution.remove_not_recompute_node 1.52% : 0.000004s : 6: substitution.replace_old_param 10.27% : 0.000029s : 6: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.064243 2 96.54% : 0.062020s : 1: type_inference.infer 3.46% : 0.002224s : 1: type_inference.specialize ------[replace.] 0.000138 11 58.45% : 0.000081s : 5: replace.inline 41.55% : 0.000057s : 6: replace.tuple_list_get_item_eliminator ------[match.] 0.000243 11 89.17% : 0.000217s : 5: match.inline 10.83% : 0.000026s : 6: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1659 1.06% : 0.000002s : 17: predicate.accumulaten_eliminater 0.71% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 10: predicate.addn_check_dump 0.89% : 0.000002s : 17: predicate.addn_zero_filter 0.86% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.13% : 0.000005s : 27: predicate.arithmetic_simplify 0.95% : 0.000002s : 17: predicate.cast_eliminate 0.57% : 0.000001s : 10: predicate.check_bprop_eliminate 0.50% : 0.000001s : 10: predicate.compare_switch_simplify 0.19% : 0.000000s : 5: predicate.const_output_eliminate 0.52% : 0.000001s : 10: predicate.depend_value_elim 0.98% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 17: predicate.dict_get_item_eliminator 0.88% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 22: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 22: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 22: predicate.environ_get_depend_swap 1.70% : 0.000004s : 32: predicate.environ_get_eliminate 1.10% : 0.000002s : 22: predicate.environ_get_set_eliminate 1.56% : 0.000004s : 28: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 28: predicate.float_depend_g_call 0.51% : 0.000001s : 10: predicate.float_environ_get_switch 0.74% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.20% : 0.000000s : 5: predicate.fold_const_symbol 0.71% : 0.000002s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.53% : 0.000001s : 10: predicate.incorporate_call 0.46% : 0.000001s : 10: predicate.incorporate_call_switch 5.71% : 0.000013s : 75: predicate.inline 0.69% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 10: predicate.less_batch_normalization 2.08% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.66% : 0.000006s : 50: predicate.load_eliminater 0.91% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.84% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.60% : 0.000004s : 27: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 10: predicate.merge_addn 0.53% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 17: predicate.minmaximum_grad 0.86% : 0.000002s : 5: predicate.mutable_eliminate 0.36% : 0.000001s : 5: predicate.opt_reshape 0.41% : 0.000001s : 5: predicate.parallel_virtual_node 2.08% : 0.000005s : 28: predicate.partial_defer_inline 1.67% : 0.000004s : 28: predicate.partial_eliminate 0.94% : 0.000002s : 17: predicate.print_const_string_wrapper 0.57% : 0.000001s : 10: predicate.reduce_all_const_elim 1.28% : 0.000003s : 17: predicate.reduce_eliminate 2.71% : 0.000006s : 50: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000003s : 33: predicate.replace_applicator 0.40% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 0.98% : 0.000002s : 17: predicate.reshape_eliminate 0.60% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 5: predicate.row_tensor_eliminate 0.71% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 10: predicate.shard_identity_eliminate 0.63% : 0.000001s : 10: predicate.special_op_eliminate 0.65% : 0.000001s : 10: predicate.specialize_transform 0.81% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.68% : 0.000004s : 28: predicate.switch_defer_inline 2.28% : 0.000005s : 38: predicate.switch_layer_defer_inline 5.74% : 0.000013s : 91: predicate.switch_simplify 0.96% : 0.000002s : 17: predicate.tile_eliminate 0.91% : 0.000002s : 17: predicate.transpose_eliminate 1.49% : 0.000003s : 27: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 27: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 27: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000007s : 43: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 27: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 37: predicate.tuple_list_set_item_eliminator 1.90% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 2.58% : 0.000006s : 50: predicate.updatestate_pure_node_eliminater 3.20% : 0.000007s : 60: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 5: predicate.value_based_eliminate 0.70% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.77% : 0.000002s : 10: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001480 15 55.86% : 0.000827s : 8: func_graph_cloner_run.FuncGraphClonerGraph 44.14% : 0.000653s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.093412 196 0.00% : 0.000006s : 1: ForceFp32Comm 0.25% : 0.007694s : 1: add_attr 0.25% : 0.007680s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000137s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000925s : 1: bootstrap 0.00% : 0.000036s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000022s : 1: environ_conv 0.00% : 0.000049s : 1: event_method 0.00% : 0.000021s : 1: execute 0.00% : 0.000018s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000014s : 1: interleave_parallel_branches 0.00% : 0.000003s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000021s : 1: label_micro_interleaved_index 0.01% : 0.000424s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000007s : 1: micro_interleaved_order_control 0.02% : 0.000524s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.05% : 0.001700s : 78: opt.transform.opt_a 0.00% : 0.000033s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000161s : 28: opt.transform.opt_b 0.00% : 0.000055s : 2: opt.transform.opt_trans_graph 0.00% : 0.000042s : 4: opt.transform.symbol_engine_opt 0.14% : 0.004281s : 1: opt_a 0.00% : 0.000131s : 1: opt_after_cconv 0.02% : 0.000483s : 1: opt_after_jit_grad 0.01% : 0.000283s : 1: opt_b 0.23% : 0.007099s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000053s : 1: overlap_grad_flash_sp 0.00% : 0.000003s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000019s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000021s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000094s : 1: pre_auto_parallel 0.00% : 0.000006s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000020s : 1: remove_cast_before_assign_add 0.00% : 0.000075s : 1: remove_dup_value 0.02% : 0.000721s : 1: renormalize.infer 0.02% : 0.000677s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000039s : 1: rewriter_after_opt_a 0.02% : 0.000505s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000016s : 1: split_matmul_comm_elemetwise 0.00% : 0.000037s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000086s : 1: symbol_engine_optimizer 96.79% : 2.994192s : 1: task_emit 0.00% : 0.000085s : 1: tuple_transform 2.08% : 0.064357s : 1: type_inference 0.00% : 0.000088s : 1: validate TotalTime = 3.12987, [24] [bootstrap]: 0.0008705 [type_inference]: 0.180434 [event_method]: 0.00034059 [auto_monad]: 0.0003809 [graph_reusing]: 8.43001e-06 [inline]: 2.44999e-06 [add_attr]: 0.00750152, [1] [add_attr_with_inline]: 0.00748692, [1] [Cycle 1]: 0.00016052, [2] [tag_attr]: 5.593e-05 [meta_addattr_fg_expand]: 2.312e-05 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 7.907e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 8.50006e-07 [dataset_repeat_opt]: 1.85001e-06 [pipeline_split]: 1.74e-06 [optimize]: 0.0719359, [53] [py_interpret_to_execute]: 4.00998e-06 [rewriter_before_opt_a]: 0.00015771 [opt_a]: 0.0685568, [3] [Cycle 1]: 0.0602211, [45] [expand_dump_flag]: 3.78001e-06 [switch_simplify]: 0.00011943 [loop_unroll]: 7.715e-05 [a_1]: 0.00184677 [with_stream_mark]: 2.547e-05 [recompute_prepare]: 3.194e-05 [updatestate_depend_eliminate]: 2.389e-05 [updatestate_assign_eliminate]: 1.178e-05 [updatestate_loads_eliminate]: 1.103e-05 [parameter_eliminate]: 1.83002e-06 [a_2]: 0.00037475 [accelerated_algorithm]: 5.788e-05 [shard]: 1.73002e-06 [meta_shard_fg_expand]: 5.79e-06 [shard_inline]: 2.362e-05 [merge_send_recv]: 6.034e-05 [auto_parallel]: 1.525e-05 [parallel]: 8.319e-05 [flash_sp]: 4.155e-05 [merge_comm]: 1.46e-05 [allreduce_fusion]: 2.313e-05 [matmul_add_comm_reduction]: 3.556e-05 [allreduce_slice_to_reducescatter]: 1.06e-05 [virtual_shard_identity]: 2.651e-05 [virtual_dataset]: 2.347e-05 [get_grad_eliminate_]: 2.3e-05 [virtual_output]: 2.295e-05 [merge_forward]: 1.165e-05 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 3.166e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.271e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 4.823e-05 [set_forward_comm_id_for_comm_node_pass]: 2.356e-05 [meta_fg_expand]: 0.0219429 [flash_sp_send_recv_attached]: 7.64002e-06 [receive_attached]: 2.585e-05 [after_resolve]: 0.00014777 [a_after_grad]: 0.00020948 [renormalize]: 0.0319068 [add_forward_monad_depend]: 1.754e-05 [auto_monad_grad]: 1.188e-05 [auto_monad_eliminator]: 0.00014739 [cse]: 0.00046879 [a_3]: 0.00181625 [Cycle 2]: 0.00671402, [45] [expand_dump_flag]: 4.46002e-06 [switch_simplify]: 0.00011951 [loop_unroll]: 0.00011661 [a_1]: 0.00295194 [with_stream_mark]: 2.264e-05 [recompute_prepare]: 2.377e-05 [updatestate_depend_eliminate]: 3.119e-05 [updatestate_assign_eliminate]: 1.012e-05 [updatestate_loads_eliminate]: 9.31998e-06 [parameter_eliminate]: 1.99e-06 [a_2]: 0.00028943 [accelerated_algorithm]: 2.445e-05 [shard]: 1.82999e-06 [meta_shard_fg_expand]: 5.14998e-06 [shard_inline]: 1.89e-05 [merge_send_recv]: 1.586e-05 [auto_parallel]: 1.571e-05 [parallel]: 7.68001e-06 [flash_sp]: 3.98001e-06 [merge_comm]: 1.046e-05 [allreduce_fusion]: 9.93998e-06 [matmul_add_comm_reduction]: 1.679e-05 [allreduce_slice_to_reducescatter]: 6.50005e-07 [virtual_shard_identity]: 2.063e-05 [virtual_dataset]: 1.886e-05 [get_grad_eliminate_]: 1.862e-05 [virtual_output]: 1.847e-05 [merge_forward]: 9.54e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.709e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.22e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 3.005e-05 [set_forward_comm_id_for_comm_node_pass]: 1.013e-05 [meta_fg_expand]: 0.00011207 [flash_sp_send_recv_attached]: 1.99999e-06 [receive_attached]: 2.71e-06 [after_resolve]: 2.616e-05 [a_after_grad]: 3.086e-05 [renormalize]: 0.00203399 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 3.014e-05 [cse]: 0.00018704 [a_3]: 0.00013505 [Cycle 3]: 0.00160503, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 1.988e-05 [loop_unroll]: 1.799e-05 [a_1]: 0.00050416 [with_stream_mark]: 1.597e-05 [recompute_prepare]: 1.818e-05 [updatestate_depend_eliminate]: 9.70002e-06 [updatestate_assign_eliminate]: 8.96002e-06 [updatestate_loads_eliminate]: 8.3e-06 [parameter_eliminate]: 1.15001e-06 [a_2]: 0.0002672 [accelerated_algorithm]: 2.115e-05 [shard]: 1.02e-06 [meta_shard_fg_expand]: 3.63999e-06 [shard_inline]: 1.763e-05 [merge_send_recv]: 1.171e-05 [auto_parallel]: 1.198e-05 [parallel]: 4.23001e-06 [flash_sp]: 8.80013e-07 [merge_comm]: 1.001e-05 [allreduce_fusion]: 9.59e-06 [matmul_add_comm_reduction]: 1.356e-05 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 1.946e-05 [virtual_dataset]: 1.766e-05 [get_grad_eliminate_]: 1.723e-05 [virtual_output]: 1.736e-05 [merge_forward]: 8.53001e-06 [cell_reuse_recompute_pass]: 1.68002e-06 [offload_activation]: 1.429e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.192e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 2.836e-05 [set_forward_comm_id_for_comm_node_pass]: 1.016e-05 [meta_fg_expand]: 6.64999e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.02e-06 [after_resolve]: 2.116e-05 [a_after_grad]: 2.899e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 2.287e-05 [cse]: 5.214e-05 [a_3]: 0.00012363 [py_interpret_to_execute_after_opt_a]: 5.00001e-06 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 6.619e-05 [convert_after_rewriter]: 1.25001e-06 [order_py_execute_after_rewriter]: 1.74e-06 [mutable_eliminate]: 0.0006313 [opt_b]: 0.00056651, [1] [Cycle 1]: 0.00055917, [7] [b_1]: 0.00042029 [b_2]: 1.981e-05 [updatestate_depend_eliminate]: 1.232e-05 [updatestate_assign_eliminate]: 8.62e-06 [updatestate_loads_eliminate]: 8.64e-06 [renormalize]: 5.90022e-07 [cse]: 5.354e-05 [optimize_parallel_all_gather_comm]: 4.247e-05 [overlap_param_gather]: 1.315e-05 [cconv]: 2.615e-05 [loop_unroll]: 0.00051594 [opt_after_cconv]: 0.00024151, [1] [Cycle 1]: 0.00023574, [7] [c_1]: 0.00010533 [parameter_eliminate]: 2.26e-06 [updatestate_depend_eliminate]: 1.42e-05 [updatestate_assign_eliminate]: 1.046e-05 [updatestate_loads_eliminate]: 9.60001e-06 [cse]: 5.695e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 3.561e-05 [tuple_transform]: 0.00017625, [1] [Cycle 1]: 0.00017127, [4] [d_1]: 0.00013083 [none_parameter_eliminate]: 2.14999e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 1.971e-05 [partial_unused_args_eliminate]: 1.67999e-06 [add_recomputation]: 0.0001207 [cse_after_recomputation]: 5.881e-05, [1] [Cycle 1]: 5.331e-05, [1] [cse]: 4.752e-05 [environ_conv]: 1.862e-05 [swap_dp_allreduce_reducescatter]: 3.593e-05 [bias_add_comm_swap]: 1.144e-05 [label_micro_interleaved_index]: 1.389e-05 [label_fine_grained_interleaved_index]: 2.60002e-06 [merge_cast_opt]: 1.41002e-06 [slice_recompute_activation]: 1.91998e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.001e-05 [full_micro_interleaved_order_control]: 1.182e-05 [reorder_send_recv_between_fp_bp]: 2.79999e-06 [comm_op_add_attrs]: 1.37999e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.013e-05 [overlap_opt_shard_in_pipeline]: 2.06e-05 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 3.083e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 7.92e-06 [overlap_recompute_and_grad_model_parallel]: 1.788e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.25001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 2.608e-05 [overlap_grad_flash_sp]: 7.114e-05 [begin_end_overlap_inline]: 5.49975e-07 [split_matmul_comm_elemetwise]: 1.145e-05 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 0.00015384, [1] [Cycle 1]: 0.00014896, [6] [build]: 8.53001e-06 [elim_shapecalc]: 2.494e-05 [elim_not_effective]: 3.347e-05 [opt_reshape]: 2.27e-05 [fold_const_symbol]: 2.93e-05 [renormalize]: 2.60014e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.41998e-06 [auto_monad_reorder]: 4.701e-05 [get_jit_bprop_graph]: 1.63002e-06 [rewriter_after_jit_bprop_graph]: 3.52002e-06 [opt_after_jit_grad]: 0.00054533 [validate]: 0.00010909 [backend_pass]: 1.37e-06 [task_emit]: 2.8673 [execute]: 8.81997e-06 Sums bootstrap : 0.000870s : 0.03% type_inference : 0.180434s : 5.78% event_method : 0.000341s : 0.01% auto_monad : 0.000381s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000056s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000023s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000079s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000158s : 0.01% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000259s : 0.01% optimize.opt_a.loop_unroll : 0.000212s : 0.01% optimize.opt_a.a_1 : 0.005303s : 0.17% optimize.opt_a.with_stream_mark : 0.000064s : 0.00% optimize.opt_a.recompute_prepare : 0.000074s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000065s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000031s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000029s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000931s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000103s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000015s : 0.00% optimize.opt_a.shard_inline : 0.000060s : 0.00% optimize.opt_a.merge_send_recv : 0.000088s : 0.00% optimize.opt_a.auto_parallel : 0.000043s : 0.00% optimize.opt_a.parallel : 0.000095s : 0.00% optimize.opt_a.flash_sp : 0.000046s : 0.00% optimize.opt_a.merge_comm : 0.000035s : 0.00% optimize.opt_a.allreduce_fusion : 0.000043s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000066s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000067s : 0.00% optimize.opt_a.virtual_dataset : 0.000060s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000059s : 0.00% optimize.opt_a.virtual_output : 0.000059s : 0.00% optimize.opt_a.merge_forward : 0.000030s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000063s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000107s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000107s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000044s : 0.00% optimize.opt_a.meta_fg_expand : 0.022062s : 0.71% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000030s : 0.00% optimize.opt_a.after_resolve : 0.000195s : 0.01% optimize.opt_a.a_after_grad : 0.000269s : 0.01% optimize.opt_a.renormalize : 0.033941s : 1.09% optimize.opt_a.add_forward_monad_depend : 0.000024s : 0.00% optimize.opt_a.auto_monad_grad : 0.000015s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000200s : 0.01% optimize.opt_a.cse : 0.000708s : 0.02% optimize.opt_a.a_3 : 0.002075s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000066s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000631s : 0.02% optimize.opt_b.b_1 : 0.000420s : 0.01% optimize.opt_b.b_2 : 0.000020s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000054s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000042s : 0.00% optimize.overlap_param_gather : 0.000013s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000516s : 0.02% optimize.opt_after_cconv.c_1 : 0.000105s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.cse : 0.000057s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000036s : 0.00% optimize.tuple_transform.d_1 : 0.000131s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000121s : 0.00% optimize.cse_after_recomputation.cse : 0.000048s : 0.00% optimize.environ_conv : 0.000019s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000036s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000021s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000031s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000018s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000071s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000025s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000033s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000023s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000029s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000047s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000545s : 0.02% validate : 0.000109s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.867302s : 91.87% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.002057 380 3.25% : 0.000067s : 7: substitution.arithmetic_simplify 0.55% : 0.000011s : 4: substitution.depend_value_elim 0.23% : 0.000005s : 11: substitution.elim_not_effective 0.26% : 0.000005s : 6: substitution.float_depend_g_call 0.20% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.20% : 0.000004s : 11: substitution.fold_const_symbol 34.80% : 0.000716s : 4: substitution.getattr_setattr_resolve 0.67% : 0.000014s : 16: substitution.graph_param_transform 0.14% : 0.000003s : 2: substitution.incorporate_call 0.10% : 0.000002s : 2: substitution.incorporate_call_switch 34.55% : 0.000711s : 25: substitution.inline 1.65% : 0.000034s : 5: substitution.inline_without_move 1.21% : 0.000025s : 37: substitution.j_node_and_user_rematch 1.69% : 0.000035s : 3: substitution.less_batch_normalization 0.76% : 0.000016s : 12: substitution.minmaximum_grad 0.07% : 0.000001s : 1: substitution.opt_reshape 1.31% : 0.000027s : 6: substitution.partial_eliminate 1.15% : 0.000024s : 37: substitution.remove_not_recompute_node 3.70% : 0.000076s : 37: substitution.replace_applicator 0.78% : 0.000016s : 26: substitution.replace_old_param 0.70% : 0.000014s : 3: substitution.reshape_eliminate 0.13% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.59% : 0.000033s : 12: substitution.tuple_list_convert_item_index_to_positive 0.74% : 0.000015s : 12: substitution.tuple_list_get_item_const_eliminator 1.03% : 0.000021s : 12: substitution.tuple_list_get_item_depend_reorder 4.37% : 0.000090s : 36: substitution.tuple_list_get_item_eliminator 1.01% : 0.000021s : 12: substitution.tuple_list_get_set_item_eliminator 1.05% : 0.000022s : 16: substitution.updatestate_pure_node_eliminater 2.09% : 0.000043s : 21: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.180317 2 98.22% : 0.177113s : 1: type_inference.infer 1.78% : 0.003204s : 1: type_inference.specialize ------[replace.] 0.000480 52 0.93% : 0.000004s : 1: replace.arithmetic_simplify 11.31% : 0.000054s : 3: replace.getattr_setattr_resolve 41.05% : 0.000197s : 25: replace.inline 6.38% : 0.000031s : 1: replace.replace_applicator 38.48% : 0.000185s : 21: replace.tuple_list_get_item_eliminator 1.86% : 0.000009s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.001437 52 0.31% : 0.000004s : 1: match.arithmetic_simplify 46.57% : 0.000669s : 3: match.getattr_setattr_resolve 48.45% : 0.000696s : 25: match.inline 0.64% : 0.000009s : 1: match.replace_applicator 3.45% : 0.000050s : 21: match.tuple_list_get_item_eliminator 0.58% : 0.000008s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001486 11077 0.88% : 0.000013s : 107: predicate.accumulaten_eliminater 0.28% : 0.000004s : 16: predicate.ad_related_special_op_eliminate 0.47% : 0.000007s : 56: predicate.addn_check_dump 0.89% : 0.000013s : 107: predicate.addn_zero_filter 0.88% : 0.000013s : 107: predicate.adjust_all_reduce_mul_add 1.92% : 0.000028s : 164: predicate.arithmetic_simplify 0.98% : 0.000015s : 108: predicate.cast_eliminate 2.55% : 0.000038s : 294: predicate.check_bprop_eliminate 0.47% : 0.000007s : 56: predicate.compare_switch_simplify 0.09% : 0.000001s : 16: predicate.const_output_eliminate 0.50% : 0.000007s : 56: predicate.depend_value_elim 0.98% : 0.000015s : 108: predicate.dict_get_item_const_eliminator 1.11% : 0.000016s : 108: predicate.dict_get_item_eliminator 0.90% : 0.000013s : 108: predicate.dict_set_item_eliminator 0.34% : 0.000005s : 32: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 16: predicate.elim_not_effective 0.16% : 0.000002s : 16: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000015s : 124: predicate.environ_add_const_eliminate 1.01% : 0.000015s : 124: predicate.environ_get_add_eliminate 1.02% : 0.000015s : 124: predicate.environ_get_depend_swap 1.51% : 0.000022s : 180: predicate.environ_get_eliminate 1.01% : 0.000015s : 124: predicate.environ_get_set_eliminate 1.39% : 0.000021s : 155: predicate.exchange_switch_depend_value 1.80% : 0.000027s : 155: predicate.float_depend_g_call 0.48% : 0.000007s : 56: predicate.float_environ_get_switch 0.62% : 0.000009s : 72: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 16: predicate.fold_const_symbol 0.52% : 0.000008s : 56: predicate.get_grad_eliminate 0.36% : 0.000005s : 20: predicate.getattr_setattr_resolve 0.09% : 0.000001s : 16: predicate.graph_param_transform 0.49% : 0.000007s : 56: predicate.incorporate_call 0.45% : 0.000007s : 56: predicate.incorporate_call_switch 4.64% : 0.000069s : 406: predicate.inline 1.65% : 0.000025s : 139: predicate.inline_without_move 0.27% : 0.000004s : 56: predicate.j_node_and_user_rematch 0.59% : 0.000009s : 56: predicate.less_batch_normalization 1.43% : 0.000021s : 161: predicate.list_to_tuple_eliminator_ 2.26% : 0.000034s : 268: predicate.load_eliminater 0.29% : 0.000004s : 16: predicate.loop_unroll_after_grad 2.11% : 0.000031s : 237: predicate.loop_unroll_before_grad 1.19% : 0.000018s : 140: predicate.make_slice_get_slice_eliminator 0.49% : 0.000007s : 56: predicate.merge_addn 2.51% : 0.000037s : 290: predicate.micro_step_allgather_replace 2.52% : 0.000037s : 290: predicate.mini_step_allgather_replace 0.90% : 0.000013s : 108: predicate.minmaximum_grad 0.30% : 0.000004s : 16: predicate.mutable_eliminate 0.16% : 0.000002s : 16: predicate.opt_reshape 0.16% : 0.000002s : 16: predicate.parallel_virtual_node 1.83% : 0.000027s : 155: predicate.partial_defer_inline 1.43% : 0.000021s : 145: predicate.partial_eliminate 0.89% : 0.000013s : 107: predicate.print_const_string_wrapper 0.49% : 0.000007s : 56: predicate.reduce_all_const_elim 1.13% : 0.000017s : 108: predicate.reduce_eliminate 2.23% : 0.000033s : 268: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000004s : 56: predicate.remove_not_recompute_node 2.50% : 0.000037s : 421: predicate.replace_applicator 0.73% : 0.000011s : 139: predicate.replace_old_param 0.10% : 0.000001s : 16: predicate.reset_defer_inline 0.95% : 0.000014s : 108: predicate.reshape_eliminate 2.56% : 0.000038s : 290: predicate.row_tensor_add_zeros_like 0.17% : 0.000003s : 16: predicate.row_tensor_eliminate 2.73% : 0.000041s : 294: predicate.same_eliminate 0.34% : 0.000005s : 58: predicate.set_cell_output_no_recompute 0.54% : 0.000008s : 56: predicate.shard_identity_eliminate 0.31% : 0.000005s : 32: predicate.special_op_eliminate 0.56% : 0.000008s : 56: predicate.specialize_transform 2.64% : 0.000039s : 290: predicate.split_environ_get_set_with_tuple_value 1.44% : 0.000021s : 139: predicate.stack_unstack_eliminate 0.16% : 0.000002s : 16: predicate.switch_call_monad_eliminater 1.51% : 0.000022s : 155: predicate.switch_defer_inline 4.06% : 0.000060s : 449: predicate.switch_layer_defer_inline 4.25% : 0.000063s : 464: predicate.switch_simplify 0.91% : 0.000014s : 108: predicate.tile_eliminate 0.91% : 0.000013s : 108: predicate.transpose_eliminate 1.29% : 0.000019s : 140: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000021s : 140: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000019s : 140: predicate.tuple_list_get_item_depend_reorder 2.36% : 0.000035s : 217: predicate.tuple_list_get_item_eliminator 1.34% : 0.000020s : 140: predicate.tuple_list_get_set_item_eliminator 1.91% : 0.000028s : 196: predicate.tuple_list_set_item_eliminator 1.40% : 0.000021s : 161: predicate.tuple_to_list_eliminator_ 2.24% : 0.000033s : 268: predicate.updatestate_pure_node_eliminater 2.85% : 0.000042s : 325: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 16: predicate.value_based_eliminate 0.53% : 0.000008s : 56: predicate.virtual_dataset_eliminate 0.51% : 0.000008s : 56: predicate.virtual_output_eliminate 0.15% : 0.000002s : 16: predicate.virtual_view_grad_eliminate 0.16% : 0.000002s : 16: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005070 78 66.96% : 0.003395s : 41: func_graph_cloner_run.FuncGraphClonerGraph 33.04% : 0.001675s : 37: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.254633 247 0.00% : 0.000003s : 1: ForceFp32Comm 0.23% : 0.007506s : 1: add_attr 0.23% : 0.007491s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000126s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000394s : 1: auto_monad 0.00% : 0.000051s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.03% : 0.000917s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000034s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000062s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000022s : 1: environ_conv 0.01% : 0.000352s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000525s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000639s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000030s : 1: opt.transform.mutable_eliminate 0.30% : 0.009787s : 125: opt.transform.opt_a 0.00% : 0.000104s : 1: opt.transform.opt_after_cconv 0.00% : 0.000068s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000412s : 28: opt.transform.opt_b 0.03% : 0.000826s : 2: opt.transform.opt_resolve 0.00% : 0.000148s : 2: opt.transform.opt_trans_graph 0.00% : 0.000106s : 4: opt.transform.symbol_engine_opt 2.11% : 0.068560s : 1: opt_a 0.01% : 0.000245s : 1: opt_after_cconv 0.02% : 0.000554s : 1: opt_after_jit_grad 0.02% : 0.000570s : 1: opt_b 2.21% : 0.071942s : 1: optimize 0.00% : 0.000046s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000075s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000029s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000021s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000084s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000040s : 1: remove_dup_value 0.88% : 0.028666s : 2: renormalize.infer 0.16% : 0.005256s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000070s : 1: rewriter_after_opt_a 0.00% : 0.000162s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000039s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000157s : 1: symbol_engine_optimizer 88.10% : 2.867334s : 1: task_emit 0.01% : 0.000179s : 1: tuple_transform 5.54% : 0.180450s : 1: type_inference 0.00% : 0.000154s : 1: validate [WARNING] ME(17919:281473890602800,ForkProcess-89):2026-01-29-17:47:10.227.740 [mindspore/graph/api.py:102] Constant value tensor are detected in tuple or list, which might cause recompiling when tensor value changes. You can use mutable(Tensor) or mutable(tuple(Tensor)) to set tensor's value as variable to to avoid recompiling. The tuple or list arg is: [Tensor(shape=[2], dtype=Int64, value= [0, 1]), Tensor(shape=[2], dtype=Int64, value= [1, 2]), Tensor(shape=[2], dtype=Int64, value= [2, 3])] . [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:10.675.157 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:266] IbTryExpandCNode] After expanding cnode Default/network-InplaceIndexPutNet/InplaceIndexPut-op0, the new abstract of Expand/_InplaceIndexPut/InnerInplaceIndexPut-op0 does not match original cnode's abstract. new: AbstractTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), old: AbstractRefTensor(key: 0xaaaaf59256a07, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny, is_inplace) [WARNING] SESSION(17919,ffffbf434f30,python3.9):2026-01-29-17:47:10.675.227 [mindspore/ccsrc/backend/common/expander/fallback/expander_fallback.cc:274] IbTryExpandCNode] Restore new abstract to AbstractRefTensor new:AbstractRefTensor(key: 0xaaaaf59256a07, ref_value: AbstractRefTensor(shape: (2, 3, 4), element: AbstractScalar(Type: Float32, Value: ValueAny, Shape: NoShape), value_ptr: 0xaaaac97b39f0, value: ValueAny), value: ValueAny) TotalTime = 0.173298, [24] [bootstrap]: 0.00057728 [type_inference]: 0.112146 [event_method]: 0.00042952 [auto_monad]: 0.00034753 [graph_reusing]: 1.021e-05 [inline]: 2.53e-06 [add_attr]: 0.00376017, [1] [add_attr_with_inline]: 0.00375163, [1] [Cycle 1]: 8.549e-05, [2] [tag_attr]: 4.452e-05 [meta_addattr_fg_expand]: 1.276e-05 [parallel-infer-symbol]: 3.14001e-06 [pre_auto_parallel]: 6.117e-05 [insert-virtual-dataset]: 2.48998e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.0422196, [53] [py_interpret_to_execute]: 4.20999e-06 [rewriter_before_opt_a]: 0.00015813 [opt_a]: 0.0395761, [3] [Cycle 1]: 0.0323326, [45] [expand_dump_flag]: 3.97e-06 [switch_simplify]: 9.171e-05 [loop_unroll]: 7.847e-05 [a_1]: 0.00185409 [with_stream_mark]: 2.816e-05 [recompute_prepare]: 3.07e-05 [updatestate_depend_eliminate]: 1.444e-05 [updatestate_assign_eliminate]: 1.246e-05 [updatestate_loads_eliminate]: 1.216e-05 [parameter_eliminate]: 2.83e-06 [a_2]: 0.00038081 [accelerated_algorithm]: 4.69e-05 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 5.44e-06 [shard_inline]: 2.431e-05 [merge_send_recv]: 2.177e-05 [auto_parallel]: 1.581e-05 [parallel]: 2.304e-05 [flash_sp]: 1.115e-05 [merge_comm]: 1.402e-05 [allreduce_fusion]: 1.303e-05 [matmul_add_comm_reduction]: 3.194e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 2.62e-05 [virtual_dataset]: 2.395e-05 [get_grad_eliminate_]: 2.367e-05 [virtual_output]: 2.34e-05 [merge_forward]: 1.368e-05 [cell_reuse_recompute_pass]: 1.04998e-06 [offload_activation]: 2.292e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.116e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 4.044e-05 [set_forward_comm_id_for_comm_node_pass]: 1.38e-05 [meta_fg_expand]: 0.00592367 [flash_sp_send_recv_attached]: 6.66999e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 0.00014027 [a_after_grad]: 0.00020574 [renormalize]: 0.0205365 [add_forward_monad_depend]: 1.687e-05 [auto_monad_grad]: 1.14e-05 [auto_monad_eliminator]: 0.00013699 [cse]: 0.00036896 [a_3]: 0.00173689 [Cycle 2]: 0.00594711, [45] [expand_dump_flag]: 3.34001e-06 [switch_simplify]: 0.00011509 [loop_unroll]: 0.00011259 [a_1]: 0.00279686 [with_stream_mark]: 1.709e-05 [recompute_prepare]: 1.882e-05 [updatestate_depend_eliminate]: 2.918e-05 [updatestate_assign_eliminate]: 7.55e-06 [updatestate_loads_eliminate]: 6.64999e-06 [parameter_eliminate]: 1.93002e-06 [a_2]: 0.00021309 [accelerated_algorithm]: 1.893e-05 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 3.95e-06 [shard_inline]: 1.412e-05 [merge_send_recv]: 1.052e-05 [auto_parallel]: 1.043e-05 [parallel]: 6.38e-06 [flash_sp]: 4.18999e-06 [merge_comm]: 7.78001e-06 [allreduce_fusion]: 7.38999e-06 [matmul_add_comm_reduction]: 1.188e-05 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 1.601e-05 [virtual_dataset]: 1.452e-05 [get_grad_eliminate_]: 1.364e-05 [virtual_output]: 1.37e-05 [merge_forward]: 6.47001e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 1.304e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.49e-05 [merge_recompute_call_nodes]: 9.00007e-07 [before_grad]: 2.266e-05 [set_forward_comm_id_for_comm_node_pass]: 7.46001e-06 [meta_fg_expand]: 0.0001164 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 1.35999e-06 [after_resolve]: 2.038e-05 [a_after_grad]: 2.296e-05 [renormalize]: 0.00168277 [add_forward_monad_depend]: 5.52001e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 2.377e-05 [cse]: 0.00016323 [a_3]: 9.955e-05 [Cycle 3]: 0.00128093, [45] [expand_dump_flag]: 1.79e-06 [switch_simplify]: 1.484e-05 [loop_unroll]: 1.339e-05 [a_1]: 0.00036011 [with_stream_mark]: 1.44e-05 [recompute_prepare]: 1.414e-05 [updatestate_depend_eliminate]: 7.49002e-06 [updatestate_assign_eliminate]: 6.38998e-06 [updatestate_loads_eliminate]: 6.22001e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 0.00019159 [accelerated_algorithm]: 1.675e-05 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 2.61999e-06 [shard_inline]: 1.3e-05 [merge_send_recv]: 9.32001e-06 [auto_parallel]: 9.39998e-06 [parallel]: 3.91001e-06 [flash_sp]: 1.27e-06 [merge_comm]: 7.31001e-06 [allreduce_fusion]: 6.93e-06 [matmul_add_comm_reduction]: 1.064e-05 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 1.426e-05 [virtual_dataset]: 1.323e-05 [get_grad_eliminate_]: 1.263e-05 [virtual_output]: 1.29e-05 [merge_forward]: 6.48e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.136e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.342e-05 [merge_recompute_call_nodes]: 7.60017e-07 [before_grad]: 2.179e-05 [set_forward_comm_id_for_comm_node_pass]: 7.66999e-06 [meta_fg_expand]: 5.18002e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 1.737e-05 [a_after_grad]: 2.088e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 1.799e-05 [cse]: 3.446e-05 [a_3]: 8.668e-05 [py_interpret_to_execute_after_opt_a]: 4.18001e-06 [slice_cell_reuse_recomputed_activation]: 1.99e-06 [rewriter_after_opt_a]: 3.636e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.57001e-06 [mutable_eliminate]: 0.00055189 [opt_b]: 0.00041687, [1] [Cycle 1]: 0.00041045, [7] [b_1]: 0.00029898 [b_2]: 1.483e-05 [updatestate_depend_eliminate]: 9.53997e-06 [updatestate_assign_eliminate]: 6.64999e-06 [updatestate_loads_eliminate]: 6.39001e-06 [renormalize]: 3.60014e-07 [cse]: 3.806e-05 [optimize_parallel_all_gather_comm]: 2.669e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.312e-05 [loop_unroll]: 0.00044477 [opt_after_cconv]: 0.00019092, [1] [Cycle 1]: 0.00018489, [7] [c_1]: 8.183e-05 [parameter_eliminate]: 2.41e-06 [updatestate_depend_eliminate]: 1.076e-05 [updatestate_assign_eliminate]: 7.46001e-06 [updatestate_loads_eliminate]: 6.41e-06 [cse]: 4.06e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 2.595e-05 [tuple_transform]: 0.00013561, [1] [Cycle 1]: 0.00013092, [4] [d_1]: 9.607e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 1.497e-05 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 8.621e-05 [cse_after_recomputation]: 4.451e-05, [1] [Cycle 1]: 3.978e-05, [1] [cse]: 3.366e-05 [environ_conv]: 1.172e-05 [swap_dp_allreduce_reducescatter]: 1.085e-05 [bias_add_comm_swap]: 2.55002e-06 [label_micro_interleaved_index]: 4.00998e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 9.10019e-07 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.06002e-06 [add_comm_op_reuse_tag]: 1.14998e-06 [interleave_split_concat_branches]: 1.22999e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.82999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 2.34e-05 [grouped_pairwise_exchange_alltoall]: 1.50001e-06 [offloading_packed_experts]: 6.28e-06 [overlap_recompute_and_grad_model_parallel]: 7.13e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 6.60997e-06 [overlap_grad_flash_sp]: 3.315e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.30999e-06 [symbol_engine_optimizer]: 0.00012309, [1] [Cycle 1]: 0.0001183, [6] [build]: 8.05e-06 [elim_shapecalc]: 1.847e-05 [elim_not_effective]: 2.495e-05 [opt_reshape]: 1.452e-05 [fold_const_symbol]: 2.313e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.14999e-06 [pipeline_parallel_scheduler]: 1.44003e-06 [auto_monad_reorder]: 3.422e-05 [get_jit_bprop_graph]: 1.29e-06 [rewriter_after_jit_bprop_graph]: 3.55998e-06 [opt_after_jit_grad]: 0.00049612 [validate]: 6.38e-05 [backend_pass]: 1.00001e-06 [task_emit]: 0.0128919 [execute]: 6.16e-06 Sums bootstrap : 0.000577s : 0.34% type_inference : 0.112146s : 66.68% event_method : 0.000430s : 0.26% auto_monad : 0.000348s : 0.21% graph_reusing : 0.000010s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000045s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000061s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000158s : 0.09% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000222s : 0.13% optimize.opt_a.loop_unroll : 0.000204s : 0.12% optimize.opt_a.a_1 : 0.005011s : 2.98% optimize.opt_a.with_stream_mark : 0.000060s : 0.04% optimize.opt_a.recompute_prepare : 0.000064s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000051s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000026s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000025s : 0.01% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000785s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000083s : 0.05% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.01% optimize.opt_a.shard_inline : 0.000051s : 0.03% optimize.opt_a.merge_send_recv : 0.000042s : 0.02% optimize.opt_a.auto_parallel : 0.000036s : 0.02% optimize.opt_a.parallel : 0.000033s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000029s : 0.02% optimize.opt_a.allreduce_fusion : 0.000027s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000054s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000056s : 0.03% optimize.opt_a.virtual_dataset : 0.000052s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000050s : 0.03% optimize.opt_a.virtual_output : 0.000050s : 0.03% optimize.opt_a.merge_forward : 0.000027s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000047s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000089s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000085s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000029s : 0.02% optimize.opt_a.meta_fg_expand : 0.006045s : 3.59% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000178s : 0.11% optimize.opt_a.a_after_grad : 0.000250s : 0.15% optimize.opt_a.renormalize : 0.022219s : 13.21% optimize.opt_a.add_forward_monad_depend : 0.000024s : 0.01% optimize.opt_a.auto_monad_grad : 0.000014s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000179s : 0.11% optimize.opt_a.cse : 0.000567s : 0.34% optimize.opt_a.a_3 : 0.001923s : 1.14% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000552s : 0.33% optimize.opt_b.b_1 : 0.000299s : 0.18% optimize.opt_b.b_2 : 0.000015s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000038s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.01% optimize.loop_unroll : 0.000445s : 0.26% optimize.opt_after_cconv.c_1 : 0.000082s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.cse : 0.000041s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000026s : 0.02% optimize.tuple_transform.d_1 : 0.000096s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000015s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000086s : 0.05% optimize.cse_after_recomputation.cse : 0.000034s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000033s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000034s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000496s : 0.30% validate : 0.000064s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.012892s : 7.67% execute : 0.000006s : 0.00% Time group info: ------[substitution.] 0.001957 353 2.81% : 0.000055s : 7: substitution.arithmetic_simplify 0.57% : 0.000011s : 4: substitution.depend_value_elim 0.20% : 0.000004s : 8: substitution.elim_not_effective 0.24% : 0.000005s : 6: substitution.float_depend_g_call 0.22% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.17% : 0.000003s : 8: substitution.fold_const_symbol 36.09% : 0.000706s : 4: substitution.getattr_setattr_resolve 0.53% : 0.000010s : 11: substitution.graph_param_transform 0.15% : 0.000003s : 2: substitution.incorporate_call 0.11% : 0.000002s : 2: substitution.incorporate_call_switch 37.59% : 0.000736s : 25: substitution.inline 1.53% : 0.000030s : 5: substitution.inline_without_move 0.74% : 0.000014s : 31: substitution.j_node_and_user_rematch 1.15% : 0.000023s : 3: substitution.less_batch_normalization 0.74% : 0.000015s : 12: substitution.minmaximum_grad 0.33% : 0.000006s : 6: substitution.partial_eliminate 1.02% : 0.000020s : 31: substitution.remove_not_recompute_node 3.78% : 0.000074s : 37: substitution.replace_applicator 0.75% : 0.000015s : 26: substitution.replace_old_param 0.16% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.60% : 0.000031s : 12: substitution.tuple_list_convert_item_index_to_positive 0.73% : 0.000014s : 12: substitution.tuple_list_get_item_const_eliminator 1.05% : 0.000021s : 12: substitution.tuple_list_get_item_depend_reorder 4.02% : 0.000079s : 36: substitution.tuple_list_get_item_eliminator 1.01% : 0.000020s : 12: substitution.tuple_list_get_set_item_eliminator 1.07% : 0.000021s : 16: substitution.updatestate_pure_node_eliminater 1.63% : 0.000032s : 21: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.112040 2 96.58% : 0.108205s : 1: type_inference.infer 3.42% : 0.003835s : 1: type_inference.specialize ------[replace.] 0.000471 52 0.97% : 0.000005s : 1: replace.arithmetic_simplify 9.33% : 0.000044s : 3: replace.getattr_setattr_resolve 42.74% : 0.000201s : 25: replace.inline 5.60% : 0.000026s : 1: replace.replace_applicator 39.43% : 0.000186s : 21: replace.tuple_list_get_item_eliminator 1.93% : 0.000009s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.001445 52 0.29% : 0.000004s : 1: match.arithmetic_simplify 45.76% : 0.000661s : 3: match.getattr_setattr_resolve 49.87% : 0.000720s : 25: match.inline 0.61% : 0.000009s : 1: match.replace_applicator 2.90% : 0.000042s : 21: match.tuple_list_get_item_eliminator 0.57% : 0.000008s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001348 9867 0.91% : 0.000012s : 97: predicate.accumulaten_eliminater 0.25% : 0.000003s : 11: predicate.ad_related_special_op_eliminate 0.44% : 0.000006s : 46: predicate.addn_check_dump 0.98% : 0.000013s : 97: predicate.addn_zero_filter 0.87% : 0.000012s : 97: predicate.adjust_all_reduce_mul_add 1.98% : 0.000027s : 144: predicate.arithmetic_simplify 0.93% : 0.000013s : 98: predicate.cast_eliminate 2.66% : 0.000036s : 274: predicate.check_bprop_eliminate 0.44% : 0.000006s : 46: predicate.compare_switch_simplify 0.07% : 0.000001s : 11: predicate.const_output_eliminate 0.47% : 0.000006s : 46: predicate.depend_value_elim 0.98% : 0.000013s : 98: predicate.dict_get_item_const_eliminator 1.06% : 0.000014s : 98: predicate.dict_get_item_eliminator 0.94% : 0.000013s : 98: predicate.dict_set_item_eliminator 0.27% : 0.000004s : 22: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 11: predicate.elim_not_effective 0.14% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.06% : 0.000014s : 109: predicate.environ_add_const_eliminate 0.99% : 0.000013s : 109: predicate.environ_get_add_eliminate 1.06% : 0.000014s : 109: predicate.environ_get_depend_swap 1.48% : 0.000020s : 155: predicate.environ_get_eliminate 1.01% : 0.000014s : 109: predicate.environ_get_set_eliminate 1.41% : 0.000019s : 145: predicate.exchange_switch_depend_value 1.85% : 0.000025s : 145: predicate.float_depend_g_call 0.43% : 0.000006s : 46: predicate.float_environ_get_switch 0.53% : 0.000007s : 57: predicate.float_tuple_getitem_switch 0.06% : 0.000001s : 11: predicate.fold_const_symbol 0.47% : 0.000006s : 46: predicate.get_grad_eliminate 0.39% : 0.000005s : 20: predicate.getattr_setattr_resolve 0.07% : 0.000001s : 11: predicate.graph_param_transform 0.45% : 0.000006s : 46: predicate.incorporate_call 0.42% : 0.000006s : 46: predicate.incorporate_call_switch 4.61% : 0.000062s : 356: predicate.inline 1.70% : 0.000023s : 129: predicate.inline_without_move 0.24% : 0.000003s : 46: predicate.j_node_and_user_rematch 0.57% : 0.000008s : 46: predicate.less_batch_normalization 1.36% : 0.000018s : 141: predicate.list_to_tuple_eliminator_ 2.26% : 0.000030s : 238: predicate.load_eliminater 0.23% : 0.000003s : 11: predicate.loop_unroll_after_grad 2.27% : 0.000031s : 227: predicate.loop_unroll_before_grad 1.14% : 0.000015s : 120: predicate.make_slice_get_slice_eliminator 0.46% : 0.000006s : 46: predicate.merge_addn 2.62% : 0.000035s : 270: predicate.micro_step_allgather_replace 2.61% : 0.000035s : 270: predicate.mini_step_allgather_replace 0.95% : 0.000013s : 98: predicate.minmaximum_grad 0.24% : 0.000003s : 11: predicate.mutable_eliminate 0.13% : 0.000002s : 11: predicate.opt_reshape 0.12% : 0.000002s : 11: predicate.parallel_virtual_node 1.89% : 0.000025s : 145: predicate.partial_defer_inline 1.41% : 0.000019s : 130: predicate.partial_eliminate 0.96% : 0.000013s : 97: predicate.print_const_string_wrapper 0.44% : 0.000006s : 46: predicate.reduce_all_const_elim 1.20% : 0.000016s : 98: predicate.reduce_eliminate 2.23% : 0.000030s : 238: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000004s : 46: predicate.remove_not_recompute_node 2.57% : 0.000035s : 391: predicate.replace_applicator 0.73% : 0.000010s : 129: predicate.replace_old_param 0.08% : 0.000001s : 11: predicate.reset_defer_inline 1.00% : 0.000013s : 98: predicate.reshape_eliminate 2.66% : 0.000036s : 270: predicate.row_tensor_add_zeros_like 0.13% : 0.000002s : 11: predicate.row_tensor_eliminate 2.78% : 0.000037s : 274: predicate.same_eliminate 0.30% : 0.000004s : 48: predicate.set_cell_output_no_recompute 0.50% : 0.000007s : 46: predicate.shard_identity_eliminate 0.26% : 0.000004s : 22: predicate.special_op_eliminate 0.50% : 0.000007s : 46: predicate.specialize_transform 2.68% : 0.000036s : 270: predicate.split_environ_get_set_with_tuple_value 1.46% : 0.000020s : 129: predicate.stack_unstack_eliminate 0.12% : 0.000002s : 11: predicate.switch_call_monad_eliminater 1.54% : 0.000021s : 145: predicate.switch_defer_inline 4.19% : 0.000056s : 419: predicate.switch_layer_defer_inline 4.41% : 0.000059s : 429: predicate.switch_simplify 1.00% : 0.000013s : 98: predicate.tile_eliminate 1.02% : 0.000014s : 98: predicate.transpose_eliminate 1.30% : 0.000017s : 120: predicate.tuple_list_convert_item_index_to_positive 1.32% : 0.000018s : 120: predicate.tuple_list_get_item_const_eliminator 1.22% : 0.000016s : 120: predicate.tuple_list_get_item_depend_reorder 2.35% : 0.000032s : 187: predicate.tuple_list_get_item_eliminator 1.30% : 0.000018s : 120: predicate.tuple_list_get_set_item_eliminator 1.87% : 0.000025s : 166: predicate.tuple_list_set_item_eliminator 1.35% : 0.000018s : 141: predicate.tuple_to_list_eliminator_ 2.24% : 0.000030s : 238: predicate.updatestate_pure_node_eliminater 2.75% : 0.000037s : 285: predicate.updatestate_useless_node_eliminater 0.14% : 0.000002s : 11: predicate.value_based_eliminate 0.48% : 0.000006s : 46: predicate.virtual_dataset_eliminate 0.47% : 0.000006s : 46: predicate.virtual_output_eliminate 0.10% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.17% : 0.000002s : 11: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005009 78 67.16% : 0.003364s : 41: func_graph_cloner_run.FuncGraphClonerGraph 32.84% : 0.001645s : 37: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.251838 247 0.00% : 0.000004s : 1: ForceFp32Comm 1.50% : 0.003765s : 1: add_attr 1.49% : 0.003755s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000091s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.14% : 0.000361s : 1: auto_monad 0.02% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.24% : 0.000603s : 1: bootstrap 0.01% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000027s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000048s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.18% : 0.000443s : 1: event_method 0.00% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.18% : 0.000453s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.22% : 0.000560s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000023s : 1: opt.transform.mutable_eliminate 3.57% : 0.009002s : 125: opt.transform.opt_a 0.03% : 0.000080s : 1: opt.transform.opt_after_cconv 0.02% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000285s : 28: opt.transform.opt_b 0.32% : 0.000797s : 2: opt.transform.opt_resolve 0.04% : 0.000109s : 2: opt.transform.opt_trans_graph 0.03% : 0.000077s : 4: opt.transform.symbol_engine_opt 15.72% : 0.039579s : 1: opt_a 0.08% : 0.000194s : 1: opt_after_cconv 0.20% : 0.000505s : 1: opt_after_jit_grad 0.17% : 0.000420s : 1: opt_b 16.77% : 0.042224s : 1: optimize 0.01% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000066s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000030s : 1: remove_dup_value 6.83% : 0.017200s : 2: renormalize.infer 1.99% : 0.005002s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000040s : 1: rewriter_after_opt_a 0.06% : 0.000163s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000126s : 1: symbol_engine_optimizer 5.12% : 0.012903s : 1: task_emit 0.05% : 0.000138s : 1: tuple_transform 44.54% : 0.112166s : 1: type_inference 0.04% : 0.000104s : 1: validate group_cases_11 have all been run, results of sub cases are below: case: (1, mindspore.float16, 'BSH') {} pass. case: (1, mindspore.float16, 'BNSD') {} pass. case: (1,) {} pass. case: (0, mindspore.bfloat16, 'BNSD') {} pass. case: (0, mindspore.float16, 'BSH') {} pass. case: (0, mindspore.bfloat16, 'BSH') {} pass. case: (0, mindspore.float16, 'BNSD') {} pass. case: (0,) {} pass. ops group_cases_12 with 8 cases start to running, all cases are below: case: (, 1, mindspore.bfloat16, 'BSH') case: (, 1, mindspore.bfloat16, 'BNSD') case: (, 'pynative', ) case: (, 'pynative', ) case: (, 'KBK', ) case: (, 'KBK', ) case: (, 0, mindspore.float32, 0) case: (, 0, mindspore.float32, 1) ops group_cases_12 total running memory: 212M, memory threshold: 51200M [WARNING] ME(18993:281473890602800,ForkProcess-97):2026-01-29-17:47:11.543.205 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. [WARNING] ME(18995:281473890602800,ForkProcess-98):2026-01-29-17:47:11.552.620 [mindspore/context.py:1334] For 'context.set_context', the parameter 'device_target' will be deprecated and removed in a future version. Please use the api mindspore.set_device() instead. TotalTime = 0.556733, [33] [bootstrap]: 0.00075022 [type_inference]: 0.0705879 [event_method]: 0.00016442 [auto_monad]: 0.00017989 [graph_reusing]: 7.72998e-06 [pre_auto_parallel]: 1.183e-05 [py_interpret_to_execute]: 4.932e-05 [rewriter_before_opt_a]: 0.00016798 [expand_dump_flag]: 3.36999e-06 [jit_opt_a]: 0.02716, [3] [Cycle 1]: 0.0152028, [27] [switch_simplify]: 0.0001317 [loop_unroll]: 5.83e-05 [a_1]: 0.00136913 [with_stream_mark]: 3.303e-05 [recompute_prepare]: 2.503e-05 [updatestate_depend_eliminate]: 2.09e-05 [updatestate_assign_eliminate]: 1.663e-05 [updatestate_loads_eliminate]: 9.22001e-06 [parameter_eliminate]: 2.98998e-06 [specialize_transform]: 1.976e-05 [updatestate_useless_node_eliminater]: 1.821e-05 [accelerated_algorithm]: 5.392e-05 [meta_shard_fg_expand]: 5.64e-06 [get_grad_eliminate_]: 1.896e-05 [merge_forward]: 1.116e-05 [cell_reuse_recompute_pass]: 1.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.899e-05 [j_node_and_user_rematch]: 3.922e-05 [meta_fg_expand]: 0.00206676 [replace_old_param]: 7.73e-05 [inline_without_move]: 6.64e-05 [renormalize]: 0.0103505 [add_forward_monad_depend]: 2.305e-05 [auto_monad_grad]: 6.96001e-06 [auto_monad_eliminator]: 6.853e-05 [cse]: 0.00030041 [replace_applicator]: 8.788e-05 [Cycle 2]: 0.00327483, [27] [switch_simplify]: 5.166e-05 [loop_unroll]: 5.068e-05 [a_1]: 0.00164937 [with_stream_mark]: 1.753e-05 [recompute_prepare]: 1.203e-05 [updatestate_depend_eliminate]: 6.22001e-06 [updatestate_assign_eliminate]: 6.09001e-06 [updatestate_loads_eliminate]: 5.20999e-06 [parameter_eliminate]: 2.00002e-06 [specialize_transform]: 1.042e-05 [updatestate_useless_node_eliminater]: 1.001e-05 [accelerated_algorithm]: 1.479e-05 [meta_shard_fg_expand]: 3.35e-06 [get_grad_eliminate_]: 1.049e-05 [merge_forward]: 6.61999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.361e-05 [j_node_and_user_rematch]: 1.759e-05 [meta_fg_expand]: 7.688e-05 [replace_old_param]: 1.478e-05 [inline_without_move]: 1.004e-05 [renormalize]: 0.00098972 [add_forward_monad_depend]: 4.99e-06 [auto_monad_grad]: 1.55001e-06 [auto_monad_eliminator]: 1.674e-05 [cse]: 8.372e-05 [replace_applicator]: 1.811e-05 [Cycle 3]: 0.00060433, [27] [switch_simplify]: 1.079e-05 [loop_unroll]: 1.019e-05 [a_1]: 0.00025878 [with_stream_mark]: 1.141e-05 [recompute_prepare]: 1.006e-05 [updatestate_depend_eliminate]: 5.67999e-06 [updatestate_assign_eliminate]: 5.27999e-06 [updatestate_loads_eliminate]: 4.53999e-06 [parameter_eliminate]: 1.20001e-06 [specialize_transform]: 1.055e-05 [updatestate_useless_node_eliminater]: 9.81e-06 [accelerated_algorithm]: 1.348e-05 [meta_shard_fg_expand]: 1.93002e-06 [get_grad_eliminate_]: 9.91e-06 [merge_forward]: 5.12e-06 [cell_reuse_recompute_pass]: 1.66e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.003e-05 [j_node_and_user_rematch]: 1.617e-05 [meta_fg_expand]: 3.8e-06 [replace_old_param]: 1.376e-05 [inline_without_move]: 9.89001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.40001e-06 [auto_monad_grad]: 9.90025e-07 [auto_monad_eliminator]: 1.187e-05 [cse]: 2.999e-05 [replace_applicator]: 9.74999e-06 [py_interpret_to_execute_after_opt_a]: 1.708e-05 [rewriter_after_opt_a]: 0.00013242 [convert_after_rewriter]: 1.24e-05 [order_py_execute_after_rewriter]: 8.22998e-06 [mutable_eliminate]: 0.00064371 [jit_opt_b]: 0.00024298, [2] [Cycle 1]: 0.00017226, [2] [frontend_op_eliminate]: 0.00012187 [inline_after_opt_a]: 3.276e-05 [Cycle 2]: 5.883e-05, [2] [frontend_op_eliminate]: 2.316e-05 [inline_after_opt_a]: 2.619e-05 [cconv]: 2.497e-05 [loop_unroll]: 0.00048219 [jit_opt_after_cconv]: 0.00029077, [1] [Cycle 1]: 0.00028331, [11] [c_1]: 3.882e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 8.87e-06 [updatestate_assign_eliminate]: 4.36002e-06 [updatestate_loads_eliminate]: 4.05998e-06 [cse]: 5.305e-05 [call_graph_tuple_transform]: 2.835e-05 [tuple_list_get_item_eliminator]: 1.459e-05 [none_parameter_eliminate]: 1.97999e-06 [renormalize]: 5.3001e-07 [switch_simplify]: 9.44998e-06 [remove_dup_value]: 5.045e-05 [partial_unused_args_eliminate]: 2.34999e-06 [environ_conv]: 1.446e-05 [add_recomputation]: 7.746e-05 [cse_after_recomputation]: 3.91e-05, [1] [Cycle 1]: 3.269e-05, [1] [cse]: 2.573e-05 [auto_monad_reorder]: 3.043e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 3.38e-06 [opt_after_jit_grad]: 0.00049752 [symbol_engine_optimizer]: 0.00011379, [1] [Cycle 1]: 0.00010711, [6] [build]: 2.107e-05 [elim_shapecalc]: 1.328e-05 [elim_not_effective]: 1.884e-05 [opt_reshape]: 9.92001e-06 [fold_const_symbol]: 1.479e-05 [renormalize]: 3.69997e-07 [validate]: 8.039e-05 [backend_pass]: 1.09e-06 [task_emit]: 0.454558 [execute]: 7.68001e-06 Sums bootstrap : 0.000750s : 0.14% type_inference : 0.070588s : 12.89% event_method : 0.000164s : 0.03% auto_monad : 0.000180s : 0.03% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000049s : 0.01% rewriter_before_opt_a : 0.000168s : 0.03% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000194s : 0.04% jit_opt_a.loop_unroll : 0.000119s : 0.02% jit_opt_a.a_1 : 0.003277s : 0.60% jit_opt_a.with_stream_mark : 0.000062s : 0.01% jit_opt_a.recompute_prepare : 0.000047s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000033s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000028s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000019s : 0.00% jit_opt_a.parameter_eliminate : 0.000006s : 0.00% jit_opt_a.specialize_transform : 0.000041s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000038s : 0.01% jit_opt_a.accelerated_algorithm : 0.000082s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000011s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000039s : 0.01% jit_opt_a.merge_forward : 0.000023s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000083s : 0.02% jit_opt_a.j_node_and_user_rematch : 0.000073s : 0.01% jit_opt_a.meta_fg_expand : 0.002147s : 0.39% jit_opt_a.replace_old_param : 0.000106s : 0.02% jit_opt_a.inline_without_move : 0.000086s : 0.02% jit_opt_a.renormalize : 0.011340s : 2.07% jit_opt_a.add_forward_monad_depend : 0.000029s : 0.01% jit_opt_a.auto_monad_grad : 0.000010s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000097s : 0.02% jit_opt_a.cse : 0.000414s : 0.08% jit_opt_a.replace_applicator : 0.000116s : 0.02% py_interpret_to_execute_after_opt_a : 0.000017s : 0.00% rewriter_after_opt_a : 0.000132s : 0.02% convert_after_rewriter : 0.000012s : 0.00% order_py_execute_after_rewriter : 0.000008s : 0.00% mutable_eliminate : 0.000644s : 0.12% jit_opt_b.frontend_op_eliminate : 0.000145s : 0.03% jit_opt_b.inline_after_opt_a : 0.000059s : 0.01% cconv : 0.000025s : 0.00% loop_unroll : 0.000482s : 0.09% jit_opt_after_cconv.c_1 : 0.000039s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000053s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000028s : 0.01% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000015s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000050s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000014s : 0.00% add_recomputation : 0.000077s : 0.01% cse_after_recomputation.cse : 0.000026s : 0.00% auto_monad_reorder : 0.000030s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000498s : 0.09% symbol_engine_optimizer.build : 0.000021s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000080s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.454558s : 83.01% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000884 201 0.32% : 0.000003s : 4: substitution.elim_not_effective 0.23% : 0.000002s : 4: substitution.fold_const_symbol 0.84% : 0.000007s : 6: substitution.graph_param_transform 52.27% : 0.000462s : 13: substitution.inline 2.06% : 0.000018s : 2: substitution.inline_without_move 2.37% : 0.000021s : 23: substitution.j_node_and_user_rematch 4.16% : 0.000037s : 3: substitution.less_batch_normalization 3.37% : 0.000030s : 17: substitution.minmaximum_grad 2.75% : 0.000024s : 5: substitution.partial_eliminate 1.82% : 0.000016s : 23: substitution.remove_not_recompute_node 3.16% : 0.000028s : 10: substitution.replace_applicator 1.38% : 0.000012s : 17: substitution.replace_old_param 0.31% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.79% : 0.000042s : 17: substitution.tuple_list_convert_item_index_to_positive 3.36% : 0.000030s : 17: substitution.tuple_list_get_item_depend_reorder 10.88% : 0.000096s : 37: substitution.tuple_list_get_item_eliminator 5.93% : 0.000052s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.070471 2 97.16% : 0.068469s : 1: type_inference.infer 2.84% : 0.002002s : 1: type_inference.specialize ------[replace.] 0.000256 32 49.33% : 0.000126s : 13: replace.inline 41.78% : 0.000107s : 17: replace.tuple_list_get_item_eliminator 8.90% : 0.000023s : 2: replace.zero_like_fill_zero ------[match.] 0.000549 32 82.85% : 0.000455s : 13: match.inline 7.91% : 0.000043s : 17: match.tuple_list_get_item_eliminator 9.24% : 0.000051s : 2: match.zero_like_fill_zero ------[predicate.] 0.000624 4164 1.35% : 0.000008s : 68: predicate.accumulaten_eliminater 0.35% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.29% : 0.000008s : 68: predicate.addn_check_dump 1.50% : 0.000009s : 68: predicate.addn_zero_filter 1.91% : 0.000012s : 68: predicate.arithmetic_simplify 1.43% : 0.000009s : 68: predicate.cast_eliminate 0.33% : 0.000002s : 13: predicate.check_bprop_eliminate 1.27% : 0.000008s : 68: predicate.compare_switch_simplify 1.31% : 0.000008s : 68: predicate.depend_value_elim 1.33% : 0.000008s : 68: predicate.dict_get_item_const_eliminator 1.42% : 0.000009s : 68: predicate.dict_get_item_eliminator 1.33% : 0.000008s : 68: predicate.dict_set_item_eliminator 0.23% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 6: predicate.elim_not_effective 0.21% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.27% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.32% : 0.000008s : 68: predicate.environ_get_depend_swap 1.37% : 0.000009s : 68: predicate.environ_get_eliminate 1.31% : 0.000008s : 68: predicate.environ_get_set_eliminate 0.08% : 0.000001s : 6: predicate.fold_const_symbol 0.84% : 0.000005s : 34: predicate.get_grad_eliminate 0.08% : 0.000001s : 6: predicate.graph_param_transform 3.81% : 0.000024s : 116: predicate.inline 1.55% : 0.000010s : 62: predicate.inline_without_move 0.39% : 0.000002s : 34: predicate.j_node_and_user_rematch 0.94% : 0.000006s : 34: predicate.less_batch_normalization 1.77% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 1.86% : 0.000012s : 91: predicate.load_eliminater 0.38% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.74% : 0.000017s : 127: predicate.loop_unroll_before_grad 1.57% : 0.000010s : 74: predicate.make_slice_get_slice_eliminator 1.31% : 0.000008s : 68: predicate.merge_addn 1.39% : 0.000009s : 68: predicate.minmaximum_grad 0.45% : 0.000003s : 8: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 11.26% : 0.000070s : 91: predicate.partial_eliminate 1.32% : 0.000008s : 68: predicate.print_const_string_wrapper 1.75% : 0.000011s : 68: predicate.reduce_eliminate 1.76% : 0.000011s : 85: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000003s : 34: predicate.remove_not_recompute_node 2.36% : 0.000015s : 156: predicate.replace_applicator 0.88% : 0.000005s : 62: predicate.replace_old_param 0.22% : 0.000001s : 12: predicate.reset_defer_inline 1.40% : 0.000009s : 68: predicate.reshape_eliminate 1.35% : 0.000008s : 68: predicate.row_tensor_add_zeros_like 0.39% : 0.000002s : 13: predicate.row_tensor_eliminate 1.34% : 0.000008s : 68: predicate.same_eliminate 0.47% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.49% : 0.000003s : 19: predicate.special_op_eliminate 0.81% : 0.000005s : 34: predicate.specialize_transform 1.52% : 0.000009s : 68: predicate.split_environ_get_set_with_tuple_value 1.33% : 0.000008s : 68: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.49% : 0.000016s : 98: predicate.switch_defer_inline 2.27% : 0.000014s : 98: predicate.switch_layer_defer_inline 5.29% : 0.000033s : 231: predicate.switch_simplify 1.33% : 0.000008s : 68: predicate.tile_eliminate 1.33% : 0.000008s : 68: predicate.transpose_eliminate 1.72% : 0.000011s : 68: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000010s : 68: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000020s : 103: predicate.tuple_list_get_item_eliminator 1.76% : 0.000011s : 68: predicate.tuple_list_set_item_eliminator 1.72% : 0.000011s : 85: predicate.tuple_to_list_eliminator_ 1.80% : 0.000011s : 91: predicate.updatestate_pure_node_eliminater 2.71% : 0.000017s : 125: predicate.updatestate_useless_node_eliminater 1.68% : 0.000010s : 68: predicate.value_based_eliminate 0.12% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.47% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002190 33 59.98% : 0.001313s : 16: func_graph_cloner_run.FuncGraphClonerGraph 40.02% : 0.000876s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.572573 95 0.01% : 0.000081s : 1: add_recomputation 0.03% : 0.000187s : 1: auto_monad 0.01% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.13% : 0.000770s : 1: bootstrap 0.00% : 0.000028s : 1: cconv 0.00% : 0.000015s : 1: convert_after_rewriter 0.01% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000017s : 1: environ_conv 0.03% : 0.000172s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 4.74% : 0.027163s : 1: jit_opt_a 0.05% : 0.000293s : 1: jit_opt_after_cconv 0.04% : 0.000246s : 1: jit_opt_b 0.09% : 0.000490s : 1: loop_unroll 0.11% : 0.000652s : 1: mutable_eliminate 0.74% : 0.004237s : 39: opt.transform.jit_opt_a 0.02% : 0.000087s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000191s : 8: opt.transform.jit_opt_b 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000022s : 1: opt.transform.mutable_eliminate 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.09% : 0.000506s : 1: opt_after_jit_grad 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000014s : 1: pre_auto_parallel 0.01% : 0.000054s : 1: py_interpret_to_execute 0.00% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000054s : 1: remove_dup_value 1.61% : 0.009229s : 2: renormalize.infer 0.37% : 0.002093s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000137s : 1: rewriter_after_opt_a 0.03% : 0.000171s : 1: rewriter_before_opt_a 0.02% : 0.000116s : 1: symbol_engine_optimizer 79.39% : 0.454578s : 1: task_emit 12.33% : 0.070605s : 1: type_inference 0.02% : 0.000112s : 1: validate TotalTime = 0.597505, [33] [bootstrap]: 0.00086349 [type_inference]: 0.0637404 [event_method]: 0.00016829 [auto_monad]: 0.0002414 [graph_reusing]: 8.06001e-06 [pre_auto_parallel]: 1.343e-05 [py_interpret_to_execute]: 4.628e-05 [rewriter_before_opt_a]: 0.00016958 [expand_dump_flag]: 3.46001e-06 [jit_opt_a]: 0.0250773, [3] [Cycle 1]: 0.0138699, [27] [switch_simplify]: 0.00012349 [loop_unroll]: 5.779e-05 [a_1]: 0.00124222 [with_stream_mark]: 2.529e-05 [recompute_prepare]: 2.419e-05 [updatestate_depend_eliminate]: 2.058e-05 [updatestate_assign_eliminate]: 1.731e-05 [updatestate_loads_eliminate]: 7.99002e-06 [parameter_eliminate]: 1.79e-06 [specialize_transform]: 1.919e-05 [updatestate_useless_node_eliminater]: 1.764e-05 [accelerated_algorithm]: 5.308e-05 [meta_shard_fg_expand]: 4.29002e-06 [get_grad_eliminate_]: 1.794e-05 [merge_forward]: 1.041e-05 [cell_reuse_recompute_pass]: 8.59989e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.509e-05 [j_node_and_user_rematch]: 4.066e-05 [meta_fg_expand]: 0.00178028 [replace_old_param]: 7.202e-05 [inline_without_move]: 6.298e-05 [renormalize]: 0.00954288 [add_forward_monad_depend]: 2.103e-05 [auto_monad_grad]: 6.54001e-06 [auto_monad_eliminator]: 6.134e-05 [cse]: 0.00027469 [replace_applicator]: 7.997e-05 [Cycle 2]: 0.00301539, [27] [switch_simplify]: 5.177e-05 [loop_unroll]: 5.039e-05 [a_1]: 0.0015936 [with_stream_mark]: 1.359e-05 [recompute_prepare]: 1.17e-05 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 5.05001e-06 [updatestate_loads_eliminate]: 4.80001e-06 [parameter_eliminate]: 1.07e-06 [specialize_transform]: 1.064e-05 [updatestate_useless_node_eliminater]: 9.99001e-06 [accelerated_algorithm]: 1.349e-05 [meta_shard_fg_expand]: 2.46e-06 [get_grad_eliminate_]: 9.79999e-06 [merge_forward]: 5.03002e-06 [cell_reuse_recompute_pass]: 8.99978e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.96e-05 [j_node_and_user_rematch]: 1.633e-05 [meta_fg_expand]: 6.234e-05 [replace_old_param]: 1.518e-05 [inline_without_move]: 1.007e-05 [renormalize]: 0.00083758 [add_forward_monad_depend]: 4.48999e-06 [auto_monad_grad]: 1.39003e-06 [auto_monad_eliminator]: 1.616e-05 [cse]: 7.323e-05 [replace_applicator]: 1.72e-05 [Cycle 3]: 0.00057997, [27] [switch_simplify]: 1.064e-05 [loop_unroll]: 9.98998e-06 [a_1]: 0.00024773 [with_stream_mark]: 1.17e-05 [recompute_prepare]: 9.59e-06 [updatestate_depend_eliminate]: 5.40999e-06 [updatestate_assign_eliminate]: 4.83001e-06 [updatestate_loads_eliminate]: 4.47998e-06 [parameter_eliminate]: 9.89996e-07 [specialize_transform]: 9.92001e-06 [updatestate_useless_node_eliminater]: 9.78002e-06 [accelerated_algorithm]: 1.409e-05 [meta_shard_fg_expand]: 1.96e-06 [get_grad_eliminate_]: 9.79999e-06 [merge_forward]: 5.17e-06 [cell_reuse_recompute_pass]: 1.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.769e-05 [j_node_and_user_rematch]: 1.568e-05 [meta_fg_expand]: 3.54002e-06 [replace_old_param]: 1.318e-05 [inline_without_move]: 9.56998e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.17999e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 1.146e-05 [cse]: 2.792e-05 [replace_applicator]: 9.46e-06 [py_interpret_to_execute_after_opt_a]: 1.345e-05 [rewriter_after_opt_a]: 0.00012063 [convert_after_rewriter]: 1.09e-05 [order_py_execute_after_rewriter]: 7.53e-06 [mutable_eliminate]: 0.00052542 [jit_opt_b]: 0.00023206, [2] [Cycle 1]: 0.00016407, [2] [frontend_op_eliminate]: 0.00011366 [inline_after_opt_a]: 3.255e-05 [Cycle 2]: 5.769e-05, [2] [frontend_op_eliminate]: 2.269e-05 [inline_after_opt_a]: 2.578e-05 [cconv]: 1.992e-05 [loop_unroll]: 0.00045151 [jit_opt_after_cconv]: 0.0002114, [1] [Cycle 1]: 0.0002053, [11] [c_1]: 3.662e-05 [parameter_eliminate]: 2.56998e-06 [updatestate_depend_eliminate]: 8.59002e-06 [updatestate_assign_eliminate]: 4.57e-06 [updatestate_loads_eliminate]: 4.42e-06 [cse]: 4.598e-05 [call_graph_tuple_transform]: 2.603e-05 [tuple_list_get_item_eliminator]: 1.434e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 3.9002e-07 [switch_simplify]: 9.62001e-06 [remove_dup_value]: 4.053e-05 [partial_unused_args_eliminate]: 2.02999e-06 [environ_conv]: 1.371e-05 [add_recomputation]: 6.796e-05 [cse_after_recomputation]: 3.775e-05, [1] [Cycle 1]: 3.187e-05, [1] [cse]: 2.482e-05 [auto_monad_reorder]: 2.817e-05 [get_jit_bprop_graph]: 1.97001e-06 [rewriter_after_jit_bprop_graph]: 3.4e-06 [opt_after_jit_grad]: 0.00048409 [symbol_engine_optimizer]: 0.00018578, [1] [Cycle 1]: 0.00017955, [6] [build]: 1.927e-05 [elim_shapecalc]: 1.285e-05 [elim_not_effective]: 1.919e-05 [opt_reshape]: 9.76e-06 [fold_const_symbol]: 1.634e-05 [renormalize]: 3.50003e-07 [validate]: 8.226e-05 [backend_pass]: 9.60019e-07 [task_emit]: 0.504237 [execute]: 9.14e-06 Sums bootstrap : 0.000863s : 0.15% type_inference : 0.063740s : 10.83% event_method : 0.000168s : 0.03% auto_monad : 0.000241s : 0.04% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000013s : 0.00% py_interpret_to_execute : 0.000046s : 0.01% rewriter_before_opt_a : 0.000170s : 0.03% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000186s : 0.03% jit_opt_a.loop_unroll : 0.000118s : 0.02% jit_opt_a.a_1 : 0.003084s : 0.52% jit_opt_a.with_stream_mark : 0.000051s : 0.01% jit_opt_a.recompute_prepare : 0.000045s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000032s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000027s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000017s : 0.00% jit_opt_a.parameter_eliminate : 0.000004s : 0.00% jit_opt_a.specialize_transform : 0.000040s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000037s : 0.01% jit_opt_a.accelerated_algorithm : 0.000081s : 0.01% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000038s : 0.01% jit_opt_a.merge_forward : 0.000021s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000072s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000073s : 0.01% jit_opt_a.meta_fg_expand : 0.001846s : 0.31% jit_opt_a.replace_old_param : 0.000100s : 0.02% jit_opt_a.inline_without_move : 0.000083s : 0.01% jit_opt_a.renormalize : 0.010381s : 1.76% jit_opt_a.add_forward_monad_depend : 0.000027s : 0.00% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000089s : 0.02% jit_opt_a.cse : 0.000376s : 0.06% jit_opt_a.replace_applicator : 0.000107s : 0.02% py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% rewriter_after_opt_a : 0.000121s : 0.02% convert_after_rewriter : 0.000011s : 0.00% order_py_execute_after_rewriter : 0.000008s : 0.00% mutable_eliminate : 0.000525s : 0.09% jit_opt_b.frontend_op_eliminate : 0.000136s : 0.02% jit_opt_b.inline_after_opt_a : 0.000058s : 0.01% cconv : 0.000020s : 0.00% loop_unroll : 0.000452s : 0.08% jit_opt_after_cconv.c_1 : 0.000037s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000046s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000026s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000014s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000010s : 0.00% remove_dup_value : 0.000041s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000014s : 0.00% add_recomputation : 0.000068s : 0.01% cse_after_recomputation.cse : 0.000025s : 0.00% auto_monad_reorder : 0.000028s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000484s : 0.08% symbol_engine_optimizer.build : 0.000019s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000082s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.504237s : 85.64% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000802 201 0.31% : 0.000002s : 4: substitution.elim_not_effective 0.25% : 0.000002s : 4: substitution.fold_const_symbol 0.83% : 0.000007s : 6: substitution.graph_param_transform 50.61% : 0.000406s : 13: substitution.inline 1.91% : 0.000015s : 2: substitution.inline_without_move 2.62% : 0.000021s : 23: substitution.j_node_and_user_rematch 4.49% : 0.000036s : 3: substitution.less_batch_normalization 3.70% : 0.000030s : 17: substitution.minmaximum_grad 3.31% : 0.000027s : 5: substitution.partial_eliminate 1.89% : 0.000015s : 23: substitution.remove_not_recompute_node 3.14% : 0.000025s : 10: substitution.replace_applicator 1.44% : 0.000012s : 17: substitution.replace_old_param 0.31% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.91% : 0.000039s : 17: substitution.tuple_list_convert_item_index_to_positive 3.40% : 0.000027s : 17: substitution.tuple_list_get_item_depend_reorder 10.94% : 0.000088s : 37: substitution.tuple_list_get_item_eliminator 5.93% : 0.000048s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.063624 2 97.14% : 0.061805s : 1: type_inference.infer 2.86% : 0.001819s : 1: type_inference.specialize ------[replace.] 0.000239 32 48.78% : 0.000117s : 13: replace.inline 42.26% : 0.000101s : 17: replace.tuple_list_get_item_eliminator 8.95% : 0.000021s : 2: replace.zero_like_fill_zero ------[match.] 0.000486 32 82.09% : 0.000399s : 13: match.inline 8.44% : 0.000041s : 17: match.tuple_list_get_item_eliminator 9.48% : 0.000046s : 2: match.zero_like_fill_zero ------[predicate.] 0.000552 4164 1.45% : 0.000008s : 68: predicate.accumulaten_eliminater 0.41% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.45% : 0.000008s : 68: predicate.addn_check_dump 1.49% : 0.000008s : 68: predicate.addn_zero_filter 1.93% : 0.000011s : 68: predicate.arithmetic_simplify 1.52% : 0.000008s : 68: predicate.cast_eliminate 0.35% : 0.000002s : 13: predicate.check_bprop_eliminate 1.43% : 0.000008s : 68: predicate.compare_switch_simplify 1.46% : 0.000008s : 68: predicate.depend_value_elim 1.49% : 0.000008s : 68: predicate.dict_get_item_const_eliminator 1.53% : 0.000008s : 68: predicate.dict_get_item_eliminator 1.48% : 0.000008s : 68: predicate.dict_set_item_eliminator 0.24% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.52% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.43% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.46% : 0.000008s : 68: predicate.environ_get_depend_swap 1.46% : 0.000008s : 68: predicate.environ_get_eliminate 1.45% : 0.000008s : 68: predicate.environ_get_set_eliminate 0.09% : 0.000000s : 6: predicate.fold_const_symbol 0.87% : 0.000005s : 34: predicate.get_grad_eliminate 0.10% : 0.000001s : 6: predicate.graph_param_transform 4.19% : 0.000023s : 116: predicate.inline 1.73% : 0.000010s : 62: predicate.inline_without_move 0.47% : 0.000003s : 34: predicate.j_node_and_user_rematch 1.07% : 0.000006s : 34: predicate.less_batch_normalization 1.92% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 2.17% : 0.000012s : 91: predicate.load_eliminater 0.42% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.15% : 0.000017s : 127: predicate.loop_unroll_before_grad 1.65% : 0.000009s : 74: predicate.make_slice_get_slice_eliminator 1.43% : 0.000008s : 68: predicate.merge_addn 1.50% : 0.000008s : 68: predicate.minmaximum_grad 0.57% : 0.000003s : 8: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.46% : 0.000014s : 91: predicate.partial_eliminate 1.45% : 0.000008s : 68: predicate.print_const_string_wrapper 1.87% : 0.000010s : 68: predicate.reduce_eliminate 1.98% : 0.000011s : 85: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000003s : 34: predicate.remove_not_recompute_node 2.68% : 0.000015s : 156: predicate.replace_applicator 0.95% : 0.000005s : 62: predicate.replace_old_param 0.22% : 0.000001s : 12: predicate.reset_defer_inline 1.54% : 0.000009s : 68: predicate.reshape_eliminate 1.47% : 0.000008s : 68: predicate.row_tensor_add_zeros_like 0.47% : 0.000003s : 13: predicate.row_tensor_eliminate 1.48% : 0.000008s : 68: predicate.same_eliminate 0.49% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.52% : 0.000003s : 19: predicate.special_op_eliminate 0.91% : 0.000005s : 34: predicate.specialize_transform 1.71% : 0.000009s : 68: predicate.split_environ_get_set_with_tuple_value 1.50% : 0.000008s : 68: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.67% : 0.000015s : 98: predicate.switch_defer_inline 2.54% : 0.000014s : 98: predicate.switch_layer_defer_inline 5.85% : 0.000032s : 231: predicate.switch_simplify 1.47% : 0.000008s : 68: predicate.tile_eliminate 1.54% : 0.000009s : 68: predicate.transpose_eliminate 1.83% : 0.000010s : 68: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000010s : 68: predicate.tuple_list_get_item_depend_reorder 3.45% : 0.000019s : 103: predicate.tuple_list_get_item_eliminator 1.81% : 0.000010s : 68: predicate.tuple_list_set_item_eliminator 1.88% : 0.000010s : 85: predicate.tuple_to_list_eliminator_ 2.03% : 0.000011s : 91: predicate.updatestate_pure_node_eliminater 3.00% : 0.000017s : 125: predicate.updatestate_useless_node_eliminater 1.85% : 0.000010s : 68: predicate.value_based_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.50% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002024 33 60.35% : 0.001221s : 16: func_graph_cloner_run.FuncGraphClonerGraph 39.65% : 0.000802s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.612129 95 0.01% : 0.000071s : 1: add_recomputation 0.04% : 0.000249s : 1: auto_monad 0.01% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.14% : 0.000886s : 1: bootstrap 0.00% : 0.000022s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000016s : 1: environ_conv 0.03% : 0.000175s : 1: event_method 0.00% : 0.000025s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 4.10% : 0.025080s : 1: jit_opt_a 0.03% : 0.000214s : 1: jit_opt_after_cconv 0.04% : 0.000235s : 1: jit_opt_b 0.08% : 0.000459s : 1: loop_unroll 0.09% : 0.000533s : 1: mutable_eliminate 0.66% : 0.004010s : 39: opt.transform.jit_opt_a 0.01% : 0.000083s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000183s : 8: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000054s : 4: opt.transform.symbol_engine_opt 0.08% : 0.000492s : 1: opt_after_jit_grad 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000016s : 1: pre_auto_parallel 0.01% : 0.000050s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000044s : 1: remove_dup_value 1.38% : 0.008420s : 2: renormalize.infer 0.32% : 0.001946s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000125s : 1: rewriter_after_opt_a 0.03% : 0.000173s : 1: rewriter_before_opt_a 0.03% : 0.000189s : 1: symbol_engine_optimizer 82.38% : 0.504255s : 1: task_emit 10.42% : 0.063799s : 1: type_inference 0.02% : 0.000113s : 1: validate group_cases_12 have all been run, results of sub cases are below: case: (1, mindspore.bfloat16, 'BSH') {} pass. case: (1, mindspore.bfloat16, 'BNSD') {} pass. case: ('pynative', ) {} pass. case: ('pynative', ) {} pass. case: ('KBK', ) {} pass. case: ('KBK', ) {} pass. case: (0, mindspore.float32, 0) {} pass. case: (0, mindspore.float32, 1) {} pass. ops group_cases_13 with 8 cases start to running, all cases are below: case: (, 0, mindspore.float16, 0) case: (, 0, mindspore.float16, 1) case: (, 0, mindspore.bfloat16, 0) case: (, 0, mindspore.bfloat16, 1) case: (, 1, mindspore.float32, 0) case: (, 1, mindspore.float32, 1) case: (, 1, mindspore.float16, 0) case: (, 1, mindspore.float16, 1) ops group_cases_13 total running memory: 288M, memory threshold: 51200M TotalTime = 0.523879, [33] [bootstrap]: 0.00075102 [type_inference]: 0.0630598 [event_method]: 0.00015878 [auto_monad]: 0.00018695 [graph_reusing]: 7.71001e-06 [pre_auto_parallel]: 1.213e-05 [py_interpret_to_execute]: 4.638e-05 [rewriter_before_opt_a]: 0.00016584 [expand_dump_flag]: 3.67002e-06 [jit_opt_a]: 0.0255276, [3] [Cycle 1]: 0.0139921, [27] [switch_simplify]: 0.00012542 [loop_unroll]: 5.754e-05 [a_1]: 0.00126055 [with_stream_mark]: 2.892e-05 [recompute_prepare]: 2.414e-05 [updatestate_depend_eliminate]: 1.891e-05 [updatestate_assign_eliminate]: 1.768e-05 [updatestate_loads_eliminate]: 9.24e-06 [parameter_eliminate]: 3.31999e-06 [specialize_transform]: 2.011e-05 [updatestate_useless_node_eliminater]: 1.779e-05 [accelerated_algorithm]: 5.601e-05 [meta_shard_fg_expand]: 4.91002e-06 [get_grad_eliminate_]: 1.866e-05 [merge_forward]: 1.095e-05 [cell_reuse_recompute_pass]: 1.03001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.683e-05 [j_node_and_user_rematch]: 3.835e-05 [meta_fg_expand]: 0.00181325 [replace_old_param]: 7.151e-05 [inline_without_move]: 6.291e-05 [renormalize]: 0.00944248 [add_forward_monad_depend]: 2.142e-05 [auto_monad_grad]: 6.71999e-06 [auto_monad_eliminator]: 6.308e-05 [cse]: 0.00037313 [replace_applicator]: 8.93e-05 [Cycle 2]: 0.00320477, [27] [switch_simplify]: 5.814e-05 [loop_unroll]: 5.726e-05 [a_1]: 0.00175576 [with_stream_mark]: 1.434e-05 [recompute_prepare]: 1.24e-05 [updatestate_depend_eliminate]: 6.49001e-06 [updatestate_assign_eliminate]: 5.27001e-06 [updatestate_loads_eliminate]: 5.05999e-06 [parameter_eliminate]: 1.20001e-06 [specialize_transform]: 1.189e-05 [updatestate_useless_node_eliminater]: 1.097e-05 [accelerated_algorithm]: 1.534e-05 [meta_shard_fg_expand]: 3.13e-06 [get_grad_eliminate_]: 1.078e-05 [merge_forward]: 5.89e-06 [cell_reuse_recompute_pass]: 1.00001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.159e-05 [j_node_and_user_rematch]: 1.839e-05 [meta_fg_expand]: 6.355e-05 [replace_old_param]: 1.573e-05 [inline_without_move]: 9.82999e-06 [renormalize]: 0.00082963 [add_forward_monad_depend]: 4.42e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 1.615e-05 [cse]: 7.116e-05 [replace_applicator]: 1.683e-05 [Cycle 3]: 0.00064221, [27] [switch_simplify]: 1.074e-05 [loop_unroll]: 1.001e-05 [a_1]: 0.00025197 [with_stream_mark]: 1.252e-05 [recompute_prepare]: 9.89001e-06 [updatestate_depend_eliminate]: 5.43002e-06 [updatestate_assign_eliminate]: 5.01997e-06 [updatestate_loads_eliminate]: 4.56002e-06 [parameter_eliminate]: 1.14998e-06 [specialize_transform]: 9.75002e-06 [updatestate_useless_node_eliminater]: 9.67001e-06 [accelerated_algorithm]: 1.334e-05 [meta_shard_fg_expand]: 1.94e-06 [get_grad_eliminate_]: 1.228e-05 [merge_forward]: 5.58002e-06 [cell_reuse_recompute_pass]: 1.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.865e-05 [j_node_and_user_rematch]: 1.594e-05 [meta_fg_expand]: 3.4e-06 [replace_old_param]: 1.278e-05 [inline_without_move]: 9.34998e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.44e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 1.208e-05 [cse]: 3.045e-05 [replace_applicator]: 1.088e-05 [py_interpret_to_execute_after_opt_a]: 1.509e-05 [rewriter_after_opt_a]: 0.00014852 [convert_after_rewriter]: 1.093e-05 [order_py_execute_after_rewriter]: 8.18001e-06 [mutable_eliminate]: 0.00056036 [jit_opt_b]: 0.00023999, [2] [Cycle 1]: 0.00017006, [2] [frontend_op_eliminate]: 0.00012125 [inline_after_opt_a]: 3.187e-05 [Cycle 2]: 5.883e-05, [2] [frontend_op_eliminate]: 2.315e-05 [inline_after_opt_a]: 2.622e-05 [cconv]: 2.261e-05 [loop_unroll]: 0.00044408 [jit_opt_after_cconv]: 0.00021115, [1] [Cycle 1]: 0.00020453, [11] [c_1]: 3.723e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 8.30999e-06 [updatestate_assign_eliminate]: 4.53999e-06 [updatestate_loads_eliminate]: 4.37e-06 [cse]: 4.444e-05 [call_graph_tuple_transform]: 2.529e-05 [tuple_list_get_item_eliminator]: 1.448e-05 [none_parameter_eliminate]: 2.07001e-06 [renormalize]: 5.00004e-07 [switch_simplify]: 9.49e-06 [remove_dup_value]: 4.352e-05 [partial_unused_args_eliminate]: 2.17001e-06 [environ_conv]: 1.344e-05 [add_recomputation]: 7.496e-05 [cse_after_recomputation]: 3.772e-05, [1] [Cycle 1]: 3.201e-05, [1] [cse]: 2.536e-05 [auto_monad_reorder]: 2.963e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 3.25002e-06 [opt_after_jit_grad]: 0.00047985 [symbol_engine_optimizer]: 0.00011052, [1] [Cycle 1]: 0.0001042, [6] [build]: 2.008e-05 [elim_shapecalc]: 1.271e-05 [elim_not_effective]: 1.85e-05 [opt_reshape]: 9.84001e-06 [fold_const_symbol]: 1.462e-05 [renormalize]: 3.80009e-07 [validate]: 8.923e-05 [backend_pass]: 1.12e-06 [task_emit]: 0.431103 [execute]: 8.52998e-06 Sums bootstrap : 0.000751s : 0.15% type_inference : 0.063060s : 12.24% event_method : 0.000159s : 0.03% auto_monad : 0.000187s : 0.04% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000046s : 0.01% rewriter_before_opt_a : 0.000166s : 0.03% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000194s : 0.04% jit_opt_a.loop_unroll : 0.000125s : 0.02% jit_opt_a.a_1 : 0.003268s : 0.63% jit_opt_a.with_stream_mark : 0.000056s : 0.01% jit_opt_a.recompute_prepare : 0.000046s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000031s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000028s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000019s : 0.00% jit_opt_a.parameter_eliminate : 0.000006s : 0.00% jit_opt_a.specialize_transform : 0.000042s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000038s : 0.01% jit_opt_a.accelerated_algorithm : 0.000085s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000010s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000042s : 0.01% jit_opt_a.merge_forward : 0.000022s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000077s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000073s : 0.01% jit_opt_a.meta_fg_expand : 0.001880s : 0.36% jit_opt_a.replace_old_param : 0.000100s : 0.02% jit_opt_a.inline_without_move : 0.000082s : 0.02% jit_opt_a.renormalize : 0.010272s : 1.99% jit_opt_a.add_forward_monad_depend : 0.000027s : 0.01% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000091s : 0.02% jit_opt_a.cse : 0.000475s : 0.09% jit_opt_a.replace_applicator : 0.000117s : 0.02% py_interpret_to_execute_after_opt_a : 0.000015s : 0.00% rewriter_after_opt_a : 0.000149s : 0.03% convert_after_rewriter : 0.000011s : 0.00% order_py_execute_after_rewriter : 0.000008s : 0.00% mutable_eliminate : 0.000560s : 0.11% jit_opt_b.frontend_op_eliminate : 0.000144s : 0.03% jit_opt_b.inline_after_opt_a : 0.000058s : 0.01% cconv : 0.000023s : 0.00% loop_unroll : 0.000444s : 0.09% jit_opt_after_cconv.c_1 : 0.000037s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000044s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000025s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000014s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000044s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000013s : 0.00% add_recomputation : 0.000075s : 0.01% cse_after_recomputation.cse : 0.000025s : 0.00% auto_monad_reorder : 0.000030s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000480s : 0.09% symbol_engine_optimizer.build : 0.000020s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000089s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.431103s : 83.69% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000836 201 0.32% : 0.000003s : 4: substitution.elim_not_effective 0.29% : 0.000002s : 4: substitution.fold_const_symbol 0.80% : 0.000007s : 6: substitution.graph_param_transform 50.93% : 0.000426s : 13: substitution.inline 1.83% : 0.000015s : 2: substitution.inline_without_move 2.33% : 0.000020s : 23: substitution.j_node_and_user_rematch 4.51% : 0.000038s : 3: substitution.less_batch_normalization 3.45% : 0.000029s : 17: substitution.minmaximum_grad 2.74% : 0.000023s : 5: substitution.partial_eliminate 1.81% : 0.000015s : 23: substitution.remove_not_recompute_node 3.17% : 0.000027s : 10: substitution.replace_applicator 1.36% : 0.000011s : 17: substitution.replace_old_param 0.29% : 0.000002s : 1: substitution.set_cell_output_no_recompute 5.09% : 0.000043s : 17: substitution.tuple_list_convert_item_index_to_positive 3.49% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 11.25% : 0.000094s : 37: substitution.tuple_list_get_item_eliminator 6.32% : 0.000053s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.062960 2 97.16% : 0.061171s : 1: type_inference.infer 2.84% : 0.001788s : 1: type_inference.specialize ------[replace.] 0.000245 32 48.48% : 0.000119s : 13: replace.inline 42.10% : 0.000103s : 17: replace.tuple_list_get_item_eliminator 9.42% : 0.000023s : 2: replace.zero_like_fill_zero ------[match.] 0.000514 32 81.44% : 0.000418s : 13: match.inline 8.57% : 0.000044s : 17: match.tuple_list_get_item_eliminator 9.99% : 0.000051s : 2: match.zero_like_fill_zero ------[predicate.] 0.000592 4164 1.48% : 0.000009s : 68: predicate.accumulaten_eliminater 0.36% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.43% : 0.000008s : 68: predicate.addn_check_dump 1.47% : 0.000009s : 68: predicate.addn_zero_filter 2.07% : 0.000012s : 68: predicate.arithmetic_simplify 1.55% : 0.000009s : 68: predicate.cast_eliminate 0.31% : 0.000002s : 13: predicate.check_bprop_eliminate 1.48% : 0.000009s : 68: predicate.compare_switch_simplify 1.48% : 0.000009s : 68: predicate.depend_value_elim 1.45% : 0.000009s : 68: predicate.dict_get_item_const_eliminator 1.56% : 0.000009s : 68: predicate.dict_get_item_eliminator 1.52% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.23% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 6: predicate.elim_not_effective 0.18% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000009s : 68: predicate.environ_add_const_eliminate 1.45% : 0.000009s : 68: predicate.environ_get_add_eliminate 1.53% : 0.000009s : 68: predicate.environ_get_depend_swap 1.50% : 0.000009s : 68: predicate.environ_get_eliminate 1.45% : 0.000009s : 68: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 1.04% : 0.000006s : 34: predicate.get_grad_eliminate 0.08% : 0.000000s : 6: predicate.graph_param_transform 4.22% : 0.000025s : 116: predicate.inline 1.63% : 0.000010s : 62: predicate.inline_without_move 0.42% : 0.000002s : 34: predicate.j_node_and_user_rematch 1.07% : 0.000006s : 34: predicate.less_batch_normalization 1.96% : 0.000012s : 85: predicate.list_to_tuple_eliminator_ 2.16% : 0.000013s : 91: predicate.load_eliminater 0.36% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.08% : 0.000018s : 127: predicate.loop_unroll_before_grad 1.69% : 0.000010s : 74: predicate.make_slice_get_slice_eliminator 1.43% : 0.000009s : 68: predicate.merge_addn 1.55% : 0.000009s : 68: predicate.minmaximum_grad 0.52% : 0.000003s : 8: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.46% : 0.000015s : 91: predicate.partial_eliminate 1.46% : 0.000009s : 68: predicate.print_const_string_wrapper 1.91% : 0.000011s : 68: predicate.reduce_eliminate 2.06% : 0.000012s : 85: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000003s : 34: predicate.remove_not_recompute_node 2.56% : 0.000015s : 156: predicate.replace_applicator 0.84% : 0.000005s : 62: predicate.replace_old_param 0.19% : 0.000001s : 12: predicate.reset_defer_inline 1.49% : 0.000009s : 68: predicate.reshape_eliminate 1.53% : 0.000009s : 68: predicate.row_tensor_add_zeros_like 0.42% : 0.000002s : 13: predicate.row_tensor_eliminate 1.47% : 0.000009s : 68: predicate.same_eliminate 0.49% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.51% : 0.000003s : 19: predicate.special_op_eliminate 0.92% : 0.000005s : 34: predicate.specialize_transform 1.71% : 0.000010s : 68: predicate.split_environ_get_set_with_tuple_value 1.51% : 0.000009s : 68: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.78% : 0.000016s : 98: predicate.switch_defer_inline 2.49% : 0.000015s : 98: predicate.switch_layer_defer_inline 5.94% : 0.000035s : 231: predicate.switch_simplify 1.47% : 0.000009s : 68: predicate.tile_eliminate 1.48% : 0.000009s : 68: predicate.transpose_eliminate 1.86% : 0.000011s : 68: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000010s : 68: predicate.tuple_list_get_item_depend_reorder 3.36% : 0.000020s : 103: predicate.tuple_list_get_item_eliminator 1.72% : 0.000010s : 68: predicate.tuple_list_set_item_eliminator 1.95% : 0.000012s : 85: predicate.tuple_to_list_eliminator_ 2.02% : 0.000012s : 91: predicate.updatestate_pure_node_eliminater 3.03% : 0.000018s : 125: predicate.updatestate_useless_node_eliminater 1.82% : 0.000011s : 68: predicate.value_based_eliminate 0.12% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002064 33 62.06% : 0.001281s : 16: func_graph_cloner_run.FuncGraphClonerGraph 37.94% : 0.000783s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.538651 95 0.01% : 0.000078s : 1: add_recomputation 0.04% : 0.000195s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.14% : 0.000772s : 1: bootstrap 0.00% : 0.000025s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000016s : 1: environ_conv 0.03% : 0.000165s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 4.74% : 0.025530s : 1: jit_opt_a 0.04% : 0.000214s : 1: jit_opt_after_cconv 0.05% : 0.000243s : 1: jit_opt_b 0.08% : 0.000452s : 1: loop_unroll 0.11% : 0.000568s : 1: mutable_eliminate 0.79% : 0.004232s : 39: opt.transform.jit_opt_a 0.02% : 0.000083s : 4: opt.transform.jit_opt_after_cconv 0.04% : 0.000190s : 8: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000022s : 1: opt.transform.mutable_eliminate 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.09% : 0.000488s : 1: opt_after_jit_grad 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000014s : 1: pre_auto_parallel 0.01% : 0.000050s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000047s : 1: remove_dup_value 1.56% : 0.008383s : 2: renormalize.infer 0.35% : 0.001875s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000153s : 1: rewriter_after_opt_a 0.03% : 0.000169s : 1: rewriter_before_opt_a 0.02% : 0.000113s : 1: symbol_engine_optimizer 80.04% : 0.431119s : 1: task_emit 11.71% : 0.063073s : 1: type_inference 0.02% : 0.000121s : 1: validate TotalTime = 0.60407, [33] [bootstrap]: 0.00088647 [type_inference]: 0.0683649 [event_method]: 0.00015802 [auto_monad]: 0.00021438 [graph_reusing]: 7.11001e-06 [pre_auto_parallel]: 1.298e-05 [py_interpret_to_execute]: 4.726e-05 [rewriter_before_opt_a]: 0.00017045 [expand_dump_flag]: 3.33998e-06 [jit_opt_a]: 0.0267978, [3] [Cycle 1]: 0.0151281, [27] [switch_simplify]: 0.00013074 [loop_unroll]: 5.766e-05 [a_1]: 0.00132125 [with_stream_mark]: 2.932e-05 [recompute_prepare]: 2.497e-05 [updatestate_depend_eliminate]: 2.247e-05 [updatestate_assign_eliminate]: 1.824e-05 [updatestate_loads_eliminate]: 8.69e-06 [parameter_eliminate]: 2.91e-06 [specialize_transform]: 1.956e-05 [updatestate_useless_node_eliminater]: 1.8e-05 [accelerated_algorithm]: 5.715e-05 [meta_shard_fg_expand]: 4.43999e-06 [get_grad_eliminate_]: 1.794e-05 [merge_forward]: 1.123e-05 [cell_reuse_recompute_pass]: 9.70002e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.787e-05 [j_node_and_user_rematch]: 4.136e-05 [meta_fg_expand]: 0.0019295 [replace_old_param]: 7.461e-05 [inline_without_move]: 6.412e-05 [renormalize]: 0.0104543 [add_forward_monad_depend]: 3.109e-05 [auto_monad_grad]: 6.18998e-06 [auto_monad_eliminator]: 6.536e-05 [cse]: 0.00030828 [replace_applicator]: 9.325e-05 [Cycle 2]: 0.00323277, [27] [switch_simplify]: 5.295e-05 [loop_unroll]: 5.011e-05 [a_1]: 0.00163008 [with_stream_mark]: 1.839e-05 [recompute_prepare]: 1.321e-05 [updatestate_depend_eliminate]: 6.96001e-06 [updatestate_assign_eliminate]: 5.72001e-06 [updatestate_loads_eliminate]: 5.17999e-06 [parameter_eliminate]: 2.69001e-06 [specialize_transform]: 1.046e-05 [updatestate_useless_node_eliminater]: 9.84001e-06 [accelerated_algorithm]: 1.492e-05 [meta_shard_fg_expand]: 3.03e-06 [get_grad_eliminate_]: 9.64999e-06 [merge_forward]: 7.16999e-06 [cell_reuse_recompute_pass]: 1.62001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.314e-05 [j_node_and_user_rematch]: 1.807e-05 [meta_fg_expand]: 0.00011607 [replace_old_param]: 1.672e-05 [inline_without_move]: 1.006e-05 [renormalize]: 0.00092685 [add_forward_monad_depend]: 5.28002e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.793e-05 [cse]: 7.853e-05 [replace_applicator]: 1.759e-05 [Cycle 3]: 0.00058717, [27] [switch_simplify]: 1.134e-05 [loop_unroll]: 9.81998e-06 [a_1]: 0.00024788 [with_stream_mark]: 1.199e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 6.01e-06 [updatestate_assign_eliminate]: 5.07999e-06 [updatestate_loads_eliminate]: 4.50001e-06 [parameter_eliminate]: 1.13001e-06 [specialize_transform]: 9.81998e-06 [updatestate_useless_node_eliminater]: 9.88998e-06 [accelerated_algorithm]: 1.367e-05 [meta_shard_fg_expand]: 2.04e-06 [get_grad_eliminate_]: 9.31002e-06 [merge_forward]: 5.70001e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.838e-05 [j_node_and_user_rematch]: 1.581e-05 [meta_fg_expand]: 3.63999e-06 [replace_old_param]: 1.3e-05 [inline_without_move]: 9.49999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 1.217e-05 [cse]: 2.927e-05 [replace_applicator]: 9.74999e-06 [py_interpret_to_execute_after_opt_a]: 1.532e-05 [rewriter_after_opt_a]: 0.00014485 [convert_after_rewriter]: 1.096e-05 [order_py_execute_after_rewriter]: 8.52e-06 [mutable_eliminate]: 0.00067392 [jit_opt_b]: 0.00025557, [2] [Cycle 1]: 0.00018675, [2] [frontend_op_eliminate]: 0.00013752 [inline_after_opt_a]: 3.168e-05 [Cycle 2]: 5.761e-05, [2] [frontend_op_eliminate]: 2.206e-05 [inline_after_opt_a]: 2.584e-05 [cconv]: 2.46e-05 [loop_unroll]: 0.00045051 [jit_opt_after_cconv]: 0.00021815, [1] [Cycle 1]: 0.00021101, [11] [c_1]: 3.753e-05 [parameter_eliminate]: 2.87002e-06 [updatestate_depend_eliminate]: 8.74998e-06 [updatestate_assign_eliminate]: 4.53999e-06 [updatestate_loads_eliminate]: 4.50001e-06 [cse]: 4.884e-05 [call_graph_tuple_transform]: 2.57e-05 [tuple_list_get_item_eliminator]: 1.434e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 8.90024e-07 [switch_simplify]: 9.47001e-06 [remove_dup_value]: 4.98e-05 [partial_unused_args_eliminate]: 2.70002e-06 [environ_conv]: 1.44e-05 [add_recomputation]: 8.182e-05 [cse_after_recomputation]: 3.9e-05, [1] [Cycle 1]: 3.268e-05, [1] [cse]: 2.572e-05 [auto_monad_reorder]: 2.933e-05 [get_jit_bprop_graph]: 1.87999e-06 [rewriter_after_jit_bprop_graph]: 3.35003e-06 [opt_after_jit_grad]: 0.0005382 [symbol_engine_optimizer]: 0.00011498, [1] [Cycle 1]: 0.00010886, [6] [build]: 2.314e-05 [elim_shapecalc]: 1.209e-05 [elim_not_effective]: 1.899e-05 [opt_reshape]: 9.71998e-06 [fold_const_symbol]: 1.52e-05 [renormalize]: 4.30009e-07 [validate]: 0.00010999 [backend_pass]: 1.12e-06 [task_emit]: 0.504299 [execute]: 8.65001e-06 Sums bootstrap : 0.000886s : 0.15% type_inference : 0.068365s : 11.49% event_method : 0.000158s : 0.03% auto_monad : 0.000214s : 0.04% graph_reusing : 0.000007s : 0.00% pre_auto_parallel : 0.000013s : 0.00% py_interpret_to_execute : 0.000047s : 0.01% rewriter_before_opt_a : 0.000170s : 0.03% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000195s : 0.03% jit_opt_a.loop_unroll : 0.000118s : 0.02% jit_opt_a.a_1 : 0.003199s : 0.54% jit_opt_a.with_stream_mark : 0.000060s : 0.01% jit_opt_a.recompute_prepare : 0.000048s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000035s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000029s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000018s : 0.00% jit_opt_a.parameter_eliminate : 0.000007s : 0.00% jit_opt_a.specialize_transform : 0.000040s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000038s : 0.01% jit_opt_a.accelerated_algorithm : 0.000086s : 0.01% jit_opt_a.meta_shard_fg_expand : 0.000010s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000037s : 0.01% jit_opt_a.merge_forward : 0.000024s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000079s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000075s : 0.01% jit_opt_a.meta_fg_expand : 0.002049s : 0.34% jit_opt_a.replace_old_param : 0.000104s : 0.02% jit_opt_a.inline_without_move : 0.000084s : 0.01% jit_opt_a.renormalize : 0.011381s : 1.91% jit_opt_a.add_forward_monad_depend : 0.000038s : 0.01% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000095s : 0.02% jit_opt_a.cse : 0.000416s : 0.07% jit_opt_a.replace_applicator : 0.000121s : 0.02% py_interpret_to_execute_after_opt_a : 0.000015s : 0.00% rewriter_after_opt_a : 0.000145s : 0.02% convert_after_rewriter : 0.000011s : 0.00% order_py_execute_after_rewriter : 0.000009s : 0.00% mutable_eliminate : 0.000674s : 0.11% jit_opt_b.frontend_op_eliminate : 0.000160s : 0.03% jit_opt_b.inline_after_opt_a : 0.000058s : 0.01% cconv : 0.000025s : 0.00% loop_unroll : 0.000451s : 0.08% jit_opt_after_cconv.c_1 : 0.000038s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.cse : 0.000049s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000026s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000014s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000050s : 0.01% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000014s : 0.00% add_recomputation : 0.000082s : 0.01% cse_after_recomputation.cse : 0.000026s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000538s : 0.09% symbol_engine_optimizer.build : 0.000023s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000110s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.504299s : 84.73% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000923 201 0.28% : 0.000003s : 4: substitution.elim_not_effective 0.24% : 0.000002s : 4: substitution.fold_const_symbol 0.76% : 0.000007s : 6: substitution.graph_param_transform 48.27% : 0.000445s : 13: substitution.inline 1.66% : 0.000015s : 2: substitution.inline_without_move 2.48% : 0.000023s : 23: substitution.j_node_and_user_rematch 4.37% : 0.000040s : 3: substitution.less_batch_normalization 7.11% : 0.000066s : 17: substitution.minmaximum_grad 2.98% : 0.000027s : 5: substitution.partial_eliminate 1.83% : 0.000017s : 23: substitution.remove_not_recompute_node 3.17% : 0.000029s : 10: substitution.replace_applicator 1.37% : 0.000013s : 17: substitution.replace_old_param 0.29% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.70% : 0.000043s : 17: substitution.tuple_list_convert_item_index_to_positive 3.23% : 0.000030s : 17: substitution.tuple_list_get_item_depend_reorder 10.35% : 0.000096s : 37: substitution.tuple_list_get_item_eliminator 6.91% : 0.000064s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.068252 2 97.42% : 0.066493s : 1: type_inference.infer 2.58% : 0.001758s : 1: type_inference.specialize ------[replace.] 0.000260 32 48.39% : 0.000126s : 13: replace.inline 40.62% : 0.000106s : 17: replace.tuple_list_get_item_eliminator 10.99% : 0.000029s : 2: replace.zero_like_fill_zero ------[match.] 0.000545 32 80.33% : 0.000438s : 13: match.inline 8.25% : 0.000045s : 17: match.tuple_list_get_item_eliminator 11.42% : 0.000062s : 2: match.zero_like_fill_zero ------[predicate.] 0.000563 4164 1.49% : 0.000008s : 68: predicate.accumulaten_eliminater 0.39% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.41% : 0.000008s : 68: predicate.addn_check_dump 1.48% : 0.000008s : 68: predicate.addn_zero_filter 1.90% : 0.000011s : 68: predicate.arithmetic_simplify 1.50% : 0.000008s : 68: predicate.cast_eliminate 0.33% : 0.000002s : 13: predicate.check_bprop_eliminate 1.41% : 0.000008s : 68: predicate.compare_switch_simplify 1.48% : 0.000008s : 68: predicate.depend_value_elim 1.46% : 0.000008s : 68: predicate.dict_get_item_const_eliminator 1.58% : 0.000009s : 68: predicate.dict_get_item_eliminator 1.47% : 0.000008s : 68: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.46% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.47% : 0.000008s : 68: predicate.environ_get_depend_swap 1.47% : 0.000008s : 68: predicate.environ_get_eliminate 1.44% : 0.000008s : 68: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.90% : 0.000005s : 34: predicate.get_grad_eliminate 0.10% : 0.000001s : 6: predicate.graph_param_transform 4.22% : 0.000024s : 116: predicate.inline 1.72% : 0.000010s : 62: predicate.inline_without_move 0.44% : 0.000002s : 34: predicate.j_node_and_user_rematch 1.11% : 0.000006s : 34: predicate.less_batch_normalization 1.93% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 2.08% : 0.000012s : 91: predicate.load_eliminater 0.38% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.06% : 0.000017s : 127: predicate.loop_unroll_before_grad 1.67% : 0.000009s : 74: predicate.make_slice_get_slice_eliminator 1.42% : 0.000008s : 68: predicate.merge_addn 1.53% : 0.000009s : 68: predicate.minmaximum_grad 0.49% : 0.000003s : 8: predicate.mutable_eliminate 0.19% : 0.000001s : 6: predicate.opt_reshape 2.49% : 0.000014s : 91: predicate.partial_eliminate 1.47% : 0.000008s : 68: predicate.print_const_string_wrapper 1.86% : 0.000010s : 68: predicate.reduce_eliminate 1.93% : 0.000011s : 85: predicate.redundant_stop_gradient_eliminater 0.48% : 0.000003s : 34: predicate.remove_not_recompute_node 2.65% : 0.000015s : 156: predicate.replace_applicator 0.91% : 0.000005s : 62: predicate.replace_old_param 0.21% : 0.000001s : 12: predicate.reset_defer_inline 1.50% : 0.000008s : 68: predicate.reshape_eliminate 1.46% : 0.000008s : 68: predicate.row_tensor_add_zeros_like 0.49% : 0.000003s : 13: predicate.row_tensor_eliminate 1.53% : 0.000009s : 68: predicate.same_eliminate 0.52% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.52% : 0.000003s : 19: predicate.special_op_eliminate 0.95% : 0.000005s : 34: predicate.specialize_transform 1.81% : 0.000010s : 68: predicate.split_environ_get_set_with_tuple_value 1.52% : 0.000009s : 68: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.65% : 0.000015s : 98: predicate.switch_defer_inline 2.45% : 0.000014s : 98: predicate.switch_layer_defer_inline 5.99% : 0.000034s : 231: predicate.switch_simplify 1.48% : 0.000008s : 68: predicate.tile_eliminate 1.46% : 0.000008s : 68: predicate.transpose_eliminate 1.89% : 0.000011s : 68: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000010s : 68: predicate.tuple_list_get_item_depend_reorder 3.53% : 0.000020s : 103: predicate.tuple_list_get_item_eliminator 1.83% : 0.000010s : 68: predicate.tuple_list_set_item_eliminator 1.93% : 0.000011s : 85: predicate.tuple_to_list_eliminator_ 1.97% : 0.000011s : 91: predicate.updatestate_pure_node_eliminater 3.01% : 0.000017s : 125: predicate.updatestate_useless_node_eliminater 1.80% : 0.000010s : 68: predicate.value_based_eliminate 0.14% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002133 33 58.88% : 0.001256s : 16: func_graph_cloner_run.FuncGraphClonerGraph 41.12% : 0.000877s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.619884 95 0.01% : 0.000085s : 1: add_recomputation 0.04% : 0.000222s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.15% : 0.000904s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000017s : 1: environ_conv 0.03% : 0.000164s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 4.32% : 0.026802s : 1: jit_opt_a 0.04% : 0.000221s : 1: jit_opt_after_cconv 0.04% : 0.000258s : 1: jit_opt_b 0.07% : 0.000458s : 1: loop_unroll 0.11% : 0.000682s : 1: mutable_eliminate 0.67% : 0.004159s : 39: opt.transform.jit_opt_a 0.01% : 0.000083s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000205s : 8: opt.transform.jit_opt_b 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.01% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.09% : 0.000546s : 1: opt_after_jit_grad 0.00% : 0.000011s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.01% : 0.000051s : 1: py_interpret_to_execute 0.00% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000053s : 1: remove_dup_value 1.50% : 0.009292s : 2: renormalize.infer 0.33% : 0.002071s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000149s : 1: rewriter_after_opt_a 0.03% : 0.000174s : 1: rewriter_before_opt_a 0.02% : 0.000117s : 1: symbol_engine_optimizer 81.36% : 0.504322s : 1: task_emit 11.03% : 0.068379s : 1: type_inference 0.02% : 0.000146s : 1: validate TotalTime = 0.603006, [33] [bootstrap]: 0.00091014 [type_inference]: 0.0684129 [event_method]: 0.00019075 [auto_monad]: 0.00019665 [graph_reusing]: 7.73001e-06 [pre_auto_parallel]: 1.748e-05 [py_interpret_to_execute]: 5.288e-05 [rewriter_before_opt_a]: 0.00018796 [expand_dump_flag]: 3.44001e-06 [jit_opt_a]: 0.0277953, [3] [Cycle 1]: 0.0159062, [27] [switch_simplify]: 0.00014607 [loop_unroll]: 5.826e-05 [a_1]: 0.001324 [with_stream_mark]: 3.696e-05 [recompute_prepare]: 2.624e-05 [updatestate_depend_eliminate]: 2.5e-05 [updatestate_assign_eliminate]: 2.048e-05 [updatestate_loads_eliminate]: 9.19e-06 [parameter_eliminate]: 2.55997e-06 [specialize_transform]: 1.916e-05 [updatestate_useless_node_eliminater]: 1.837e-05 [accelerated_algorithm]: 6.002e-05 [meta_shard_fg_expand]: 4.3e-06 [get_grad_eliminate_]: 1.863e-05 [merge_forward]: 1.109e-05 [cell_reuse_recompute_pass]: 1.05999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.95e-05 [j_node_and_user_rematch]: 4.426e-05 [meta_fg_expand]: 0.00216197 [replace_old_param]: 7.937e-05 [inline_without_move]: 6.686e-05 [renormalize]: 0.01087 [add_forward_monad_depend]: 2.315e-05 [auto_monad_grad]: 5.77999e-06 [auto_monad_eliminator]: 0.00013004 [cse]: 0.00032979 [replace_applicator]: 9.271e-05 [Cycle 2]: 0.00315314, [27] [switch_simplify]: 5.344e-05 [loop_unroll]: 5.154e-05 [a_1]: 0.00162091 [with_stream_mark]: 1.616e-05 [recompute_prepare]: 1.186e-05 [updatestate_depend_eliminate]: 6.60997e-06 [updatestate_assign_eliminate]: 5.59998e-06 [updatestate_loads_eliminate]: 5.35001e-06 [parameter_eliminate]: 1.67999e-06 [specialize_transform]: 1.116e-05 [updatestate_useless_node_eliminater]: 1.001e-05 [accelerated_algorithm]: 1.457e-05 [meta_shard_fg_expand]: 2.96999e-06 [get_grad_eliminate_]: 1.032e-05 [merge_forward]: 5.89e-06 [cell_reuse_recompute_pass]: 9.29984e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.364e-05 [j_node_and_user_rematch]: 1.729e-05 [meta_fg_expand]: 7.293e-05 [replace_old_param]: 1.587e-05 [inline_without_move]: 1.026e-05 [renormalize]: 0.00091983 [add_forward_monad_depend]: 4.58001e-06 [auto_monad_grad]: 1.79998e-06 [auto_monad_eliminator]: 1.665e-05 [cse]: 7.205e-05 [replace_applicator]: 1.693e-05 [Cycle 3]: 0.00062261, [27] [switch_simplify]: 1.055e-05 [loop_unroll]: 9.72999e-06 [a_1]: 0.00024802 [with_stream_mark]: 1.178e-05 [recompute_prepare]: 1.099e-05 [updatestate_depend_eliminate]: 6.44999e-06 [updatestate_assign_eliminate]: 4.95001e-06 [updatestate_loads_eliminate]: 4.48999e-06 [parameter_eliminate]: 1.49e-06 [specialize_transform]: 1.03e-05 [updatestate_useless_node_eliminater]: 9.82001e-06 [accelerated_algorithm]: 1.455e-05 [meta_shard_fg_expand]: 2.16e-06 [get_grad_eliminate_]: 9.56e-06 [merge_forward]: 5.56998e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.967e-05 [j_node_and_user_rematch]: 1.595e-05 [meta_fg_expand]: 3.65e-06 [replace_old_param]: 1.388e-05 [inline_without_move]: 9.43002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.54e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 1.318e-05 [cse]: 3.191e-05 [replace_applicator]: 9.82999e-06 [py_interpret_to_execute_after_opt_a]: 1.461e-05 [rewriter_after_opt_a]: 0.00015746 [convert_after_rewriter]: 1.133e-05 [order_py_execute_after_rewriter]: 8.08001e-06 [mutable_eliminate]: 0.00069149 [jit_opt_b]: 0.00024065, [2] [Cycle 1]: 0.00017162, [2] [frontend_op_eliminate]: 0.00012324 [inline_after_opt_a]: 3.208e-05 [Cycle 2]: 5.801e-05, [2] [frontend_op_eliminate]: 2.254e-05 [inline_after_opt_a]: 2.627e-05 [cconv]: 2.346e-05 [loop_unroll]: 0.00043899 [jit_opt_after_cconv]: 0.00020992, [1] [Cycle 1]: 0.00020381, [11] [c_1]: 3.588e-05 [parameter_eliminate]: 2.22999e-06 [updatestate_depend_eliminate]: 7.40998e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.97998e-06 [cse]: 4.611e-05 [call_graph_tuple_transform]: 2.689e-05 [tuple_list_get_item_eliminator]: 1.412e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 5.10016e-07 [switch_simplify]: 9.39e-06 [remove_dup_value]: 4.288e-05 [partial_unused_args_eliminate]: 2.36e-06 [environ_conv]: 1.386e-05 [add_recomputation]: 7.897e-05 [cse_after_recomputation]: 3.811e-05, [1] [Cycle 1]: 3.231e-05, [1] [cse]: 2.54e-05 [auto_monad_reorder]: 2.915e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 3.02002e-06 [opt_after_jit_grad]: 0.00047079 [symbol_engine_optimizer]: 0.00011528, [1] [Cycle 1]: 0.000109, [6] [build]: 2.404e-05 [elim_shapecalc]: 1.274e-05 [elim_not_effective]: 1.869e-05 [opt_reshape]: 9.84999e-06 [fold_const_symbol]: 1.447e-05 [renormalize]: 5.69999e-07 [validate]: 7.877e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.502236 [execute]: 9.14998e-06 Sums bootstrap : 0.000910s : 0.15% type_inference : 0.068413s : 11.52% event_method : 0.000191s : 0.03% auto_monad : 0.000197s : 0.03% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000017s : 0.00% py_interpret_to_execute : 0.000053s : 0.01% rewriter_before_opt_a : 0.000188s : 0.03% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000210s : 0.04% jit_opt_a.loop_unroll : 0.000120s : 0.02% jit_opt_a.a_1 : 0.003193s : 0.54% jit_opt_a.with_stream_mark : 0.000065s : 0.01% jit_opt_a.recompute_prepare : 0.000049s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000038s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000031s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000019s : 0.00% jit_opt_a.parameter_eliminate : 0.000006s : 0.00% jit_opt_a.specialize_transform : 0.000041s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000038s : 0.01% jit_opt_a.accelerated_algorithm : 0.000089s : 0.02% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000039s : 0.01% jit_opt_a.merge_forward : 0.000023s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000083s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000078s : 0.01% jit_opt_a.meta_fg_expand : 0.002239s : 0.38% jit_opt_a.replace_old_param : 0.000109s : 0.02% jit_opt_a.inline_without_move : 0.000087s : 0.01% jit_opt_a.renormalize : 0.011790s : 1.99% jit_opt_a.add_forward_monad_depend : 0.000029s : 0.00% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000160s : 0.03% jit_opt_a.cse : 0.000434s : 0.07% jit_opt_a.replace_applicator : 0.000119s : 0.02% py_interpret_to_execute_after_opt_a : 0.000015s : 0.00% rewriter_after_opt_a : 0.000157s : 0.03% convert_after_rewriter : 0.000011s : 0.00% order_py_execute_after_rewriter : 0.000008s : 0.00% mutable_eliminate : 0.000691s : 0.12% jit_opt_b.frontend_op_eliminate : 0.000146s : 0.02% jit_opt_b.inline_after_opt_a : 0.000058s : 0.01% cconv : 0.000023s : 0.00% loop_unroll : 0.000439s : 0.07% jit_opt_after_cconv.c_1 : 0.000036s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000046s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000027s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000014s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000043s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000014s : 0.00% add_recomputation : 0.000079s : 0.01% cse_after_recomputation.cse : 0.000025s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000471s : 0.08% symbol_engine_optimizer.build : 0.000024s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000079s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.502236s : 84.57% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000918 201 0.28% : 0.000003s : 4: substitution.elim_not_effective 0.23% : 0.000002s : 4: substitution.fold_const_symbol 0.81% : 0.000007s : 6: substitution.graph_param_transform 50.69% : 0.000465s : 13: substitution.inline 2.08% : 0.000019s : 2: substitution.inline_without_move 2.72% : 0.000025s : 23: substitution.j_node_and_user_rematch 4.74% : 0.000043s : 3: substitution.less_batch_normalization 3.86% : 0.000035s : 17: substitution.minmaximum_grad 3.39% : 0.000031s : 5: substitution.partial_eliminate 1.76% : 0.000016s : 23: substitution.remove_not_recompute_node 3.08% : 0.000028s : 10: substitution.replace_applicator 1.51% : 0.000014s : 17: substitution.replace_old_param 0.33% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.72% : 0.000043s : 17: substitution.tuple_list_convert_item_index_to_positive 3.11% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 10.43% : 0.000096s : 37: substitution.tuple_list_get_item_eliminator 6.27% : 0.000058s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.068298 2 97.24% : 0.066410s : 1: type_inference.infer 2.76% : 0.001888s : 1: type_inference.specialize ------[replace.] 0.000248 32 50.77% : 0.000126s : 13: replace.inline 40.45% : 0.000100s : 17: replace.tuple_list_get_item_eliminator 8.79% : 0.000022s : 2: replace.zero_like_fill_zero ------[match.] 0.000559 32 81.84% : 0.000457s : 13: match.inline 8.13% : 0.000045s : 17: match.tuple_list_get_item_eliminator 10.03% : 0.000056s : 2: match.zero_like_fill_zero ------[predicate.] 0.000563 4164 1.49% : 0.000008s : 68: predicate.accumulaten_eliminater 0.37% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.43% : 0.000008s : 68: predicate.addn_check_dump 1.57% : 0.000009s : 68: predicate.addn_zero_filter 1.86% : 0.000010s : 68: predicate.arithmetic_simplify 1.50% : 0.000008s : 68: predicate.cast_eliminate 0.34% : 0.000002s : 13: predicate.check_bprop_eliminate 1.42% : 0.000008s : 68: predicate.compare_switch_simplify 1.44% : 0.000008s : 68: predicate.depend_value_elim 1.44% : 0.000008s : 68: predicate.dict_get_item_const_eliminator 1.57% : 0.000009s : 68: predicate.dict_get_item_eliminator 1.48% : 0.000008s : 68: predicate.dict_set_item_eliminator 0.21% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.47% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.44% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.42% : 0.000008s : 68: predicate.environ_get_depend_swap 1.48% : 0.000008s : 68: predicate.environ_get_eliminate 1.44% : 0.000008s : 68: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.99% : 0.000006s : 34: predicate.get_grad_eliminate 0.11% : 0.000001s : 6: predicate.graph_param_transform 4.31% : 0.000024s : 116: predicate.inline 1.75% : 0.000010s : 62: predicate.inline_without_move 0.44% : 0.000002s : 34: predicate.j_node_and_user_rematch 1.04% : 0.000006s : 34: predicate.less_batch_normalization 1.95% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 2.11% : 0.000012s : 91: predicate.load_eliminater 0.34% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.00% : 0.000017s : 127: predicate.loop_unroll_before_grad 1.68% : 0.000009s : 74: predicate.make_slice_get_slice_eliminator 1.41% : 0.000008s : 68: predicate.merge_addn 1.52% : 0.000009s : 68: predicate.minmaximum_grad 0.45% : 0.000003s : 8: predicate.mutable_eliminate 0.19% : 0.000001s : 6: predicate.opt_reshape 2.50% : 0.000014s : 91: predicate.partial_eliminate 1.55% : 0.000009s : 68: predicate.print_const_string_wrapper 1.81% : 0.000010s : 68: predicate.reduce_eliminate 1.95% : 0.000011s : 85: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000003s : 34: predicate.remove_not_recompute_node 2.68% : 0.000015s : 156: predicate.replace_applicator 0.94% : 0.000005s : 62: predicate.replace_old_param 0.19% : 0.000001s : 12: predicate.reset_defer_inline 1.51% : 0.000008s : 68: predicate.reshape_eliminate 1.46% : 0.000008s : 68: predicate.row_tensor_add_zeros_like 0.42% : 0.000002s : 13: predicate.row_tensor_eliminate 1.51% : 0.000009s : 68: predicate.same_eliminate 0.54% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.52% : 0.000003s : 19: predicate.special_op_eliminate 0.94% : 0.000005s : 34: predicate.specialize_transform 1.67% : 0.000009s : 68: predicate.split_environ_get_set_with_tuple_value 1.50% : 0.000008s : 68: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.65% : 0.000015s : 98: predicate.switch_defer_inline 2.43% : 0.000014s : 98: predicate.switch_layer_defer_inline 6.00% : 0.000034s : 231: predicate.switch_simplify 1.47% : 0.000008s : 68: predicate.tile_eliminate 1.48% : 0.000008s : 68: predicate.transpose_eliminate 1.89% : 0.000011s : 68: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000010s : 68: predicate.tuple_list_get_item_depend_reorder 3.50% : 0.000020s : 103: predicate.tuple_list_get_item_eliminator 1.88% : 0.000011s : 68: predicate.tuple_list_set_item_eliminator 1.91% : 0.000011s : 85: predicate.tuple_to_list_eliminator_ 2.03% : 0.000011s : 91: predicate.updatestate_pure_node_eliminater 3.18% : 0.000018s : 125: predicate.updatestate_useless_node_eliminater 1.80% : 0.000010s : 68: predicate.value_based_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.52% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002313 33 63.90% : 0.001478s : 16: func_graph_cloner_run.FuncGraphClonerGraph 36.10% : 0.000835s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.619235 95 0.01% : 0.000082s : 1: add_recomputation 0.03% : 0.000204s : 1: auto_monad 0.01% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.15% : 0.000930s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000014s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000016s : 1: environ_conv 0.03% : 0.000199s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 4.49% : 0.027798s : 1: jit_opt_a 0.03% : 0.000213s : 1: jit_opt_after_cconv 0.04% : 0.000243s : 1: jit_opt_b 0.07% : 0.000446s : 1: loop_unroll 0.11% : 0.000699s : 1: mutable_eliminate 0.68% : 0.004186s : 39: opt.transform.jit_opt_a 0.01% : 0.000083s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000192s : 8: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000019s : 1: opt.transform.mutable_eliminate 0.00% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.08% : 0.000478s : 1: opt_after_jit_grad 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000020s : 1: pre_auto_parallel 0.01% : 0.000056s : 1: py_interpret_to_execute 0.00% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000046s : 1: remove_dup_value 1.58% : 0.009758s : 2: renormalize.infer 0.33% : 0.002014s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000162s : 1: rewriter_after_opt_a 0.03% : 0.000191s : 1: rewriter_before_opt_a 0.02% : 0.000118s : 1: symbol_engine_optimizer 81.11% : 0.502253s : 1: task_emit 11.05% : 0.068430s : 1: type_inference 0.02% : 0.000112s : 1: validate TotalTime = 0.643868, [33] [bootstrap]: 0.00098509 [type_inference]: 0.0678114 [event_method]: 0.0001468 [auto_monad]: 0.00023533 [graph_reusing]: 6.89001e-06 [pre_auto_parallel]: 1.252e-05 [py_interpret_to_execute]: 4.866e-05 [rewriter_before_opt_a]: 0.00017031 [expand_dump_flag]: 2.98998e-06 [jit_opt_a]: 0.0266405, [3] [Cycle 1]: 0.0145559, [27] [switch_simplify]: 0.00012622 [loop_unroll]: 5.868e-05 [a_1]: 0.00129633 [with_stream_mark]: 3.034e-05 [recompute_prepare]: 2.436e-05 [updatestate_depend_eliminate]: 2.074e-05 [updatestate_assign_eliminate]: 1.705e-05 [updatestate_loads_eliminate]: 8.96998e-06 [parameter_eliminate]: 3.04999e-06 [specialize_transform]: 1.943e-05 [updatestate_useless_node_eliminater]: 1.759e-05 [accelerated_algorithm]: 5.595e-05 [meta_shard_fg_expand]: 4.63999e-06 [get_grad_eliminate_]: 1.828e-05 [merge_forward]: 1.004e-05 [cell_reuse_recompute_pass]: 1.03001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.697e-05 [j_node_and_user_rematch]: 3.911e-05 [meta_fg_expand]: 0.00188067 [replace_old_param]: 7.319e-05 [inline_without_move]: 6.485e-05 [renormalize]: 0.0100193 [add_forward_monad_depend]: 3.192e-05 [auto_monad_grad]: 6.47001e-06 [auto_monad_eliminator]: 6.303e-05 [cse]: 0.00028034 [replace_applicator]: 8.253e-05 [Cycle 2]: 0.00304009, [27] [switch_simplify]: 5.278e-05 [loop_unroll]: 5.019e-05 [a_1]: 0.001578 [with_stream_mark]: 1.424e-05 [recompute_prepare]: 1.199e-05 [updatestate_depend_eliminate]: 6.14001e-06 [updatestate_assign_eliminate]: 4.99e-06 [updatestate_loads_eliminate]: 4.62998e-06 [parameter_eliminate]: 1.19e-06 [specialize_transform]: 1.053e-05 [updatestate_useless_node_eliminater]: 1.028e-05 [accelerated_algorithm]: 1.381e-05 [meta_shard_fg_expand]: 2.43998e-06 [get_grad_eliminate_]: 1.006e-05 [merge_forward]: 6.22001e-06 [cell_reuse_recompute_pass]: 9.20001e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.796e-05 [j_node_and_user_rematch]: 1.747e-05 [meta_fg_expand]: 6.367e-05 [replace_old_param]: 1.591e-05 [inline_without_move]: 1.003e-05 [renormalize]: 0.00085241 [add_forward_monad_depend]: 4.41002e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 1.635e-05 [cse]: 6.821e-05 [replace_applicator]: 1.692e-05 [Cycle 3]: 0.00057991, [27] [switch_simplify]: 1.06e-05 [loop_unroll]: 9.99001e-06 [a_1]: 0.00024802 [with_stream_mark]: 1.122e-05 [recompute_prepare]: 9.66e-06 [updatestate_depend_eliminate]: 5.43002e-06 [updatestate_assign_eliminate]: 5.12999e-06 [updatestate_loads_eliminate]: 4.51002e-06 [parameter_eliminate]: 1.03001e-06 [specialize_transform]: 9.79e-06 [updatestate_useless_node_eliminater]: 9.62001e-06 [accelerated_algorithm]: 1.322e-05 [meta_shard_fg_expand]: 1.89e-06 [get_grad_eliminate_]: 9.47999e-06 [merge_forward]: 5.00001e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.852e-05 [j_node_and_user_rematch]: 1.583e-05 [meta_fg_expand]: 3.38e-06 [replace_old_param]: 1.293e-05 [inline_without_move]: 9.57001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 1.147e-05 [cse]: 2.829e-05 [replace_applicator]: 1.018e-05 [py_interpret_to_execute_after_opt_a]: 1.41e-05 [rewriter_after_opt_a]: 0.00015682 [convert_after_rewriter]: 1.048e-05 [order_py_execute_after_rewriter]: 7.93001e-06 [mutable_eliminate]: 0.0005286 [jit_opt_b]: 0.00024157, [2] [Cycle 1]: 0.00017304, [2] [frontend_op_eliminate]: 0.00012283 [inline_after_opt_a]: 3.256e-05 [Cycle 2]: 5.807e-05, [2] [frontend_op_eliminate]: 2.255e-05 [inline_after_opt_a]: 2.595e-05 [cconv]: 2.142e-05 [loop_unroll]: 0.00044586 [jit_opt_after_cconv]: 0.00020997, [1] [Cycle 1]: 0.00020356, [11] [c_1]: 3.709e-05 [parameter_eliminate]: 2.28998e-06 [updatestate_depend_eliminate]: 7.72998e-06 [updatestate_assign_eliminate]: 4.57e-06 [updatestate_loads_eliminate]: 4.45999e-06 [cse]: 4.52e-05 [call_graph_tuple_transform]: 2.603e-05 [tuple_list_get_item_eliminator]: 1.392e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 4.50003e-07 [switch_simplify]: 9.47999e-06 [remove_dup_value]: 4.53e-05 [partial_unused_args_eliminate]: 2.40002e-06 [environ_conv]: 1.429e-05 [add_recomputation]: 7.597e-05 [cse_after_recomputation]: 3.823e-05, [1] [Cycle 1]: 3.227e-05, [1] [cse]: 2.524e-05 [auto_monad_reorder]: 2.865e-05 [get_jit_bprop_graph]: 1.92999e-06 [rewriter_after_jit_bprop_graph]: 3.56001e-06 [opt_after_jit_grad]: 0.00049288 [symbol_engine_optimizer]: 0.00011514, [1] [Cycle 1]: 0.0001084, [6] [build]: 2.518e-05 [elim_shapecalc]: 1.198e-05 [elim_not_effective]: 1.85e-05 [opt_reshape]: 9.64e-06 [fold_const_symbol]: 1.435e-05 [renormalize]: 3.19997e-07 [validate]: 8.422e-05 [backend_pass]: 1.05001e-06 [task_emit]: 0.544922 [execute]: 8.55001e-06 Sums bootstrap : 0.000985s : 0.16% type_inference : 0.067811s : 10.69% event_method : 0.000147s : 0.02% auto_monad : 0.000235s : 0.04% graph_reusing : 0.000007s : 0.00% pre_auto_parallel : 0.000013s : 0.00% py_interpret_to_execute : 0.000049s : 0.01% rewriter_before_opt_a : 0.000170s : 0.03% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000190s : 0.03% jit_opt_a.loop_unroll : 0.000119s : 0.02% jit_opt_a.a_1 : 0.003122s : 0.49% jit_opt_a.with_stream_mark : 0.000056s : 0.01% jit_opt_a.recompute_prepare : 0.000046s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000032s : 0.01% jit_opt_a.updatestate_assign_eliminate : 0.000027s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000018s : 0.00% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000040s : 0.01% jit_opt_a.updatestate_useless_node_eliminater : 0.000037s : 0.01% jit_opt_a.accelerated_algorithm : 0.000083s : 0.01% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000038s : 0.01% jit_opt_a.merge_forward : 0.000021s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000093s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000072s : 0.01% jit_opt_a.meta_fg_expand : 0.001948s : 0.31% jit_opt_a.replace_old_param : 0.000102s : 0.02% jit_opt_a.inline_without_move : 0.000084s : 0.01% jit_opt_a.renormalize : 0.010872s : 1.71% jit_opt_a.add_forward_monad_depend : 0.000038s : 0.01% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000091s : 0.01% jit_opt_a.cse : 0.000377s : 0.06% jit_opt_a.replace_applicator : 0.000110s : 0.02% py_interpret_to_execute_after_opt_a : 0.000014s : 0.00% rewriter_after_opt_a : 0.000157s : 0.02% convert_after_rewriter : 0.000010s : 0.00% order_py_execute_after_rewriter : 0.000008s : 0.00% mutable_eliminate : 0.000529s : 0.08% jit_opt_b.frontend_op_eliminate : 0.000145s : 0.02% jit_opt_b.inline_after_opt_a : 0.000059s : 0.01% cconv : 0.000021s : 0.00% loop_unroll : 0.000446s : 0.07% jit_opt_after_cconv.c_1 : 0.000037s : 0.01% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000045s : 0.01% jit_opt_after_cconv.call_graph_tuple_transform : 0.000026s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000014s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000045s : 0.01% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000014s : 0.00% add_recomputation : 0.000076s : 0.01% cse_after_recomputation.cse : 0.000025s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000493s : 0.08% symbol_engine_optimizer.build : 0.000025s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000084s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.544922s : 85.90% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000855 201 0.30% : 0.000003s : 4: substitution.elim_not_effective 0.22% : 0.000002s : 4: substitution.fold_const_symbol 0.76% : 0.000007s : 6: substitution.graph_param_transform 51.72% : 0.000442s : 13: substitution.inline 1.87% : 0.000016s : 2: substitution.inline_without_move 2.37% : 0.000020s : 23: substitution.j_node_and_user_rematch 4.52% : 0.000039s : 3: substitution.less_batch_normalization 3.42% : 0.000029s : 17: substitution.minmaximum_grad 3.02% : 0.000026s : 5: substitution.partial_eliminate 1.79% : 0.000015s : 23: substitution.remove_not_recompute_node 2.98% : 0.000025s : 10: substitution.replace_applicator 1.40% : 0.000012s : 17: substitution.replace_old_param 0.32% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.77% : 0.000041s : 17: substitution.tuple_list_convert_item_index_to_positive 3.24% : 0.000028s : 17: substitution.tuple_list_get_item_depend_reorder 10.52% : 0.000090s : 37: substitution.tuple_list_get_item_eliminator 6.77% : 0.000058s : 2: substitution.zero_like_fill_zero ------[type_inference.] 0.067687 2 97.40% : 0.065930s : 1: type_inference.infer 2.60% : 0.001757s : 1: type_inference.specialize ------[replace.] 0.000241 32 49.89% : 0.000120s : 13: replace.inline 41.26% : 0.000099s : 17: replace.tuple_list_get_item_eliminator 8.85% : 0.000021s : 2: replace.zero_like_fill_zero ------[match.] 0.000534 32 81.44% : 0.000435s : 13: match.inline 8.03% : 0.000043s : 17: match.tuple_list_get_item_eliminator 10.53% : 0.000056s : 2: match.zero_like_fill_zero ------[predicate.] 0.000554 4164 1.53% : 0.000008s : 68: predicate.accumulaten_eliminater 0.36% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.44% : 0.000008s : 68: predicate.addn_check_dump 1.48% : 0.000008s : 68: predicate.addn_zero_filter 1.93% : 0.000011s : 68: predicate.arithmetic_simplify 1.57% : 0.000009s : 68: predicate.cast_eliminate 0.31% : 0.000002s : 13: predicate.check_bprop_eliminate 1.46% : 0.000008s : 68: predicate.compare_switch_simplify 1.49% : 0.000008s : 68: predicate.depend_value_elim 1.47% : 0.000008s : 68: predicate.dict_get_item_const_eliminator 1.57% : 0.000009s : 68: predicate.dict_get_item_eliminator 1.48% : 0.000008s : 68: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.52% : 0.000008s : 68: predicate.environ_add_const_eliminate 1.44% : 0.000008s : 68: predicate.environ_get_add_eliminate 1.47% : 0.000008s : 68: predicate.environ_get_depend_swap 1.45% : 0.000008s : 68: predicate.environ_get_eliminate 1.42% : 0.000008s : 68: predicate.environ_get_set_eliminate 0.08% : 0.000000s : 6: predicate.fold_const_symbol 0.90% : 0.000005s : 34: predicate.get_grad_eliminate 0.09% : 0.000001s : 6: predicate.graph_param_transform 4.41% : 0.000024s : 116: predicate.inline 1.82% : 0.000010s : 62: predicate.inline_without_move 0.44% : 0.000002s : 34: predicate.j_node_and_user_rematch 0.99% : 0.000005s : 34: predicate.less_batch_normalization 1.97% : 0.000011s : 85: predicate.list_to_tuple_eliminator_ 2.11% : 0.000012s : 91: predicate.load_eliminater 0.36% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.01% : 0.000017s : 127: predicate.loop_unroll_before_grad 1.67% : 0.000009s : 74: predicate.make_slice_get_slice_eliminator 1.43% : 0.000008s : 68: predicate.merge_addn 1.57% : 0.000009s : 68: predicate.minmaximum_grad 0.47% : 0.000003s : 8: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.47% : 0.000014s : 91: predicate.partial_eliminate 1.44% : 0.000008s : 68: predicate.print_const_string_wrapper 1.85% : 0.000010s : 68: predicate.reduce_eliminate 1.99% : 0.000011s : 85: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000003s : 34: predicate.remove_not_recompute_node 2.59% : 0.000014s : 156: predicate.replace_applicator 0.90% : 0.000005s : 62: predicate.replace_old_param 0.21% : 0.000001s : 12: predicate.reset_defer_inline 1.47% : 0.000008s : 68: predicate.reshape_eliminate 1.48% : 0.000008s : 68: predicate.row_tensor_add_zeros_like 0.44% : 0.000002s : 13: predicate.row_tensor_eliminate 1.50% : 0.000008s : 68: predicate.same_eliminate 0.53% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.54% : 0.000003s : 19: predicate.special_op_eliminate 0.92% : 0.000005s : 34: predicate.specialize_transform 1.72% : 0.000010s : 68: predicate.split_environ_get_set_with_tuple_value 1.45% : 0.000008s : 68: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.74% : 0.000015s : 98: predicate.switch_defer_inline 2.45% : 0.000014s : 98: predicate.switch_layer_defer_inline 5.91% : 0.000033s : 231: predicate.switch_simplify 1.45% : 0.000008s : 68: predicate.tile_eliminate 1.45% : 0.000008s : 68: predicate.transpose_eliminate 1.91% : 0.000011s : 68: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000009s : 68: predicate.tuple_list_get_item_depend_reorder 3.50% : 0.000019s : 103: predicate.tuple_list_get_item_eliminator 1.83% : 0.000010s : 68: predicate.tuple_list_set_item_eliminator 1.92% : 0.000011s : 85: predicate.tuple_to_list_eliminator_ 2.03% : 0.000011s : 91: predicate.updatestate_pure_node_eliminater 3.05% : 0.000017s : 125: predicate.updatestate_useless_node_eliminater 1.85% : 0.000010s : 68: predicate.value_based_eliminate 0.14% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.49% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002019 33 62.19% : 0.001256s : 16: func_graph_cloner_run.FuncGraphClonerGraph 37.81% : 0.000764s : 17: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.659068 95 0.01% : 0.000079s : 1: add_recomputation 0.04% : 0.000243s : 1: auto_monad 0.00% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.15% : 0.001008s : 1: bootstrap 0.00% : 0.000024s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.01% : 0.000040s : 1: cse_after_recomputation 0.00% : 0.000017s : 1: environ_conv 0.02% : 0.000154s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 4.04% : 0.026644s : 1: jit_opt_a 0.03% : 0.000213s : 1: jit_opt_after_cconv 0.04% : 0.000244s : 1: jit_opt_b 0.07% : 0.000454s : 1: loop_unroll 0.08% : 0.000536s : 1: mutable_eliminate 0.62% : 0.004075s : 39: opt.transform.jit_opt_a 0.01% : 0.000083s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000192s : 8: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.00% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000051s : 4: opt.transform.symbol_engine_opt 0.08% : 0.000502s : 1: opt_after_jit_grad 0.00% : 0.000010s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.01% : 0.000052s : 1: py_interpret_to_execute 0.00% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000048s : 1: remove_dup_value 1.37% : 0.009023s : 2: renormalize.infer 0.28% : 0.001833s : 2: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000161s : 1: rewriter_after_opt_a 0.03% : 0.000174s : 1: rewriter_before_opt_a 0.02% : 0.000118s : 1: symbol_engine_optimizer 82.69% : 0.544956s : 1: task_emit 10.29% : 0.067828s : 1: type_inference 0.02% : 0.000116s : 1: validate group_cases_13 have all been run, results of sub cases are below: case: (1, mindspore.float32, 0) {} pass. case: (1, mindspore.float16, 0) {} pass. case: (1, mindspore.float16, 1) {} pass. case: (1, mindspore.float32, 1) {} pass. case: (0, mindspore.float16, 1) {} pass. case: (0, mindspore.bfloat16, 0) {} pass. case: (0, mindspore.bfloat16, 1) {} pass. case: (0, mindspore.float16, 0) {} pass. ops group_cases_14 with 8 cases start to running, all cases are below: case: (, 1, mindspore.bfloat16, 0) case: (, 1, mindspore.bfloat16, 1) case: (, 1) case: (, 'pynative', 'mean') case: (, 'pynative', 'sum') case: (, 'pynative', 'none') case: (, 'KBK', 'mean') case: (, 'KBK', 'sum') ops group_cases_14 total running memory: 96M, memory threshold: 51200M [LOG_WARNING] can not open file, file: /home/jenkins/ascend/log/debug/plog/plog-171921_20260129173659633.log, possible reason: Permission denied. TotalTime = 3.28799, [33] [bootstrap]: 0.0010409 [type_inference]: 0.0867981 [event_method]: 0.00012581 [auto_monad]: 0.00016266 [graph_reusing]: 8.40999e-06 [pre_auto_parallel]: 1.12e-05 [py_interpret_to_execute]: 3.806e-05 [rewriter_before_opt_a]: 0.00015353 [expand_dump_flag]: 3.14001e-06 [jit_opt_a]: 0.0101479, [2] [Cycle 1]: 0.00252613, [27] [switch_simplify]: 0.00015572 [loop_unroll]: 4.581e-05 [a_1]: 0.00078093 [with_stream_mark]: 1.619e-05 [recompute_prepare]: 8.48999e-06 [updatestate_depend_eliminate]: 1.224e-05 [updatestate_assign_eliminate]: 1.12e-05 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.97001e-06 [specialize_transform]: 7.85998e-06 [updatestate_useless_node_eliminater]: 6.44001e-06 [accelerated_algorithm]: 6.44999e-06 [meta_shard_fg_expand]: 1.118e-05 [get_grad_eliminate_]: 6.76e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.219e-05 [j_node_and_user_rematch]: 1.229e-05 [meta_fg_expand]: 3.01999e-06 [replace_old_param]: 1.286e-05 [inline_without_move]: 6.79999e-06 [renormalize]: 0.00097258 [add_forward_monad_depend]: 1.239e-05 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 2.405e-05 [cse]: 5.185e-05 [replace_applicator]: 1.298e-05 [Cycle 2]: 0.00036066, [27] [switch_simplify]: 7.01999e-06 [loop_unroll]: 6.03998e-06 [a_1]: 0.00011821 [with_stream_mark]: 8.75999e-06 [recompute_prepare]: 6.23e-06 [updatestate_depend_eliminate]: 3.03e-06 [updatestate_assign_eliminate]: 2.47001e-06 [updatestate_loads_eliminate]: 2.41e-06 [parameter_eliminate]: 9.29984e-07 [specialize_transform]: 6.17999e-06 [updatestate_useless_node_eliminater]: 6.02999e-06 [accelerated_algorithm]: 6.22001e-06 [meta_shard_fg_expand]: 1.25999e-06 [get_grad_eliminate_]: 5.64e-06 [merge_forward]: 3.15998e-06 [cell_reuse_recompute_pass]: 1.12e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.322e-05 [j_node_and_user_rematch]: 8.57e-06 [meta_fg_expand]: 1.89e-06 [replace_old_param]: 9.19e-06 [inline_without_move]: 5.66e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.10001e-06 [auto_monad_grad]: 7.50006e-07 [auto_monad_eliminator]: 5.91e-06 [cse]: 1.301e-05 [replace_applicator]: 6.06e-06 [py_interpret_to_execute_after_opt_a]: 9.92999e-06 [rewriter_after_opt_a]: 8.706e-05 [convert_after_rewriter]: 7.65e-06 [order_py_execute_after_rewriter]: 5.84999e-06 [mutable_eliminate]: 0.00052049 [jit_opt_b]: 5.341e-05, [1] [Cycle 1]: 4.703e-05, [2] [frontend_op_eliminate]: 1.897e-05 [inline_after_opt_a]: 1.748e-05 [cconv]: 2.362e-05 [loop_unroll]: 0.00041342 [jit_opt_after_cconv]: 0.0001714, [1] [Cycle 1]: 0.0001651, [11] [c_1]: 2.608e-05 [parameter_eliminate]: 2.20002e-06 [updatestate_depend_eliminate]: 5.66e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 2.068e-05 [call_graph_tuple_transform]: 2.014e-05 [tuple_list_get_item_eliminator]: 6.45002e-06 [none_parameter_eliminate]: 1.37e-06 [renormalize]: 3.69997e-07 [switch_simplify]: 7.56001e-06 [remove_dup_value]: 1.599e-05 [partial_unused_args_eliminate]: 2.53e-06 [environ_conv]: 2.123e-05 [add_recomputation]: 6.635e-05 [cse_after_recomputation]: 2.597e-05, [1] [Cycle 1]: 2.014e-05, [1] [cse]: 1.431e-05 [auto_monad_reorder]: 2.494e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 2.93e-06 [opt_after_jit_grad]: 0.00046035 [symbol_engine_optimizer]: 7.544e-05, [1] [Cycle 1]: 6.954e-05, [6] [build]: 3.06001e-06 [elim_shapecalc]: 8.43001e-06 [elim_not_effective]: 1.437e-05 [opt_reshape]: 6.51e-06 [fold_const_symbol]: 9.36e-06 [renormalize]: 3.89991e-07 [validate]: 5.521e-05 [backend_pass]: 1.07e-06 [task_emit]: 3.18597 [execute]: 8.90001e-06 Sums bootstrap : 0.001041s : 0.03% type_inference : 0.086798s : 2.65% event_method : 0.000126s : 0.00% auto_monad : 0.000163s : 0.00% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000011s : 0.00% py_interpret_to_execute : 0.000038s : 0.00% rewriter_before_opt_a : 0.000154s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000163s : 0.00% jit_opt_a.loop_unroll : 0.000052s : 0.00% jit_opt_a.a_1 : 0.000899s : 0.03% jit_opt_a.with_stream_mark : 0.000025s : 0.00% jit_opt_a.recompute_prepare : 0.000015s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000014s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000012s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000012s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000012s : 0.00% jit_opt_a.merge_forward : 0.000007s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000045s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000021s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000022s : 0.00% jit_opt_a.inline_without_move : 0.000012s : 0.00% jit_opt_a.renormalize : 0.000973s : 0.03% jit_opt_a.add_forward_monad_depend : 0.000013s : 0.00% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000030s : 0.00% jit_opt_a.cse : 0.000065s : 0.00% jit_opt_a.replace_applicator : 0.000019s : 0.00% py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% rewriter_after_opt_a : 0.000087s : 0.00% convert_after_rewriter : 0.000008s : 0.00% order_py_execute_after_rewriter : 0.000006s : 0.00% mutable_eliminate : 0.000520s : 0.02% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.00% jit_opt_b.inline_after_opt_a : 0.000017s : 0.00% cconv : 0.000024s : 0.00% loop_unroll : 0.000413s : 0.01% jit_opt_after_cconv.c_1 : 0.000026s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.cse : 0.000021s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000020s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000006s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000008s : 0.00% remove_dup_value : 0.000016s : 0.00% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000021s : 0.00% add_recomputation : 0.000066s : 0.00% cse_after_recomputation.cse : 0.000014s : 0.00% auto_monad_reorder : 0.000025s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000460s : 0.01% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000055s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.185971s : 97.17% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000265 33 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 1.96% : 0.000005s : 4: substitution.graph_param_transform 73.06% : 0.000193s : 8: substitution.inline 1.19% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.83% : 0.000013s : 4: substitution.remove_not_recompute_node 1.77% : 0.000005s : 4: substitution.replace_old_param 6.17% : 0.000016s : 1: substitution.switch_simplify 9.70% : 0.000026s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.086707 2 97.91% : 0.084894s : 1: type_inference.infer 2.09% : 0.001813s : 1: type_inference.specialize ------[replace.] 0.000104 13 57.14% : 0.000060s : 8: replace.inline 20.77% : 0.000022s : 1: replace.switch_simplify 22.09% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000228 13 82.75% : 0.000189s : 8: match.inline 6.80% : 0.000016s : 1: match.switch_simplify 10.44% : 0.000024s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000171 1126 1.26% : 0.000002s : 17: predicate.accumulaten_eliminater 0.87% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.27% : 0.000002s : 17: predicate.addn_check_dump 1.45% : 0.000002s : 17: predicate.addn_zero_filter 1.94% : 0.000003s : 17: predicate.arithmetic_simplify 1.32% : 0.000002s : 17: predicate.cast_eliminate 0.32% : 0.000001s : 4: predicate.check_bprop_eliminate 1.20% : 0.000002s : 17: predicate.compare_switch_simplify 1.29% : 0.000002s : 17: predicate.depend_value_elim 1.26% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.45% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.32% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.59% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.28% : 0.000000s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.22% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.27% : 0.000002s : 17: predicate.environ_get_depend_swap 1.28% : 0.000002s : 17: predicate.environ_get_eliminate 1.20% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000001s : 8: predicate.get_grad_eliminate 0.32% : 0.000001s : 4: predicate.graph_param_transform 5.18% : 0.000009s : 37: predicate.inline 0.76% : 0.000001s : 8: predicate.inline_without_move 0.40% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.76% : 0.000001s : 8: predicate.less_batch_normalization 1.87% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 1.93% : 0.000003s : 25: predicate.load_eliminater 1.06% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.36% : 0.000007s : 52: predicate.loop_unroll_before_grad 1.80% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.25% : 0.000002s : 17: predicate.merge_addn 1.18% : 0.000002s : 17: predicate.minmaximum_grad 1.14% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 2.44% : 0.000004s : 25: predicate.partial_eliminate 1.37% : 0.000002s : 17: predicate.print_const_string_wrapper 1.80% : 0.000003s : 17: predicate.reduce_eliminate 1.62% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.90% : 0.000003s : 29: predicate.replace_applicator 0.63% : 0.000001s : 8: predicate.replace_old_param 0.30% : 0.000001s : 4: predicate.reset_defer_inline 1.32% : 0.000002s : 17: predicate.reshape_eliminate 1.29% : 0.000002s : 17: predicate.row_tensor_add_zeros_like 0.71% : 0.000001s : 4: predicate.row_tensor_eliminate 1.36% : 0.000002s : 17: predicate.same_eliminate 0.52% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.84% : 0.000001s : 8: predicate.special_op_eliminate 0.86% : 0.000001s : 8: predicate.specialize_transform 1.51% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.39% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.73% : 0.000005s : 29: predicate.switch_defer_inline 2.53% : 0.000004s : 29: predicate.switch_layer_defer_inline 8.37% : 0.000014s : 87: predicate.switch_simplify 1.34% : 0.000002s : 17: predicate.tile_eliminate 1.26% : 0.000002s : 17: predicate.transpose_eliminate 1.69% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.40% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 17: predicate.tuple_list_set_item_eliminator 1.77% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 1.90% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.87% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.82% : 0.000003s : 17: predicate.value_based_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001233 19 57.56% : 0.000710s : 9: func_graph_cloner_run.FuncGraphClonerGraph 42.44% : 0.000523s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.289139 76 0.00% : 0.000070s : 1: add_recomputation 0.01% : 0.000170s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.03% : 0.001084s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000010s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000024s : 1: environ_conv 0.00% : 0.000133s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.31% : 0.010150s : 1: jit_opt_a 0.01% : 0.000174s : 1: jit_opt_after_cconv 0.00% : 0.000056s : 1: jit_opt_b 0.01% : 0.000421s : 1: loop_unroll 0.02% : 0.000528s : 1: mutable_eliminate 0.04% : 0.001254s : 26: opt.transform.jit_opt_a 0.00% : 0.000057s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000031s : 4: opt.transform.jit_opt_b 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.01% : 0.000468s : 1: opt_after_jit_grad 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000013s : 1: pre_auto_parallel 0.00% : 0.000041s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000018s : 1: remove_dup_value 0.01% : 0.000465s : 1: renormalize.infer 0.02% : 0.000499s : 1: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000091s : 1: rewriter_after_opt_a 0.00% : 0.000157s : 1: rewriter_before_opt_a 0.00% : 0.000078s : 1: symbol_engine_optimizer 96.86% : 3.186015s : 1: task_emit 2.64% : 0.086814s : 1: type_inference 0.00% : 0.000081s : 1: validate TotalTime = 3.28071, [33] [bootstrap]: 0.00096199 [type_inference]: 0.08568 [event_method]: 0.00012588 [auto_monad]: 0.00012283 [graph_reusing]: 5.36998e-06 [pre_auto_parallel]: 6.68998e-06 [py_interpret_to_execute]: 3.195e-05 [rewriter_before_opt_a]: 0.00012482 [expand_dump_flag]: 2.63998e-06 [jit_opt_a]: 0.00974097, [2] [Cycle 1]: 0.0023507, [27] [switch_simplify]: 0.0001307 [loop_unroll]: 4.564e-05 [a_1]: 0.00075528 [with_stream_mark]: 1.257e-05 [recompute_prepare]: 8.11002e-06 [updatestate_depend_eliminate]: 8.06001e-06 [updatestate_assign_eliminate]: 6.48998e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 1.37999e-06 [specialize_transform]: 7.15e-06 [updatestate_useless_node_eliminater]: 6.56e-06 [accelerated_algorithm]: 6.42001e-06 [meta_shard_fg_expand]: 6.28002e-06 [get_grad_eliminate_]: 6.41998e-06 [merge_forward]: 3.41001e-06 [cell_reuse_recompute_pass]: 9.20001e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.419e-05 [j_node_and_user_rematch]: 1.019e-05 [meta_fg_expand]: 2.54999e-06 [replace_old_param]: 1.057e-05 [inline_without_move]: 6.56e-06 [renormalize]: 0.00101747 [add_forward_monad_depend]: 1.42e-05 [auto_monad_grad]: 1.90001e-06 [auto_monad_eliminator]: 1.831e-05 [cse]: 3.165e-05 [replace_applicator]: 1.293e-05 [Cycle 2]: 0.00036575, [27] [switch_simplify]: 7.04001e-06 [loop_unroll]: 6.15002e-06 [a_1]: 0.0001204 [with_stream_mark]: 9.14e-06 [recompute_prepare]: 5.87001e-06 [updatestate_depend_eliminate]: 3.03e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.32001e-06 [parameter_eliminate]: 9.39996e-07 [specialize_transform]: 5.97999e-06 [updatestate_useless_node_eliminater]: 6.04999e-06 [accelerated_algorithm]: 6.17001e-06 [meta_shard_fg_expand]: 1.35999e-06 [get_grad_eliminate_]: 6.02999e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 1.46002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.358e-05 [j_node_and_user_rematch]: 8.57e-06 [meta_fg_expand]: 2.06e-06 [replace_old_param]: 9.02999e-06 [inline_without_move]: 5.82001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.07998e-06 [auto_monad_grad]: 7.49977e-07 [auto_monad_eliminator]: 6.14999e-06 [cse]: 1.417e-05 [replace_applicator]: 6.07001e-06 [py_interpret_to_execute_after_opt_a]: 9.94001e-06 [rewriter_after_opt_a]: 5.57e-05 [convert_after_rewriter]: 5.50001e-06 [order_py_execute_after_rewriter]: 4.82e-06 [mutable_eliminate]: 0.00048748 [jit_opt_b]: 5.349e-05, [1] [Cycle 1]: 4.784e-05, [2] [frontend_op_eliminate]: 1.897e-05 [inline_after_opt_a]: 1.724e-05 [cconv]: 1.683e-05 [loop_unroll]: 0.0004223 [jit_opt_after_cconv]: 0.00014896, [1] [Cycle 1]: 0.00014315, [11] [c_1]: 2.686e-05 [parameter_eliminate]: 2.38002e-06 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 2.88998e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.095e-05 [call_graph_tuple_transform]: 1.77e-05 [tuple_list_get_item_eliminator]: 6.61e-06 [none_parameter_eliminate]: 1.15001e-06 [renormalize]: 4.00003e-07 [switch_simplify]: 6.07001e-06 [remove_dup_value]: 1.163e-05 [partial_unused_args_eliminate]: 1.74998e-06 [environ_conv]: 1.814e-05 [add_recomputation]: 4.377e-05 [cse_after_recomputation]: 2.706e-05, [1] [Cycle 1]: 2.129e-05, [1] [cse]: 1.495e-05 [auto_monad_reorder]: 1.657e-05 [get_jit_bprop_graph]: 1.35999e-06 [rewriter_after_jit_bprop_graph]: 2.88e-06 [opt_after_jit_grad]: 0.00046966 [symbol_engine_optimizer]: 7.551e-05, [1] [Cycle 1]: 6.984e-05, [6] [build]: 2.89001e-06 [elim_shapecalc]: 8.62e-06 [elim_not_effective]: 1.455e-05 [opt_reshape]: 6.83e-06 [fold_const_symbol]: 9.49999e-06 [renormalize]: 3.9002e-07 [validate]: 4.661e-05 [backend_pass]: 8.29983e-07 [task_emit]: 3.17943 [execute]: 1.077e-05 Sums bootstrap : 0.000962s : 0.03% type_inference : 0.085680s : 2.62% event_method : 0.000126s : 0.00% auto_monad : 0.000123s : 0.00% graph_reusing : 0.000005s : 0.00% pre_auto_parallel : 0.000007s : 0.00% py_interpret_to_execute : 0.000032s : 0.00% rewriter_before_opt_a : 0.000125s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000138s : 0.00% jit_opt_a.loop_unroll : 0.000052s : 0.00% jit_opt_a.a_1 : 0.000876s : 0.03% jit_opt_a.with_stream_mark : 0.000022s : 0.00% jit_opt_a.recompute_prepare : 0.000014s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% jit_opt_a.parameter_eliminate : 0.000002s : 0.00% jit_opt_a.specialize_transform : 0.000013s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000013s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000008s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000012s : 0.00% jit_opt_a.merge_forward : 0.000006s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000019s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000020s : 0.00% jit_opt_a.inline_without_move : 0.000012s : 0.00% jit_opt_a.renormalize : 0.001018s : 0.03% jit_opt_a.add_forward_monad_depend : 0.000015s : 0.00% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000024s : 0.00% jit_opt_a.cse : 0.000046s : 0.00% jit_opt_a.replace_applicator : 0.000019s : 0.00% py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% rewriter_after_opt_a : 0.000056s : 0.00% convert_after_rewriter : 0.000006s : 0.00% order_py_execute_after_rewriter : 0.000005s : 0.00% mutable_eliminate : 0.000487s : 0.01% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.00% jit_opt_b.inline_after_opt_a : 0.000017s : 0.00% cconv : 0.000017s : 0.00% loop_unroll : 0.000422s : 0.01% jit_opt_after_cconv.c_1 : 0.000027s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000021s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000018s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000006s : 0.00% remove_dup_value : 0.000012s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000018s : 0.00% add_recomputation : 0.000044s : 0.00% cse_after_recomputation.cse : 0.000015s : 0.00% auto_monad_reorder : 0.000017s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000470s : 0.01% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000047s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.179430s : 97.21% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000226 33 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000001s : 2: substitution.fold_const_symbol 1.63% : 0.000004s : 4: substitution.graph_param_transform 77.79% : 0.000176s : 8: substitution.inline 1.27% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.23% : 0.000007s : 4: substitution.remove_not_recompute_node 1.86% : 0.000004s : 4: substitution.replace_old_param 4.70% : 0.000011s : 1: substitution.switch_simplify 8.01% : 0.000018s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.085607 2 97.95% : 0.083851s : 1: type_inference.infer 2.05% : 0.001756s : 1: type_inference.specialize ------[replace.] 0.000105 13 55.46% : 0.000058s : 8: replace.inline 22.07% : 0.000023s : 1: replace.switch_simplify 22.47% : 0.000024s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000197 13 86.90% : 0.000172s : 8: match.inline 4.97% : 0.000010s : 1: match.switch_simplify 8.13% : 0.000016s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000169 1126 1.45% : 0.000002s : 17: predicate.accumulaten_eliminater 0.96% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.22% : 0.000002s : 17: predicate.addn_check_dump 1.29% : 0.000002s : 17: predicate.addn_zero_filter 2.24% : 0.000004s : 17: predicate.arithmetic_simplify 1.31% : 0.000002s : 17: predicate.cast_eliminate 0.37% : 0.000001s : 4: predicate.check_bprop_eliminate 1.25% : 0.000002s : 17: predicate.compare_switch_simplify 1.27% : 0.000002s : 17: predicate.depend_value_elim 1.22% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.34% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.26% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.73% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.22% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.25% : 0.000002s : 17: predicate.environ_get_depend_swap 1.45% : 0.000002s : 17: predicate.environ_get_eliminate 1.24% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.76% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 5.20% : 0.000009s : 37: predicate.inline 0.80% : 0.000001s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000001s : 8: predicate.less_batch_normalization 1.73% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 1.90% : 0.000003s : 25: predicate.load_eliminater 1.17% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.29% : 0.000007s : 52: predicate.loop_unroll_before_grad 1.75% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.24% : 0.000002s : 17: predicate.merge_addn 1.20% : 0.000002s : 17: predicate.minmaximum_grad 1.34% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 2.38% : 0.000004s : 25: predicate.partial_eliminate 1.26% : 0.000002s : 17: predicate.print_const_string_wrapper 1.70% : 0.000003s : 17: predicate.reduce_eliminate 1.74% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 8: predicate.remove_not_recompute_node 1.85% : 0.000003s : 29: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.41% : 0.000001s : 4: predicate.reset_defer_inline 1.35% : 0.000002s : 17: predicate.reshape_eliminate 1.31% : 0.000002s : 17: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 1.45% : 0.000002s : 17: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.71% : 0.000001s : 8: predicate.special_op_eliminate 0.75% : 0.000001s : 8: predicate.specialize_transform 1.63% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.38% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.39% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.75% : 0.000005s : 29: predicate.switch_defer_inline 2.47% : 0.000004s : 29: predicate.switch_layer_defer_inline 8.35% : 0.000014s : 87: predicate.switch_simplify 1.40% : 0.000002s : 17: predicate.tile_eliminate 1.35% : 0.000002s : 17: predicate.transpose_eliminate 1.59% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.46% : 0.000002s : 17: predicate.tuple_list_set_item_eliminator 1.77% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 1.94% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.77% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.79% : 0.000003s : 17: predicate.value_based_eliminate 0.34% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001196 19 56.44% : 0.000675s : 9: func_graph_cloner_run.FuncGraphClonerGraph 43.56% : 0.000521s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.280910 76 0.00% : 0.000047s : 1: add_recomputation 0.00% : 0.000130s : 1: auto_monad 0.00% : 0.000019s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.03% : 0.000999s : 1: bootstrap 0.00% : 0.000019s : 1: cconv 0.00% : 0.000008s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000021s : 1: environ_conv 0.00% : 0.000132s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.30% : 0.009743s : 1: jit_opt_a 0.01% : 0.000172s : 1: jit_opt_after_cconv 0.00% : 0.000056s : 1: jit_opt_b 0.01% : 0.000430s : 1: loop_unroll 0.02% : 0.000496s : 1: mutable_eliminate 0.04% : 0.001195s : 26: opt.transform.jit_opt_a 0.00% : 0.000054s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000030s : 4: opt.transform.jit_opt_b 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000036s : 4: opt.transform.symbol_engine_opt 0.01% : 0.000478s : 1: opt_after_jit_grad 0.00% : 0.000007s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000009s : 1: pre_auto_parallel 0.00% : 0.000035s : 1: py_interpret_to_execute 0.00% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000015s : 1: remove_dup_value 0.01% : 0.000453s : 1: renormalize.infer 0.02% : 0.000557s : 1: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000059s : 1: rewriter_after_opt_a 0.00% : 0.000128s : 1: rewriter_before_opt_a 0.00% : 0.000078s : 1: symbol_engine_optimizer 96.91% : 3.179598s : 1: task_emit 2.61% : 0.085692s : 1: type_inference 0.00% : 0.000072s : 1: validate TotalTime = 0.128778, [33] [bootstrap]: 0.00045311 [type_inference]: 0.083606 [event_method]: 0.00038682 [auto_monad]: 0.00020224 [graph_reusing]: 1.11e-05 [pre_auto_parallel]: 4.43001e-06 [py_interpret_to_execute]: 6.521e-05 [rewriter_before_opt_a]: 0.00021547 [expand_dump_flag]: 5.05001e-06 [jit_opt_a]: 0.0264956, [3] [Cycle 1]: 0.0187365, [27] [switch_simplify]: 0.00023959 [loop_unroll]: 0.00011111 [a_1]: 0.00208993 [with_stream_mark]: 2.809e-05 [recompute_prepare]: 2.964e-05 [updatestate_depend_eliminate]: 1.031e-05 [updatestate_assign_eliminate]: 9.39998e-06 [updatestate_loads_eliminate]: 9.44e-06 [parameter_eliminate]: 2.81e-06 [specialize_transform]: 1.995e-05 [updatestate_useless_node_eliminater]: 2.056e-05 [accelerated_algorithm]: 7.527e-05 [meta_shard_fg_expand]: 5.72001e-06 [get_grad_eliminate_]: 1.963e-05 [merge_forward]: 1.147e-05 [cell_reuse_recompute_pass]: 1.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.782e-05 [j_node_and_user_rematch]: 3.235e-05 [meta_fg_expand]: 0.00202051 [replace_old_param]: 7.138e-05 [inline_without_move]: 6.525e-05 [renormalize]: 0.0131444 [add_forward_monad_depend]: 1.788e-05 [auto_monad_grad]: 6.74001e-06 [auto_monad_eliminator]: 6.458e-05 [cse]: 0.00026026 [replace_applicator]: 9.867e-05 [Cycle 2]: 0.00331469, [27] [switch_simplify]: 4.907e-05 [loop_unroll]: 4.635e-05 [a_1]: 0.00141724 [with_stream_mark]: 1.526e-05 [recompute_prepare]: 1.024e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 5.00001e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 2.00002e-06 [specialize_transform]: 9.25001e-06 [updatestate_useless_node_eliminater]: 9.04e-06 [accelerated_algorithm]: 1.35e-05 [meta_shard_fg_expand]: 2.86e-06 [get_grad_eliminate_]: 8.54e-06 [merge_forward]: 5.91e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.883e-05 [j_node_and_user_rematch]: 1.485e-05 [meta_fg_expand]: 0.00037532 [replace_old_param]: 2.061e-05 [inline_without_move]: 9.66998e-06 [renormalize]: 0.00098679 [add_forward_monad_depend]: 4.27e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 1.351e-05 [cse]: 8.017e-05 [replace_applicator]: 1.609e-05 [Cycle 3]: 0.00049814, [27] [switch_simplify]: 1.033e-05 [loop_unroll]: 8.88002e-06 [a_1]: 0.00018166 [with_stream_mark]: 1.114e-05 [recompute_prepare]: 8.59002e-06 [updatestate_depend_eliminate]: 4.54998e-06 [updatestate_assign_eliminate]: 4.02e-06 [updatestate_loads_eliminate]: 3.6e-06 [parameter_eliminate]: 1.22e-06 [specialize_transform]: 8.45999e-06 [updatestate_useless_node_eliminater]: 8.32e-06 [accelerated_algorithm]: 1.479e-05 [meta_shard_fg_expand]: 1.86e-06 [get_grad_eliminate_]: 8.3e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 1.73002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.824e-05 [j_node_and_user_rematch]: 1.333e-05 [meta_fg_expand]: 2.78e-06 [replace_old_param]: 1.186e-05 [inline_without_move]: 8.17e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 8.87e-06 [cse]: 2.57e-05 [replace_applicator]: 8.89e-06 [py_interpret_to_execute_after_opt_a]: 1.507e-05 [rewriter_after_opt_a]: 0.0002023 [convert_after_rewriter]: 1.258e-05 [order_py_execute_after_rewriter]: 6.61e-06 [mutable_eliminate]: 0.00062476 [jit_opt_b]: 7.394e-05, [1] [Cycle 1]: 6.629e-05, [2] [frontend_op_eliminate]: 2.831e-05 [inline_after_opt_a]: 2.551e-05 [cconv]: 2.503e-05 [loop_unroll]: 0.00043525 [jit_opt_after_cconv]: 0.00019416, [1] [Cycle 1]: 0.0001871, [11] [c_1]: 3.782e-05 [parameter_eliminate]: 2.66e-06 [updatestate_depend_eliminate]: 7.08e-06 [updatestate_assign_eliminate]: 4.92e-06 [updatestate_loads_eliminate]: 3.66999e-06 [cse]: 3.274e-05 [call_graph_tuple_transform]: 2.747e-05 [tuple_list_get_item_eliminator]: 8.70999e-06 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 3.7998e-07 [switch_simplify]: 9.12999e-06 [remove_dup_value]: 3.507e-05 [partial_unused_args_eliminate]: 3.38e-06 [environ_conv]: 8.74998e-06 [add_recomputation]: 6.561e-05 [cse_after_recomputation]: 3.338e-05, [1] [Cycle 1]: 2.78e-05, [1] [cse]: 2.155e-05 [auto_monad_reorder]: 2.098e-05 [get_jit_bprop_graph]: 1.82999e-06 [rewriter_after_jit_bprop_graph]: 5.39e-06 [opt_after_jit_grad]: 0.00048668 [symbol_engine_optimizer]: 0.00011383, [1] [Cycle 1]: 0.00010699, [6] [build]: 2.297e-05 [elim_shapecalc]: 1.239e-05 [elim_not_effective]: 1.859e-05 [opt_reshape]: 9.92001e-06 [fold_const_symbol]: 1.337e-05 [renormalize]: 3.50003e-07 [validate]: 4.56e-05 [backend_pass]: 1.10001e-06 [task_emit]: 0.0146816 [execute]: 8.25999e-06 Sums bootstrap : 0.000453s : 0.37% type_inference : 0.083606s : 67.45% event_method : 0.000387s : 0.31% auto_monad : 0.000202s : 0.16% graph_reusing : 0.000011s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000065s : 0.05% rewriter_before_opt_a : 0.000215s : 0.17% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000299s : 0.24% jit_opt_a.loop_unroll : 0.000166s : 0.13% jit_opt_a.a_1 : 0.003689s : 2.98% jit_opt_a.with_stream_mark : 0.000054s : 0.04% jit_opt_a.recompute_prepare : 0.000048s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000018s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000017s : 0.01% jit_opt_a.parameter_eliminate : 0.000006s : 0.00% jit_opt_a.specialize_transform : 0.000038s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000038s : 0.03% jit_opt_a.accelerated_algorithm : 0.000104s : 0.08% jit_opt_a.meta_shard_fg_expand : 0.000010s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000036s : 0.03% jit_opt_a.merge_forward : 0.000022s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000075s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000061s : 0.05% jit_opt_a.meta_fg_expand : 0.002399s : 1.94% jit_opt_a.replace_old_param : 0.000104s : 0.08% jit_opt_a.inline_without_move : 0.000083s : 0.07% jit_opt_a.renormalize : 0.014131s : 11.40% jit_opt_a.add_forward_monad_depend : 0.000024s : 0.02% jit_opt_a.auto_monad_grad : 0.000009s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000087s : 0.07% jit_opt_a.cse : 0.000366s : 0.30% jit_opt_a.replace_applicator : 0.000124s : 0.10% py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% rewriter_after_opt_a : 0.000202s : 0.16% convert_after_rewriter : 0.000013s : 0.01% order_py_execute_after_rewriter : 0.000007s : 0.01% mutable_eliminate : 0.000625s : 0.50% jit_opt_b.frontend_op_eliminate : 0.000028s : 0.02% jit_opt_b.inline_after_opt_a : 0.000026s : 0.02% cconv : 0.000025s : 0.02% loop_unroll : 0.000435s : 0.35% jit_opt_after_cconv.c_1 : 0.000038s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000033s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000027s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000009s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.01% remove_dup_value : 0.000035s : 0.03% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000009s : 0.01% add_recomputation : 0.000066s : 0.05% cse_after_recomputation.cse : 0.000022s : 0.02% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000487s : 0.39% symbol_engine_optimizer.build : 0.000023s : 0.02% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000046s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.014682s : 11.84% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000975 174 0.24% : 0.000002s : 4: substitution.elim_not_effective 0.20% : 0.000002s : 4: substitution.fold_const_symbol 0.70% : 0.000007s : 6: substitution.graph_param_transform 67.28% : 0.000656s : 23: substitution.inline 1.64% : 0.000016s : 2: substitution.inline_without_move 1.04% : 0.000010s : 18: substitution.j_node_and_user_rematch 5.23% : 0.000051s : 3: substitution.less_batch_normalization 1.51% : 0.000015s : 11: substitution.minmaximum_grad 2.44% : 0.000024s : 10: substitution.partial_eliminate 1.53% : 0.000015s : 18: substitution.remove_not_recompute_node 2.75% : 0.000027s : 9: substitution.replace_applicator 1.17% : 0.000011s : 16: substitution.replace_old_param 0.28% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.47% : 0.000014s : 4: substitution.switch_simplify 3.51% : 0.000034s : 11: substitution.tuple_list_convert_item_index_to_positive 2.33% : 0.000023s : 11: substitution.tuple_list_get_item_depend_reorder 6.67% : 0.000065s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.083478 2 94.31% : 0.078730s : 1: type_inference.infer 5.69% : 0.004748s : 1: type_inference.specialize ------[replace.] 0.000344 39 56.34% : 0.000194s : 23: replace.inline 18.16% : 0.000062s : 4: replace.switch_simplify 25.50% : 0.000088s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000687 39 93.57% : 0.000643s : 23: match.inline 1.78% : 0.000012s : 4: match.switch_simplify 4.65% : 0.000032s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000570 3976 1.53% : 0.000009s : 66: predicate.accumulaten_eliminater 0.30% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.46% : 0.000008s : 66: predicate.addn_check_dump 1.60% : 0.000009s : 66: predicate.addn_zero_filter 2.24% : 0.000013s : 66: predicate.arithmetic_simplify 1.49% : 0.000008s : 66: predicate.cast_eliminate 0.15% : 0.000001s : 6: predicate.check_bprop_eliminate 1.43% : 0.000008s : 66: predicate.compare_switch_simplify 1.49% : 0.000009s : 66: predicate.depend_value_elim 1.43% : 0.000008s : 66: predicate.dict_get_item_const_eliminator 1.55% : 0.000009s : 66: predicate.dict_get_item_eliminator 1.50% : 0.000009s : 66: predicate.dict_set_item_eliminator 0.23% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000008s : 66: predicate.environ_add_const_eliminate 1.47% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.50% : 0.000009s : 66: predicate.environ_get_depend_swap 1.53% : 0.000009s : 66: predicate.environ_get_eliminate 1.49% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000004s : 27: predicate.get_grad_eliminate 0.09% : 0.000001s : 6: predicate.graph_param_transform 4.32% : 0.000025s : 113: predicate.inline 1.64% : 0.000009s : 56: predicate.inline_without_move 0.35% : 0.000002s : 27: predicate.j_node_and_user_rematch 0.90% : 0.000005s : 27: predicate.less_batch_normalization 1.88% : 0.000011s : 78: predicate.list_to_tuple_eliminator_ 1.99% : 0.000011s : 84: predicate.load_eliminater 0.34% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.70% : 0.000021s : 147: predicate.loop_unroll_before_grad 1.77% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 1.45% : 0.000008s : 66: predicate.merge_addn 1.48% : 0.000008s : 66: predicate.minmaximum_grad 0.39% : 0.000002s : 6: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.40% : 0.000014s : 84: predicate.partial_eliminate 1.44% : 0.000008s : 66: predicate.print_const_string_wrapper 1.96% : 0.000011s : 66: predicate.reduce_eliminate 1.80% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000002s : 27: predicate.remove_not_recompute_node 2.43% : 0.000014s : 137: predicate.replace_applicator 0.90% : 0.000005s : 56: predicate.replace_old_param 0.12% : 0.000001s : 6: predicate.reset_defer_inline 1.56% : 0.000009s : 66: predicate.reshape_eliminate 1.47% : 0.000008s : 66: predicate.row_tensor_add_zeros_like 0.25% : 0.000001s : 6: predicate.row_tensor_eliminate 1.48% : 0.000008s : 66: predicate.same_eliminate 0.43% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.41% : 0.000002s : 12: predicate.special_op_eliminate 0.74% : 0.000004s : 27: predicate.specialize_transform 1.74% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.53% : 0.000009s : 66: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 6: predicate.switch_call_monad_eliminater 3.05% : 0.000017s : 101: predicate.switch_defer_inline 2.66% : 0.000015s : 101: predicate.switch_layer_defer_inline 7.28% : 0.000041s : 262: predicate.switch_simplify 1.47% : 0.000008s : 66: predicate.tile_eliminate 1.50% : 0.000009s : 66: predicate.transpose_eliminate 1.92% : 0.000011s : 66: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000018s : 90: predicate.tuple_list_get_item_eliminator 1.83% : 0.000010s : 66: predicate.tuple_list_set_item_eliminator 1.83% : 0.000010s : 78: predicate.tuple_to_list_eliminator_ 1.84% : 0.000010s : 84: predicate.updatestate_pure_node_eliminater 2.79% : 0.000016s : 111: predicate.updatestate_useless_node_eliminater 1.91% : 0.000011s : 66: predicate.value_based_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.19% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004107 50 58.21% : 0.002391s : 23: func_graph_cloner_run.FuncGraphClonerGraph 41.79% : 0.001716s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.147899 91 0.05% : 0.000069s : 1: add_recomputation 0.14% : 0.000210s : 1: auto_monad 0.02% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.32% : 0.000471s : 1: bootstrap 0.02% : 0.000028s : 1: cconv 0.01% : 0.000015s : 1: convert_after_rewriter 0.02% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000011s : 1: environ_conv 0.27% : 0.000395s : 1: event_method 0.01% : 0.000012s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 17.92% : 0.026499s : 1: jit_opt_a 0.13% : 0.000197s : 1: jit_opt_after_cconv 0.05% : 0.000076s : 1: jit_opt_b 0.30% : 0.000443s : 1: loop_unroll 0.43% : 0.000633s : 1: mutable_eliminate 3.25% : 0.004809s : 39: opt.transform.jit_opt_a 0.05% : 0.000080s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000047s : 4: opt.transform.jit_opt_b 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.02% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000050s : 4: opt.transform.symbol_engine_opt 0.33% : 0.000495s : 1: opt_after_jit_grad 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pre_auto_parallel 0.05% : 0.000068s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000038s : 1: remove_dup_value 7.84% : 0.011592s : 2: renormalize.infer 1.70% : 0.002521s : 2: renormalize.specialize 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.14% : 0.000207s : 1: rewriter_after_opt_a 0.15% : 0.000219s : 1: rewriter_before_opt_a 0.08% : 0.000116s : 1: symbol_engine_optimizer 9.94% : 0.014695s : 1: task_emit 56.54% : 0.083625s : 1: type_inference 0.05% : 0.000076s : 1: validate TotalTime = 0.136075, [33] [bootstrap]: 0.00058959 [type_inference]: 0.0880873 [event_method]: 0.00016402 [auto_monad]: 0.00019077 [graph_reusing]: 1.044e-05 [pre_auto_parallel]: 3.91999e-06 [py_interpret_to_execute]: 6.786e-05 [rewriter_before_opt_a]: 0.00021226 [expand_dump_flag]: 4.45e-06 [jit_opt_a]: 0.0271363, [3] [Cycle 1]: 0.0190207, [27] [switch_simplify]: 0.00021751 [loop_unroll]: 9.802e-05 [a_1]: 0.00206576 [with_stream_mark]: 2.883e-05 [recompute_prepare]: 2.71e-05 [updatestate_depend_eliminate]: 1.06e-05 [updatestate_assign_eliminate]: 8.85001e-06 [updatestate_loads_eliminate]: 9.20001e-06 [parameter_eliminate]: 2.71e-06 [specialize_transform]: 1.763e-05 [updatestate_useless_node_eliminater]: 1.952e-05 [accelerated_algorithm]: 6.493e-05 [meta_shard_fg_expand]: 5.35999e-06 [get_grad_eliminate_]: 1.755e-05 [merge_forward]: 1.057e-05 [cell_reuse_recompute_pass]: 8.99978e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.707e-05 [j_node_and_user_rematch]: 3.234e-05 [meta_fg_expand]: 0.00206648 [replace_old_param]: 7.097e-05 [inline_without_move]: 6.529e-05 [renormalize]: 0.0134558 [add_forward_monad_depend]: 3.989e-05 [auto_monad_grad]: 6.35002e-06 [auto_monad_eliminator]: 6.432e-05 [cse]: 0.00026535 [replace_applicator]: 8.499e-05 [Cycle 2]: 0.00336954, [27] [switch_simplify]: 4.876e-05 [loop_unroll]: 4.632e-05 [a_1]: 0.00142175 [with_stream_mark]: 1.55e-05 [recompute_prepare]: 1.088e-05 [updatestate_depend_eliminate]: 5.16002e-06 [updatestate_assign_eliminate]: 4.69998e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 1.44e-06 [specialize_transform]: 9.31e-06 [updatestate_useless_node_eliminater]: 9.00001e-06 [accelerated_algorithm]: 1.309e-05 [meta_shard_fg_expand]: 2.66e-06 [get_grad_eliminate_]: 8.59e-06 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 9.5999e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.811e-05 [j_node_and_user_rematch]: 1.375e-05 [meta_fg_expand]: 0.00038062 [replace_old_param]: 1.998e-05 [inline_without_move]: 9.77001e-06 [renormalize]: 0.00103537 [add_forward_monad_depend]: 4.2e-06 [auto_monad_grad]: 1.59998e-06 [auto_monad_eliminator]: 1.383e-05 [cse]: 8.051e-05 [replace_applicator]: 1.664e-05 [Cycle 3]: 0.00050431, [27] [switch_simplify]: 9.93002e-06 [loop_unroll]: 9.00999e-06 [a_1]: 0.00018258 [with_stream_mark]: 1.143e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 4.72998e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 9.00007e-07 [specialize_transform]: 8.49002e-06 [updatestate_useless_node_eliminater]: 8.33001e-06 [accelerated_algorithm]: 1.468e-05 [meta_shard_fg_expand]: 1.79e-06 [get_grad_eliminate_]: 8.38999e-06 [merge_forward]: 4.33999e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.708e-05 [j_node_and_user_rematch]: 1.367e-05 [meta_fg_expand]: 3.00002e-06 [replace_old_param]: 1.217e-05 [inline_without_move]: 8.60999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.62999e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 9.34998e-06 [cse]: 3.14e-05 [replace_applicator]: 9.15999e-06 [py_interpret_to_execute_after_opt_a]: 1.306e-05 [rewriter_after_opt_a]: 0.0001775 [convert_after_rewriter]: 1.056e-05 [order_py_execute_after_rewriter]: 6.51e-06 [mutable_eliminate]: 0.00061868 [jit_opt_b]: 7.422e-05, [1] [Cycle 1]: 6.632e-05, [2] [frontend_op_eliminate]: 2.772e-05 [inline_after_opt_a]: 2.541e-05 [cconv]: 2.457e-05 [loop_unroll]: 0.00042974 [jit_opt_after_cconv]: 0.00019246, [1] [Cycle 1]: 0.0001851, [11] [c_1]: 3.739e-05 [parameter_eliminate]: 2.24999e-06 [updatestate_depend_eliminate]: 7.12997e-06 [updatestate_assign_eliminate]: 5.12e-06 [updatestate_loads_eliminate]: 3.64002e-06 [cse]: 3.096e-05 [call_graph_tuple_transform]: 2.641e-05 [tuple_list_get_item_eliminator]: 9.12999e-06 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 7.00005e-07 [switch_simplify]: 9.34e-06 [remove_dup_value]: 3.498e-05 [partial_unused_args_eliminate]: 3.18e-06 [environ_conv]: 9.42999e-06 [add_recomputation]: 9.088e-05 [cse_after_recomputation]: 3.24e-05, [1] [Cycle 1]: 2.641e-05, [1] [cse]: 1.979e-05 [auto_monad_reorder]: 2.045e-05 [get_jit_bprop_graph]: 1.95001e-06 [rewriter_after_jit_bprop_graph]: 4.28999e-06 [opt_after_jit_grad]: 0.00047689 [symbol_engine_optimizer]: 0.0001172, [1] [Cycle 1]: 0.00010957, [6] [build]: 2.6e-05 [elim_shapecalc]: 1.209e-05 [elim_not_effective]: 1.845e-05 [opt_reshape]: 1.038e-05 [fold_const_symbol]: 1.405e-05 [renormalize]: 4.30009e-07 [validate]: 7.957e-05 [backend_pass]: 1.09998e-06 [task_emit]: 0.0169494 [execute]: 7.67002e-06 Sums bootstrap : 0.000590s : 0.45% type_inference : 0.088087s : 67.26% event_method : 0.000164s : 0.13% auto_monad : 0.000191s : 0.15% graph_reusing : 0.000010s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000068s : 0.05% rewriter_before_opt_a : 0.000212s : 0.16% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000276s : 0.21% jit_opt_a.loop_unroll : 0.000153s : 0.12% jit_opt_a.a_1 : 0.003670s : 2.80% jit_opt_a.with_stream_mark : 0.000056s : 0.04% jit_opt_a.recompute_prepare : 0.000047s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000018s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000017s : 0.01% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000035s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000037s : 0.03% jit_opt_a.accelerated_algorithm : 0.000093s : 0.07% jit_opt_a.meta_shard_fg_expand : 0.000010s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000035s : 0.03% jit_opt_a.merge_forward : 0.000020s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000072s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000060s : 0.05% jit_opt_a.meta_fg_expand : 0.002450s : 1.87% jit_opt_a.replace_old_param : 0.000103s : 0.08% jit_opt_a.inline_without_move : 0.000084s : 0.06% jit_opt_a.renormalize : 0.014491s : 11.07% jit_opt_a.add_forward_monad_depend : 0.000046s : 0.03% jit_opt_a.auto_monad_grad : 0.000009s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000087s : 0.07% jit_opt_a.cse : 0.000377s : 0.29% jit_opt_a.replace_applicator : 0.000111s : 0.08% py_interpret_to_execute_after_opt_a : 0.000013s : 0.01% rewriter_after_opt_a : 0.000178s : 0.14% convert_after_rewriter : 0.000011s : 0.01% order_py_execute_after_rewriter : 0.000007s : 0.00% mutable_eliminate : 0.000619s : 0.47% jit_opt_b.frontend_op_eliminate : 0.000028s : 0.02% jit_opt_b.inline_after_opt_a : 0.000025s : 0.02% cconv : 0.000025s : 0.02% loop_unroll : 0.000430s : 0.33% jit_opt_after_cconv.c_1 : 0.000037s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000031s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000026s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000009s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.01% remove_dup_value : 0.000035s : 0.03% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000009s : 0.01% add_recomputation : 0.000091s : 0.07% cse_after_recomputation.cse : 0.000020s : 0.02% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000477s : 0.36% symbol_engine_optimizer.build : 0.000026s : 0.02% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000080s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.016949s : 12.94% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000986 174 0.22% : 0.000002s : 4: substitution.elim_not_effective 0.20% : 0.000002s : 4: substitution.fold_const_symbol 0.66% : 0.000006s : 6: substitution.graph_param_transform 65.60% : 0.000647s : 23: substitution.inline 1.65% : 0.000016s : 2: substitution.inline_without_move 1.01% : 0.000010s : 18: substitution.j_node_and_user_rematch 4.52% : 0.000045s : 3: substitution.less_batch_normalization 1.49% : 0.000015s : 11: substitution.minmaximum_grad 2.55% : 0.000025s : 10: substitution.partial_eliminate 1.44% : 0.000014s : 18: substitution.remove_not_recompute_node 2.56% : 0.000025s : 9: substitution.replace_applicator 1.16% : 0.000011s : 16: substitution.replace_old_param 0.28% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.41% : 0.000014s : 4: substitution.switch_simplify 3.20% : 0.000032s : 11: substitution.tuple_list_convert_item_index_to_positive 5.76% : 0.000057s : 11: substitution.tuple_list_get_item_depend_reorder 6.29% : 0.000062s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.087934 2 94.26% : 0.082889s : 1: type_inference.infer 5.74% : 0.005045s : 1: type_inference.specialize ------[replace.] 0.000343 39 56.39% : 0.000194s : 23: replace.inline 18.40% : 0.000063s : 4: replace.switch_simplify 25.21% : 0.000087s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000678 39 93.63% : 0.000634s : 23: match.inline 1.77% : 0.000012s : 4: match.switch_simplify 4.60% : 0.000031s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000564 3976 1.46% : 0.000008s : 66: predicate.accumulaten_eliminater 0.40% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.40% : 0.000008s : 66: predicate.addn_check_dump 1.53% : 0.000009s : 66: predicate.addn_zero_filter 2.07% : 0.000012s : 66: predicate.arithmetic_simplify 1.58% : 0.000009s : 66: predicate.cast_eliminate 0.18% : 0.000001s : 6: predicate.check_bprop_eliminate 1.43% : 0.000008s : 66: predicate.compare_switch_simplify 1.52% : 0.000009s : 66: predicate.depend_value_elim 1.45% : 0.000008s : 66: predicate.dict_get_item_const_eliminator 1.54% : 0.000009s : 66: predicate.dict_get_item_eliminator 1.46% : 0.000008s : 66: predicate.dict_set_item_eliminator 0.25% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.19% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.54% : 0.000009s : 66: predicate.environ_add_const_eliminate 1.46% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.47% : 0.000008s : 66: predicate.environ_get_depend_swap 1.47% : 0.000008s : 66: predicate.environ_get_eliminate 1.44% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000004s : 27: predicate.get_grad_eliminate 0.09% : 0.000001s : 6: predicate.graph_param_transform 4.49% : 0.000025s : 113: predicate.inline 1.64% : 0.000009s : 56: predicate.inline_without_move 0.35% : 0.000002s : 27: predicate.j_node_and_user_rematch 0.90% : 0.000005s : 27: predicate.less_batch_normalization 1.90% : 0.000011s : 78: predicate.list_to_tuple_eliminator_ 1.93% : 0.000011s : 84: predicate.load_eliminater 0.39% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.63% : 0.000020s : 147: predicate.loop_unroll_before_grad 1.76% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 1.43% : 0.000008s : 66: predicate.merge_addn 1.52% : 0.000009s : 66: predicate.minmaximum_grad 0.47% : 0.000003s : 6: predicate.mutable_eliminate 0.20% : 0.000001s : 6: predicate.opt_reshape 2.41% : 0.000014s : 84: predicate.partial_eliminate 1.46% : 0.000008s : 66: predicate.print_const_string_wrapper 2.00% : 0.000011s : 66: predicate.reduce_eliminate 1.82% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000002s : 27: predicate.remove_not_recompute_node 2.46% : 0.000014s : 137: predicate.replace_applicator 0.85% : 0.000005s : 56: predicate.replace_old_param 0.11% : 0.000001s : 6: predicate.reset_defer_inline 1.54% : 0.000009s : 66: predicate.reshape_eliminate 1.49% : 0.000008s : 66: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 6: predicate.row_tensor_eliminate 1.51% : 0.000008s : 66: predicate.same_eliminate 0.45% : 0.000003s : 27: predicate.set_cell_output_no_recompute 0.37% : 0.000002s : 12: predicate.special_op_eliminate 0.80% : 0.000004s : 27: predicate.specialize_transform 1.76% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.50% : 0.000008s : 66: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.94% : 0.000017s : 101: predicate.switch_defer_inline 2.71% : 0.000015s : 101: predicate.switch_layer_defer_inline 7.07% : 0.000040s : 262: predicate.switch_simplify 1.57% : 0.000009s : 66: predicate.tile_eliminate 1.56% : 0.000009s : 66: predicate.transpose_eliminate 1.88% : 0.000011s : 66: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000018s : 90: predicate.tuple_list_get_item_eliminator 1.87% : 0.000011s : 66: predicate.tuple_list_set_item_eliminator 1.81% : 0.000010s : 78: predicate.tuple_to_list_eliminator_ 1.86% : 0.000010s : 84: predicate.updatestate_pure_node_eliminater 2.82% : 0.000016s : 111: predicate.updatestate_useless_node_eliminater 1.89% : 0.000011s : 66: predicate.value_based_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.22% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004254 50 59.97% : 0.002551s : 23: func_graph_cloner_run.FuncGraphClonerGraph 40.03% : 0.001703s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.155470 91 0.06% : 0.000094s : 1: add_recomputation 0.13% : 0.000198s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.39% : 0.000607s : 1: bootstrap 0.02% : 0.000027s : 1: cconv 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000035s : 1: cse_after_recomputation 0.01% : 0.000012s : 1: environ_conv 0.11% : 0.000172s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 17.46% : 0.027140s : 1: jit_opt_a 0.13% : 0.000195s : 1: jit_opt_after_cconv 0.05% : 0.000077s : 1: jit_opt_b 0.28% : 0.000437s : 1: loop_unroll 0.40% : 0.000627s : 1: mutable_eliminate 3.04% : 0.004725s : 39: opt.transform.jit_opt_a 0.05% : 0.000079s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000047s : 4: opt.transform.jit_opt_b 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.02% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000051s : 4: opt.transform.symbol_engine_opt 0.31% : 0.000484s : 1: opt_after_jit_grad 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pre_auto_parallel 0.05% : 0.000071s : 1: py_interpret_to_execute 0.01% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000038s : 1: remove_dup_value 7.65% : 0.011898s : 2: renormalize.infer 1.66% : 0.002575s : 2: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.12% : 0.000182s : 1: rewriter_after_opt_a 0.14% : 0.000216s : 1: rewriter_before_opt_a 0.08% : 0.000120s : 1: symbol_engine_optimizer 10.91% : 0.016965s : 1: task_emit 56.67% : 0.088104s : 1: type_inference 0.07% : 0.000112s : 1: validate ..............group_cases_14 have all been run, results of sub cases are below: case: ('pynative', 'sum') {} pass. case: ('pynative', 'none') {} pass. case: ('pynative', 'mean') {} pass. case: (1, mindspore.bfloat16, 0) {} pass. case: (1, mindspore.bfloat16, 1) {} pass. case: ('KBK', 'mean') {} pass. case: ('KBK', 'sum') {} pass. case: (1,) {} pass. ops group_cases_15 with 8 cases start to running, all cases are below: case: (, 'KBK', 'none') case: (, 'graph', 'mean') case: (, 'graph', 'sum') case: (, 'graph', 'none') case: (, 0) case: (, 1) case: (, 'pynative', ) case: (, 'pynative', ) ops group_cases_15 total running memory: 32M, memory threshold: 51200M [WARNING] PARSER(50334,ffffbf434f30,python3.9):2026-01-29-17:49:57.888.201 [mindspore/ccsrc/frontend/jit/ps/parse/data_converter.cc:661] CheckAPI] The mint interface squeeze was called, and the operators under this interface have different view capabilities on pynative and graph mode. Use this interface with caution in graph mode, as it may produce unexpected results. For more information, please refer to: https://www.mindspore.cn/docs/en/master/features/view.html TotalTime = 2.13459, [24] [bootstrap]: 0.00094963 [type_inference]: 0.174389 [event_method]: 0.0003695 [auto_monad]: 0.00014197 [graph_reusing]: 7.60998e-06 [inline]: 2.49001e-06 [add_attr]: 0.0075443, [1] [add_attr_with_inline]: 0.00752801, [1] [Cycle 1]: 0.00013573, [2] [tag_attr]: 3.925e-05 [meta_addattr_fg_expand]: 1.667e-05 [parallel-infer-symbol]: 3.11001e-06 [pre_auto_parallel]: 5.602e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 9.50007e-07 [dataset_repeat_opt]: 2.29999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.007425, [53] [py_interpret_to_execute]: 4.05998e-06 [rewriter_before_opt_a]: 0.00026073 [opt_a]: 0.0048555, [2] [Cycle 1]: 0.00423981, [45] [expand_dump_flag]: 3.93001e-06 [switch_simplify]: 8.682e-05 [loop_unroll]: 4.062e-05 [a_1]: 0.00078518 [with_stream_mark]: 1.622e-05 [recompute_prepare]: 8.75001e-06 [updatestate_depend_eliminate]: 1.43e-05 [updatestate_assign_eliminate]: 1.247e-05 [updatestate_loads_eliminate]: 2.89001e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 8.261e-05 [accelerated_algorithm]: 6.89999e-06 [shard]: 2.02999e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 6.37001e-06 [merge_send_recv]: 4.017e-05 [auto_parallel]: 6.61e-06 [parallel]: 7.987e-05 [flash_sp]: 3.363e-05 [merge_comm]: 4.40999e-06 [allreduce_fusion]: 1.057e-05 [matmul_add_comm_reduction]: 1.744e-05 [allreduce_slice_to_reducescatter]: 8.50999e-06 [virtual_shard_identity]: 9.37001e-06 [virtual_dataset]: 6.75998e-06 [get_grad_eliminate_]: 6.18002e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 4.06001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.676e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.363e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.055e-05 [set_forward_comm_id_for_comm_node_pass]: 1.065e-05 [meta_fg_expand]: 3.58e-06 [flash_sp_send_recv_attached]: 2.56998e-06 [receive_attached]: 1.848e-05 [after_resolve]: 1.07e-05 [a_after_grad]: 9.76998e-06 [renormalize]: 0.00244491 [add_forward_monad_depend]: 6.01e-06 [auto_monad_grad]: 2.42001e-06 [auto_monad_eliminator]: 2.707e-05 [cse]: 5.068e-05 [a_3]: 4.63e-05 [Cycle 2]: 0.00060452, [45] [expand_dump_flag]: 1.34003e-06 [switch_simplify]: 7.35e-06 [loop_unroll]: 6.21e-06 [a_1]: 0.00012529 [with_stream_mark]: 1.248e-05 [recompute_prepare]: 5.99999e-06 [updatestate_depend_eliminate]: 3.06999e-06 [updatestate_assign_eliminate]: 2.17999e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 1.22e-06 [a_2]: 7.087e-05 [accelerated_algorithm]: 6.12999e-06 [shard]: 1.21002e-06 [meta_shard_fg_expand]: 1.39e-06 [shard_inline]: 5.96e-06 [merge_send_recv]: 4.62998e-06 [auto_parallel]: 5.29e-06 [parallel]: 4.56002e-06 [flash_sp]: 2.93e-06 [merge_comm]: 2.74999e-06 [allreduce_fusion]: 2.52001e-06 [matmul_add_comm_reduction]: 5.20999e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 6.56e-06 [virtual_dataset]: 5.73002e-06 [get_grad_eliminate_]: 5.79e-06 [virtual_output]: 5.73002e-06 [merge_forward]: 2.68e-06 [cell_reuse_recompute_pass]: 1.79e-06 [offload_activation]: 6.25997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.217e-05 [merge_recompute_call_nodes]: 6.09987e-07 [before_grad]: 9.37999e-06 [set_forward_comm_id_for_comm_node_pass]: 2.78998e-06 [meta_fg_expand]: 2.00002e-06 [flash_sp_send_recv_attached]: 7.39994e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.15999e-06 [a_after_grad]: 8.65001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.35001e-06 [auto_monad_grad]: 7.50006e-07 [auto_monad_eliminator]: 6.12001e-06 [cse]: 1.271e-05 [a_3]: 3.453e-05 [py_interpret_to_execute_after_opt_a]: 4.62e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 2.769e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00061545 [opt_b]: 0.00024207, [1] [Cycle 1]: 0.00023501, [7] [b_1]: 0.00011802 [b_2]: 5.283e-05 [updatestate_depend_eliminate]: 5.17999e-06 [updatestate_assign_eliminate]: 2.66999e-06 [updatestate_loads_eliminate]: 2.46e-06 [renormalize]: 3.60014e-07 [cse]: 1.753e-05 [optimize_parallel_all_gather_comm]: 2.758e-05 [overlap_param_gather]: 1.157e-05 [cconv]: 2.552e-05 [loop_unroll]: 0.00047617 [opt_after_cconv]: 9.838e-05, [1] [Cycle 1]: 9.212e-05, [7] [c_1]: 3.057e-05 [parameter_eliminate]: 2.79001e-06 [updatestate_depend_eliminate]: 4.81002e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.11e-06 [cse]: 1.688e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.415e-05 [tuple_transform]: 7.365e-05, [1] [Cycle 1]: 6.926e-05, [4] [d_1]: 4.312e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.56e-06 [partial_unused_args_eliminate]: 1.75001e-06 [add_recomputation]: 0.00010967 [cse_after_recomputation]: 2.227e-05, [1] [Cycle 1]: 1.771e-05, [1] [cse]: 1.225e-05 [environ_conv]: 2.093e-05 [swap_dp_allreduce_reducescatter]: 2.359e-05 [bias_add_comm_swap]: 1.017e-05 [label_micro_interleaved_index]: 1.265e-05 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.93997e-06 [micro_interleaved_order_control]: 2.36998e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 7.29982e-07 [remove_cast_before_assign_add]: 8.47e-06 [full_micro_interleaved_order_control]: 9.54999e-06 [reorder_send_recv_between_fp_bp]: 2.68e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 8.60001e-06 [overlap_opt_shard_in_pipeline]: 2.053e-05 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.201e-05 [grouped_pairwise_exchange_alltoall]: 1.35001e-06 [offloading_packed_experts]: 3.95998e-06 [overlap_recompute_and_grad_model_parallel]: 1.221e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.27e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 1.912e-05 [overlap_grad_flash_sp]: 4.456e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.003e-05 [split_layernorm_comm]: 1.76e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.589e-05, [1] [Cycle 1]: 7.169e-05, [6] [build]: 2.86999e-06 [elim_shapecalc]: 1.063e-05 [elim_not_effective]: 1.29e-05 [opt_reshape]: 7.18e-06 [fold_const_symbol]: 1.022e-05 [renormalize]: 3.10014e-07 [detach_backward]: 1.62999e-06 [pipeline_parallel_scheduler]: 1.42999e-06 [auto_monad_reorder]: 2.123e-05 [get_jit_bprop_graph]: 1.49998e-06 [rewriter_after_jit_bprop_graph]: 3.14999e-06 [opt_after_jit_grad]: 0.00052186 [validate]: 7.587e-05 [backend_pass]: 1.06997e-06 [task_emit]: 1.9427 [execute]: 9.74e-06 Sums bootstrap : 0.000950s : 0.04% type_inference : 0.174389s : 8.20% event_method : 0.000369s : 0.02% auto_monad : 0.000142s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000056s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000261s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000094s : 0.00% optimize.opt_a.loop_unroll : 0.000047s : 0.00% optimize.opt_a.a_1 : 0.000910s : 0.04% optimize.opt_a.with_stream_mark : 0.000029s : 0.00% optimize.opt_a.recompute_prepare : 0.000015s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000153s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000045s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000084s : 0.00% optimize.opt_a.flash_sp : 0.000037s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.00% optimize.opt_a.a_after_grad : 0.000018s : 0.00% optimize.opt_a.renormalize : 0.002445s : 0.12% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.00% optimize.opt_a.cse : 0.000063s : 0.00% optimize.opt_a.a_3 : 0.000081s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000615s : 0.03% optimize.opt_b.b_1 : 0.000118s : 0.01% optimize.opt_b.b_2 : 0.000053s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000476s : 0.02% optimize.opt_after_cconv.c_1 : 0.000031s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000043s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000110s : 0.01% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000021s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000024s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000021s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000522s : 0.02% validate : 0.000076s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 1.942700s : 91.38% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000246 28 0.73% : 0.000002s : 2: substitution.elim_not_effective 0.59% : 0.000001s : 2: substitution.fold_const_symbol 2.36% : 0.000006s : 4: substitution.graph_param_transform 78.00% : 0.000192s : 7: substitution.inline 1.40% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.39% : 0.000013s : 4: substitution.remove_not_recompute_node 1.24% : 0.000003s : 2: substitution.replace_old_param 10.28% : 0.000025s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.174298 2 98.49% : 0.171673s : 1: type_inference.infer 1.51% : 0.002625s : 1: type_inference.specialize ------[replace.] 0.000084 10 72.67% : 0.000061s : 7: replace.inline 27.33% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000212 10 88.88% : 0.000188s : 7: match.inline 11.12% : 0.000024s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000218 1449 1.01% : 0.000002s : 16: predicate.accumulaten_eliminater 0.94% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 16: predicate.addn_zero_filter 0.94% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.29% : 0.000005s : 24: predicate.arithmetic_simplify 1.02% : 0.000002s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.29% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 20: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 20: predicate.environ_get_depend_swap 1.64% : 0.000004s : 28: predicate.environ_get_eliminate 1.20% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.62% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.82% : 0.000013s : 66: predicate.inline 0.59% : 0.000001s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000004s : 27: predicate.list_to_tuple_eliminator_ 2.44% : 0.000005s : 43: predicate.load_eliminater 0.75% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.04% : 0.000007s : 48: predicate.loop_unroll_before_grad 1.68% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 16: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.01% : 0.000004s : 26: predicate.partial_defer_inline 1.52% : 0.000003s : 23: predicate.partial_eliminate 1.05% : 0.000002s : 16: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 16: predicate.reduce_eliminate 2.62% : 0.000006s : 43: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 8: predicate.remove_not_recompute_node 1.45% : 0.000003s : 27: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000001s : 4: predicate.reset_defer_inline 1.07% : 0.000002s : 16: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.65% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.77% : 0.000004s : 26: predicate.switch_defer_inline 2.16% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.79% : 0.000013s : 86: predicate.switch_simplify 1.12% : 0.000002s : 16: predicate.tile_eliminate 1.00% : 0.000002s : 16: predicate.transpose_eliminate 1.67% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 24: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 24: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000007s : 35: predicate.tuple_list_get_item_eliminator 1.65% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 27: predicate.tuple_to_list_eliminator_ 2.48% : 0.000005s : 43: predicate.updatestate_pure_node_eliminater 3.04% : 0.000007s : 51: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.58% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.76% : 0.000002s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002181 25 67.32% : 0.001468s : 16: func_graph_cloner_run.FuncGraphClonerGraph 32.68% : 0.000713s : 9: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.153570 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.35% : 0.007548s : 1: add_attr 0.35% : 0.007532s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000114s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000152s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.05% : 0.001001s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000025s : 1: environ_conv 0.02% : 0.000382s : 1: event_method 0.00% : 0.000024s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000485s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000624s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.06% : 0.001385s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000144s : 28: opt.transform.opt_b 0.00% : 0.000048s : 2: opt.transform.opt_trans_graph 0.00% : 0.000037s : 4: opt.transform.symbol_engine_opt 0.23% : 0.004859s : 1: opt_a 0.00% : 0.000102s : 1: opt_after_cconv 0.02% : 0.000532s : 1: opt_after_jit_grad 0.01% : 0.000245s : 1: opt_b 0.34% : 0.007430s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000048s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000025s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.08% : 0.001669s : 1: renormalize.infer 0.04% : 0.000765s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.01% : 0.000267s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000079s : 1: symbol_engine_optimizer 90.21% : 1.942789s : 1: task_emit 0.00% : 0.000077s : 1: tuple_transform 8.10% : 0.174410s : 1: type_inference 0.01% : 0.000108s : 1: validate TotalTime = 2.46972, [33] [bootstrap]: 0.00071018 [type_inference]: 0.0924889 [event_method]: 0.00013214 [auto_monad]: 0.00016954 [graph_reusing]: 6.96001e-06 [pre_auto_parallel]: 1.149e-05 [py_interpret_to_execute]: 3.694e-05 [rewriter_before_opt_a]: 0.00015172 [expand_dump_flag]: 3.63e-06 [jit_opt_a]: 0.0118279, [2] [Cycle 1]: 0.00317701, [27] [switch_simplify]: 0.00021346 [loop_unroll]: 4.736e-05 [a_1]: 0.00091745 [with_stream_mark]: 2.909e-05 [recompute_prepare]: 1.138e-05 [updatestate_depend_eliminate]: 1.671e-05 [updatestate_assign_eliminate]: 1.333e-05 [updatestate_loads_eliminate]: 3.75998e-06 [parameter_eliminate]: 2.17001e-06 [specialize_transform]: 7.56999e-06 [updatestate_useless_node_eliminater]: 6.28e-06 [accelerated_algorithm]: 6.39001e-06 [meta_shard_fg_expand]: 1.188e-05 [get_grad_eliminate_]: 6.42001e-06 [merge_forward]: 4.89e-06 [cell_reuse_recompute_pass]: 1.03001e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.754e-05 [j_node_and_user_rematch]: 1.165e-05 [meta_fg_expand]: 2.96001e-06 [replace_old_param]: 1.195e-05 [inline_without_move]: 6.75002e-06 [renormalize]: 0.00142723 [add_forward_monad_depend]: 1.843e-05 [auto_monad_grad]: 3.12002e-06 [auto_monad_eliminator]: 3.036e-05 [cse]: 5.894e-05 [replace_applicator]: 1.779e-05 [Cycle 2]: 0.00038033, [27] [switch_simplify]: 7.28e-06 [loop_unroll]: 6.12999e-06 [a_1]: 0.00011685 [with_stream_mark]: 1.24e-05 [recompute_prepare]: 6.17999e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.65002e-06 [parameter_eliminate]: 1.57001e-06 [specialize_transform]: 5.92999e-06 [updatestate_useless_node_eliminater]: 6.12001e-06 [accelerated_algorithm]: 6.18002e-06 [meta_shard_fg_expand]: 2.01e-06 [get_grad_eliminate_]: 5.65001e-06 [merge_forward]: 2.91e-06 [cell_reuse_recompute_pass]: 1.96e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.773e-05 [j_node_and_user_rematch]: 9.10999e-06 [meta_fg_expand]: 2.04999e-06 [replace_old_param]: 9.52999e-06 [inline_without_move]: 5.96998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.96e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 7.48e-06 [cse]: 1.625e-05 [replace_applicator]: 7.02002e-06 [py_interpret_to_execute_after_opt_a]: 1.533e-05 [rewriter_after_opt_a]: 8.897e-05 [convert_after_rewriter]: 8.90001e-06 [order_py_execute_after_rewriter]: 5.56e-06 [mutable_eliminate]: 0.00072298 [jit_opt_b]: 5.712e-05, [1] [Cycle 1]: 4.941e-05, [2] [frontend_op_eliminate]: 1.842e-05 [inline_after_opt_a]: 1.973e-05 [cconv]: 3.208e-05 [loop_unroll]: 0.00048663 [jit_opt_after_cconv]: 0.00018595, [1] [Cycle 1]: 0.00017823, [11] [c_1]: 2.588e-05 [parameter_eliminate]: 3.86001e-06 [updatestate_depend_eliminate]: 8.11002e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.53e-06 [cse]: 2.827e-05 [call_graph_tuple_transform]: 2.243e-05 [tuple_list_get_item_eliminator]: 6.31998e-06 [none_parameter_eliminate]: 1.39e-06 [renormalize]: 7.50006e-07 [switch_simplify]: 7.00998e-06 [remove_dup_value]: 1.73e-05 [partial_unused_args_eliminate]: 2.44001e-06 [environ_conv]: 2.699e-05 [add_recomputation]: 7.109e-05 [cse_after_recomputation]: 2.977e-05, [1] [Cycle 1]: 2.287e-05, [1] [cse]: 1.579e-05 [auto_monad_reorder]: 2.67e-05 [get_jit_bprop_graph]: 2.37001e-06 [rewriter_after_jit_bprop_graph]: 3.76999e-06 [opt_after_jit_grad]: 0.00051429 [symbol_engine_optimizer]: 8.043e-05, [1] [Cycle 1]: 7.347e-05, [6] [build]: 3.93001e-06 [elim_shapecalc]: 9.14e-06 [elim_not_effective]: 1.522e-05 [opt_reshape]: 6.69001e-06 [fold_const_symbol]: 9.80002e-06 [renormalize]: 8.2e-07 [validate]: 6.486e-05 [backend_pass]: 1.11002e-06 [task_emit]: 2.36009 [execute]: 1.066e-05 Sums bootstrap : 0.000710s : 0.03% type_inference : 0.092489s : 3.76% event_method : 0.000132s : 0.01% auto_monad : 0.000170s : 0.01% graph_reusing : 0.000007s : 0.00% pre_auto_parallel : 0.000011s : 0.00% py_interpret_to_execute : 0.000037s : 0.00% rewriter_before_opt_a : 0.000152s : 0.01% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000221s : 0.01% jit_opt_a.loop_unroll : 0.000053s : 0.00% jit_opt_a.a_1 : 0.001034s : 0.04% jit_opt_a.with_stream_mark : 0.000041s : 0.00% jit_opt_a.recompute_prepare : 0.000018s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_a.parameter_eliminate : 0.000004s : 0.00% jit_opt_a.specialize_transform : 0.000013s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000012s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000014s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000012s : 0.00% jit_opt_a.merge_forward : 0.000008s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000021s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000021s : 0.00% jit_opt_a.inline_without_move : 0.000013s : 0.00% jit_opt_a.renormalize : 0.001427s : 0.06% jit_opt_a.add_forward_monad_depend : 0.000020s : 0.00% jit_opt_a.auto_monad_grad : 0.000004s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000038s : 0.00% jit_opt_a.cse : 0.000075s : 0.00% jit_opt_a.replace_applicator : 0.000025s : 0.00% py_interpret_to_execute_after_opt_a : 0.000015s : 0.00% rewriter_after_opt_a : 0.000089s : 0.00% convert_after_rewriter : 0.000009s : 0.00% order_py_execute_after_rewriter : 0.000006s : 0.00% mutable_eliminate : 0.000723s : 0.03% jit_opt_b.frontend_op_eliminate : 0.000018s : 0.00% jit_opt_b.inline_after_opt_a : 0.000020s : 0.00% cconv : 0.000032s : 0.00% loop_unroll : 0.000487s : 0.02% jit_opt_after_cconv.c_1 : 0.000026s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000028s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000022s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000006s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.00% remove_dup_value : 0.000017s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000027s : 0.00% add_recomputation : 0.000071s : 0.00% cse_after_recomputation.cse : 0.000016s : 0.00% auto_monad_reorder : 0.000027s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000514s : 0.02% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000065s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.360088s : 95.97% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000323 33 0.65% : 0.000002s : 2: substitution.elim_not_effective 0.47% : 0.000002s : 2: substitution.fold_const_symbol 1.77% : 0.000006s : 4: substitution.graph_param_transform 74.03% : 0.000239s : 8: substitution.inline 1.12% : 0.000004s : 4: substitution.j_node_and_user_rematch 4.56% : 0.000015s : 4: substitution.remove_not_recompute_node 1.77% : 0.000006s : 4: substitution.replace_old_param 6.56% : 0.000021s : 1: substitution.switch_simplify 9.06% : 0.000029s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.092392 2 97.90% : 0.090447s : 1: type_inference.infer 2.10% : 0.001944s : 1: type_inference.specialize ------[replace.] 0.000204 13 62.75% : 0.000128s : 8: replace.inline 23.60% : 0.000048s : 1: replace.switch_simplify 13.65% : 0.000028s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000282 13 83.08% : 0.000235s : 8: match.inline 7.21% : 0.000020s : 1: match.switch_simplify 9.70% : 0.000027s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000175 1126 1.22% : 0.000002s : 17: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.18% : 0.000002s : 17: predicate.addn_check_dump 1.36% : 0.000002s : 17: predicate.addn_zero_filter 1.97% : 0.000003s : 17: predicate.arithmetic_simplify 1.30% : 0.000002s : 17: predicate.cast_eliminate 0.31% : 0.000001s : 4: predicate.check_bprop_eliminate 1.20% : 0.000002s : 17: predicate.compare_switch_simplify 1.31% : 0.000002s : 17: predicate.depend_value_elim 1.24% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.52% : 0.000003s : 17: predicate.dict_get_item_eliminator 1.26% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.56% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.64% : 0.000001s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 17: predicate.environ_get_depend_swap 1.28% : 0.000002s : 17: predicate.environ_get_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.24% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 5.88% : 0.000010s : 37: predicate.inline 0.70% : 0.000001s : 8: predicate.inline_without_move 0.37% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.79% : 0.000001s : 8: predicate.less_batch_normalization 1.63% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.01% : 0.000004s : 25: predicate.load_eliminater 1.17% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.45% : 0.000008s : 52: predicate.loop_unroll_before_grad 1.87% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.17% : 0.000002s : 17: predicate.merge_addn 1.16% : 0.000002s : 17: predicate.minmaximum_grad 1.66% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 2.28% : 0.000004s : 25: predicate.partial_eliminate 1.21% : 0.000002s : 17: predicate.print_const_string_wrapper 1.78% : 0.000003s : 17: predicate.reduce_eliminate 1.59% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.66% : 0.000001s : 8: predicate.remove_not_recompute_node 2.19% : 0.000004s : 29: predicate.replace_applicator 0.60% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000000s : 4: predicate.reset_defer_inline 1.25% : 0.000002s : 17: predicate.reshape_eliminate 1.23% : 0.000002s : 17: predicate.row_tensor_add_zeros_like 0.52% : 0.000001s : 4: predicate.row_tensor_eliminate 1.20% : 0.000002s : 17: predicate.same_eliminate 0.63% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000001s : 8: predicate.specialize_transform 1.52% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.26% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.76% : 0.000005s : 29: predicate.switch_defer_inline 2.36% : 0.000004s : 29: predicate.switch_layer_defer_inline 8.42% : 0.000015s : 87: predicate.switch_simplify 1.22% : 0.000002s : 17: predicate.tile_eliminate 1.28% : 0.000002s : 17: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.71% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 17: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 1.80% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.64% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.73% : 0.000003s : 17: predicate.value_based_eliminate 0.27% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001399 19 58.86% : 0.000823s : 9: func_graph_cloner_run.FuncGraphClonerGraph 41.14% : 0.000575s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.471448 76 0.00% : 0.000074s : 1: add_recomputation 0.01% : 0.000177s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.03% : 0.000736s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000012s : 1: convert_after_rewriter 0.00% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000030s : 1: environ_conv 0.01% : 0.000139s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.48% : 0.011831s : 1: jit_opt_a 0.01% : 0.000189s : 1: jit_opt_after_cconv 0.00% : 0.000060s : 1: jit_opt_b 0.02% : 0.000495s : 1: loop_unroll 0.03% : 0.000734s : 1: mutable_eliminate 0.06% : 0.001457s : 26: opt.transform.jit_opt_a 0.00% : 0.000058s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000031s : 4: opt.transform.jit_opt_b 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.00% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000038s : 4: opt.transform.symbol_engine_opt 0.02% : 0.000524s : 1: opt_after_jit_grad 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000013s : 1: pre_auto_parallel 0.00% : 0.000040s : 1: py_interpret_to_execute 0.00% : 0.000018s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000020s : 1: remove_dup_value 0.03% : 0.000751s : 1: renormalize.infer 0.03% : 0.000667s : 1: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000095s : 1: rewriter_after_opt_a 0.01% : 0.000155s : 1: rewriter_before_opt_a 0.00% : 0.000083s : 1: symbol_engine_optimizer 95.50% : 2.360201s : 1: task_emit 3.74% : 0.092503s : 1: type_inference 0.00% : 0.000103s : 1: validate TotalTime = 0.141895, [33] [bootstrap]: 0.00062072 [type_inference]: 0.0941432 [event_method]: 0.00041154 [auto_monad]: 0.00019654 [graph_reusing]: 1.119e-05 [pre_auto_parallel]: 4.23001e-06 [py_interpret_to_execute]: 6.168e-05 [rewriter_before_opt_a]: 0.00021312 [expand_dump_flag]: 4.47e-06 [jit_opt_a]: 0.0290138, [3] [Cycle 1]: 0.0202276, [27] [switch_simplify]: 0.00025424 [loop_unroll]: 9.601e-05 [a_1]: 0.00209152 [with_stream_mark]: 3.203e-05 [recompute_prepare]: 2.831e-05 [updatestate_depend_eliminate]: 1.045e-05 [updatestate_assign_eliminate]: 8.87e-06 [updatestate_loads_eliminate]: 9.81e-06 [parameter_eliminate]: 3.14999e-06 [specialize_transform]: 2.084e-05 [updatestate_useless_node_eliminater]: 2.174e-05 [accelerated_algorithm]: 6.451e-05 [meta_shard_fg_expand]: 6.12001e-06 [get_grad_eliminate_]: 1.97e-05 [merge_forward]: 1.055e-05 [cell_reuse_recompute_pass]: 1.09998e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.152e-05 [j_node_and_user_rematch]: 3.578e-05 [meta_fg_expand]: 0.00226425 [replace_old_param]: 7.637e-05 [inline_without_move]: 6.771e-05 [renormalize]: 0.0141563 [add_forward_monad_depend]: 4.618e-05 [auto_monad_grad]: 6.88998e-06 [auto_monad_eliminator]: 6.622e-05 [cse]: 0.00043003 [replace_applicator]: 9.439e-05 [Cycle 2]: 0.00350566, [27] [switch_simplify]: 4.934e-05 [loop_unroll]: 4.657e-05 [a_1]: 0.00145115 [with_stream_mark]: 1.765e-05 [recompute_prepare]: 1.067e-05 [updatestate_depend_eliminate]: 6.21e-06 [updatestate_assign_eliminate]: 4.84e-06 [updatestate_loads_eliminate]: 4.25e-06 [parameter_eliminate]: 2.24001e-06 [specialize_transform]: 9.56e-06 [updatestate_useless_node_eliminater]: 9.56e-06 [accelerated_algorithm]: 1.378e-05 [meta_shard_fg_expand]: 2.96999e-06 [get_grad_eliminate_]: 9.19e-06 [merge_forward]: 6.53e-06 [cell_reuse_recompute_pass]: 1.04e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.926e-05 [j_node_and_user_rematch]: 1.408e-05 [meta_fg_expand]: 0.0003898 [replace_old_param]: 2.174e-05 [inline_without_move]: 9.91e-06 [renormalize]: 0.00109679 [add_forward_monad_depend]: 4.4e-06 [auto_monad_grad]: 1.81e-06 [auto_monad_eliminator]: 1.561e-05 [cse]: 9.508e-05 [replace_applicator]: 1.693e-05 [Cycle 3]: 0.00057093, [27] [switch_simplify]: 9.89999e-06 [loop_unroll]: 8.94e-06 [a_1]: 0.00018298 [with_stream_mark]: 1.123e-05 [recompute_prepare]: 8.86002e-06 [updatestate_depend_eliminate]: 4.77998e-06 [updatestate_assign_eliminate]: 4.03001e-06 [updatestate_loads_eliminate]: 3.88001e-06 [parameter_eliminate]: 1.09998e-06 [specialize_transform]: 8.54998e-06 [updatestate_useless_node_eliminater]: 8.42e-06 [accelerated_algorithm]: 1.572e-05 [meta_shard_fg_expand]: 1.69e-06 [get_grad_eliminate_]: 8.92e-06 [merge_forward]: 4.65001e-06 [cell_reuse_recompute_pass]: 1.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.817e-05 [j_node_and_user_rematch]: 1.322e-05 [meta_fg_expand]: 2.94001e-06 [replace_old_param]: 1.224e-05 [inline_without_move]: 8.48001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.70001e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 1.028e-05 [cse]: 2.434e-05 [replace_applicator]: 9.05001e-06 [py_interpret_to_execute_after_opt_a]: 1.458e-05 [rewriter_after_opt_a]: 0.0001689 [convert_after_rewriter]: 1.029e-05 [order_py_execute_after_rewriter]: 6.63998e-06 [mutable_eliminate]: 0.00069347 [jit_opt_b]: 7.442e-05, [1] [Cycle 1]: 6.718e-05, [2] [frontend_op_eliminate]: 2.882e-05 [inline_after_opt_a]: 2.619e-05 [cconv]: 2.769e-05 [loop_unroll]: 0.00042984 [jit_opt_after_cconv]: 0.00019833, [1] [Cycle 1]: 0.00019065, [11] [c_1]: 3.769e-05 [parameter_eliminate]: 2.69999e-06 [updatestate_depend_eliminate]: 7.65998e-06 [updatestate_assign_eliminate]: 5.08002e-06 [updatestate_loads_eliminate]: 3.78001e-06 [cse]: 3.405e-05 [call_graph_tuple_transform]: 2.836e-05 [tuple_list_get_item_eliminator]: 9.41e-06 [none_parameter_eliminate]: 1.48002e-06 [renormalize]: 5.00004e-07 [switch_simplify]: 8.99e-06 [remove_dup_value]: 3.642e-05 [partial_unused_args_eliminate]: 4.11001e-06 [environ_conv]: 9.46e-06 [add_recomputation]: 9.203e-05 [cse_after_recomputation]: 3.409e-05, [1] [Cycle 1]: 2.813e-05, [1] [cse]: 2.145e-05 [auto_monad_reorder]: 2.118e-05 [get_jit_bprop_graph]: 2.00002e-06 [rewriter_after_jit_bprop_graph]: 4.89e-06 [opt_after_jit_grad]: 0.00049111 [symbol_engine_optimizer]: 0.0001167, [1] [Cycle 1]: 0.00010982, [6] [build]: 2.463e-05 [elim_shapecalc]: 1.242e-05 [elim_not_effective]: 1.874e-05 [opt_reshape]: 1.018e-05 [fold_const_symbol]: 1.479e-05 [renormalize]: 3.80009e-07 [validate]: 8.658e-05 [backend_pass]: 1.19998e-06 [task_emit]: 0.0144173 [execute]: 8.28999e-06 Sums bootstrap : 0.000621s : 0.46% type_inference : 0.094143s : 69.14% event_method : 0.000412s : 0.30% auto_monad : 0.000197s : 0.14% graph_reusing : 0.000011s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000062s : 0.05% rewriter_before_opt_a : 0.000213s : 0.16% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000313s : 0.23% jit_opt_a.loop_unroll : 0.000152s : 0.11% jit_opt_a.a_1 : 0.003726s : 2.74% jit_opt_a.with_stream_mark : 0.000061s : 0.04% jit_opt_a.recompute_prepare : 0.000048s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000021s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000018s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000018s : 0.01% jit_opt_a.parameter_eliminate : 0.000006s : 0.00% jit_opt_a.specialize_transform : 0.000039s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000040s : 0.03% jit_opt_a.accelerated_algorithm : 0.000094s : 0.07% jit_opt_a.meta_shard_fg_expand : 0.000011s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000038s : 0.03% jit_opt_a.merge_forward : 0.000022s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000079s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000063s : 0.05% jit_opt_a.meta_fg_expand : 0.002657s : 1.95% jit_opt_a.replace_old_param : 0.000110s : 0.08% jit_opt_a.inline_without_move : 0.000086s : 0.06% jit_opt_a.renormalize : 0.015253s : 11.20% jit_opt_a.add_forward_monad_depend : 0.000052s : 0.04% jit_opt_a.auto_monad_grad : 0.000010s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000092s : 0.07% jit_opt_a.cse : 0.000549s : 0.40% jit_opt_a.replace_applicator : 0.000120s : 0.09% py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% rewriter_after_opt_a : 0.000169s : 0.12% convert_after_rewriter : 0.000010s : 0.01% order_py_execute_after_rewriter : 0.000007s : 0.00% mutable_eliminate : 0.000693s : 0.51% jit_opt_b.frontend_op_eliminate : 0.000029s : 0.02% jit_opt_b.inline_after_opt_a : 0.000026s : 0.02% cconv : 0.000028s : 0.02% loop_unroll : 0.000430s : 0.32% jit_opt_after_cconv.c_1 : 0.000038s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000034s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000028s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000009s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.01% remove_dup_value : 0.000036s : 0.03% partial_unused_args_eliminate : 0.000004s : 0.00% environ_conv : 0.000009s : 0.01% add_recomputation : 0.000092s : 0.07% cse_after_recomputation.cse : 0.000021s : 0.02% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000491s : 0.36% symbol_engine_optimizer.build : 0.000025s : 0.02% symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000087s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.014417s : 10.59% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000973 174 0.24% : 0.000002s : 4: substitution.elim_not_effective 0.22% : 0.000002s : 4: substitution.fold_const_symbol 0.78% : 0.000008s : 6: substitution.graph_param_transform 66.97% : 0.000652s : 23: substitution.inline 1.96% : 0.000019s : 2: substitution.inline_without_move 1.15% : 0.000011s : 18: substitution.j_node_and_user_rematch 4.22% : 0.000041s : 3: substitution.less_batch_normalization 1.51% : 0.000015s : 11: substitution.minmaximum_grad 2.41% : 0.000023s : 10: substitution.partial_eliminate 1.58% : 0.000015s : 18: substitution.remove_not_recompute_node 2.96% : 0.000029s : 9: substitution.replace_applicator 1.31% : 0.000013s : 16: substitution.replace_old_param 0.31% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.44% : 0.000014s : 4: substitution.switch_simplify 3.61% : 0.000035s : 11: substitution.tuple_list_convert_item_index_to_positive 2.37% : 0.000023s : 11: substitution.tuple_list_get_item_depend_reorder 6.95% : 0.000068s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.094009 2 95.19% : 0.089488s : 1: type_inference.infer 4.81% : 0.004521s : 1: type_inference.specialize ------[replace.] 0.000347 39 55.71% : 0.000193s : 23: replace.inline 18.78% : 0.000065s : 4: replace.switch_simplify 25.51% : 0.000089s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000683 39 93.61% : 0.000639s : 23: match.inline 1.67% : 0.000011s : 4: match.switch_simplify 4.72% : 0.000032s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000583 3976 1.53% : 0.000009s : 66: predicate.accumulaten_eliminater 0.40% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.41% : 0.000008s : 66: predicate.addn_check_dump 1.51% : 0.000009s : 66: predicate.addn_zero_filter 2.23% : 0.000013s : 66: predicate.arithmetic_simplify 1.53% : 0.000009s : 66: predicate.cast_eliminate 0.16% : 0.000001s : 6: predicate.check_bprop_eliminate 1.42% : 0.000008s : 66: predicate.compare_switch_simplify 1.47% : 0.000009s : 66: predicate.depend_value_elim 1.50% : 0.000009s : 66: predicate.dict_get_item_const_eliminator 1.54% : 0.000009s : 66: predicate.dict_get_item_eliminator 1.46% : 0.000008s : 66: predicate.dict_set_item_eliminator 0.23% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 6: predicate.elim_not_effective 0.21% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.45% : 0.000008s : 66: predicate.environ_add_const_eliminate 1.43% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.47% : 0.000009s : 66: predicate.environ_get_depend_swap 1.46% : 0.000008s : 66: predicate.environ_get_eliminate 1.41% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.79% : 0.000005s : 27: predicate.get_grad_eliminate 0.09% : 0.000001s : 6: predicate.graph_param_transform 4.40% : 0.000026s : 113: predicate.inline 1.59% : 0.000009s : 56: predicate.inline_without_move 0.35% : 0.000002s : 27: predicate.j_node_and_user_rematch 1.04% : 0.000006s : 27: predicate.less_batch_normalization 1.88% : 0.000011s : 78: predicate.list_to_tuple_eliminator_ 1.94% : 0.000011s : 84: predicate.load_eliminater 0.39% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.68% : 0.000021s : 147: predicate.loop_unroll_before_grad 1.79% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 1.45% : 0.000008s : 66: predicate.merge_addn 1.51% : 0.000009s : 66: predicate.minmaximum_grad 0.44% : 0.000003s : 6: predicate.mutable_eliminate 0.20% : 0.000001s : 6: predicate.opt_reshape 2.33% : 0.000014s : 84: predicate.partial_eliminate 1.44% : 0.000008s : 66: predicate.print_const_string_wrapper 1.95% : 0.000011s : 66: predicate.reduce_eliminate 1.80% : 0.000011s : 78: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000002s : 27: predicate.remove_not_recompute_node 2.36% : 0.000014s : 137: predicate.replace_applicator 0.87% : 0.000005s : 56: predicate.replace_old_param 0.11% : 0.000001s : 6: predicate.reset_defer_inline 1.53% : 0.000009s : 66: predicate.reshape_eliminate 1.51% : 0.000009s : 66: predicate.row_tensor_add_zeros_like 0.24% : 0.000001s : 6: predicate.row_tensor_eliminate 1.53% : 0.000009s : 66: predicate.same_eliminate 0.44% : 0.000003s : 27: predicate.set_cell_output_no_recompute 0.37% : 0.000002s : 12: predicate.special_op_eliminate 0.77% : 0.000004s : 27: predicate.specialize_transform 1.76% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.53% : 0.000009s : 66: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.98% : 0.000017s : 101: predicate.switch_defer_inline 2.59% : 0.000015s : 101: predicate.switch_layer_defer_inline 7.47% : 0.000044s : 262: predicate.switch_simplify 1.49% : 0.000009s : 66: predicate.tile_eliminate 1.48% : 0.000009s : 66: predicate.transpose_eliminate 1.88% : 0.000011s : 66: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000019s : 90: predicate.tuple_list_get_item_eliminator 1.86% : 0.000011s : 66: predicate.tuple_list_set_item_eliminator 1.86% : 0.000011s : 78: predicate.tuple_to_list_eliminator_ 1.81% : 0.000011s : 84: predicate.updatestate_pure_node_eliminater 2.80% : 0.000016s : 111: predicate.updatestate_useless_node_eliminater 1.87% : 0.000011s : 66: predicate.value_based_eliminate 0.13% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.21% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004119 50 61.38% : 0.002528s : 23: func_graph_cloner_run.FuncGraphClonerGraph 38.62% : 0.001590s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.162182 91 0.06% : 0.000095s : 1: add_recomputation 0.13% : 0.000204s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.40% : 0.000649s : 1: bootstrap 0.02% : 0.000030s : 1: cconv 0.01% : 0.000013s : 1: convert_after_rewriter 0.02% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000012s : 1: environ_conv 0.26% : 0.000420s : 1: event_method 0.01% : 0.000014s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 17.89% : 0.029018s : 1: jit_opt_a 0.12% : 0.000201s : 1: jit_opt_after_cconv 0.05% : 0.000077s : 1: jit_opt_b 0.27% : 0.000438s : 1: loop_unroll 0.43% : 0.000703s : 1: mutable_eliminate 2.99% : 0.004853s : 39: opt.transform.jit_opt_a 0.05% : 0.000081s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000048s : 4: opt.transform.jit_opt_b 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 0.02% : 0.000037s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.31% : 0.000500s : 1: opt_after_jit_grad 0.01% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pre_auto_parallel 0.04% : 0.000065s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000039s : 1: remove_dup_value 7.82% : 0.012687s : 2: renormalize.infer 1.57% : 0.002545s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.11% : 0.000172s : 1: rewriter_after_opt_a 0.13% : 0.000217s : 1: rewriter_before_opt_a 0.07% : 0.000119s : 1: symbol_engine_optimizer 8.90% : 0.014434s : 1: task_emit 58.06% : 0.094165s : 1: type_inference 0.08% : 0.000124s : 1: validate TotalTime = 0.111723, [24] [bootstrap]: 0.00067138 [type_inference]: 0.0827056 [event_method]: 0.00028785 [auto_monad]: 8.74e-05 [graph_reusing]: 7.7e-06 [inline]: 2.74999e-06 [add_attr]: 0.00466608, [1] [add_attr_with_inline]: 0.00465357, [1] [Cycle 1]: 7.449e-05, [2] [tag_attr]: 2.981e-05 [meta_addattr_fg_expand]: 6.79999e-06 [parallel-infer-symbol]: 4.05e-06 [pre_auto_parallel]: 4.616e-05 [insert-virtual-dataset]: 2.76999e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 2.36e-06 [pipeline_split]: 1.73002e-06 [optimize]: 0.0156293, [53] [py_interpret_to_execute]: 5.50001e-06 [rewriter_before_opt_a]: 0.00030744 [opt_a]: 0.0118779, [2] [Cycle 1]: 0.0111852, [45] [expand_dump_flag]: 3.78999e-06 [switch_simplify]: 5.814e-05 [loop_unroll]: 4.434e-05 [a_1]: 0.00080173 [with_stream_mark]: 1.795e-05 [recompute_prepare]: 8.34002e-06 [updatestate_depend_eliminate]: 4.56002e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 8.136e-05 [accelerated_algorithm]: 7.43999e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.3e-06 [auto_parallel]: 6.67002e-06 [parallel]: 2.395e-05 [flash_sp]: 8.05e-06 [merge_comm]: 4.10998e-06 [allreduce_fusion]: 3.13e-06 [matmul_add_comm_reduction]: 9.36e-06 [allreduce_slice_to_reducescatter]: 9.5999e-07 [virtual_shard_identity]: 7.81001e-06 [virtual_dataset]: 6.77002e-06 [get_grad_eliminate_]: 6.28e-06 [virtual_output]: 6.36e-06 [merge_forward]: 4.15e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.015e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.275e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 3.26001e-06 [meta_fg_expand]: 2.71e-06 [flash_sp_send_recv_attached]: 2.22001e-06 [receive_attached]: 2.22999e-06 [after_resolve]: 1.043e-05 [a_after_grad]: 9.49e-06 [renormalize]: 0.00958483 [add_forward_monad_depend]: 9.05999e-06 [auto_monad_grad]: 2.44001e-06 [auto_monad_eliminator]: 2.049e-05 [cse]: 3.469e-05 [a_3]: 6.01e-05 [Cycle 2]: 0.00067947, [45] [expand_dump_flag]: 2.13002e-06 [switch_simplify]: 8.92e-06 [loop_unroll]: 6.70002e-06 [a_1]: 0.00013895 [with_stream_mark]: 1.818e-05 [recompute_prepare]: 6.48e-06 [updatestate_depend_eliminate]: 4.02e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 7.212e-05 [accelerated_algorithm]: 6.53003e-06 [shard]: 3.11999e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 5.99e-06 [merge_send_recv]: 8.65999e-06 [auto_parallel]: 9.35001e-06 [parallel]: 8.28999e-06 [flash_sp]: 3.95998e-06 [merge_comm]: 2.88e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 6.89994e-07 [virtual_shard_identity]: 7.12002e-06 [virtual_dataset]: 6.62002e-06 [get_grad_eliminate_]: 6.09999e-06 [virtual_output]: 6.02001e-06 [merge_forward]: 4.08001e-06 [cell_reuse_recompute_pass]: 3.04001e-06 [offload_activation]: 9.56998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.528e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21999e-06 [meta_fg_expand]: 2.85002e-06 [flash_sp_send_recv_attached]: 1.62999e-06 [receive_attached]: 2.31e-06 [after_resolve]: 1.084e-05 [a_after_grad]: 9.98998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 7.21001e-06 [cse]: 1.317e-05 [a_3]: 3.509e-05 [py_interpret_to_execute_after_opt_a]: 6.31998e-06 [slice_cell_reuse_recomputed_activation]: 1.91998e-06 [rewriter_after_opt_a]: 1.994e-05 [convert_after_rewriter]: 1.60001e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00192842 [opt_b]: 0.00025281, [1] [Cycle 1]: 0.00024467, [7] [b_1]: 0.00012523 [b_2]: 4.972e-05 [updatestate_depend_eliminate]: 6.55997e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.59001e-06 [renormalize]: 8.09989e-07 [cse]: 2.139e-05 [optimize_parallel_all_gather_comm]: 1.762e-05 [overlap_param_gather]: 2.06998e-06 [cconv]: 3.144e-05 [loop_unroll]: 0.00047198 [opt_after_cconv]: 0.0001023, [1] [Cycle 1]: 9.623e-05, [7] [c_1]: 3.23e-05 [parameter_eliminate]: 2.90998e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.788e-05 [renormalize]: 7.09988e-07 [remove_dup_value]: 1.541e-05 [tuple_transform]: 7.689e-05, [1] [Cycle 1]: 7.239e-05, [4] [d_1]: 4.451e-05 [none_parameter_eliminate]: 1.45001e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 7.5e-06 [partial_unused_args_eliminate]: 1.94e-06 [add_recomputation]: 7.684e-05 [cse_after_recomputation]: 2.233e-05, [1] [Cycle 1]: 1.737e-05, [1] [cse]: 1.193e-05 [environ_conv]: 7.7e-06 [swap_dp_allreduce_reducescatter]: 5.01002e-06 [bias_add_comm_swap]: 2.52001e-06 [label_micro_interleaved_index]: 4.47e-06 [label_fine_grained_interleaved_index]: 2.52001e-06 [merge_cast_opt]: 1.19e-06 [slice_recompute_activation]: 1.87001e-06 [micro_interleaved_order_control]: 2.48e-06 [assign_add_opt]: 1.13001e-06 [ForceFp32Comm]: 1.09e-06 [remove_cast_before_assign_add]: 1.14998e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.52001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.281e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 3.61999e-06 [overlap_recompute_and_grad_model_parallel]: 4.92e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.18999e-06 [overlap_grad_flash_sp]: 2e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.29999e-06 [split_layernorm_comm]: 1.91003e-06 [handle_group_info]: 8.59989e-07 [symbol_engine_optimizer]: 7.554e-05, [1] [Cycle 1]: 7.135e-05, [6] [build]: 3.81999e-06 [elim_shapecalc]: 9.34998e-06 [elim_not_effective]: 1.278e-05 [opt_reshape]: 7.53e-06 [fold_const_symbol]: 9.69e-06 [renormalize]: 1.69995e-07 [detach_backward]: 2.27001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.59e-05 [get_jit_bprop_graph]: 1.81e-06 [rewriter_after_jit_bprop_graph]: 5.25001e-06 [opt_after_jit_grad]: 0.00048007 [validate]: 7.692e-05 [backend_pass]: 1.04003e-06 [task_emit]: 0.00674199 [execute]: 8.82999e-06 Sums bootstrap : 0.000671s : 0.63% type_inference : 0.082706s : 78.04% event_method : 0.000288s : 0.27% auto_monad : 0.000087s : 0.08% graph_reusing : 0.000008s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000046s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000307s : 0.29% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000067s : 0.06% optimize.opt_a.loop_unroll : 0.000051s : 0.05% optimize.opt_a.a_1 : 0.000941s : 0.89% optimize.opt_a.with_stream_mark : 0.000036s : 0.03% optimize.opt_a.recompute_prepare : 0.000015s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000153s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000032s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.009585s : 9.04% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.03% optimize.opt_a.cse : 0.000048s : 0.05% optimize.opt_a.a_3 : 0.000095s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000020s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.001928s : 1.82% optimize.opt_b.b_1 : 0.000125s : 0.12% optimize.opt_b.b_2 : 0.000050s : 0.05% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.03% optimize.loop_unroll : 0.000472s : 0.45% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000045s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000077s : 0.07% optimize.cse_after_recomputation.cse : 0.000012s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000480s : 0.45% validate : 0.000077s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.006742s : 6.36% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000232 28 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.54% : 0.000001s : 2: substitution.fold_const_symbol 2.81% : 0.000007s : 4: substitution.graph_param_transform 83.03% : 0.000192s : 7: substitution.inline 1.92% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.55% : 0.000006s : 4: substitution.remove_not_recompute_node 2.09% : 0.000005s : 2: substitution.replace_old_param 6.20% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.082561 2 97.25% : 0.080288s : 1: type_inference.infer 2.75% : 0.002273s : 1: type_inference.specialize ------[replace.] 0.000089 10 72.91% : 0.000065s : 7: replace.inline 27.09% : 0.000024s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000201 10 93.73% : 0.000188s : 7: match.inline 6.27% : 0.000013s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000236 1463 1.01% : 0.000002s : 16: predicate.accumulaten_eliminater 0.67% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 16: predicate.addn_zero_filter 0.95% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.44% : 0.000006s : 24: predicate.arithmetic_simplify 0.98% : 0.000002s : 16: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 4: predicate.elim_not_effective 0.31% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_depend_swap 1.61% : 0.000004s : 28: predicate.environ_get_eliminate 1.17% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.56% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.39% : 0.000006s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.67% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000002s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.46% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 5.69% : 0.000013s : 66: predicate.inline 0.68% : 0.000002s : 8: predicate.inline_without_move 0.46% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 27: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 43: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.10% : 0.000007s : 55: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 16: predicate.minmaximum_grad 1.30% : 0.000003s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 2.07% : 0.000005s : 26: predicate.partial_defer_inline 1.41% : 0.000003s : 23: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.25% : 0.000003s : 16: predicate.reduce_eliminate 2.44% : 0.000006s : 43: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.39% : 0.000003s : 27: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.54% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.69% : 0.000004s : 26: predicate.switch_defer_inline 2.21% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.72% : 0.000013s : 93: predicate.switch_simplify 1.01% : 0.000002s : 16: predicate.tile_eliminate 1.02% : 0.000002s : 16: predicate.transpose_eliminate 1.40% : 0.000003s : 24: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000008s : 35: predicate.tuple_list_get_item_eliminator 1.51% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 27: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 43: predicate.updatestate_pure_node_eliminater 2.93% : 0.000007s : 51: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002723 29 73.58% : 0.002004s : 20: func_graph_cloner_run.FuncGraphClonerGraph 26.42% : 0.000719s : 9: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.143212 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.26% : 0.004672s : 1: add_attr 3.25% : 0.004658s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000082s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000097s : 1: auto_monad 0.01% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.50% : 0.000717s : 1: bootstrap 0.02% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.21% : 0.000302s : 1: event_method 0.01% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.34% : 0.000481s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 1.35% : 0.001940s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 0.98% : 0.001400s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000145s : 28: opt.transform.opt_b 0.03% : 0.000050s : 2: opt.transform.opt_trans_graph 0.02% : 0.000036s : 4: opt.transform.symbol_engine_opt 8.30% : 0.011882s : 1: opt_a 0.07% : 0.000106s : 1: opt_after_cconv 0.34% : 0.000489s : 1: opt_after_jit_grad 0.18% : 0.000257s : 1: opt_b 10.92% : 0.015635s : 1: optimize 0.01% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000051s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 6.00% : 0.008596s : 1: renormalize.infer 0.68% : 0.000975s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000023s : 1: rewriter_after_opt_a 0.22% : 0.000314s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000078s : 1: symbol_engine_optimizer 4.72% : 0.006761s : 1: task_emit 0.06% : 0.000080s : 1: tuple_transform 57.77% : 0.082732s : 1: type_inference 0.08% : 0.000116s : 1: validate TotalTime = 0.182968, [24] [bootstrap]: 0.00070559 [type_inference]: 0.156086 [event_method]: 0.0005924 [auto_monad]: 0.00021072 [graph_reusing]: 1.051e-05 [inline]: 2.94001e-06 [add_attr]: 0.00413151, [1] [add_attr_with_inline]: 0.00410487, [1] [Cycle 1]: 0.00012221, [2] [tag_attr]: 4.808e-05 [meta_addattr_fg_expand]: 1.436e-05 [parallel-infer-symbol]: 3.43e-06 [pre_auto_parallel]: 6.747e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.88997e-06 [pipeline_split]: 1.92999e-06 [optimize]: 0.0202997, [53] [py_interpret_to_execute]: 5.34998e-06 [rewriter_before_opt_a]: 0.00038812 [opt_a]: 0.0179156, [2] [Cycle 1]: 0.0172791, [45] [expand_dump_flag]: 6.03998e-06 [switch_simplify]: 0.00019788 [loop_unroll]: 7.6e-05 [a_1]: 0.00164088 [with_stream_mark]: 5.289e-05 [recompute_prepare]: 2.318e-05 [updatestate_depend_eliminate]: 8.80999e-06 [updatestate_assign_eliminate]: 7.06001e-06 [updatestate_loads_eliminate]: 6.26e-06 [parameter_eliminate]: 2.71e-06 [a_2]: 0.00021276 [accelerated_algorithm]: 1.427e-05 [shard]: 1.68002e-06 [meta_shard_fg_expand]: 4.74002e-06 [shard_inline]: 1.426e-05 [merge_send_recv]: 1.599e-05 [auto_parallel]: 1.053e-05 [parallel]: 6.272e-05 [flash_sp]: 1.115e-05 [merge_comm]: 8.85999e-06 [allreduce_fusion]: 7.83999e-06 [matmul_add_comm_reduction]: 2.879e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.672e-05 [virtual_dataset]: 1.435e-05 [get_grad_eliminate_]: 1.453e-05 [virtual_output]: 1.389e-05 [merge_forward]: 8.52e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 1.703e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.681e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 2.431e-05 [set_forward_comm_id_for_comm_node_pass]: 8.54e-06 [meta_fg_expand]: 0.0018616 [flash_sp_send_recv_attached]: 4.22e-06 [receive_attached]: 2.78e-06 [after_resolve]: 6.03e-05 [a_after_grad]: 8.346e-05 [renormalize]: 0.0123014 [add_forward_monad_depend]: 7.52002e-06 [auto_monad_grad]: 2.71999e-06 [auto_monad_eliminator]: 1.437e-05 [cse]: 2.705e-05 [a_3]: 4.334e-05 [Cycle 2]: 0.00062161, [45] [expand_dump_flag]: 2.16998e-06 [switch_simplify]: 6.14001e-06 [loop_unroll]: 4.2e-06 [a_1]: 5.063e-05 [with_stream_mark]: 1.564e-05 [recompute_prepare]: 4.43001e-06 [updatestate_depend_eliminate]: 4.22e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.44001e-06 [parameter_eliminate]: 1.49e-06 [a_2]: 3.871e-05 [accelerated_algorithm]: 4.27e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.21003e-06 [shard_inline]: 3.96001e-06 [merge_send_recv]: 6.88e-06 [auto_parallel]: 7.74002e-06 [parallel]: 7.95e-06 [flash_sp]: 3.65e-06 [merge_comm]: 2.35002e-06 [allreduce_fusion]: 2.37999e-06 [matmul_add_comm_reduction]: 7.21999e-06 [allreduce_slice_to_reducescatter]: 9.99979e-07 [virtual_shard_identity]: 4.68001e-06 [virtual_dataset]: 4.3e-06 [get_grad_eliminate_]: 4.1e-06 [virtual_output]: 3.98001e-06 [merge_forward]: 3.18e-06 [cell_reuse_recompute_pass]: 2.78e-06 [offload_activation]: 6.44999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.154e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 6.52001e-06 [set_forward_comm_id_for_comm_node_pass]: 2.41e-06 [meta_fg_expand]: 0.00013151 [flash_sp_send_recv_attached]: 2.23002e-06 [receive_attached]: 2.87002e-06 [after_resolve]: 6.42001e-06 [a_after_grad]: 5.89999e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.88002e-06 [auto_monad_grad]: 9.60019e-07 [auto_monad_eliminator]: 4.85001e-06 [cse]: 1.217e-05 [a_3]: 2.083e-05 [py_interpret_to_execute_after_opt_a]: 4.52e-06 [slice_cell_reuse_recomputed_activation]: 2.42001e-06 [rewriter_after_opt_a]: 1.626e-05 [convert_after_rewriter]: 1.35001e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.0006228 [opt_b]: 0.00019737, [1] [Cycle 1]: 0.00019002, [7] [b_1]: 0.00012246 [b_2]: 6.34001e-06 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 1.98002e-06 [updatestate_loads_eliminate]: 1.64e-06 [renormalize]: 1.00001e-06 [cse]: 1.561e-05 [optimize_parallel_all_gather_comm]: 4.551e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 2.298e-05 [loop_unroll]: 0.00045157 [opt_after_cconv]: 7.974e-05, [1] [Cycle 1]: 7.377e-05, [7] [c_1]: 1.533e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 4.60001e-06 [updatestate_assign_eliminate]: 1.85001e-06 [updatestate_loads_eliminate]: 1.70001e-06 [cse]: 1.477e-05 [renormalize]: 3.09985e-07 [remove_dup_value]: 1.344e-05 [tuple_transform]: 5.234e-05, [1] [Cycle 1]: 4.805e-05, [4] [d_1]: 2.329e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 4.82e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 3.508e-05 [cse_after_recomputation]: 1.855e-05, [1] [Cycle 1]: 1.44e-05, [1] [cse]: 8.85999e-06 [environ_conv]: 4.75001e-06 [swap_dp_allreduce_reducescatter]: 4.18999e-06 [bias_add_comm_swap]: 2.38002e-06 [label_micro_interleaved_index]: 4.02e-06 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.41e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.12e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.13998e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.28002e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.01997e-06 [overlap_opt_shard_in_pipeline]: 2.789e-05 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 9.81e-06 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 2.36998e-06 [overlap_recompute_and_grad_model_parallel]: 3.63e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 3.58e-06 [overlap_grad_flash_sp]: 1.65e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 1.95001e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 6.29e-05, [1] [Cycle 1]: 5.862e-05, [6] [build]: 2.79001e-06 [elim_shapecalc]: 7.48e-06 [elim_not_effective]: 9.28002e-06 [opt_reshape]: 4.80001e-06 [fold_const_symbol]: 6.54001e-06 [renormalize]: 2.29978e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 1.355e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 3.71999e-06 [opt_after_jit_grad]: 0.00049749 [validate]: 3.136e-05 [backend_pass]: 9.00007e-07 [task_emit]: 4.037e-05 [execute]: 1.25999e-06 Sums bootstrap : 0.000706s : 0.40% type_inference : 0.156086s : 87.82% event_method : 0.000592s : 0.33% auto_monad : 0.000211s : 0.12% graph_reusing : 0.000011s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000048s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000067s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000388s : 0.22% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000204s : 0.11% optimize.opt_a.loop_unroll : 0.000080s : 0.05% optimize.opt_a.a_1 : 0.001692s : 0.95% optimize.opt_a.with_stream_mark : 0.000069s : 0.04% optimize.opt_a.recompute_prepare : 0.000028s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000251s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000018s : 0.01% optimize.opt_a.merge_send_recv : 0.000023s : 0.01% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000071s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000011s : 0.01% optimize.opt_a.allreduce_fusion : 0.000010s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000036s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.01% optimize.opt_a.virtual_dataset : 0.000019s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000019s : 0.01% optimize.opt_a.virtual_output : 0.000018s : 0.01% optimize.opt_a.merge_forward : 0.000012s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000023s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000031s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.01% optimize.opt_a.meta_fg_expand : 0.001993s : 1.12% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000067s : 0.04% optimize.opt_a.a_after_grad : 0.000089s : 0.05% optimize.opt_a.renormalize : 0.012301s : 6.92% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000019s : 0.01% optimize.opt_a.cse : 0.000039s : 0.02% optimize.opt_a.a_3 : 0.000064s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000623s : 0.35% optimize.opt_b.b_1 : 0.000122s : 0.07% optimize.opt_b.b_2 : 0.000006s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000016s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000046s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.01% optimize.loop_unroll : 0.000452s : 0.25% optimize.opt_after_cconv.c_1 : 0.000015s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000015s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.01% optimize.tuple_transform.d_1 : 0.000023s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000005s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000035s : 0.02% optimize.cse_after_recomputation.cse : 0.000009s : 0.00% optimize.environ_conv : 0.000005s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000004s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000028s : 0.02% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000010s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000002s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000017s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000007s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000009s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000005s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000007s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000014s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000497s : 0.28% validate : 0.000031s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.000040s : 0.02% execute : 0.000001s : 0.00% Time group info: ------[substitution.] 0.000533 92 0.31% : 0.000002s : 1: substitution.elim_not_effective 1.19% : 0.000006s : 9: substitution.float_depend_g_call 0.64% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.19% : 0.000001s : 1: substitution.fold_const_symbol 0.82% : 0.000004s : 1: substitution.graph_param_transform 0.52% : 0.000003s : 2: substitution.incorporate_call 0.43% : 0.000002s : 2: substitution.incorporate_call_switch 70.68% : 0.000377s : 16: substitution.inline 3.29% : 0.000018s : 2: substitution.inline_without_move 1.22% : 0.000006s : 9: substitution.j_node_and_user_rematch 0.57% : 0.000003s : 2: substitution.minmaximum_grad 3.40% : 0.000018s : 9: substitution.partial_eliminate 1.61% : 0.000009s : 9: substitution.remove_not_recompute_node 0.50% : 0.000003s : 1: substitution.replace_applicator 0.94% : 0.000005s : 7: substitution.replace_old_param 0.54% : 0.000003s : 1: substitution.set_cell_output_no_recompute 5.02% : 0.000027s : 3: substitution.switch_simplify 1.34% : 0.000007s : 2: substitution.tuple_list_convert_item_index_to_positive 0.60% : 0.000003s : 2: substitution.tuple_list_get_item_const_eliminator 0.90% : 0.000005s : 2: substitution.tuple_list_get_item_depend_reorder 4.52% : 0.000024s : 7: substitution.tuple_list_get_item_eliminator 0.79% : 0.000004s : 2: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.155915 2 97.09% : 0.151382s : 1: type_inference.infer 2.91% : 0.004533s : 1: type_inference.specialize ------[replace.] 0.000188 22 64.16% : 0.000121s : 16: replace.inline 23.51% : 0.000044s : 3: replace.switch_simplify 12.34% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000406 22 90.82% : 0.000369s : 16: match.inline 6.06% : 0.000025s : 3: match.switch_simplify 3.12% : 0.000013s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000333 2147 1.07% : 0.000004s : 28: predicate.accumulaten_eliminater 0.31% : 0.000001s : 1: predicate.ad_related_special_op_eliminate 0.53% : 0.000002s : 13: predicate.addn_check_dump 1.19% : 0.000004s : 28: predicate.addn_zero_filter 1.05% : 0.000003s : 28: predicate.adjust_all_reduce_mul_add 2.15% : 0.000007s : 41: predicate.arithmetic_simplify 1.19% : 0.000004s : 28: predicate.cast_eliminate 0.27% : 0.000001s : 2: predicate.check_bprop_eliminate 0.54% : 0.000002s : 13: predicate.compare_switch_simplify 0.04% : 0.000000s : 1: predicate.const_output_eliminate 0.55% : 0.000002s : 13: predicate.depend_value_elim 1.14% : 0.000004s : 28: predicate.dict_get_item_const_eliminator 1.31% : 0.000004s : 28: predicate.dict_get_item_eliminator 1.13% : 0.000004s : 28: predicate.dict_set_item_eliminator 0.36% : 0.000001s : 2: predicate.dumpgradient_eliminate 0.05% : 0.000000s : 1: predicate.elim_not_effective 0.08% : 0.000000s : 1: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.12% : 0.000004s : 29: predicate.environ_get_depend_swap 1.65% : 0.000005s : 42: predicate.environ_get_eliminate 1.08% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.94% : 0.000006s : 47: predicate.exchange_switch_depend_value 2.94% : 0.000010s : 47: predicate.float_depend_g_call 0.54% : 0.000002s : 13: predicate.float_environ_get_switch 0.72% : 0.000002s : 14: predicate.float_tuple_getitem_switch 0.03% : 0.000000s : 1: predicate.fold_const_symbol 0.75% : 0.000003s : 13: predicate.get_grad_eliminate 0.10% : 0.000000s : 1: predicate.graph_param_transform 0.54% : 0.000002s : 13: predicate.incorporate_call 0.48% : 0.000002s : 13: predicate.incorporate_call_switch 6.14% : 0.000020s : 103: predicate.inline 2.25% : 0.000007s : 37: predicate.inline_without_move 0.29% : 0.000001s : 13: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 13: predicate.less_batch_normalization 1.48% : 0.000005s : 33: predicate.list_to_tuple_eliminator_ 2.34% : 0.000008s : 61: predicate.load_eliminater 0.42% : 0.000001s : 1: predicate.loop_unroll_after_grad 3.76% : 0.000013s : 86: predicate.loop_unroll_before_grad 1.41% : 0.000005s : 30: predicate.make_slice_get_slice_eliminator 0.58% : 0.000002s : 13: predicate.merge_addn 0.14% : 0.000000s : 2: predicate.micro_step_allgather_replace 0.15% : 0.000001s : 2: predicate.mini_step_allgather_replace 1.06% : 0.000004s : 28: predicate.minmaximum_grad 0.49% : 0.000002s : 1: predicate.mutable_eliminate 0.11% : 0.000000s : 1: predicate.opt_reshape 0.29% : 0.000001s : 1: predicate.parallel_virtual_node 2.71% : 0.000009s : 47: predicate.partial_defer_inline 1.43% : 0.000005s : 32: predicate.partial_eliminate 1.11% : 0.000004s : 28: predicate.print_const_string_wrapper 0.56% : 0.000002s : 13: predicate.reduce_all_const_elim 1.47% : 0.000005s : 28: predicate.reduce_eliminate 2.32% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 13: predicate.remove_not_recompute_node 1.19% : 0.000004s : 33: predicate.replace_applicator 1.03% : 0.000003s : 37: predicate.replace_old_param 0.07% : 0.000000s : 1: predicate.reset_defer_inline 1.21% : 0.000004s : 28: predicate.reshape_eliminate 0.21% : 0.000001s : 2: predicate.row_tensor_add_zeros_like 0.15% : 0.000001s : 1: predicate.row_tensor_eliminate 0.27% : 0.000001s : 2: predicate.same_eliminate 0.36% : 0.000001s : 13: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 13: predicate.shard_identity_eliminate 0.23% : 0.000001s : 2: predicate.special_op_eliminate 0.63% : 0.000002s : 13: predicate.specialize_transform 0.47% : 0.000002s : 2: predicate.split_environ_get_set_with_tuple_value 1.97% : 0.000007s : 37: predicate.stack_unstack_eliminate 0.07% : 0.000000s : 1: predicate.switch_call_monad_eliminater 2.15% : 0.000007s : 47: predicate.switch_defer_inline 2.39% : 0.000008s : 49: predicate.switch_layer_defer_inline 7.34% : 0.000024s : 153: predicate.switch_simplify 1.21% : 0.000004s : 28: predicate.tile_eliminate 1.08% : 0.000004s : 28: predicate.transpose_eliminate 1.48% : 0.000005s : 30: predicate.tuple_list_convert_item_index_to_positive 1.41% : 0.000005s : 30: predicate.tuple_list_get_item_const_eliminator 1.22% : 0.000004s : 30: predicate.tuple_list_get_item_depend_reorder 2.64% : 0.000009s : 46: predicate.tuple_list_get_item_eliminator 1.36% : 0.000005s : 30: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000007s : 43: predicate.tuple_list_set_item_eliminator 1.32% : 0.000004s : 33: predicate.tuple_to_list_eliminator_ 2.28% : 0.000008s : 61: predicate.updatestate_pure_node_eliminater 2.99% : 0.000010s : 74: predicate.updatestate_useless_node_eliminater 0.09% : 0.000000s : 1: predicate.value_based_eliminate 0.79% : 0.000003s : 13: predicate.virtual_dataset_eliminate 0.64% : 0.000002s : 13: predicate.virtual_output_eliminate 0.06% : 0.000000s : 1: predicate.virtual_view_grad_eliminate 0.13% : 0.000000s : 1: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003602 46 79.89% : 0.002878s : 27: func_graph_cloner_run.FuncGraphClonerGraph 20.11% : 0.000724s : 19: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.222326 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.86% : 0.004137s : 1: add_attr 1.85% : 0.004109s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000039s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000223s : 1: auto_monad 0.01% : 0.000017s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.34% : 0.000762s : 1: bootstrap 0.01% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000013s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000022s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000008s : 1: environ_conv 0.27% : 0.000606s : 1: event_method 0.00% : 0.000005s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.21% : 0.000460s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.28% : 0.000633s : 1: mutable_eliminate 0.00% : 0.000005s : 1: offloading_packed_experts 0.00% : 0.000010s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000012s : 1: opt.transform.mutable_eliminate 1.15% : 0.002561s : 78: opt.transform.opt_a 0.01% : 0.000014s : 1: opt.transform.opt_after_cconv 0.01% : 0.000016s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000099s : 28: opt.transform.opt_b 0.01% : 0.000026s : 2: opt.transform.opt_trans_graph 0.01% : 0.000024s : 4: opt.transform.symbol_engine_opt 8.06% : 0.017920s : 1: opt_a 0.04% : 0.000083s : 1: opt_after_cconv 0.23% : 0.000508s : 1: opt_after_jit_grad 0.09% : 0.000201s : 1: opt_b 9.13% : 0.020306s : 1: optimize 0.02% : 0.000050s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000006s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000032s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000006s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.03% : 0.000072s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000017s : 1: remove_dup_value 5.16% : 0.011474s : 1: renormalize.infer 0.37% : 0.000814s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.18% : 0.000395s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000007s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000066s : 1: symbol_engine_optimizer 0.02% : 0.000047s : 1: task_emit 0.02% : 0.000055s : 1: tuple_transform 70.22% : 0.156113s : 1: type_inference 0.03% : 0.000060s : 1: validate ... TotalTime = 23.8267, [33] [bootstrap]: 0.00079138 [type_inference]: 0.075848 [event_method]: 0.00015082 [auto_monad]: 0.00018348 [graph_reusing]: 7.80998e-06 [pre_auto_parallel]: 2.326e-05 [py_interpret_to_execute]: 3.78e-05 [rewriter_before_opt_a]: 0.00016704 [expand_dump_flag]: 3.4e-06 [jit_opt_a]: 0.0115741, [2] [Cycle 1]: 0.00291207, [27] [switch_simplify]: 0.00018827 [loop_unroll]: 4.745e-05 [a_1]: 0.00081676 [with_stream_mark]: 2.958e-05 [recompute_prepare]: 7.87e-06 [updatestate_depend_eliminate]: 1.066e-05 [updatestate_assign_eliminate]: 1.383e-05 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.60999e-06 [specialize_transform]: 7.82002e-06 [updatestate_useless_node_eliminater]: 6.60002e-06 [accelerated_algorithm]: 6.75998e-06 [meta_shard_fg_expand]: 1.646e-05 [get_grad_eliminate_]: 6.59001e-06 [merge_forward]: 3.33998e-06 [cell_reuse_recompute_pass]: 1.03001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.869e-05 [j_node_and_user_rematch]: 1.031e-05 [meta_fg_expand]: 2.78e-06 [replace_old_param]: 1.139e-05 [inline_without_move]: 6.54001e-06 [renormalize]: 0.00131883 [add_forward_monad_depend]: 3.081e-05 [auto_monad_grad]: 1.04998e-06 [auto_monad_eliminator]: 1.352e-05 [cse]: 6.189e-05 [replace_applicator]: 1.615e-05 [Cycle 2]: 0.00037211, [27] [switch_simplify]: 7.35e-06 [loop_unroll]: 6.42001e-06 [a_1]: 0.00011999 [with_stream_mark]: 1.034e-05 [recompute_prepare]: 6.05002e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 2.41e-06 [parameter_eliminate]: 1.53002e-06 [specialize_transform]: 5.94999e-06 [updatestate_useless_node_eliminater]: 6.33e-06 [accelerated_algorithm]: 6.23998e-06 [meta_shard_fg_expand]: 1.86e-06 [get_grad_eliminate_]: 5.78002e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 1.29e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.539e-05 [j_node_and_user_rematch]: 9.12001e-06 [meta_fg_expand]: 2.01998e-06 [replace_old_param]: 9.83002e-06 [inline_without_move]: 5.88002e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 9.79984e-07 [auto_monad_grad]: 6.00005e-07 [auto_monad_eliminator]: 5.55001e-06 [cse]: 1.282e-05 [replace_applicator]: 5.79e-06 [py_interpret_to_execute_after_opt_a]: 1.237e-05 [rewriter_after_opt_a]: 8.475e-05 [convert_after_rewriter]: 5.35001e-06 [order_py_execute_after_rewriter]: 5.24e-06 [mutable_eliminate]: 0.00065372 [jit_opt_b]: 5.459e-05, [1] [Cycle 1]: 4.663e-05, [2] [frontend_op_eliminate]: 1.843e-05 [inline_after_opt_a]: 1.71e-05 [cconv]: 2.623e-05 [loop_unroll]: 0.00041789 [jit_opt_after_cconv]: 0.00018979, [1] [Cycle 1]: 0.00018291, [11] [c_1]: 2.541e-05 [parameter_eliminate]: 2.61e-06 [updatestate_depend_eliminate]: 5.47999e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.63e-06 [cse]: 2.573e-05 [call_graph_tuple_transform]: 5.322e-05 [tuple_list_get_item_eliminator]: 6.58e-06 [none_parameter_eliminate]: 7.39994e-07 [renormalize]: 2.30008e-07 [switch_simplify]: 6.75002e-06 [remove_dup_value]: 7.75e-06 [partial_unused_args_eliminate]: 1.79e-06 [environ_conv]: 2.227e-05 [add_recomputation]: 6.758e-05 [cse_after_recomputation]: 2.614e-05, [1] [Cycle 1]: 2.003e-05, [1] [cse]: 1.405e-05 [auto_monad_reorder]: 2.172e-05 [get_jit_bprop_graph]: 1.45001e-06 [rewriter_after_jit_bprop_graph]: 3.44001e-06 [opt_after_jit_grad]: 0.00047952 [symbol_engine_optimizer]: 7.539e-05, [1] [Cycle 1]: 6.959e-05, [6] [build]: 4.36002e-06 [elim_shapecalc]: 8.22e-06 [elim_not_effective]: 1.27e-05 [opt_reshape]: 6.88998e-06 [fold_const_symbol]: 9.27001e-06 [renormalize]: 7.2e-07 [validate]: 6.655e-05 [backend_pass]: 6.69999e-07 [task_emit]: 23.734 [execute]: 1.085e-05 Sums bootstrap : 0.000791s : 0.00% type_inference : 0.075848s : 0.32% event_method : 0.000151s : 0.00% auto_monad : 0.000183s : 0.00% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000023s : 0.00% py_interpret_to_execute : 0.000038s : 0.00% rewriter_before_opt_a : 0.000167s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000196s : 0.00% jit_opt_a.loop_unroll : 0.000054s : 0.00% jit_opt_a.a_1 : 0.000937s : 0.00% jit_opt_a.with_stream_mark : 0.000040s : 0.00% jit_opt_a.recompute_prepare : 0.000014s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000014s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000014s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000013s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000018s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000012s : 0.00% jit_opt_a.merge_forward : 0.000007s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000019s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000021s : 0.00% jit_opt_a.inline_without_move : 0.000012s : 0.00% jit_opt_a.renormalize : 0.001319s : 0.01% jit_opt_a.add_forward_monad_depend : 0.000032s : 0.00% jit_opt_a.auto_monad_grad : 0.000002s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000019s : 0.00% jit_opt_a.cse : 0.000075s : 0.00% jit_opt_a.replace_applicator : 0.000022s : 0.00% py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% rewriter_after_opt_a : 0.000085s : 0.00% convert_after_rewriter : 0.000005s : 0.00% order_py_execute_after_rewriter : 0.000005s : 0.00% mutable_eliminate : 0.000654s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000018s : 0.00% jit_opt_b.inline_after_opt_a : 0.000017s : 0.00% cconv : 0.000026s : 0.00% loop_unroll : 0.000418s : 0.00% jit_opt_after_cconv.c_1 : 0.000025s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000026s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000053s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.00% remove_dup_value : 0.000008s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000022s : 0.00% add_recomputation : 0.000068s : 0.00% cse_after_recomputation.cse : 0.000014s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000480s : 0.00% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000067s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 23.733983s : 99.65% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000309 33 0.60% : 0.000002s : 2: substitution.elim_not_effective 0.45% : 0.000001s : 2: substitution.fold_const_symbol 5.02% : 0.000016s : 4: substitution.graph_param_transform 70.21% : 0.000217s : 8: substitution.inline 1.03% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.68% : 0.000014s : 4: substitution.remove_not_recompute_node 1.65% : 0.000005s : 4: substitution.replace_old_param 7.04% : 0.000022s : 1: substitution.switch_simplify 9.32% : 0.000029s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.075745 2 97.37% : 0.073750s : 1: type_inference.infer 2.63% : 0.001994s : 1: type_inference.specialize ------[replace.] 0.000113 13 54.85% : 0.000062s : 8: replace.inline 21.82% : 0.000025s : 1: replace.switch_simplify 23.32% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000260 13 81.69% : 0.000213s : 8: match.inline 8.01% : 0.000021s : 1: match.switch_simplify 10.30% : 0.000027s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000171 1126 1.25% : 0.000002s : 17: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.25% : 0.000002s : 17: predicate.addn_check_dump 1.30% : 0.000002s : 17: predicate.addn_zero_filter 2.18% : 0.000004s : 17: predicate.arithmetic_simplify 1.28% : 0.000002s : 17: predicate.cast_eliminate 0.35% : 0.000001s : 4: predicate.check_bprop_eliminate 1.26% : 0.000002s : 17: predicate.compare_switch_simplify 1.22% : 0.000002s : 17: predicate.depend_value_elim 1.26% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.35% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.35% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.59% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.34% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.21% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.20% : 0.000002s : 17: predicate.environ_get_depend_swap 1.25% : 0.000002s : 17: predicate.environ_get_eliminate 1.26% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.85% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 4.86% : 0.000008s : 37: predicate.inline 0.73% : 0.000001s : 8: predicate.inline_without_move 0.43% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.77% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.07% : 0.000004s : 25: predicate.load_eliminater 0.98% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.31% : 0.000007s : 52: predicate.loop_unroll_before_grad 1.95% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.33% : 0.000002s : 17: predicate.merge_addn 1.25% : 0.000002s : 17: predicate.minmaximum_grad 1.02% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 2.34% : 0.000004s : 25: predicate.partial_eliminate 1.39% : 0.000002s : 17: predicate.print_const_string_wrapper 1.77% : 0.000003s : 17: predicate.reduce_eliminate 1.87% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 2.05% : 0.000003s : 29: predicate.replace_applicator 0.57% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000000s : 4: predicate.reset_defer_inline 1.28% : 0.000002s : 17: predicate.reshape_eliminate 1.52% : 0.000003s : 17: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 4: predicate.row_tensor_eliminate 1.32% : 0.000002s : 17: predicate.same_eliminate 0.47% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.02% : 0.000002s : 8: predicate.special_op_eliminate 0.75% : 0.000001s : 8: predicate.specialize_transform 1.54% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.36% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.87% : 0.000005s : 29: predicate.switch_defer_inline 2.53% : 0.000004s : 29: predicate.switch_layer_defer_inline 8.55% : 0.000015s : 87: predicate.switch_simplify 1.26% : 0.000002s : 17: predicate.tile_eliminate 1.25% : 0.000002s : 17: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.40% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.73% : 0.000003s : 17: predicate.tuple_list_set_item_eliminator 1.83% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.02% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.88% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.78% : 0.000003s : 17: predicate.value_based_eliminate 0.26% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001338 19 56.77% : 0.000759s : 9: func_graph_cloner_run.FuncGraphClonerGraph 43.23% : 0.000578s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 23.828152 76 0.00% : 0.000071s : 1: add_recomputation 0.00% : 0.000191s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.00% : 0.000827s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000008s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000157s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.05% : 0.011577s : 1: jit_opt_a 0.00% : 0.000192s : 1: jit_opt_after_cconv 0.00% : 0.000057s : 1: jit_opt_b 0.00% : 0.000425s : 1: loop_unroll 0.00% : 0.000664s : 1: mutable_eliminate 0.01% : 0.001328s : 26: opt.transform.jit_opt_a 0.00% : 0.000089s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000029s : 4: opt.transform.jit_opt_b 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000034s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000487s : 1: opt_after_jit_grad 0.00% : 0.000007s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000025s : 1: pre_auto_parallel 0.00% : 0.000041s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000010s : 1: remove_dup_value 0.00% : 0.000609s : 1: renormalize.infer 0.00% : 0.000702s : 1: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000088s : 1: rewriter_after_opt_a 0.00% : 0.000171s : 1: rewriter_before_opt_a 0.00% : 0.000078s : 1: symbol_engine_optimizer 99.61% : 23.734115s : 1: task_emit 0.32% : 0.075872s : 1: type_inference 0.00% : 0.000077s : 1: validate TotalTime = 23.8892, [33] [bootstrap]: 0.00092468 [type_inference]: 0.0813289 [event_method]: 0.00014922 [auto_monad]: 0.00018368 [graph_reusing]: 7.8e-06 [pre_auto_parallel]: 1.889e-05 [py_interpret_to_execute]: 4.005e-05 [rewriter_before_opt_a]: 0.00016891 [expand_dump_flag]: 3.2e-06 [jit_opt_a]: 0.0115742, [2] [Cycle 1]: 0.00291207, [27] [switch_simplify]: 0.00018826 [loop_unroll]: 4.725e-05 [a_1]: 0.00081575 [with_stream_mark]: 2.065e-05 [recompute_prepare]: 9.34998e-06 [updatestate_depend_eliminate]: 1.715e-05 [updatestate_assign_eliminate]: 1.381e-05 [updatestate_loads_eliminate]: 3.27002e-06 [parameter_eliminate]: 1.91e-06 [specialize_transform]: 7.97e-06 [updatestate_useless_node_eliminater]: 6.46e-06 [accelerated_algorithm]: 6.68e-06 [meta_shard_fg_expand]: 1.419e-05 [get_grad_eliminate_]: 7.16999e-06 [merge_forward]: 4.25999e-06 [cell_reuse_recompute_pass]: 4.22998e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.656e-05 [j_node_and_user_rematch]: 1.022e-05 [meta_fg_expand]: 2.48e-06 [replace_old_param]: 1.079e-05 [inline_without_move]: 6.53003e-06 [renormalize]: 0.00131912 [add_forward_monad_depend]: 2.763e-05 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 1.64e-05 [cse]: 6.235e-05 [replace_applicator]: 1.614e-05 [Cycle 2]: 0.00037271, [27] [switch_simplify]: 7.07002e-06 [loop_unroll]: 6.04999e-06 [a_1]: 0.00012031 [with_stream_mark]: 1.047e-05 [recompute_prepare]: 6.00002e-06 [updatestate_depend_eliminate]: 3.04999e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.39999e-06 [parameter_eliminate]: 1.52999e-06 [specialize_transform]: 6.14001e-06 [updatestate_useless_node_eliminater]: 6.10002e-06 [accelerated_algorithm]: 6.23e-06 [meta_shard_fg_expand]: 1.91e-06 [get_grad_eliminate_]: 5.79e-06 [merge_forward]: 3.18e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.555e-05 [j_node_and_user_rematch]: 9.12001e-06 [meta_fg_expand]: 1.93002e-06 [replace_old_param]: 1.009e-05 [inline_without_move]: 5.94999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.05001e-06 [auto_monad_grad]: 5.79981e-07 [auto_monad_eliminator]: 5.79999e-06 [cse]: 1.29e-05 [replace_applicator]: 5.87999e-06 [py_interpret_to_execute_after_opt_a]: 1.232e-05 [rewriter_after_opt_a]: 8.157e-05 [convert_after_rewriter]: 9.22001e-06 [order_py_execute_after_rewriter]: 4.44002e-06 [mutable_eliminate]: 0.00065359 [jit_opt_b]: 5.515e-05, [1] [Cycle 1]: 4.753e-05, [2] [frontend_op_eliminate]: 1.856e-05 [inline_after_opt_a]: 1.822e-05 [cconv]: 2.573e-05 [loop_unroll]: 0.00041724 [jit_opt_after_cconv]: 0.00018034, [1] [Cycle 1]: 0.00017338, [11] [c_1]: 2.546e-05 [parameter_eliminate]: 2.21e-06 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.581e-05 [call_graph_tuple_transform]: 4.401e-05 [tuple_list_get_item_eliminator]: 6.75002e-06 [none_parameter_eliminate]: 8.99978e-07 [renormalize]: 2.80008e-07 [switch_simplify]: 6.68e-06 [remove_dup_value]: 1.681e-05 [partial_unused_args_eliminate]: 2.31e-06 [environ_conv]: 2.282e-05 [add_recomputation]: 6.741e-05 [cse_after_recomputation]: 2.581e-05, [1] [Cycle 1]: 1.998e-05, [1] [cse]: 1.404e-05 [auto_monad_reorder]: 2.209e-05 [get_jit_bprop_graph]: 1.25999e-06 [rewriter_after_jit_bprop_graph]: 2.91999e-06 [opt_after_jit_grad]: 0.00047943 [symbol_engine_optimizer]: 7.512e-05, [1] [Cycle 1]: 6.953e-05, [6] [build]: 4.28999e-06 [elim_shapecalc]: 8.18999e-06 [elim_not_effective]: 1.358e-05 [opt_reshape]: 6.47001e-06 [fold_const_symbol]: 9.91e-06 [renormalize]: 3.30008e-07 [validate]: 7.916e-05 [backend_pass]: 6.90023e-07 [task_emit]: 23.7905 [execute]: 9.95002e-06 Sums bootstrap : 0.000925s : 0.00% type_inference : 0.081329s : 0.34% event_method : 0.000149s : 0.00% auto_monad : 0.000184s : 0.00% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000019s : 0.00% py_interpret_to_execute : 0.000040s : 0.00% rewriter_before_opt_a : 0.000169s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000195s : 0.00% jit_opt_a.loop_unroll : 0.000053s : 0.00% jit_opt_a.a_1 : 0.000936s : 0.00% jit_opt_a.with_stream_mark : 0.000031s : 0.00% jit_opt_a.recompute_prepare : 0.000015s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000014s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000013s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000016s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000013s : 0.00% jit_opt_a.merge_forward : 0.000007s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000042s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000019s : 0.00% jit_opt_a.meta_fg_expand : 0.000004s : 0.00% jit_opt_a.replace_old_param : 0.000021s : 0.00% jit_opt_a.inline_without_move : 0.000012s : 0.00% jit_opt_a.renormalize : 0.001319s : 0.01% jit_opt_a.add_forward_monad_depend : 0.000029s : 0.00% jit_opt_a.auto_monad_grad : 0.000002s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000022s : 0.00% jit_opt_a.cse : 0.000075s : 0.00% jit_opt_a.replace_applicator : 0.000022s : 0.00% py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% rewriter_after_opt_a : 0.000082s : 0.00% convert_after_rewriter : 0.000009s : 0.00% order_py_execute_after_rewriter : 0.000004s : 0.00% mutable_eliminate : 0.000654s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.00% jit_opt_b.inline_after_opt_a : 0.000018s : 0.00% cconv : 0.000026s : 0.00% loop_unroll : 0.000417s : 0.00% jit_opt_after_cconv.c_1 : 0.000025s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000026s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000044s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.00% remove_dup_value : 0.000017s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000023s : 0.00% add_recomputation : 0.000067s : 0.00% cse_after_recomputation.cse : 0.000014s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000479s : 0.00% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000079s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 23.790466s : 99.63% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000325 33 0.59% : 0.000002s : 2: substitution.elim_not_effective 0.40% : 0.000001s : 2: substitution.fold_const_symbol 8.90% : 0.000029s : 4: substitution.graph_param_transform 66.60% : 0.000217s : 8: substitution.inline 0.87% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.35% : 0.000017s : 4: substitution.remove_not_recompute_node 1.35% : 0.000004s : 4: substitution.replace_old_param 7.00% : 0.000023s : 1: substitution.switch_simplify 8.91% : 0.000029s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.081237 2 97.53% : 0.079231s : 1: type_inference.infer 2.47% : 0.002007s : 1: type_inference.specialize ------[replace.] 0.000112 13 55.92% : 0.000063s : 8: replace.inline 21.48% : 0.000024s : 1: replace.switch_simplify 22.60% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000262 13 81.22% : 0.000212s : 8: match.inline 8.40% : 0.000022s : 1: match.switch_simplify 10.37% : 0.000027s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000171 1126 1.33% : 0.000002s : 17: predicate.accumulaten_eliminater 0.82% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.21% : 0.000002s : 17: predicate.addn_check_dump 1.39% : 0.000002s : 17: predicate.addn_zero_filter 2.37% : 0.000004s : 17: predicate.arithmetic_simplify 1.29% : 0.000002s : 17: predicate.cast_eliminate 0.40% : 0.000001s : 4: predicate.check_bprop_eliminate 1.19% : 0.000002s : 17: predicate.compare_switch_simplify 1.28% : 0.000002s : 17: predicate.depend_value_elim 1.25% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.37% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.29% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.53% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.42% : 0.000001s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.19% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.20% : 0.000002s : 17: predicate.environ_get_depend_swap 1.28% : 0.000002s : 17: predicate.environ_get_eliminate 1.21% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 4.91% : 0.000008s : 37: predicate.inline 0.74% : 0.000001s : 8: predicate.inline_without_move 0.44% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.83% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.08% : 0.000004s : 25: predicate.load_eliminater 0.93% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.31% : 0.000007s : 52: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.25% : 0.000002s : 17: predicate.merge_addn 1.24% : 0.000002s : 17: predicate.minmaximum_grad 1.09% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 2.29% : 0.000004s : 25: predicate.partial_eliminate 1.21% : 0.000002s : 17: predicate.print_const_string_wrapper 1.88% : 0.000003s : 17: predicate.reduce_eliminate 1.64% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 2.01% : 0.000003s : 29: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000000s : 4: predicate.reset_defer_inline 1.34% : 0.000002s : 17: predicate.reshape_eliminate 1.30% : 0.000002s : 17: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 1.24% : 0.000002s : 17: predicate.same_eliminate 0.51% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.85% : 0.000001s : 8: predicate.special_op_eliminate 0.78% : 0.000001s : 8: predicate.specialize_transform 1.56% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.37% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.84% : 0.000005s : 29: predicate.switch_defer_inline 2.53% : 0.000004s : 29: predicate.switch_layer_defer_inline 8.78% : 0.000015s : 87: predicate.switch_simplify 1.29% : 0.000002s : 17: predicate.tile_eliminate 1.33% : 0.000002s : 17: predicate.transpose_eliminate 1.57% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 17: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000006s : 29: predicate.tuple_list_get_item_eliminator 1.65% : 0.000003s : 17: predicate.tuple_list_set_item_eliminator 1.74% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 1.93% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.83% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.98% : 0.000003s : 17: predicate.value_based_eliminate 0.28% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001414 19 59.52% : 0.000841s : 9: func_graph_cloner_run.FuncGraphClonerGraph 40.48% : 0.000572s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 23.890235 76 0.00% : 0.000071s : 1: add_recomputation 0.00% : 0.000191s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.00% : 0.000966s : 1: bootstrap 0.00% : 0.000028s : 1: cconv 0.00% : 0.000012s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000026s : 1: environ_conv 0.00% : 0.000157s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.05% : 0.011577s : 1: jit_opt_a 0.00% : 0.000183s : 1: jit_opt_after_cconv 0.00% : 0.000058s : 1: jit_opt_b 0.00% : 0.000425s : 1: loop_unroll 0.00% : 0.000664s : 1: mutable_eliminate 0.01% : 0.001331s : 26: opt.transform.jit_opt_a 0.00% : 0.000079s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000031s : 4: opt.transform.jit_opt_b 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000487s : 1: opt_after_jit_grad 0.00% : 0.000006s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000021s : 1: pre_auto_parallel 0.00% : 0.000043s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000019s : 1: remove_dup_value 0.00% : 0.000609s : 1: renormalize.infer 0.00% : 0.000703s : 1: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000085s : 1: rewriter_after_opt_a 0.00% : 0.000172s : 1: rewriter_before_opt_a 0.00% : 0.000077s : 1: symbol_engine_optimizer 99.58% : 23.790572s : 1: task_emit 0.34% : 0.081353s : 1: type_inference 0.00% : 0.000090s : 1: validate TotalTime = 25.2403, [33] [bootstrap]: 0.00079145 [type_inference]: 0.0758482 [event_method]: 0.0001494 [auto_monad]: 0.00018352 [graph_reusing]: 7.85998e-06 [pre_auto_parallel]: 1.491e-05 [py_interpret_to_execute]: 4.393e-05 [rewriter_before_opt_a]: 0.00016903 [expand_dump_flag]: 5.23002e-06 [jit_opt_a]: 0.0115723, [2] [Cycle 1]: 0.0029129, [27] [switch_simplify]: 0.00018817 [loop_unroll]: 4.742e-05 [a_1]: 0.00081726 [with_stream_mark]: 2.076e-05 [recompute_prepare]: 9.42001e-06 [updatestate_depend_eliminate]: 1.713e-05 [updatestate_assign_eliminate]: 1.361e-05 [updatestate_loads_eliminate]: 5.40001e-06 [parameter_eliminate]: 8.2e-07 [specialize_transform]: 7.5e-06 [updatestate_useless_node_eliminater]: 6.47001e-06 [accelerated_algorithm]: 6.39999e-06 [meta_shard_fg_expand]: 1.477e-05 [get_grad_eliminate_]: 6.51e-06 [merge_forward]: 3.93001e-06 [cell_reuse_recompute_pass]: 1.12e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.888e-05 [j_node_and_user_rematch]: 1.049e-05 [meta_fg_expand]: 2.91e-06 [replace_old_param]: 1.184e-05 [inline_without_move]: 6.49999e-06 [renormalize]: 0.00131902 [add_forward_monad_depend]: 1.759e-05 [auto_monad_grad]: 3.35e-06 [auto_monad_eliminator]: 2.44e-05 [cse]: 6.243e-05 [replace_applicator]: 1.62e-05 [Cycle 2]: 0.0003728, [27] [switch_simplify]: 7.1e-06 [loop_unroll]: 6.41998e-06 [a_1]: 0.00011986 [with_stream_mark]: 1.086e-05 [recompute_prepare]: 6.04001e-06 [updatestate_depend_eliminate]: 3.2e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 2.46e-06 [parameter_eliminate]: 1.51002e-06 [specialize_transform]: 6.50002e-06 [updatestate_useless_node_eliminater]: 6.24001e-06 [accelerated_algorithm]: 6.49001e-06 [meta_shard_fg_expand]: 1.69998e-06 [get_grad_eliminate_]: 5.91e-06 [merge_forward]: 3.38999e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.546e-05 [j_node_and_user_rematch]: 9.01002e-06 [meta_fg_expand]: 2.12999e-06 [replace_old_param]: 9.67999e-06 [inline_without_move]: 6.01e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.02e-06 [auto_monad_grad]: 6.00005e-07 [auto_monad_eliminator]: 5.53002e-06 [cse]: 1.283e-05 [replace_applicator]: 6.19999e-06 [py_interpret_to_execute_after_opt_a]: 1.225e-05 [rewriter_after_opt_a]: 8.189e-05 [convert_after_rewriter]: 1.001e-05 [order_py_execute_after_rewriter]: 4.05e-06 [mutable_eliminate]: 0.00065312 [jit_opt_b]: 5.408e-05, [1] [Cycle 1]: 4.71e-05, [2] [frontend_op_eliminate]: 1.869e-05 [inline_after_opt_a]: 1.758e-05 [cconv]: 2.622e-05 [loop_unroll]: 0.00041729 [jit_opt_after_cconv]: 0.00018392, [1] [Cycle 1]: 0.00017673, [11] [c_1]: 2.518e-05 [parameter_eliminate]: 3.00998e-06 [updatestate_depend_eliminate]: 5.98002e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 2.581e-05 [call_graph_tuple_transform]: 2.083e-05 [tuple_list_get_item_eliminator]: 6.16e-06 [none_parameter_eliminate]: 1.36998e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 7.35e-06 [remove_dup_value]: 1.319e-05 [partial_unused_args_eliminate]: 2.38002e-06 [environ_conv]: 2.267e-05 [add_recomputation]: 7.078e-05 [cse_after_recomputation]: 2.335e-05, [1] [Cycle 1]: 1.814e-05, [1] [cse]: 1.221e-05 [auto_monad_reorder]: 2.106e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 2.88003e-06 [opt_after_jit_grad]: 0.00047963 [symbol_engine_optimizer]: 7.624e-05, [1] [Cycle 1]: 7.058e-05, [6] [build]: 4.25999e-06 [elim_shapecalc]: 8.21002e-06 [elim_not_effective]: 1.339e-05 [opt_reshape]: 6.80998e-06 [fold_const_symbol]: 9.72999e-06 [renormalize]: 3.09985e-07 [validate]: 5.984e-05 [backend_pass]: 8.59989e-07 [task_emit]: 25.148 [execute]: 9.69e-06 Sums bootstrap : 0.000791s : 0.00% type_inference : 0.075848s : 0.30% event_method : 0.000149s : 0.00% auto_monad : 0.000184s : 0.00% graph_reusing : 0.000008s : 0.00% pre_auto_parallel : 0.000015s : 0.00% py_interpret_to_execute : 0.000044s : 0.00% rewriter_before_opt_a : 0.000169s : 0.00% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000195s : 0.00% jit_opt_a.loop_unroll : 0.000054s : 0.00% jit_opt_a.a_1 : 0.000937s : 0.00% jit_opt_a.with_stream_mark : 0.000032s : 0.00% jit_opt_a.recompute_prepare : 0.000015s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% jit_opt_a.parameter_eliminate : 0.000002s : 0.00% jit_opt_a.specialize_transform : 0.000014s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000013s : 0.00% jit_opt_a.accelerated_algorithm : 0.000013s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000016s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000012s : 0.00% jit_opt_a.merge_forward : 0.000007s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000044s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000020s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000022s : 0.00% jit_opt_a.inline_without_move : 0.000013s : 0.00% jit_opt_a.renormalize : 0.001319s : 0.01% jit_opt_a.add_forward_monad_depend : 0.000019s : 0.00% jit_opt_a.auto_monad_grad : 0.000004s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000030s : 0.00% jit_opt_a.cse : 0.000075s : 0.00% jit_opt_a.replace_applicator : 0.000022s : 0.00% py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% rewriter_after_opt_a : 0.000082s : 0.00% convert_after_rewriter : 0.000010s : 0.00% order_py_execute_after_rewriter : 0.000004s : 0.00% mutable_eliminate : 0.000653s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.00% jit_opt_b.inline_after_opt_a : 0.000018s : 0.00% cconv : 0.000026s : 0.00% loop_unroll : 0.000417s : 0.00% jit_opt_after_cconv.c_1 : 0.000025s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.cse : 0.000026s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000006s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.00% remove_dup_value : 0.000013s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000023s : 0.00% add_recomputation : 0.000071s : 0.00% cse_after_recomputation.cse : 0.000012s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000480s : 0.00% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000060s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 25.147970s : 99.67% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000298 33 0.65% : 0.000002s : 2: substitution.elim_not_effective 0.33% : 0.000001s : 2: substitution.fold_const_symbol 2.06% : 0.000006s : 4: substitution.graph_param_transform 73.53% : 0.000219s : 8: substitution.inline 1.07% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.83% : 0.000014s : 4: substitution.remove_not_recompute_node 1.94% : 0.000006s : 4: substitution.replace_old_param 7.29% : 0.000022s : 1: substitution.switch_simplify 8.31% : 0.000025s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.075732 2 97.04% : 0.073489s : 1: type_inference.infer 2.96% : 0.002243s : 1: type_inference.specialize ------[replace.] 0.000114 13 55.33% : 0.000063s : 8: replace.inline 21.80% : 0.000025s : 1: replace.switch_simplify 22.87% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000258 13 83.15% : 0.000215s : 8: match.inline 8.09% : 0.000021s : 1: match.switch_simplify 8.76% : 0.000023s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000172 1126 1.46% : 0.000003s : 17: predicate.accumulaten_eliminater 0.77% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.24% : 0.000002s : 17: predicate.addn_check_dump 1.29% : 0.000002s : 17: predicate.addn_zero_filter 2.36% : 0.000004s : 17: predicate.arithmetic_simplify 1.37% : 0.000002s : 17: predicate.cast_eliminate 0.40% : 0.000001s : 4: predicate.check_bprop_eliminate 1.18% : 0.000002s : 17: predicate.compare_switch_simplify 1.30% : 0.000002s : 17: predicate.depend_value_elim 1.25% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.32% : 0.000002s : 17: predicate.dict_get_item_eliminator 1.30% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.66% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.27% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.26% : 0.000002s : 17: predicate.environ_get_depend_swap 1.27% : 0.000002s : 17: predicate.environ_get_eliminate 1.32% : 0.000002s : 17: predicate.environ_get_set_eliminate 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.93% : 0.000002s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 5.18% : 0.000009s : 37: predicate.inline 0.80% : 0.000001s : 8: predicate.inline_without_move 0.50% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.90% : 0.000002s : 8: predicate.less_batch_normalization 1.92% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.04% : 0.000004s : 25: predicate.load_eliminater 0.87% : 0.000001s : 4: predicate.loop_unroll_after_grad 4.41% : 0.000008s : 52: predicate.loop_unroll_before_grad 1.77% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 1.20% : 0.000002s : 17: predicate.merge_addn 1.29% : 0.000002s : 17: predicate.minmaximum_grad 1.13% : 0.000002s : 4: predicate.mutable_eliminate 0.41% : 0.000001s : 4: predicate.opt_reshape 2.24% : 0.000004s : 25: predicate.partial_eliminate 1.48% : 0.000003s : 17: predicate.print_const_string_wrapper 1.78% : 0.000003s : 17: predicate.reduce_eliminate 1.65% : 0.000003s : 21: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 2.01% : 0.000003s : 29: predicate.replace_applicator 0.59% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000000s : 4: predicate.reset_defer_inline 1.48% : 0.000003s : 17: predicate.reshape_eliminate 1.32% : 0.000002s : 17: predicate.row_tensor_add_zeros_like 0.50% : 0.000001s : 4: predicate.row_tensor_eliminate 1.30% : 0.000002s : 17: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000001s : 8: predicate.specialize_transform 1.54% : 0.000003s : 17: predicate.split_environ_get_set_with_tuple_value 1.44% : 0.000002s : 17: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.91% : 0.000005s : 29: predicate.switch_defer_inline 2.53% : 0.000004s : 29: predicate.switch_layer_defer_inline 7.86% : 0.000014s : 87: predicate.switch_simplify 1.46% : 0.000003s : 17: predicate.tile_eliminate 1.43% : 0.000002s : 17: predicate.transpose_eliminate 1.55% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 17: predicate.tuple_list_set_item_eliminator 2.01% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 1.92% : 0.000003s : 25: predicate.updatestate_pure_node_eliminater 2.76% : 0.000005s : 33: predicate.updatestate_useless_node_eliminater 1.83% : 0.000003s : 17: predicate.value_based_eliminate 0.26% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001363 19 55.04% : 0.000750s : 9: func_graph_cloner_run.FuncGraphClonerGraph 44.96% : 0.000613s : 10: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 25.242038 76 0.00% : 0.000074s : 1: add_recomputation 0.00% : 0.000191s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.00% : 0.000826s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000012s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000157s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.05% : 0.011576s : 1: jit_opt_a 0.00% : 0.000187s : 1: jit_opt_after_cconv 0.00% : 0.000057s : 1: jit_opt_b 0.00% : 0.000424s : 1: loop_unroll 0.00% : 0.000664s : 1: mutable_eliminate 0.01% : 0.001331s : 26: opt.transform.jit_opt_a 0.00% : 0.000056s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000030s : 4: opt.transform.jit_opt_b 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000487s : 1: opt_after_jit_grad 0.00% : 0.000006s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000017s : 1: pre_auto_parallel 0.00% : 0.000047s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000016s : 1: remove_dup_value 0.00% : 0.000609s : 1: renormalize.infer 0.00% : 0.000703s : 1: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000085s : 1: rewriter_after_opt_a 0.00% : 0.000173s : 1: rewriter_before_opt_a 0.00% : 0.000079s : 1: symbol_engine_optimizer 99.63% : 25.148031s : 1: task_emit 0.30% : 0.075872s : 1: type_inference 0.00% : 0.000075s : 1: validate ... TotalTime = 10.2674, [33] [bootstrap]: 0.00072682 [type_inference]: 0.0768391 [event_method]: 0.00033522 [auto_monad]: 0.00017872 [graph_reusing]: 1.118e-05 [pre_auto_parallel]: 5.15999e-06 [py_interpret_to_execute]: 5.955e-05 [rewriter_before_opt_a]: 0.0002095 [expand_dump_flag]: 5.07999e-06 [jit_opt_a]: 0.0247359, [3] [Cycle 1]: 0.0166279, [27] [switch_simplify]: 0.00020018 [loop_unroll]: 8.338e-05 [a_1]: 0.00163946 [with_stream_mark]: 2.563e-05 [recompute_prepare]: 2.37e-05 [updatestate_depend_eliminate]: 1.051e-05 [updatestate_assign_eliminate]: 8.42998e-06 [updatestate_loads_eliminate]: 7.91001e-06 [parameter_eliminate]: 2.53e-06 [specialize_transform]: 1.757e-05 [updatestate_useless_node_eliminater]: 1.62e-05 [accelerated_algorithm]: 5.818e-05 [meta_shard_fg_expand]: 5.35001e-06 [get_grad_eliminate_]: 1.737e-05 [merge_forward]: 1.008e-05 [cell_reuse_recompute_pass]: 9.20001e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.221e-05 [j_node_and_user_rematch]: 2.832e-05 [meta_fg_expand]: 0.00188726 [replace_old_param]: 7.238e-05 [inline_without_move]: 6.408e-05 [renormalize]: 0.011661 [add_forward_monad_depend]: 3.145e-05 [auto_monad_grad]: 6.28e-06 [auto_monad_eliminator]: 6.374e-05 [cse]: 0.00034351 [replace_applicator]: 8.083e-05 [Cycle 2]: 0.00312205, [27] [switch_simplify]: 4.82e-05 [loop_unroll]: 4.611e-05 [a_1]: 0.00140098 [with_stream_mark]: 1.376e-05 [recompute_prepare]: 1.089e-05 [updatestate_depend_eliminate]: 5.20999e-06 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 3.67002e-06 [parameter_eliminate]: 9.79984e-07 [specialize_transform]: 9.72999e-06 [updatestate_useless_node_eliminater]: 8.67998e-06 [accelerated_algorithm]: 1.163e-05 [meta_shard_fg_expand]: 1.95001e-06 [get_grad_eliminate_]: 8.35999e-06 [merge_forward]: 4.11001e-06 [cell_reuse_recompute_pass]: 8.49977e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.704e-05 [j_node_and_user_rematch]: 1.411e-05 [meta_fg_expand]: 0.0002465 [replace_old_param]: 1.901e-05 [inline_without_move]: 9.53002e-06 [renormalize]: 0.00093585 [add_forward_monad_depend]: 4.41002e-06 [auto_monad_grad]: 1.24e-06 [auto_monad_eliminator]: 1.396e-05 [cse]: 9.98e-05 [replace_applicator]: 1.65e-05 [Cycle 3]: 0.00049634, [27] [switch_simplify]: 1.016e-05 [loop_unroll]: 8.99e-06 [a_1]: 0.00018151 [with_stream_mark]: 1.133e-05 [recompute_prepare]: 8.61002e-06 [updatestate_depend_eliminate]: 4.65001e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.53999e-06 [parameter_eliminate]: 1.09e-06 [specialize_transform]: 8.69998e-06 [updatestate_useless_node_eliminater]: 8.48999e-06 [accelerated_algorithm]: 1.442e-05 [meta_shard_fg_expand]: 1.71002e-06 [get_grad_eliminate_]: 8.29998e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.68e-05 [j_node_and_user_rematch]: 1.287e-05 [meta_fg_expand]: 2.75002e-06 [replace_old_param]: 1.141e-05 [inline_without_move]: 8.03999e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.45001e-06 [auto_monad_grad]: 1.00999e-06 [auto_monad_eliminator]: 9.19e-06 [cse]: 2.593e-05 [replace_applicator]: 8.60999e-06 [py_interpret_to_execute_after_opt_a]: 1.254e-05 [rewriter_after_opt_a]: 0.00014965 [convert_after_rewriter]: 1.014e-05 [order_py_execute_after_rewriter]: 6.54001e-06 [mutable_eliminate]: 0.00048989 [jit_opt_b]: 7.379e-05, [1] [Cycle 1]: 6.735e-05, [2] [frontend_op_eliminate]: 2.869e-05 [inline_after_opt_a]: 2.712e-05 [cconv]: 2.081e-05 [loop_unroll]: 0.00042757 [jit_opt_after_cconv]: 0.00019502, [1] [Cycle 1]: 0.00018894, [11] [c_1]: 3.715e-05 [parameter_eliminate]: 2.39001e-06 [updatestate_depend_eliminate]: 7.18e-06 [updatestate_assign_eliminate]: 4.28999e-06 [updatestate_loads_eliminate]: 3.87002e-06 [cse]: 3.507e-05 [call_graph_tuple_transform]: 2.669e-05 [tuple_list_get_item_eliminator]: 9.29e-06 [none_parameter_eliminate]: 1.51998e-06 [renormalize]: 3.59985e-07 [switch_simplify]: 9.47999e-06 [remove_dup_value]: 3.317e-05 [partial_unused_args_eliminate]: 2.44001e-06 [environ_conv]: 9.61003e-06 [add_recomputation]: 5.701e-05 [cse_after_recomputation]: 3.263e-05, [1] [Cycle 1]: 2.683e-05, [1] [cse]: 2.028e-05 [auto_monad_reorder]: 2.031e-05 [get_jit_bprop_graph]: 1.43002e-06 [rewriter_after_jit_bprop_graph]: 5.30999e-06 [opt_after_jit_grad]: 0.00047916 [symbol_engine_optimizer]: 0.00012302, [1] [Cycle 1]: 0.00011706, [6] [build]: 3.333e-05 [elim_shapecalc]: 1.25e-05 [elim_not_effective]: 1.877e-05 [opt_reshape]: 9.42999e-06 [fold_const_symbol]: 1.386e-05 [renormalize]: 4.39992e-07 [validate]: 4.454e-05 [backend_pass]: 9.70002e-07 [task_emit]: 10.1619 [execute]: 9.08002e-06 Sums bootstrap : 0.000727s : 0.01% type_inference : 0.076839s : 0.75% event_method : 0.000335s : 0.00% auto_monad : 0.000179s : 0.00% graph_reusing : 0.000011s : 0.00% pre_auto_parallel : 0.000005s : 0.00% py_interpret_to_execute : 0.000060s : 0.00% rewriter_before_opt_a : 0.000209s : 0.00% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000259s : 0.00% jit_opt_a.loop_unroll : 0.000138s : 0.00% jit_opt_a.a_1 : 0.003222s : 0.03% jit_opt_a.with_stream_mark : 0.000051s : 0.00% jit_opt_a.recompute_prepare : 0.000043s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000036s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000033s : 0.00% jit_opt_a.accelerated_algorithm : 0.000084s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000034s : 0.00% jit_opt_a.merge_forward : 0.000018s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000066s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000055s : 0.00% jit_opt_a.meta_fg_expand : 0.002137s : 0.02% jit_opt_a.replace_old_param : 0.000103s : 0.00% jit_opt_a.inline_without_move : 0.000082s : 0.00% jit_opt_a.renormalize : 0.012597s : 0.12% jit_opt_a.add_forward_monad_depend : 0.000037s : 0.00% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000087s : 0.00% jit_opt_a.cse : 0.000469s : 0.00% jit_opt_a.replace_applicator : 0.000106s : 0.00% py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% rewriter_after_opt_a : 0.000150s : 0.00% convert_after_rewriter : 0.000010s : 0.00% order_py_execute_after_rewriter : 0.000007s : 0.00% mutable_eliminate : 0.000490s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000029s : 0.00% jit_opt_b.inline_after_opt_a : 0.000027s : 0.00% cconv : 0.000021s : 0.00% loop_unroll : 0.000428s : 0.00% jit_opt_after_cconv.c_1 : 0.000037s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000035s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000027s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000009s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000033s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000010s : 0.00% add_recomputation : 0.000057s : 0.00% cse_after_recomputation.cse : 0.000020s : 0.00% auto_monad_reorder : 0.000020s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000479s : 0.00% symbol_engine_optimizer.build : 0.000033s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000045s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 10.161855s : 99.02% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000836 174 0.29% : 0.000002s : 4: substitution.elim_not_effective 0.23% : 0.000002s : 4: substitution.fold_const_symbol 0.83% : 0.000007s : 6: substitution.graph_param_transform 66.25% : 0.000554s : 23: substitution.inline 1.98% : 0.000017s : 2: substitution.inline_without_move 1.19% : 0.000010s : 18: substitution.j_node_and_user_rematch 4.06% : 0.000034s : 3: substitution.less_batch_normalization 1.54% : 0.000013s : 11: substitution.minmaximum_grad 2.49% : 0.000021s : 10: substitution.partial_eliminate 1.62% : 0.000014s : 18: substitution.remove_not_recompute_node 2.77% : 0.000023s : 9: substitution.replace_applicator 1.30% : 0.000011s : 16: substitution.replace_old_param 0.33% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.55% : 0.000013s : 4: substitution.switch_simplify 3.45% : 0.000029s : 11: substitution.tuple_list_convert_item_index_to_positive 3.42% : 0.000029s : 11: substitution.tuple_list_get_item_depend_reorder 6.69% : 0.000056s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.076729 2 95.40% : 0.073197s : 1: type_inference.infer 4.60% : 0.003533s : 1: type_inference.specialize ------[replace.] 0.000312 39 57.18% : 0.000178s : 23: replace.inline 15.63% : 0.000049s : 4: replace.switch_simplify 27.18% : 0.000085s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000581 39 93.15% : 0.000541s : 23: match.inline 1.82% : 0.000011s : 4: match.switch_simplify 5.03% : 0.000029s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000562 3976 1.45% : 0.000008s : 66: predicate.accumulaten_eliminater 0.38% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.40% : 0.000008s : 66: predicate.addn_check_dump 1.63% : 0.000009s : 66: predicate.addn_zero_filter 2.27% : 0.000013s : 66: predicate.arithmetic_simplify 1.53% : 0.000009s : 66: predicate.cast_eliminate 0.19% : 0.000001s : 6: predicate.check_bprop_eliminate 1.39% : 0.000008s : 66: predicate.compare_switch_simplify 1.48% : 0.000008s : 66: predicate.depend_value_elim 1.41% : 0.000008s : 66: predicate.dict_get_item_const_eliminator 1.50% : 0.000008s : 66: predicate.dict_get_item_eliminator 1.42% : 0.000008s : 66: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.21% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.43% : 0.000008s : 66: predicate.environ_add_const_eliminate 1.43% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.43% : 0.000008s : 66: predicate.environ_get_depend_swap 1.54% : 0.000009s : 66: predicate.environ_get_eliminate 1.41% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.74% : 0.000004s : 27: predicate.get_grad_eliminate 0.12% : 0.000001s : 6: predicate.graph_param_transform 4.41% : 0.000025s : 113: predicate.inline 1.63% : 0.000009s : 56: predicate.inline_without_move 0.37% : 0.000002s : 27: predicate.j_node_and_user_rematch 0.84% : 0.000005s : 27: predicate.less_batch_normalization 1.82% : 0.000010s : 78: predicate.list_to_tuple_eliminator_ 2.08% : 0.000012s : 84: predicate.load_eliminater 0.39% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.67% : 0.000021s : 147: predicate.loop_unroll_before_grad 1.73% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 1.43% : 0.000008s : 66: predicate.merge_addn 1.57% : 0.000009s : 66: predicate.minmaximum_grad 0.47% : 0.000003s : 6: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.44% : 0.000014s : 84: predicate.partial_eliminate 1.48% : 0.000008s : 66: predicate.print_const_string_wrapper 1.96% : 0.000011s : 66: predicate.reduce_eliminate 1.82% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000002s : 27: predicate.remove_not_recompute_node 2.45% : 0.000014s : 137: predicate.replace_applicator 0.85% : 0.000005s : 56: predicate.replace_old_param 0.11% : 0.000001s : 6: predicate.reset_defer_inline 1.52% : 0.000009s : 66: predicate.reshape_eliminate 1.44% : 0.000008s : 66: predicate.row_tensor_add_zeros_like 0.23% : 0.000001s : 6: predicate.row_tensor_eliminate 1.49% : 0.000008s : 66: predicate.same_eliminate 0.44% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.40% : 0.000002s : 12: predicate.special_op_eliminate 0.80% : 0.000004s : 27: predicate.specialize_transform 1.74% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.60% : 0.000009s : 66: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 3.02% : 0.000017s : 101: predicate.switch_defer_inline 2.64% : 0.000015s : 101: predicate.switch_layer_defer_inline 7.07% : 0.000040s : 262: predicate.switch_simplify 1.58% : 0.000009s : 66: predicate.tile_eliminate 1.51% : 0.000008s : 66: predicate.transpose_eliminate 1.93% : 0.000011s : 66: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000018s : 90: predicate.tuple_list_get_item_eliminator 1.83% : 0.000010s : 66: predicate.tuple_list_set_item_eliminator 1.85% : 0.000010s : 78: predicate.tuple_to_list_eliminator_ 1.93% : 0.000011s : 84: predicate.updatestate_pure_node_eliminater 2.75% : 0.000015s : 111: predicate.updatestate_useless_node_eliminater 1.93% : 0.000011s : 66: predicate.value_based_eliminate 0.15% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.20% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003560 50 63.87% : 0.002274s : 23: func_graph_cloner_run.FuncGraphClonerGraph 36.13% : 0.001286s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 10.284376 91 0.00% : 0.000060s : 1: add_recomputation 0.00% : 0.000185s : 1: auto_monad 0.00% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.01% : 0.000744s : 1: bootstrap 0.00% : 0.000024s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.00% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000012s : 1: environ_conv 0.00% : 0.000344s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.24% : 0.024739s : 1: jit_opt_a 0.00% : 0.000198s : 1: jit_opt_after_cconv 0.00% : 0.000077s : 1: jit_opt_b 0.00% : 0.000435s : 1: loop_unroll 0.00% : 0.000497s : 1: mutable_eliminate 0.04% : 0.004215s : 39: opt.transform.jit_opt_a 0.00% : 0.000079s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000049s : 4: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000018s : 1: opt.transform.mutable_eliminate 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000051s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000487s : 1: opt_after_jit_grad 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000007s : 1: pre_auto_parallel 0.00% : 0.000063s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000036s : 1: remove_dup_value 0.10% : 0.010347s : 2: renormalize.infer 0.02% : 0.002234s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000153s : 1: rewriter_after_opt_a 0.00% : 0.000213s : 1: rewriter_before_opt_a 0.00% : 0.000126s : 1: symbol_engine_optimizer 98.81% : 10.161876s : 1: task_emit 0.75% : 0.076852s : 1: type_inference 0.00% : 0.000059s : 1: validate TotalTime = 10.5517, [33] [bootstrap]: 0.00052108 [type_inference]: 0.0730422 [event_method]: 0.0003017 [auto_monad]: 0.0001645 [graph_reusing]: 1.006e-05 [pre_auto_parallel]: 3.79002e-06 [py_interpret_to_execute]: 5.982e-05 [rewriter_before_opt_a]: 0.00020357 [expand_dump_flag]: 3.93999e-06 [jit_opt_a]: 0.0244651, [3] [Cycle 1]: 0.0166156, [27] [switch_simplify]: 0.00019395 [loop_unroll]: 8.326e-05 [a_1]: 0.00159677 [with_stream_mark]: 2.369e-05 [recompute_prepare]: 2.311e-05 [updatestate_depend_eliminate]: 9.59e-06 [updatestate_assign_eliminate]: 7.97e-06 [updatestate_loads_eliminate]: 7.61001e-06 [parameter_eliminate]: 1.83997e-06 [specialize_transform]: 1.763e-05 [updatestate_useless_node_eliminater]: 1.623e-05 [accelerated_algorithm]: 5.52e-05 [meta_shard_fg_expand]: 5.22999e-06 [get_grad_eliminate_]: 1.694e-05 [merge_forward]: 1.711e-05 [cell_reuse_recompute_pass]: 1.04998e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.149e-05 [j_node_and_user_rematch]: 2.88e-05 [meta_fg_expand]: 0.00187957 [replace_old_param]: 7.096e-05 [inline_without_move]: 6.527e-05 [renormalize]: 0.0117221 [add_forward_monad_depend]: 2.811e-05 [auto_monad_grad]: 5.49e-06 [auto_monad_eliminator]: 6.135e-05 [cse]: 0.00033642 [replace_applicator]: 8.288e-05 [Cycle 2]: 0.00311253, [27] [switch_simplify]: 4.899e-05 [loop_unroll]: 4.585e-05 [a_1]: 0.00140255 [with_stream_mark]: 1.313e-05 [recompute_prepare]: 1.073e-05 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 4.35999e-06 [updatestate_loads_eliminate]: 3.86999e-06 [parameter_eliminate]: 1.09e-06 [specialize_transform]: 9.11002e-06 [updatestate_useless_node_eliminater]: 8.87e-06 [accelerated_algorithm]: 1.195e-05 [meta_shard_fg_expand]: 2.09e-06 [get_grad_eliminate_]: 8.46002e-06 [merge_forward]: 4.30999e-06 [cell_reuse_recompute_pass]: 8.89995e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.703e-05 [j_node_and_user_rematch]: 1.436e-05 [meta_fg_expand]: 0.00024127 [replace_old_param]: 2.033e-05 [inline_without_move]: 9.95002e-06 [renormalize]: 0.00089539 [add_forward_monad_depend]: 4.47e-06 [auto_monad_grad]: 1.24998e-06 [auto_monad_eliminator]: 1.409e-05 [cse]: 0.00010215 [replace_applicator]: 1.637e-05 [Cycle 3]: 0.00052974, [27] [switch_simplify]: 9.64e-06 [loop_unroll]: 8.69e-06 [a_1]: 0.00020653 [with_stream_mark]: 1.206e-05 [recompute_prepare]: 8.92e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.68e-06 [parameter_eliminate]: 9.89996e-07 [specialize_transform]: 8.67e-06 [updatestate_useless_node_eliminater]: 8.62e-06 [accelerated_algorithm]: 1.44e-05 [meta_shard_fg_expand]: 1.74998e-06 [get_grad_eliminate_]: 8.25e-06 [merge_forward]: 4.18999e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.618e-05 [j_node_and_user_rematch]: 1.34e-05 [meta_fg_expand]: 2.88e-06 [replace_old_param]: 1.16e-05 [inline_without_move]: 8.29002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.04003e-06 [auto_monad_eliminator]: 9.95002e-06 [cse]: 3.353e-05 [replace_applicator]: 9.49999e-06 [py_interpret_to_execute_after_opt_a]: 1.283e-05 [rewriter_after_opt_a]: 0.00014837 [convert_after_rewriter]: 9.89001e-06 [order_py_execute_after_rewriter]: 6.81001e-06 [mutable_eliminate]: 0.00050587 [jit_opt_b]: 7.357e-05, [1] [Cycle 1]: 6.684e-05, [2] [frontend_op_eliminate]: 2.812e-05 [inline_after_opt_a]: 2.691e-05 [cconv]: 1.929e-05 [loop_unroll]: 0.00042952 [jit_opt_after_cconv]: 0.00019501, [1] [Cycle 1]: 0.00018884, [11] [c_1]: 3.67e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 7.23e-06 [updatestate_assign_eliminate]: 4.07998e-06 [updatestate_loads_eliminate]: 3.78999e-06 [cse]: 3.61e-05 [call_graph_tuple_transform]: 2.676e-05 [tuple_list_get_item_eliminator]: 9.19e-06 [none_parameter_eliminate]: 1.19998e-06 [renormalize]: 4.30009e-07 [switch_simplify]: 9.25999e-06 [remove_dup_value]: 3.219e-05 [partial_unused_args_eliminate]: 2.20002e-06 [environ_conv]: 9.01002e-06 [add_recomputation]: 5.265e-05 [cse_after_recomputation]: 3.306e-05, [1] [Cycle 1]: 2.744e-05, [1] [cse]: 2.131e-05 [auto_monad_reorder]: 1.794e-05 [get_jit_bprop_graph]: 2.05002e-06 [rewriter_after_jit_bprop_graph]: 4.67e-06 [opt_after_jit_grad]: 0.00048483 [symbol_engine_optimizer]: 0.00012221, [1] [Cycle 1]: 0.0001163, [6] [build]: 3.072e-05 [elim_shapecalc]: 1.322e-05 [elim_not_effective]: 1.853e-05 [opt_reshape]: 1.026e-05 [fold_const_symbol]: 1.419e-05 [renormalize]: 4.60015e-07 [validate]: 4.557e-05 [backend_pass]: 7.59988e-07 [task_emit]: 10.4505 [execute]: 9.07999e-06 Sums bootstrap : 0.000521s : 0.00% type_inference : 0.073042s : 0.69% event_method : 0.000302s : 0.00% auto_monad : 0.000165s : 0.00% graph_reusing : 0.000010s : 0.00% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000060s : 0.00% rewriter_before_opt_a : 0.000204s : 0.00% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000253s : 0.00% jit_opt_a.loop_unroll : 0.000138s : 0.00% jit_opt_a.a_1 : 0.003206s : 0.03% jit_opt_a.with_stream_mark : 0.000049s : 0.00% jit_opt_a.recompute_prepare : 0.000043s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% jit_opt_a.parameter_eliminate : 0.000004s : 0.00% jit_opt_a.specialize_transform : 0.000035s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000034s : 0.00% jit_opt_a.accelerated_algorithm : 0.000082s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000034s : 0.00% jit_opt_a.merge_forward : 0.000026s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000065s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000057s : 0.00% jit_opt_a.meta_fg_expand : 0.002124s : 0.02% jit_opt_a.replace_old_param : 0.000103s : 0.00% jit_opt_a.inline_without_move : 0.000084s : 0.00% jit_opt_a.renormalize : 0.012618s : 0.12% jit_opt_a.add_forward_monad_depend : 0.000034s : 0.00% jit_opt_a.auto_monad_grad : 0.000008s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000085s : 0.00% jit_opt_a.cse : 0.000472s : 0.00% jit_opt_a.replace_applicator : 0.000109s : 0.00% py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% rewriter_after_opt_a : 0.000148s : 0.00% convert_after_rewriter : 0.000010s : 0.00% order_py_execute_after_rewriter : 0.000007s : 0.00% mutable_eliminate : 0.000506s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000028s : 0.00% jit_opt_b.inline_after_opt_a : 0.000027s : 0.00% cconv : 0.000019s : 0.00% loop_unroll : 0.000430s : 0.00% jit_opt_after_cconv.c_1 : 0.000037s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000036s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000027s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000009s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000009s : 0.00% remove_dup_value : 0.000032s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000009s : 0.00% add_recomputation : 0.000053s : 0.00% cse_after_recomputation.cse : 0.000021s : 0.00% auto_monad_reorder : 0.000018s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000485s : 0.00% symbol_engine_optimizer.build : 0.000031s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000046s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 10.450471s : 99.09% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000795 174 0.32% : 0.000003s : 4: substitution.elim_not_effective 0.25% : 0.000002s : 4: substitution.fold_const_symbol 0.82% : 0.000007s : 6: substitution.graph_param_transform 67.20% : 0.000534s : 23: substitution.inline 2.15% : 0.000017s : 2: substitution.inline_without_move 1.22% : 0.000010s : 18: substitution.j_node_and_user_rematch 4.07% : 0.000032s : 3: substitution.less_batch_normalization 1.62% : 0.000013s : 11: substitution.minmaximum_grad 2.36% : 0.000019s : 10: substitution.partial_eliminate 1.66% : 0.000013s : 18: substitution.remove_not_recompute_node 2.92% : 0.000023s : 9: substitution.replace_applicator 1.35% : 0.000011s : 16: substitution.replace_old_param 0.34% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.48% : 0.000012s : 4: substitution.switch_simplify 3.34% : 0.000027s : 11: substitution.tuple_list_convert_item_index_to_positive 2.34% : 0.000019s : 11: substitution.tuple_list_get_item_depend_reorder 6.53% : 0.000052s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.072940 2 95.31% : 0.069516s : 1: type_inference.infer 4.69% : 0.003424s : 1: type_inference.specialize ------[replace.] 0.000311 39 56.10% : 0.000175s : 23: replace.inline 15.80% : 0.000049s : 4: replace.switch_simplify 28.10% : 0.000088s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000556 39 93.75% : 0.000521s : 23: match.inline 1.70% : 0.000009s : 4: match.switch_simplify 4.55% : 0.000025s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000581 3976 1.43% : 0.000008s : 66: predicate.accumulaten_eliminater 0.36% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.39% : 0.000008s : 66: predicate.addn_check_dump 1.44% : 0.000008s : 66: predicate.addn_zero_filter 2.04% : 0.000012s : 66: predicate.arithmetic_simplify 1.44% : 0.000008s : 66: predicate.cast_eliminate 0.16% : 0.000001s : 6: predicate.check_bprop_eliminate 1.38% : 0.000008s : 66: predicate.compare_switch_simplify 1.44% : 0.000008s : 66: predicate.depend_value_elim 1.41% : 0.000008s : 66: predicate.dict_get_item_const_eliminator 1.47% : 0.000009s : 66: predicate.dict_get_item_eliminator 1.51% : 0.000009s : 66: predicate.dict_set_item_eliminator 0.24% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.17% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.40% : 0.000008s : 66: predicate.environ_add_const_eliminate 1.38% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.35% : 0.000008s : 66: predicate.environ_get_depend_swap 1.45% : 0.000008s : 66: predicate.environ_get_eliminate 1.36% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.09% : 0.000001s : 6: predicate.fold_const_symbol 0.70% : 0.000004s : 27: predicate.get_grad_eliminate 0.11% : 0.000001s : 6: predicate.graph_param_transform 4.14% : 0.000024s : 113: predicate.inline 1.56% : 0.000009s : 56: predicate.inline_without_move 0.34% : 0.000002s : 27: predicate.j_node_and_user_rematch 0.83% : 0.000005s : 27: predicate.less_batch_normalization 1.73% : 0.000010s : 78: predicate.list_to_tuple_eliminator_ 1.93% : 0.000011s : 84: predicate.load_eliminater 0.42% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.54% : 0.000021s : 147: predicate.loop_unroll_before_grad 1.64% : 0.000010s : 72: predicate.make_slice_get_slice_eliminator 1.36% : 0.000008s : 66: predicate.merge_addn 1.43% : 0.000008s : 66: predicate.minmaximum_grad 0.43% : 0.000002s : 6: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.34% : 0.000014s : 84: predicate.partial_eliminate 1.40% : 0.000008s : 66: predicate.print_const_string_wrapper 1.92% : 0.000011s : 66: predicate.reduce_eliminate 1.73% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000002s : 27: predicate.remove_not_recompute_node 2.38% : 0.000014s : 137: predicate.replace_applicator 0.84% : 0.000005s : 56: predicate.replace_old_param 0.10% : 0.000001s : 6: predicate.reset_defer_inline 5.71% : 0.000033s : 66: predicate.reshape_eliminate 1.40% : 0.000008s : 66: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 6: predicate.row_tensor_eliminate 1.45% : 0.000008s : 66: predicate.same_eliminate 0.43% : 0.000002s : 27: predicate.set_cell_output_no_recompute 0.40% : 0.000002s : 12: predicate.special_op_eliminate 0.76% : 0.000004s : 27: predicate.specialize_transform 1.68% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.44% : 0.000008s : 66: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.87% : 0.000017s : 101: predicate.switch_defer_inline 2.57% : 0.000015s : 101: predicate.switch_layer_defer_inline 6.78% : 0.000039s : 262: predicate.switch_simplify 1.48% : 0.000009s : 66: predicate.tile_eliminate 1.43% : 0.000008s : 66: predicate.transpose_eliminate 1.82% : 0.000011s : 66: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 2.84% : 0.000016s : 90: predicate.tuple_list_get_item_eliminator 1.77% : 0.000010s : 66: predicate.tuple_list_set_item_eliminator 1.93% : 0.000011s : 78: predicate.tuple_to_list_eliminator_ 1.77% : 0.000010s : 84: predicate.updatestate_pure_node_eliminater 2.73% : 0.000016s : 111: predicate.updatestate_useless_node_eliminater 1.88% : 0.000011s : 66: predicate.value_based_eliminate 0.14% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.21% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003276 50 61.27% : 0.002007s : 23: func_graph_cloner_run.FuncGraphClonerGraph 38.73% : 0.001269s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 10.568669 91 0.00% : 0.000056s : 1: add_recomputation 0.00% : 0.000172s : 1: auto_monad 0.00% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.01% : 0.000535s : 1: bootstrap 0.00% : 0.000022s : 1: cconv 0.00% : 0.000012s : 1: convert_after_rewriter 0.00% : 0.000035s : 1: cse_after_recomputation 0.00% : 0.000011s : 1: environ_conv 0.00% : 0.000309s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.23% : 0.024468s : 1: jit_opt_a 0.00% : 0.000198s : 1: jit_opt_after_cconv 0.00% : 0.000076s : 1: jit_opt_b 0.00% : 0.000437s : 1: loop_unroll 0.00% : 0.000514s : 1: mutable_eliminate 0.04% : 0.004193s : 39: opt.transform.jit_opt_a 0.00% : 0.000078s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000048s : 4: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000019s : 1: opt.transform.mutable_eliminate 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000052s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000492s : 1: opt_after_jit_grad 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pre_auto_parallel 0.00% : 0.000063s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000035s : 1: remove_dup_value 0.10% : 0.010385s : 2: renormalize.infer 0.02% : 0.002216s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000152s : 1: rewriter_after_opt_a 0.00% : 0.000207s : 1: rewriter_before_opt_a 0.00% : 0.000125s : 1: symbol_engine_optimizer 98.88% : 10.450492s : 1: task_emit 0.69% : 0.073055s : 1: type_inference 0.00% : 0.000060s : 1: validate TotalTime = 10.2565, [33] [bootstrap]: 0.00075579 [type_inference]: 0.0760477 [event_method]: 0.00034916 [auto_monad]: 0.00017773 [graph_reusing]: 1.191e-05 [pre_auto_parallel]: 4.27e-06 [py_interpret_to_execute]: 6.121e-05 [rewriter_before_opt_a]: 0.00020883 [expand_dump_flag]: 4.82e-06 [jit_opt_a]: 0.0241996, [3] [Cycle 1]: 0.0164516, [27] [switch_simplify]: 0.00019677 [loop_unroll]: 8.224e-05 [a_1]: 0.00161375 [with_stream_mark]: 2.519e-05 [recompute_prepare]: 2.338e-05 [updatestate_depend_eliminate]: 1.027e-05 [updatestate_assign_eliminate]: 9.12001e-06 [updatestate_loads_eliminate]: 8.18999e-06 [parameter_eliminate]: 2.57001e-06 [specialize_transform]: 1.79e-05 [updatestate_useless_node_eliminater]: 1.624e-05 [accelerated_algorithm]: 5.513e-05 [meta_shard_fg_expand]: 5.12e-06 [get_grad_eliminate_]: 1.701e-05 [merge_forward]: 1.044e-05 [cell_reuse_recompute_pass]: 1.15999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.315e-05 [j_node_and_user_rematch]: 2.842e-05 [meta_fg_expand]: 0.0018748 [replace_old_param]: 7.205e-05 [inline_without_move]: 6.52e-05 [renormalize]: 0.0115323 [add_forward_monad_depend]: 3.416e-05 [auto_monad_grad]: 6.12999e-06 [auto_monad_eliminator]: 6.147e-05 [cse]: 0.00033903 [replace_applicator]: 7.916e-05 [Cycle 2]: 0.00303619, [27] [switch_simplify]: 4.829e-05 [loop_unroll]: 4.617e-05 [a_1]: 0.00139791 [with_stream_mark]: 1.228e-05 [recompute_prepare]: 1.04e-05 [updatestate_depend_eliminate]: 5.12e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 3.63999e-06 [parameter_eliminate]: 1.20001e-06 [specialize_transform]: 9.84001e-06 [updatestate_useless_node_eliminater]: 9.15999e-06 [accelerated_algorithm]: 1.188e-05 [meta_shard_fg_expand]: 2.16e-06 [get_grad_eliminate_]: 8.49002e-06 [merge_forward]: 4.04002e-06 [cell_reuse_recompute_pass]: 9.5999e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.7e-05 [j_node_and_user_rematch]: 1.367e-05 [meta_fg_expand]: 0.00020517 [replace_old_param]: 1.755e-05 [inline_without_move]: 9.59999e-06 [renormalize]: 0.00089925 [add_forward_monad_depend]: 4.35e-06 [auto_monad_grad]: 1.25999e-06 [auto_monad_eliminator]: 1.425e-05 [cse]: 9.678e-05 [replace_applicator]: 1.653e-05 [Cycle 3]: 0.00049183, [27] [switch_simplify]: 9.94001e-06 [loop_unroll]: 8.85999e-06 [a_1]: 0.00017837 [with_stream_mark]: 1.075e-05 [recompute_prepare]: 8.91997e-06 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 8.89995e-07 [specialize_transform]: 8.74e-06 [updatestate_useless_node_eliminater]: 8.73001e-06 [accelerated_algorithm]: 1.468e-05 [meta_shard_fg_expand]: 1.74e-06 [get_grad_eliminate_]: 8.62e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.37e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.692e-05 [j_node_and_user_rematch]: 1.324e-05 [meta_fg_expand]: 2.87002e-06 [replace_old_param]: 1.166e-05 [inline_without_move]: 8.85001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 9.69e-06 [cse]: 2.517e-05 [replace_applicator]: 8.66002e-06 [py_interpret_to_execute_after_opt_a]: 1.135e-05 [rewriter_after_opt_a]: 0.00014583 [convert_after_rewriter]: 1.033e-05 [order_py_execute_after_rewriter]: 6.92002e-06 [mutable_eliminate]: 0.00048293 [jit_opt_b]: 7.31e-05, [1] [Cycle 1]: 6.661e-05, [2] [frontend_op_eliminate]: 2.786e-05 [inline_after_opt_a]: 2.666e-05 [cconv]: 2.105e-05 [loop_unroll]: 0.000435 [jit_opt_after_cconv]: 0.00019636, [1] [Cycle 1]: 0.00018997, [11] [c_1]: 3.754e-05 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 7.38e-06 [updatestate_assign_eliminate]: 4.06001e-06 [updatestate_loads_eliminate]: 3.82002e-06 [cse]: 3.536e-05 [call_graph_tuple_transform]: 2.678e-05 [tuple_list_get_item_eliminator]: 9.55001e-06 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 9.54e-06 [remove_dup_value]: 3.616e-05 [partial_unused_args_eliminate]: 2.29999e-06 [environ_conv]: 9.71e-06 [add_recomputation]: 5.775e-05 [cse_after_recomputation]: 3.176e-05, [1] [Cycle 1]: 2.623e-05, [1] [cse]: 2.025e-05 [auto_monad_reorder]: 2.163e-05 [get_jit_bprop_graph]: 1.44e-06 [rewriter_after_jit_bprop_graph]: 4.91997e-06 [opt_after_jit_grad]: 0.00048415 [symbol_engine_optimizer]: 0.00012627, [1] [Cycle 1]: 0.00011988, [6] [build]: 3.339e-05 [elim_shapecalc]: 1.29e-05 [elim_not_effective]: 1.978e-05 [opt_reshape]: 9.54e-06 [fold_const_symbol]: 1.486e-05 [renormalize]: 4.00003e-07 [validate]: 6.717e-05 [backend_pass]: 9.5999e-07 [task_emit]: 10.1523 [execute]: 9.02e-06 Sums bootstrap : 0.000756s : 0.01% type_inference : 0.076048s : 0.74% event_method : 0.000349s : 0.00% auto_monad : 0.000178s : 0.00% graph_reusing : 0.000012s : 0.00% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000061s : 0.00% rewriter_before_opt_a : 0.000209s : 0.00% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000255s : 0.00% jit_opt_a.loop_unroll : 0.000137s : 0.00% jit_opt_a.a_1 : 0.003190s : 0.03% jit_opt_a.with_stream_mark : 0.000048s : 0.00% jit_opt_a.recompute_prepare : 0.000043s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000036s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000034s : 0.00% jit_opt_a.accelerated_algorithm : 0.000082s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000034s : 0.00% jit_opt_a.merge_forward : 0.000019s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000067s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000055s : 0.00% jit_opt_a.meta_fg_expand : 0.002083s : 0.02% jit_opt_a.replace_old_param : 0.000101s : 0.00% jit_opt_a.inline_without_move : 0.000084s : 0.00% jit_opt_a.renormalize : 0.012432s : 0.12% jit_opt_a.add_forward_monad_depend : 0.000040s : 0.00% jit_opt_a.auto_monad_grad : 0.000008s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000085s : 0.00% jit_opt_a.cse : 0.000461s : 0.00% jit_opt_a.replace_applicator : 0.000104s : 0.00% py_interpret_to_execute_after_opt_a : 0.000011s : 0.00% rewriter_after_opt_a : 0.000146s : 0.00% convert_after_rewriter : 0.000010s : 0.00% order_py_execute_after_rewriter : 0.000007s : 0.00% mutable_eliminate : 0.000483s : 0.00% jit_opt_b.frontend_op_eliminate : 0.000028s : 0.00% jit_opt_b.inline_after_opt_a : 0.000027s : 0.00% cconv : 0.000021s : 0.00% loop_unroll : 0.000435s : 0.00% jit_opt_after_cconv.c_1 : 0.000038s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.cse : 0.000035s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000027s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000010s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000010s : 0.00% remove_dup_value : 0.000036s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000010s : 0.00% add_recomputation : 0.000058s : 0.00% cse_after_recomputation.cse : 0.000020s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000484s : 0.00% symbol_engine_optimizer.build : 0.000033s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000067s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 10.152267s : 99.03% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000816 174 0.36% : 0.000003s : 4: substitution.elim_not_effective 0.27% : 0.000002s : 4: substitution.fold_const_symbol 0.84% : 0.000007s : 6: substitution.graph_param_transform 65.83% : 0.000537s : 23: substitution.inline 2.11% : 0.000017s : 2: substitution.inline_without_move 1.23% : 0.000010s : 18: substitution.j_node_and_user_rematch 4.16% : 0.000034s : 3: substitution.less_batch_normalization 1.52% : 0.000012s : 11: substitution.minmaximum_grad 3.42% : 0.000028s : 10: substitution.partial_eliminate 1.70% : 0.000014s : 18: substitution.remove_not_recompute_node 2.87% : 0.000023s : 9: substitution.replace_applicator 1.33% : 0.000011s : 16: substitution.replace_old_param 0.37% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.67% : 0.000014s : 4: substitution.switch_simplify 3.40% : 0.000028s : 11: substitution.tuple_list_convert_item_index_to_positive 2.20% : 0.000018s : 11: substitution.tuple_list_get_item_depend_reorder 6.72% : 0.000055s : 23: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.075940 2 95.63% : 0.072623s : 1: type_inference.infer 4.37% : 0.003317s : 1: type_inference.specialize ------[replace.] 0.000304 39 57.15% : 0.000174s : 23: replace.inline 15.21% : 0.000046s : 4: replace.switch_simplify 27.65% : 0.000084s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000564 39 92.92% : 0.000524s : 23: match.inline 2.00% : 0.000011s : 4: match.switch_simplify 5.08% : 0.000029s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000555 3976 1.51% : 0.000008s : 66: predicate.accumulaten_eliminater 0.36% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 1.41% : 0.000008s : 66: predicate.addn_check_dump 1.46% : 0.000008s : 66: predicate.addn_zero_filter 2.22% : 0.000012s : 66: predicate.arithmetic_simplify 1.48% : 0.000008s : 66: predicate.cast_eliminate 0.16% : 0.000001s : 6: predicate.check_bprop_eliminate 1.44% : 0.000008s : 66: predicate.compare_switch_simplify 1.49% : 0.000008s : 66: predicate.depend_value_elim 1.42% : 0.000008s : 66: predicate.dict_get_item_const_eliminator 1.53% : 0.000008s : 66: predicate.dict_get_item_eliminator 1.48% : 0.000008s : 66: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 6: predicate.elim_not_effective 0.20% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000008s : 66: predicate.environ_add_const_eliminate 1.42% : 0.000008s : 66: predicate.environ_get_add_eliminate 1.47% : 0.000008s : 66: predicate.environ_get_depend_swap 1.45% : 0.000008s : 66: predicate.environ_get_eliminate 1.46% : 0.000008s : 66: predicate.environ_get_set_eliminate 0.10% : 0.000001s : 6: predicate.fold_const_symbol 0.77% : 0.000004s : 27: predicate.get_grad_eliminate 0.09% : 0.000001s : 6: predicate.graph_param_transform 4.37% : 0.000024s : 113: predicate.inline 1.70% : 0.000009s : 56: predicate.inline_without_move 0.36% : 0.000002s : 27: predicate.j_node_and_user_rematch 0.90% : 0.000005s : 27: predicate.less_batch_normalization 1.84% : 0.000010s : 78: predicate.list_to_tuple_eliminator_ 2.00% : 0.000011s : 84: predicate.load_eliminater 0.44% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.69% : 0.000021s : 147: predicate.loop_unroll_before_grad 1.70% : 0.000009s : 72: predicate.make_slice_get_slice_eliminator 1.46% : 0.000008s : 66: predicate.merge_addn 1.48% : 0.000008s : 66: predicate.minmaximum_grad 0.47% : 0.000003s : 6: predicate.mutable_eliminate 0.18% : 0.000001s : 6: predicate.opt_reshape 2.42% : 0.000013s : 84: predicate.partial_eliminate 1.45% : 0.000008s : 66: predicate.print_const_string_wrapper 1.93% : 0.000011s : 66: predicate.reduce_eliminate 1.81% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000002s : 27: predicate.remove_not_recompute_node 2.44% : 0.000014s : 137: predicate.replace_applicator 0.91% : 0.000005s : 56: predicate.replace_old_param 0.11% : 0.000001s : 6: predicate.reset_defer_inline 1.54% : 0.000009s : 66: predicate.reshape_eliminate 1.48% : 0.000008s : 66: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 6: predicate.row_tensor_eliminate 1.47% : 0.000008s : 66: predicate.same_eliminate 0.46% : 0.000003s : 27: predicate.set_cell_output_no_recompute 0.38% : 0.000002s : 12: predicate.special_op_eliminate 0.79% : 0.000004s : 27: predicate.specialize_transform 1.76% : 0.000010s : 66: predicate.split_environ_get_set_with_tuple_value 1.50% : 0.000008s : 66: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.97% : 0.000016s : 101: predicate.switch_defer_inline 2.76% : 0.000015s : 101: predicate.switch_layer_defer_inline 7.10% : 0.000039s : 262: predicate.switch_simplify 1.54% : 0.000009s : 66: predicate.tile_eliminate 1.50% : 0.000008s : 66: predicate.transpose_eliminate 1.89% : 0.000010s : 66: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000010s : 66: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000017s : 90: predicate.tuple_list_get_item_eliminator 1.92% : 0.000011s : 66: predicate.tuple_list_set_item_eliminator 1.84% : 0.000010s : 78: predicate.tuple_to_list_eliminator_ 1.92% : 0.000011s : 84: predicate.updatestate_pure_node_eliminater 2.85% : 0.000016s : 111: predicate.updatestate_useless_node_eliminater 1.86% : 0.000010s : 66: predicate.value_based_eliminate 0.15% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.23% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003271 50 63.73% : 0.002085s : 23: func_graph_cloner_run.FuncGraphClonerGraph 36.27% : 0.001186s : 27: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 10.273335 91 0.00% : 0.000061s : 1: add_recomputation 0.00% : 0.000185s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.01% : 0.000769s : 1: bootstrap 0.00% : 0.000024s : 1: cconv 0.00% : 0.000013s : 1: convert_after_rewriter 0.00% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000012s : 1: environ_conv 0.00% : 0.000359s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.24% : 0.024202s : 1: jit_opt_a 0.00% : 0.000199s : 1: jit_opt_after_cconv 0.00% : 0.000076s : 1: jit_opt_b 0.00% : 0.000443s : 1: loop_unroll 0.00% : 0.000491s : 1: mutable_eliminate 0.04% : 0.004176s : 39: opt.transform.jit_opt_a 0.00% : 0.000079s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000047s : 4: opt.transform.jit_opt_b 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000019s : 1: opt.transform.mutable_eliminate 0.00% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000053s : 4: opt.transform.symbol_engine_opt 0.00% : 0.000492s : 1: opt_after_jit_grad 0.00% : 0.000009s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pre_auto_parallel 0.00% : 0.000064s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000039s : 1: remove_dup_value 0.10% : 0.010307s : 2: renormalize.infer 0.02% : 0.002110s : 2: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000149s : 1: rewriter_after_opt_a 0.00% : 0.000212s : 1: rewriter_before_opt_a 0.00% : 0.000129s : 1: symbol_engine_optimizer 98.82% : 10.152292s : 1: task_emit 0.74% : 0.076060s : 1: type_inference 0.00% : 0.000082s : 1: validate group_cases_15 have all been run, results of sub cases are below: case: ('pynative', ) {} pass. case: (1,) {} pass. case: ('KBK', 'none') {} pass. case: (0,) {} pass. case: ('pynative', ) {} pass. case: ('graph', 'none') {} pass. case: ('graph', 'mean') {} pass. case: ('graph', 'sum') {} pass. ops group_cases_16 with 8 cases start to running, all cases are below: case: (, 'pynative', ) case: (, 'pynative', ) case: (, 'kbk', ) case: (, 'kbk', ) case: (, 'kbk', ) case: (, 'kbk', ) case: (, 'ge', ) case: (, 'ge', ) ops group_cases_16 total running memory: 32M, memory threshold: 51200M TotalTime = 4.6038, [24] [bootstrap]: 0.00090273 [type_inference]: 0.0651419 [event_method]: 9.259e-05 [auto_monad]: 0.00025445 [graph_reusing]: 1.268e-05 [inline]: 2.86e-06 [add_attr]: 0.0081592, [1] [add_attr_with_inline]: 0.00814185, [1] [Cycle 1]: 0.00015714, [2] [tag_attr]: 4.602e-05 [meta_addattr_fg_expand]: 1.612e-05 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 6.542e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 6.69999e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.00723209, [53] [py_interpret_to_execute]: 6.43998e-06 [rewriter_before_opt_a]: 0.00029366 [opt_a]: 0.00436258, [2] [Cycle 1]: 0.00363475, [45] [expand_dump_flag]: 4.99e-06 [switch_simplify]: 0.00013796 [loop_unroll]: 3.954e-05 [a_1]: 0.00082821 [with_stream_mark]: 2.181e-05 [recompute_prepare]: 9.82001e-06 [updatestate_depend_eliminate]: 1.387e-05 [updatestate_assign_eliminate]: 1.278e-05 [updatestate_loads_eliminate]: 4.29002e-06 [parameter_eliminate]: 2.16e-06 [a_2]: 0.000103 [accelerated_algorithm]: 7.6e-06 [shard]: 2.09999e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 6.94001e-06 [merge_send_recv]: 4.264e-05 [auto_parallel]: 9.51e-06 [parallel]: 8.265e-05 [flash_sp]: 3.423e-05 [merge_comm]: 5.24e-06 [allreduce_fusion]: 1.143e-05 [matmul_add_comm_reduction]: 1.853e-05 [allreduce_slice_to_reducescatter]: 8.17e-06 [virtual_shard_identity]: 1.126e-05 [virtual_dataset]: 7.53e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 6.86999e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.894e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.436e-05 [merge_recompute_call_nodes]: 1.50999e-06 [before_grad]: 1.207e-05 [set_forward_comm_id_for_comm_node_pass]: 1.277e-05 [meta_fg_expand]: 3.95e-06 [flash_sp_send_recv_attached]: 2.79999e-06 [receive_attached]: 1.842e-05 [after_resolve]: 1.278e-05 [a_after_grad]: 1.063e-05 [renormalize]: 0.00163691 [add_forward_monad_depend]: 7.19001e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 3.28e-05 [cse]: 8.061e-05 [a_3]: 5.439e-05 [Cycle 2]: 0.00071441, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 7.9e-06 [loop_unroll]: 7.33e-06 [a_1]: 0.00015847 [with_stream_mark]: 1.455e-05 [recompute_prepare]: 6.99001e-06 [updatestate_depend_eliminate]: 4.15e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.42002e-06 [parameter_eliminate]: 1.20999e-06 [a_2]: 8.555e-05 [accelerated_algorithm]: 6.61999e-06 [shard]: 1.34e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 6.51999e-06 [merge_send_recv]: 6.70998e-06 [auto_parallel]: 8.01001e-06 [parallel]: 6.88998e-06 [flash_sp]: 3.65e-06 [merge_comm]: 3.95e-06 [allreduce_fusion]: 4.21001e-06 [matmul_add_comm_reduction]: 7.97e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7.48e-06 [virtual_dataset]: 6.23e-06 [get_grad_eliminate_]: 6.07001e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 3.96001e-06 [cell_reuse_recompute_pass]: 1.91998e-06 [offload_activation]: 9.35001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.177e-05 [merge_recompute_call_nodes]: 1.06002e-06 [before_grad]: 1.069e-05 [set_forward_comm_id_for_comm_node_pass]: 4.09997e-06 [meta_fg_expand]: 2.96001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.46002e-06 [after_resolve]: 1.04e-05 [a_after_grad]: 9.87001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49998e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 1.021e-05 [cse]: 2.23e-05 [a_3]: 3.967e-05 [py_interpret_to_execute_after_opt_a]: 7.55e-06 [slice_cell_reuse_recomputed_activation]: 2.47001e-06 [rewriter_after_opt_a]: 4.01e-05 [convert_after_rewriter]: 1.59e-06 [order_py_execute_after_rewriter]: 1.16002e-06 [mutable_eliminate]: 0.00068591 [opt_b]: 0.00029205, [1] [Cycle 1]: 0.0002837, [7] [b_1]: 0.00016049 [b_2]: 4.043e-05 [updatestate_depend_eliminate]: 7.33e-06 [updatestate_assign_eliminate]: 3.71999e-06 [updatestate_loads_eliminate]: 3.41001e-06 [renormalize]: 6.00005e-07 [cse]: 3.08e-05 [optimize_parallel_all_gather_comm]: 3.169e-05 [overlap_param_gather]: 1.158e-05 [cconv]: 2.675e-05 [loop_unroll]: 0.00048805 [opt_after_cconv]: 0.00011293, [1] [Cycle 1]: 0.00010562, [7] [c_1]: 3.066e-05 [parameter_eliminate]: 3.44001e-06 [updatestate_depend_eliminate]: 6.11e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.23e-06 [cse]: 2.602e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 3.838e-05 [tuple_transform]: 9.707e-05, [1] [Cycle 1]: 9.229e-05, [4] [d_1]: 6.332e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 7.35e-06 [partial_unused_args_eliminate]: 1.98997e-06 [add_recomputation]: 7.253e-05 [cse_after_recomputation]: 2.773e-05, [1] [Cycle 1]: 2.27e-05, [1] [cse]: 1.706e-05 [environ_conv]: 3.267e-05 [swap_dp_allreduce_reducescatter]: 2.341e-05 [bias_add_comm_swap]: 1.077e-05 [label_micro_interleaved_index]: 1.292e-05 [label_fine_grained_interleaved_index]: 2.76999e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.54999e-06 [micro_interleaved_order_control]: 2.86e-06 [assign_add_opt]: 1.44e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 8.70999e-06 [full_micro_interleaved_order_control]: 1.021e-05 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 8.48999e-06 [overlap_opt_shard_in_pipeline]: 2.196e-05 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 1.614e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.89e-06 [overlap_recompute_and_grad_model_parallel]: 1.262e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.25999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41002e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 1.919e-05 [overlap_grad_flash_sp]: 5.299e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 9.64e-06 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 0.00011665, [1] [Cycle 1]: 0.00011196, [6] [build]: 3.261e-05 [elim_shapecalc]: 1.429e-05 [elim_not_effective]: 1.51e-05 [opt_reshape]: 8.32e-06 [fold_const_symbol]: 1.147e-05 [renormalize]: 1.59984e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 2.721e-05 [get_jit_bprop_graph]: 1.97001e-06 [rewriter_after_jit_bprop_graph]: 4.19002e-06 [opt_after_jit_grad]: 0.00054764 [validate]: 7.225e-05 [backend_pass]: 1.07e-06 [task_emit]: 4.52053 [execute]: 1.595e-05 Sums bootstrap : 0.000903s : 0.02% type_inference : 0.065142s : 1.42% event_method : 0.000093s : 0.00% auto_monad : 0.000254s : 0.01% graph_reusing : 0.000013s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000065s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000294s : 0.01% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000146s : 0.00% optimize.opt_a.loop_unroll : 0.000047s : 0.00% optimize.opt_a.a_1 : 0.000987s : 0.02% optimize.opt_a.with_stream_mark : 0.000036s : 0.00% optimize.opt_a.recompute_prepare : 0.000017s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000189s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000049s : 0.00% optimize.opt_a.auto_parallel : 0.000018s : 0.00% optimize.opt_a.parallel : 0.000090s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000016s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000028s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.00% optimize.opt_a.a_after_grad : 0.000021s : 0.00% optimize.opt_a.renormalize : 0.001637s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000043s : 0.00% optimize.opt_a.cse : 0.000103s : 0.00% optimize.opt_a.a_3 : 0.000094s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000686s : 0.01% optimize.opt_b.b_1 : 0.000160s : 0.00% optimize.opt_b.b_2 : 0.000040s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000027s : 0.00% optimize.loop_unroll : 0.000488s : 0.01% optimize.opt_after_cconv.c_1 : 0.000031s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000038s : 0.00% optimize.tuple_transform.d_1 : 0.000063s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000073s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000033s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000022s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000053s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000033s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000548s : 0.01% validate : 0.000072s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.520529s : 98.40% execute : 0.000016s : 0.00% Time group info: ------[substitution.] 0.000373 62 0.59% : 0.000002s : 3: substitution.elim_not_effective 2.35% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.49% : 0.000002s : 3: substitution.fold_const_symbol 1.77% : 0.000007s : 4: substitution.graph_param_transform 57.11% : 0.000213s : 8: substitution.inline 1.39% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.32% : 0.000009s : 2: substitution.minmaximum_grad 1.64% : 0.000006s : 6: substitution.remove_not_recompute_node 1.20% : 0.000004s : 2: substitution.replace_old_param 4.50% : 0.000017s : 1: substitution.switch_simplify 4.55% : 0.000017s : 4: substitution.tuple_list_convert_item_index_to_positive 4.20% : 0.000016s : 4: substitution.tuple_list_get_item_const_eliminator 3.29% : 0.000012s : 4: substitution.tuple_list_get_item_depend_reorder 11.68% : 0.000044s : 8: substitution.tuple_list_get_item_eliminator 2.92% : 0.000011s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.065003 2 96.98% : 0.063042s : 1: type_inference.infer 3.02% : 0.001961s : 1: type_inference.specialize ------[replace.] 0.000105 11 58.72% : 0.000062s : 8: replace.inline 22.75% : 0.000024s : 1: replace.switch_simplify 18.53% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000236 11 88.11% : 0.000208s : 8: match.inline 6.74% : 0.000016s : 1: match.switch_simplify 5.15% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000242 1438 1.00% : 0.000002s : 16: predicate.accumulaten_eliminater 0.66% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 16: predicate.addn_zero_filter 0.90% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 24: predicate.arithmetic_simplify 1.02% : 0.000002s : 16: predicate.cast_eliminate 0.68% : 0.000002s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 16: predicate.dict_set_item_eliminator 1.07% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_depend_swap 1.68% : 0.000004s : 28: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.44% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 26: predicate.float_depend_g_call 0.45% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.28% : 0.000001s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.92% : 0.000014s : 66: predicate.inline 0.63% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.40% : 0.000006s : 42: predicate.load_eliminater 1.16% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.76% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.59% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.00% : 0.000002s : 16: predicate.minmaximum_grad 1.34% : 0.000003s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.29% : 0.000001s : 4: predicate.parallel_virtual_node 2.21% : 0.000005s : 26: predicate.partial_defer_inline 1.32% : 0.000003s : 22: predicate.partial_eliminate 0.94% : 0.000002s : 16: predicate.print_const_string_wrapper 0.46% : 0.000001s : 8: predicate.reduce_all_const_elim 1.40% : 0.000003s : 16: predicate.reduce_eliminate 2.42% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000003s : 26: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.16% : 0.000003s : 16: predicate.reshape_eliminate 0.63% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.91% : 0.000002s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 0.65% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 26: predicate.switch_defer_inline 2.08% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.84% : 0.000014s : 86: predicate.switch_simplify 0.98% : 0.000002s : 16: predicate.tile_eliminate 1.03% : 0.000003s : 16: predicate.transpose_eliminate 1.67% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.57% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.53% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001655 23 59.69% : 0.000988s : 11: func_graph_cloner_run.FuncGraphClonerGraph 40.31% : 0.000667s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.622509 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.18% : 0.008165s : 1: add_attr 0.18% : 0.008146s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000077s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000265s : 1: auto_monad 0.00% : 0.000058s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.02% : 0.000955s : 1: bootstrap 0.00% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000037s : 1: environ_conv 0.00% : 0.000102s : 1: event_method 0.00% : 0.000070s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.01% : 0.000496s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.02% : 0.000696s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.03% : 0.001565s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000171s : 28: opt.transform.opt_b 0.00% : 0.000069s : 2: opt.transform.opt_trans_graph 0.00% : 0.000044s : 4: opt.transform.symbol_engine_opt 0.09% : 0.004366s : 1: opt_a 0.00% : 0.000116s : 1: opt_after_cconv 0.01% : 0.000557s : 1: opt_after_jit_grad 0.01% : 0.000296s : 1: opt_b 0.16% : 0.007237s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000057s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000026s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000070s : 1: pre_auto_parallel 0.00% : 0.000011s : 1: py_interpret_to_execute 0.00% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000042s : 1: remove_dup_value 0.02% : 0.000921s : 1: renormalize.infer 0.02% : 0.000705s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000044s : 1: rewriter_after_opt_a 0.01% : 0.000300s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000120s : 1: symbol_engine_optimizer 97.80% : 4.520777s : 1: task_emit 0.00% : 0.000100s : 1: tuple_transform 1.41% : 0.065171s : 1: type_inference 0.00% : 0.000108s : 1: validate TotalTime = 4.81006, [24] [bootstrap]: 0.00100712 [type_inference]: 0.0617751 [event_method]: 8.169e-05 [auto_monad]: 0.00048861 [graph_reusing]: 1.202e-05 [inline]: 2.24999e-06 [add_attr]: 0.00785165, [1] [add_attr_with_inline]: 0.00779182, [1] [Cycle 1]: 0.00014491, [2] [tag_attr]: 4.185e-05 [meta_addattr_fg_expand]: 2.04e-05 [parallel-infer-symbol]: 3.25002e-06 [pre_auto_parallel]: 6.552e-05 [insert-virtual-dataset]: 2.31e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00637386, [53] [py_interpret_to_execute]: 3.23e-06 [rewriter_before_opt_a]: 0.00026194 [opt_a]: 0.00378943, [2] [Cycle 1]: 0.00311785, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 0.00013394 [loop_unroll]: 3.833e-05 [a_1]: 0.00078613 [with_stream_mark]: 8.69e-06 [recompute_prepare]: 8.52e-06 [updatestate_depend_eliminate]: 9.12999e-06 [updatestate_assign_eliminate]: 1.405e-05 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 1.98997e-06 [a_2]: 9.539e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 1.435e-05 [meta_shard_fg_expand]: 1.86003e-06 [shard_inline]: 6.88e-06 [merge_send_recv]: 4.264e-05 [auto_parallel]: 6.43e-06 [parallel]: 9.586e-05 [flash_sp]: 4.425e-05 [merge_comm]: 4.43999e-06 [allreduce_fusion]: 1.642e-05 [matmul_add_comm_reduction]: 2.314e-05 [allreduce_slice_to_reducescatter]: 1.388e-05 [virtual_shard_identity]: 8.56002e-06 [virtual_dataset]: 6.84001e-06 [get_grad_eliminate_]: 6.44001e-06 [virtual_output]: 6.65998e-06 [merge_forward]: 4.2e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 2.268e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.567e-05 [merge_recompute_call_nodes]: 5.50004e-07 [before_grad]: 1.031e-05 [set_forward_comm_id_for_comm_node_pass]: 1.596e-05 [meta_fg_expand]: 3.85998e-06 [flash_sp_send_recv_attached]: 4.28999e-06 [receive_attached]: 2.345e-05 [after_resolve]: 9.66e-06 [a_after_grad]: 9.87001e-06 [renormalize]: 0.00117857 [add_forward_monad_depend]: 5.01002e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 3.121e-05 [cse]: 8.788e-05 [a_3]: 4.809e-05 [Cycle 2]: 0.00066216, [45] [expand_dump_flag]: 8.30012e-07 [switch_simplify]: 7.53e-06 [loop_unroll]: 6.69001e-06 [a_1]: 0.00014809 [with_stream_mark]: 1.149e-05 [recompute_prepare]: 6.66999e-06 [updatestate_depend_eliminate]: 3.54002e-06 [updatestate_assign_eliminate]: 3.12002e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 9.30013e-07 [a_2]: 8.247e-05 [accelerated_algorithm]: 6.44999e-06 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 6.66999e-06 [merge_send_recv]: 5.35001e-06 [auto_parallel]: 5.63002e-06 [parallel]: 4.28999e-06 [flash_sp]: 6.99001e-06 [merge_comm]: 3.91999e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 6.25002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.95998e-06 [virtual_dataset]: 6.04001e-06 [get_grad_eliminate_]: 6.06e-06 [virtual_output]: 5.76e-06 [merge_forward]: 3.5e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 6.44999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.129e-05 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 9.76e-06 [set_forward_comm_id_for_comm_node_pass]: 4.19002e-06 [meta_fg_expand]: 2.44999e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 8.84998e-06 [a_after_grad]: 8.84998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.10001e-06 [auto_monad_grad]: 8.29983e-07 [auto_monad_eliminator]: 7.73001e-06 [cse]: 1.8e-05 [a_3]: 3.796e-05 [py_interpret_to_execute_after_opt_a]: 4.41002e-06 [slice_cell_reuse_recomputed_activation]: 7.93001e-06 [rewriter_after_opt_a]: 4.946e-05 [convert_after_rewriter]: 1.25001e-06 [order_py_execute_after_rewriter]: 9.50007e-07 [mutable_eliminate]: 0.00048023 [opt_b]: 0.00025527, [1] [Cycle 1]: 0.00024968, [7] [b_1]: 0.00014952 [b_2]: 8.25999e-06 [updatestate_depend_eliminate]: 5.99e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 3.00002e-06 [renormalize]: 4.50003e-07 [cse]: 4.544e-05 [optimize_parallel_all_gather_comm]: 3.434e-05 [overlap_param_gather]: 1.277e-05 [cconv]: 3.124e-05 [loop_unroll]: 0.00041205 [opt_after_cconv]: 0.00010383, [1] [Cycle 1]: 9.874e-05, [7] [c_1]: 3.011e-05 [parameter_eliminate]: 1.93997e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.08e-06 [cse]: 2.31e-05 [renormalize]: 3.09985e-07 [remove_dup_value]: 2.859e-05 [tuple_transform]: 9.009e-05, [1] [Cycle 1]: 8.575e-05, [4] [d_1]: 5.897e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 7.21001e-06 [partial_unused_args_eliminate]: 1.91003e-06 [add_recomputation]: 6.81e-05 [cse_after_recomputation]: 2.539e-05, [1] [Cycle 1]: 2.083e-05, [1] [cse]: 1.558e-05 [environ_conv]: 1.743e-05 [swap_dp_allreduce_reducescatter]: 3.275e-05 [bias_add_comm_swap]: 1.867e-05 [label_micro_interleaved_index]: 1.679e-05 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.46998e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.776e-05 [full_micro_interleaved_order_control]: 1.037e-05 [reorder_send_recv_between_fp_bp]: 2.86999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.29e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.267e-05 [overlap_opt_shard_in_pipeline]: 1.548e-05 [overlap_opt_shard_grad_in_pipeline]: 4.01001e-06 [control_data_broadcast_order]: 1.257e-05 [grouped_pairwise_exchange_alltoall]: 3.75998e-06 [offloading_packed_experts]: 3.58e-06 [overlap_recompute_and_grad_model_parallel]: 1.781e-05 [overlap_grad_matmul_and_grad_allreduce]: 7.10017e-07 [overlap_recompute_allgather_and_fa_grad]: 6.80011e-07 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 2.923e-05 [overlap_grad_flash_sp]: 6.683e-05 [begin_end_overlap_inline]: 3.39991e-07 [split_matmul_comm_elemetwise]: 8.50999e-06 [split_layernorm_comm]: 3.63e-06 [handle_group_info]: 3.47002e-06 [symbol_engine_optimizer]: 0.00010501, [1] [Cycle 1]: 0.00010083, [6] [build]: 2.887e-05 [elim_shapecalc]: 1.149e-05 [elim_not_effective]: 1.38e-05 [opt_reshape]: 7.37002e-06 [fold_const_symbol]: 1.121e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.72999e-06 [pipeline_parallel_scheduler]: 1.37e-06 [auto_monad_reorder]: 3.009e-05 [get_jit_bprop_graph]: 1.09998e-06 [rewriter_after_jit_bprop_graph]: 3.09999e-06 [opt_after_jit_grad]: 0.0004562 [validate]: 0.00012041 [backend_pass]: 8.59989e-07 [task_emit]: 4.73133 [execute]: 1.374e-05 Sums bootstrap : 0.001007s : 0.02% type_inference : 0.061775s : 1.29% event_method : 0.000082s : 0.00% auto_monad : 0.000489s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000066s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000003s : 0.00% optimize.rewriter_before_opt_a : 0.000262s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000141s : 0.00% optimize.opt_a.loop_unroll : 0.000045s : 0.00% optimize.opt_a.a_1 : 0.000934s : 0.02% optimize.opt_a.with_stream_mark : 0.000020s : 0.00% optimize.opt_a.recompute_prepare : 0.000015s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000178s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000015s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.00% optimize.opt_a.merge_send_recv : 0.000048s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000100s : 0.00% optimize.opt_a.flash_sp : 0.000051s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000029s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000014s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000001s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.001179s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.00% optimize.opt_a.cse : 0.000106s : 0.00% optimize.opt_a.a_3 : 0.000086s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000008s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000480s : 0.01% optimize.opt_b.b_1 : 0.000150s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000045s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.00% optimize.overlap_param_gather : 0.000013s : 0.00% optimize.cconv : 0.000031s : 0.00% optimize.loop_unroll : 0.000412s : 0.01% optimize.opt_after_cconv.c_1 : 0.000030s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000029s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000033s : 0.00% optimize.bias_add_comm_swap : 0.000019s : 0.00% optimize.label_micro_interleaved_index : 0.000017s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000018s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000015s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000004s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000004s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000018s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000029s : 0.00% optimize.overlap_grad_flash_sp : 0.000067s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000003s : 0.00% optimize.symbol_engine_optimizer.build : 0.000029s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000456s : 0.01% validate : 0.000120s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.731329s : 98.55% execute : 0.000014s : 0.00% Time group info: ------[substitution.] 0.000343 62 0.61% : 0.000002s : 3: substitution.elim_not_effective 1.63% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.61% : 0.000002s : 3: substitution.fold_const_symbol 1.63% : 0.000006s : 4: substitution.graph_param_transform 51.21% : 0.000175s : 8: substitution.inline 1.00% : 0.000003s : 6: substitution.j_node_and_user_rematch 9.15% : 0.000031s : 2: substitution.minmaximum_grad 2.51% : 0.000009s : 6: substitution.remove_not_recompute_node 0.93% : 0.000003s : 2: substitution.replace_old_param 6.12% : 0.000021s : 1: substitution.switch_simplify 4.13% : 0.000014s : 4: substitution.tuple_list_convert_item_index_to_positive 5.41% : 0.000019s : 4: substitution.tuple_list_get_item_const_eliminator 2.81% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 9.66% : 0.000033s : 8: substitution.tuple_list_get_item_eliminator 2.60% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.061664 2 97.22% : 0.059950s : 1: type_inference.infer 2.78% : 0.001714s : 1: type_inference.specialize ------[replace.] 0.000085 11 62.98% : 0.000053s : 8: replace.inline 17.90% : 0.000015s : 1: replace.switch_simplify 19.12% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000200 11 85.62% : 0.000171s : 8: match.inline 10.09% : 0.000020s : 1: match.switch_simplify 4.29% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000224 1438 1.00% : 0.000002s : 16: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 16: predicate.addn_zero_filter 1.01% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 24: predicate.arithmetic_simplify 0.99% : 0.000002s : 16: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.01% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.77% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.22% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.71% : 0.000004s : 28: predicate.environ_get_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.59% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.03% : 0.000013s : 66: predicate.inline 0.75% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 42: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.88% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 16: predicate.minmaximum_grad 0.95% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.09% : 0.000005s : 26: predicate.partial_defer_inline 1.44% : 0.000003s : 22: predicate.partial_eliminate 0.95% : 0.000002s : 16: predicate.print_const_string_wrapper 0.47% : 0.000001s : 8: predicate.reduce_all_const_elim 1.37% : 0.000003s : 16: predicate.reduce_eliminate 2.59% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000003s : 26: predicate.replace_applicator 0.34% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000000s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 16: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.76% : 0.000004s : 26: predicate.switch_defer_inline 2.19% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.92% : 0.000013s : 86: predicate.switch_simplify 1.00% : 0.000002s : 16: predicate.tile_eliminate 1.00% : 0.000002s : 16: predicate.transpose_eliminate 1.78% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 24: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.69% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.68% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.41% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001421 23 59.20% : 0.000841s : 11: func_graph_cloner_run.FuncGraphClonerGraph 40.80% : 0.000580s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.826989 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.16% : 0.007855s : 1: add_attr 0.16% : 0.007795s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000498s : 1: auto_monad 0.00% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000022s : 1: bias_add_comm_swap 0.02% : 0.001060s : 1: bootstrap 0.00% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000021s : 1: environ_conv 0.00% : 0.000089s : 1: event_method 0.00% : 0.000033s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000007s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.01% : 0.000420s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.000488s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.001482s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000130s : 28: opt.transform.opt_b 0.00% : 0.000064s : 2: opt.transform.opt_trans_graph 0.00% : 0.000041s : 4: opt.transform.symbol_engine_opt 0.08% : 0.003793s : 1: opt_a 0.00% : 0.000107s : 1: opt_after_cconv 0.01% : 0.000465s : 1: opt_after_jit_grad 0.01% : 0.000259s : 1: opt_b 0.13% : 0.006378s : 1: optimize 0.00% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000071s : 1: overlap_grad_flash_sp 0.00% : 0.000003s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000032s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000019s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000021s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000070s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000021s : 1: remove_cast_before_assign_add 0.00% : 0.000032s : 1: remove_dup_value 0.01% : 0.000635s : 1: renormalize.infer 0.01% : 0.000536s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000054s : 1: rewriter_after_opt_a 0.01% : 0.000268s : 1: rewriter_before_opt_a 0.00% : 0.000011s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000011s : 1: split_matmul_comm_elemetwise 0.00% : 0.000036s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000108s : 1: symbol_engine_optimizer 98.02% : 4.731470s : 1: task_emit 0.00% : 0.000093s : 1: tuple_transform 1.28% : 0.061794s : 1: type_inference 0.00% : 0.000144s : 1: validate TotalTime = 4.9155, [24] [bootstrap]: 0.00100728 [type_inference]: 0.061775 [event_method]: 8.164e-05 [auto_monad]: 0.0004883 [graph_reusing]: 1.208e-05 [inline]: 1.97999e-06 [add_attr]: 0.00785081, [1] [add_attr_with_inline]: 0.00778976, [1] [Cycle 1]: 0.00013462, [2] [tag_attr]: 4.205e-05 [meta_addattr_fg_expand]: 1.937e-05 [parallel-infer-symbol]: 3.73999e-06 [pre_auto_parallel]: 6.551e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.02001e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00637377, [53] [py_interpret_to_execute]: 4.93001e-06 [rewriter_before_opt_a]: 0.00026504 [opt_a]: 0.00378758, [2] [Cycle 1]: 0.00311704, [45] [expand_dump_flag]: 2.96001e-06 [switch_simplify]: 0.00013403 [loop_unroll]: 3.83e-05 [a_1]: 0.00077274 [with_stream_mark]: 2.306e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 7.35e-06 [updatestate_assign_eliminate]: 1.346e-05 [updatestate_loads_eliminate]: 3.63e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 9.543e-05 [accelerated_algorithm]: 6.83998e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 2.27001e-06 [shard_inline]: 6.76999e-06 [merge_send_recv]: 5.59e-05 [auto_parallel]: 6.29999e-06 [parallel]: 9.625e-05 [flash_sp]: 4.445e-05 [merge_comm]: 4.70001e-06 [allreduce_fusion]: 1.676e-05 [matmul_add_comm_reduction]: 2.271e-05 [allreduce_slice_to_reducescatter]: 1.399e-05 [virtual_shard_identity]: 8.75999e-06 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.53e-06 [virtual_output]: 6.43e-06 [merge_forward]: 6.20002e-06 [cell_reuse_recompute_pass]: 8.09989e-07 [offload_activation]: 2.047e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.312e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.159e-05 [set_forward_comm_id_for_comm_node_pass]: 1.628e-05 [meta_fg_expand]: 3.46001e-06 [flash_sp_send_recv_attached]: 4.88001e-06 [receive_attached]: 2.253e-05 [after_resolve]: 9.99999e-06 [a_after_grad]: 1.031e-05 [renormalize]: 0.00117792 [add_forward_monad_depend]: 5.07e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 3.112e-05 [cse]: 8.781e-05 [a_3]: 4.86e-05 [Cycle 2]: 0.00066204, [45] [expand_dump_flag]: 1.02e-06 [switch_simplify]: 7.45e-06 [loop_unroll]: 6.64999e-06 [a_1]: 0.00014801 [with_stream_mark]: 1.13e-05 [recompute_prepare]: 6.52001e-06 [updatestate_depend_eliminate]: 4.09002e-06 [updatestate_assign_eliminate]: 2.94999e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 8.251e-05 [accelerated_algorithm]: 6.52001e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.39998e-06 [shard_inline]: 6.54001e-06 [merge_send_recv]: 5.24e-06 [auto_parallel]: 5.82001e-06 [parallel]: 4.35e-06 [flash_sp]: 1.215e-05 [merge_comm]: 3.84002e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 5.45001e-06 [allreduce_slice_to_reducescatter]: 3.10014e-07 [virtual_shard_identity]: 7.13e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 5.70001e-06 [merge_forward]: 3.09001e-06 [cell_reuse_recompute_pass]: 9.99979e-07 [offload_activation]: 5.63002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.127e-05 [merge_recompute_call_nodes]: 6.39993e-07 [before_grad]: 9.79999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.22e-06 [meta_fg_expand]: 2.37001e-06 [flash_sp_send_recv_attached]: 6.80011e-07 [receive_attached]: 7.89994e-07 [after_resolve]: 8.45001e-06 [a_after_grad]: 8.90999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 9.89996e-07 [auto_monad_grad]: 7.09988e-07 [auto_monad_eliminator]: 6.87002e-06 [cse]: 1.593e-05 [a_3]: 3.842e-05 [py_interpret_to_execute_after_opt_a]: 4.41002e-06 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 3.605e-05 [convert_after_rewriter]: 1.92001e-06 [order_py_execute_after_rewriter]: 1.40999e-06 [mutable_eliminate]: 0.00050053 [opt_b]: 0.00025493, [1] [Cycle 1]: 0.00024921, [7] [b_1]: 0.00014925 [b_2]: 2.983e-05 [updatestate_depend_eliminate]: 5.10999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.19001e-06 [renormalize]: 2.80008e-07 [cse]: 2.365e-05 [optimize_parallel_all_gather_comm]: 3.065e-05 [overlap_param_gather]: 1.703e-05 [cconv]: 2.273e-05 [loop_unroll]: 0.00042056 [opt_after_cconv]: 0.00010386, [1] [Cycle 1]: 9.88e-05, [7] [c_1]: 2.931e-05 [parameter_eliminate]: 2.39999e-06 [updatestate_depend_eliminate]: 5.91e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 3.21001e-06 [cse]: 2.274e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 2.841e-05 [tuple_transform]: 8.989e-05, [1] [Cycle 1]: 8.546e-05, [4] [d_1]: 5.891e-05 [none_parameter_eliminate]: 1.27e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 6.83e-05 [cse_after_recomputation]: 2.502e-05, [1] [Cycle 1]: 2.065e-05, [1] [cse]: 1.575e-05 [environ_conv]: 2.142e-05 [swap_dp_allreduce_reducescatter]: 2.848e-05 [bias_add_comm_swap]: 1.684e-05 [label_micro_interleaved_index]: 2.128e-05 [label_fine_grained_interleaved_index]: 5.22e-06 [merge_cast_opt]: 4.60015e-07 [slice_recompute_activation]: 3.36001e-06 [micro_interleaved_order_control]: 1.01002e-06 [assign_add_opt]: 4.65999e-06 [ForceFp32Comm]: 3.59985e-07 [remove_cast_before_assign_add]: 7.63999e-06 [full_micro_interleaved_order_control]: 1.416e-05 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.38002e-06 [interleave_parallel_branches]: 1.341e-05 [overlap_opt_shard_in_pipeline]: 1.555e-05 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.435e-05 [grouped_pairwise_exchange_alltoall]: 1.29998e-06 [offloading_packed_experts]: 4.47e-06 [overlap_recompute_and_grad_model_parallel]: 1.758e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 5.12999e-06 [overlap_grad_ring_attention]: 2.647e-05 [overlap_grad_flash_sp]: 6.759e-05 [begin_end_overlap_inline]: 3.69997e-07 [split_matmul_comm_elemetwise]: 9.75002e-06 [split_layernorm_comm]: 5.8001e-07 [handle_group_info]: 3.93001e-06 [symbol_engine_optimizer]: 0.00010549, [1] [Cycle 1]: 0.00010104, [6] [build]: 3.082e-05 [elim_shapecalc]: 1.124e-05 [elim_not_effective]: 1.265e-05 [opt_reshape]: 7.38e-06 [fold_const_symbol]: 1.093e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.61002e-06 [pipeline_parallel_scheduler]: 1.40001e-06 [auto_monad_reorder]: 2.902e-05 [get_jit_bprop_graph]: 1.04e-06 [rewriter_after_jit_bprop_graph]: 3.23998e-06 [opt_after_jit_grad]: 0.00045645 [validate]: 0.00012049 [backend_pass]: 6.59988e-07 [task_emit]: 4.83686 [execute]: 1.105e-05 Sums bootstrap : 0.001007s : 0.02% type_inference : 0.061775s : 1.26% event_method : 0.000082s : 0.00% auto_monad : 0.000488s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000019s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000066s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000265s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000141s : 0.00% optimize.opt_a.loop_unroll : 0.000045s : 0.00% optimize.opt_a.a_1 : 0.000921s : 0.02% optimize.opt_a.with_stream_mark : 0.000034s : 0.00% optimize.opt_a.recompute_prepare : 0.000015s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000178s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000061s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000101s : 0.00% optimize.opt_a.flash_sp : 0.000057s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000028s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000014s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000023s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.001178s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.00% optimize.opt_a.cse : 0.000104s : 0.00% optimize.opt_a.a_3 : 0.000087s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000501s : 0.01% optimize.opt_b.b_1 : 0.000149s : 0.00% optimize.opt_b.b_2 : 0.000030s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000017s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000421s : 0.01% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000028s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000021s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000017s : 0.00% optimize.label_micro_interleaved_index : 0.000021s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000000s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000018s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000068s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000031s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000456s : 0.01% validate : 0.000120s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.836864s : 98.58% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000324 62 0.47% : 0.000002s : 3: substitution.elim_not_effective 1.89% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.61% : 0.000002s : 3: substitution.fold_const_symbol 1.74% : 0.000006s : 4: substitution.graph_param_transform 55.22% : 0.000179s : 8: substitution.inline 1.17% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.99% : 0.000010s : 2: substitution.minmaximum_grad 1.69% : 0.000005s : 6: substitution.remove_not_recompute_node 0.77% : 0.000003s : 2: substitution.replace_old_param 6.73% : 0.000022s : 1: substitution.switch_simplify 3.78% : 0.000012s : 4: substitution.tuple_list_convert_item_index_to_positive 5.02% : 0.000016s : 4: substitution.tuple_list_get_item_const_eliminator 3.62% : 0.000012s : 4: substitution.tuple_list_get_item_depend_reorder 11.51% : 0.000037s : 8: substitution.tuple_list_get_item_eliminator 2.78% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.061664 2 97.22% : 0.059950s : 1: type_inference.infer 2.78% : 0.001714s : 1: type_inference.specialize ------[replace.] 0.000084 11 63.52% : 0.000053s : 8: replace.inline 17.16% : 0.000014s : 1: replace.switch_simplify 19.32% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000212 11 82.50% : 0.000175s : 8: match.inline 9.93% : 0.000021s : 1: match.switch_simplify 7.58% : 0.000016s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000225 1438 0.98% : 0.000002s : 16: predicate.accumulaten_eliminater 0.72% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.13% : 0.000003s : 16: predicate.addn_zero_filter 0.89% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.26% : 0.000005s : 24: predicate.arithmetic_simplify 1.02% : 0.000002s : 16: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.22% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.20% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.20% : 0.000003s : 20: predicate.environ_get_depend_swap 1.69% : 0.000004s : 28: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.58% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.14% : 0.000005s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.85% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.01% : 0.000014s : 66: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.76% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.48% : 0.000006s : 42: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.84% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 16: predicate.minmaximum_grad 0.96% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.04% : 0.000005s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 1.04% : 0.000002s : 16: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.41% : 0.000003s : 16: predicate.reduce_eliminate 2.51% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000003s : 26: predicate.replace_applicator 0.34% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.84% : 0.000002s : 8: predicate.special_op_eliminate 0.68% : 0.000002s : 8: predicate.specialize_transform 0.66% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.72% : 0.000004s : 26: predicate.switch_defer_inline 2.19% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.00% : 0.000014s : 86: predicate.switch_simplify 0.95% : 0.000002s : 16: predicate.tile_eliminate 0.94% : 0.000002s : 16: predicate.transpose_eliminate 1.84% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.88% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.22% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.72% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.37% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.00% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001403 23 58.58% : 0.000822s : 11: func_graph_cloner_run.FuncGraphClonerGraph 41.42% : 0.000581s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.932460 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.16% : 0.007855s : 1: add_attr 0.16% : 0.007793s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000072s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.01% : 0.000498s : 1: auto_monad 0.00% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000020s : 1: bias_add_comm_swap 0.02% : 0.001059s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000089s : 1: event_method 0.00% : 0.000028s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000024s : 1: label_micro_interleaved_index 0.01% : 0.000428s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.01% : 0.000508s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.001469s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000152s : 28: opt.transform.opt_b 0.00% : 0.000064s : 2: opt.transform.opt_trans_graph 0.00% : 0.000039s : 4: opt.transform.symbol_engine_opt 0.08% : 0.003791s : 1: opt_a 0.00% : 0.000107s : 1: opt_after_cconv 0.01% : 0.000465s : 1: opt_after_jit_grad 0.01% : 0.000258s : 1: opt_b 0.13% : 0.006378s : 1: optimize 0.00% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000072s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000029s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000019s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000020s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000070s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000010s : 1: remove_cast_before_assign_add 0.00% : 0.000032s : 1: remove_dup_value 0.01% : 0.000635s : 1: renormalize.infer 0.01% : 0.000536s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000040s : 1: rewriter_after_opt_a 0.01% : 0.000271s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000003s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000032s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000108s : 1: symbol_engine_optimizer 98.06% : 4.836935s : 1: task_emit 0.00% : 0.000093s : 1: tuple_transform 1.25% : 0.061794s : 1: type_inference 0.00% : 0.000150s : 1: validate TotalTime = 5.396, [24] [bootstrap]: 0.00100909 [type_inference]: 0.0622262 [event_method]: 8.123e-05 [auto_monad]: 0.00016093 [graph_reusing]: 9.17999e-06 [inline]: 1.74e-06 [add_attr]: 0.00773707, [1] [add_attr_with_inline]: 0.00772655, [1] [Cycle 1]: 0.00014349, [2] [tag_attr]: 4.207e-05 [meta_addattr_fg_expand]: 1.836e-05 [parallel-infer-symbol]: 3.13e-06 [pre_auto_parallel]: 6.544e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.63002e-06 [pipeline_split]: 1.49998e-06 [optimize]: 0.00637549, [53] [py_interpret_to_execute]: 4.91002e-06 [rewriter_before_opt_a]: 0.00026233 [opt_a]: 0.00380782, [2] [Cycle 1]: 0.0031186, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 0.0001348 [loop_unroll]: 3.846e-05 [a_1]: 0.0007693 [with_stream_mark]: 1.533e-05 [recompute_prepare]: 8.69e-06 [updatestate_depend_eliminate]: 1.393e-05 [updatestate_assign_eliminate]: 1.483e-05 [updatestate_loads_eliminate]: 3.55e-06 [parameter_eliminate]: 1.91003e-06 [a_2]: 9.934e-05 [accelerated_algorithm]: 7.06999e-06 [shard]: 1.03001e-06 [meta_shard_fg_expand]: 1.91998e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 5.228e-05 [auto_parallel]: 6.76e-06 [parallel]: 9.737e-05 [flash_sp]: 4.258e-05 [merge_comm]: 4.83001e-06 [allreduce_fusion]: 1.599e-05 [matmul_add_comm_reduction]: 2.318e-05 [allreduce_slice_to_reducescatter]: 1.364e-05 [virtual_shard_identity]: 8.92e-06 [virtual_dataset]: 6.69001e-06 [get_grad_eliminate_]: 6.38e-06 [virtual_output]: 6.43e-06 [merge_forward]: 4.67998e-06 [cell_reuse_recompute_pass]: 9.10019e-07 [offload_activation]: 2.266e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.294e-05 [merge_recompute_call_nodes]: 1.57001e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 1.703e-05 [meta_fg_expand]: 3.62998e-06 [flash_sp_send_recv_attached]: 2.41998e-06 [receive_attached]: 2.525e-05 [after_resolve]: 1.002e-05 [a_after_grad]: 9.79999e-06 [renormalize]: 0.00118469 [add_forward_monad_depend]: 4.71002e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 2.388e-05 [cse]: 8.822e-05 [a_3]: 4.855e-05 [Cycle 2]: 0.00067997, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.94002e-06 [loop_unroll]: 6.75998e-06 [a_1]: 0.00014797 [with_stream_mark]: 1.076e-05 [recompute_prepare]: 7.08e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.12002e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 8.573e-05 [accelerated_algorithm]: 6.61e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.73002e-06 [shard_inline]: 6.76999e-06 [merge_send_recv]: 5.51e-06 [auto_parallel]: 5.69e-06 [parallel]: 4.23001e-06 [flash_sp]: 1.12e-05 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.58e-06 [matmul_add_comm_reduction]: 6.51999e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.46999e-06 [virtual_dataset]: 6.34001e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 5.71998e-06 [merge_forward]: 3.53999e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 6.62002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.156e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 9.74e-06 [set_forward_comm_id_for_comm_node_pass]: 4.49998e-06 [meta_fg_expand]: 2.51e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 8.91002e-06 [a_after_grad]: 9.13002e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 8.14997e-06 [cse]: 1.986e-05 [a_3]: 3.746e-05 [py_interpret_to_execute_after_opt_a]: 4.21001e-06 [slice_cell_reuse_recomputed_activation]: 5.94999e-06 [rewriter_after_opt_a]: 3.894e-05 [convert_after_rewriter]: 1.06002e-06 [order_py_execute_after_rewriter]: 9.70002e-07 [mutable_eliminate]: 0.00047697 [opt_b]: 0.0002631, [1] [Cycle 1]: 0.00025744, [7] [b_1]: 0.00015272 [b_2]: 3.054e-05 [updatestate_depend_eliminate]: 6.40997e-06 [updatestate_assign_eliminate]: 3.37997e-06 [updatestate_loads_eliminate]: 3.3e-06 [renormalize]: 3.80009e-07 [cse]: 2.531e-05 [optimize_parallel_all_gather_comm]: 2.035e-05 [overlap_param_gather]: 1.727e-05 [cconv]: 2.246e-05 [loop_unroll]: 0.00043982 [opt_after_cconv]: 0.0001061, [1] [Cycle 1]: 0.00010065, [7] [c_1]: 2.913e-05 [parameter_eliminate]: 2.59001e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.437e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 2.254e-05 [tuple_transform]: 8.97e-05, [1] [Cycle 1]: 8.545e-05, [4] [d_1]: 5.825e-05 [none_parameter_eliminate]: 1.22999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.47998e-06 [partial_unused_args_eliminate]: 1.00001e-06 [add_recomputation]: 5.281e-05 [cse_after_recomputation]: 2.655e-05, [1] [Cycle 1]: 2.215e-05, [1] [cse]: 1.701e-05 [environ_conv]: 1.639e-05 [swap_dp_allreduce_reducescatter]: 3.213e-05 [bias_add_comm_swap]: 1.786e-05 [label_micro_interleaved_index]: 1.919e-05 [label_fine_grained_interleaved_index]: 5.47001e-06 [merge_cast_opt]: 6.19999e-07 [slice_recompute_activation]: 8.49977e-07 [micro_interleaved_order_control]: 1.45999e-06 [assign_add_opt]: 1.35001e-06 [ForceFp32Comm]: 8.29983e-07 [remove_cast_before_assign_add]: 1.467e-05 [full_micro_interleaved_order_control]: 1.386e-05 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 4.77e-06 [add_comm_op_reuse_tag]: 4.10015e-07 [interleave_split_concat_branches]: 7.29982e-07 [interleave_parallel_branches]: 9.61998e-06 [overlap_opt_shard_in_pipeline]: 1.628e-05 [overlap_opt_shard_grad_in_pipeline]: 2.07001e-06 [control_data_broadcast_order]: 1.441e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 4.70999e-06 [overlap_recompute_and_grad_model_parallel]: 1.756e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 4.83001e-06 [overlap_recompute_comm]: 1.08001e-06 [overlap_grad_ring_attention]: 2.743e-05 [overlap_grad_flash_sp]: 6.156e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 1.115e-05 [split_layernorm_comm]: 1.95001e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 0.00011318, [1] [Cycle 1]: 0.00010863, [6] [build]: 3.441e-05 [elim_shapecalc]: 1.25e-05 [elim_not_effective]: 1.445e-05 [opt_reshape]: 7.59002e-06 [fold_const_symbol]: 1.122e-05 [renormalize]: 2.09984e-07 [detach_backward]: 1.17e-06 [pipeline_parallel_scheduler]: 9.50007e-07 [auto_monad_reorder]: 2.902e-05 [get_jit_bprop_graph]: 1.14998e-06 [rewriter_after_jit_bprop_graph]: 3.13e-06 [opt_after_jit_grad]: 0.00047504 [validate]: 0.00010075 [backend_pass]: 7.89994e-07 [task_emit]: 5.31731 [execute]: 1.04e-05 Sums bootstrap : 0.001009s : 0.02% type_inference : 0.062226s : 1.16% event_method : 0.000081s : 0.00% auto_monad : 0.000161s : 0.00% graph_reusing : 0.000009s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000018s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000065s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000262s : 0.00% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000143s : 0.00% optimize.opt_a.loop_unroll : 0.000045s : 0.00% optimize.opt_a.a_1 : 0.000917s : 0.02% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000185s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.00% optimize.opt_a.merge_send_recv : 0.000058s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000102s : 0.00% optimize.opt_a.flash_sp : 0.000054s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000020s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000030s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000014s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000022s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000026s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.001185s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.00% optimize.opt_a.cse : 0.000108s : 0.00% optimize.opt_a.a_3 : 0.000086s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000039s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000477s : 0.01% optimize.opt_b.b_1 : 0.000153s : 0.00% optimize.opt_b.b_2 : 0.000031s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.00% optimize.overlap_param_gather : 0.000017s : 0.00% optimize.cconv : 0.000022s : 0.00% optimize.loop_unroll : 0.000440s : 0.01% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000023s : 0.00% optimize.tuple_transform.d_1 : 0.000058s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000053s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000032s : 0.00% optimize.bias_add_comm_swap : 0.000018s : 0.00% optimize.label_micro_interleaved_index : 0.000019s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000015s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000005s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000018s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000005s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000027s : 0.00% optimize.overlap_grad_flash_sp : 0.000062s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000034s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000475s : 0.01% validate : 0.000101s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 5.317312s : 98.71% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000319 62 0.57% : 0.000002s : 3: substitution.elim_not_effective 2.52% : 0.000008s : 3: substitution.float_tuple_getitem_switch 0.43% : 0.000001s : 3: substitution.fold_const_symbol 1.30% : 0.000004s : 4: substitution.graph_param_transform 54.58% : 0.000174s : 8: substitution.inline 1.20% : 0.000004s : 6: substitution.j_node_and_user_rematch 2.97% : 0.000009s : 2: substitution.minmaximum_grad 1.64% : 0.000005s : 6: substitution.remove_not_recompute_node 0.92% : 0.000003s : 2: substitution.replace_old_param 6.60% : 0.000021s : 1: substitution.switch_simplify 4.53% : 0.000014s : 4: substitution.tuple_list_convert_item_index_to_positive 4.02% : 0.000013s : 4: substitution.tuple_list_get_item_const_eliminator 3.10% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 12.65% : 0.000040s : 8: substitution.tuple_list_get_item_eliminator 2.98% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.062160 2 97.23% : 0.060437s : 1: type_inference.infer 2.77% : 0.001723s : 1: type_inference.specialize ------[replace.] 0.000088 11 62.61% : 0.000055s : 8: replace.inline 19.23% : 0.000017s : 1: replace.switch_simplify 18.17% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000207 11 82.10% : 0.000170s : 8: match.inline 9.75% : 0.000020s : 1: match.switch_simplify 8.15% : 0.000017s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1438 1.16% : 0.000003s : 16: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.08% : 0.000002s : 16: predicate.addn_zero_filter 0.95% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.20% : 0.000005s : 24: predicate.arithmetic_simplify 1.01% : 0.000002s : 16: predicate.cast_eliminate 0.60% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 1.04% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.79% : 0.000004s : 28: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.15% : 0.000005s : 26: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000001s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.96% : 0.000013s : 66: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.47% : 0.000006s : 42: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.76% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.95% : 0.000002s : 16: predicate.minmaximum_grad 1.04% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 1.96% : 0.000004s : 26: predicate.partial_defer_inline 1.43% : 0.000003s : 22: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 16: predicate.reduce_eliminate 2.50% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 26: predicate.replace_applicator 0.33% : 0.000001s : 8: predicate.replace_old_param 0.17% : 0.000000s : 4: predicate.reset_defer_inline 1.07% : 0.000002s : 16: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.69% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.74% : 0.000002s : 8: predicate.shard_identity_eliminate 0.60% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.69% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.78% : 0.000004s : 26: predicate.switch_defer_inline 2.19% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.99% : 0.000014s : 86: predicate.switch_simplify 0.98% : 0.000002s : 16: predicate.tile_eliminate 0.98% : 0.000002s : 16: predicate.transpose_eliminate 1.82% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.83% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.34% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.06% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.66% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001408 23 58.63% : 0.000826s : 11: func_graph_cloner_run.FuncGraphClonerGraph 41.37% : 0.000583s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 5.412893 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.14% : 0.007741s : 1: add_attr 0.14% : 0.007730s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000170s : 1: auto_monad 0.00% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000021s : 1: bias_add_comm_swap 0.02% : 0.001062s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000008s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.00% : 0.000089s : 1: event_method 0.00% : 0.000038s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000013s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000003s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000022s : 1: label_micro_interleaved_index 0.01% : 0.000448s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.01% : 0.000485s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000016s : 1: opt.transform.mutable_eliminate 0.03% : 0.001473s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000156s : 28: opt.transform.opt_b 0.00% : 0.000064s : 2: opt.transform.opt_trans_graph 0.00% : 0.000042s : 4: opt.transform.symbol_engine_opt 0.07% : 0.003811s : 1: opt_a 0.00% : 0.000110s : 1: opt_after_cconv 0.01% : 0.000484s : 1: opt_after_jit_grad 0.00% : 0.000266s : 1: opt_b 0.12% : 0.006380s : 1: optimize 0.00% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000065s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000020s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000021s : 1: overlap_param_gather 0.00% : 0.000007s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000070s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000017s : 1: remove_cast_before_assign_add 0.00% : 0.000027s : 1: remove_dup_value 0.01% : 0.000635s : 1: renormalize.infer 0.01% : 0.000542s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000043s : 1: rewriter_after_opt_a 0.00% : 0.000268s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000035s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000116s : 1: symbol_engine_optimizer 98.24% : 5.317395s : 1: task_emit 0.00% : 0.000093s : 1: tuple_transform 1.15% : 0.062240s : 1: type_inference 0.00% : 0.000131s : 1: validate TotalTime = 0.0624158, [24] [bootstrap]: 0.00076973 [type_inference]: 0.0395718 [event_method]: 0.00010216 [auto_monad]: 0.00019331 [graph_reusing]: 1.231e-05 [inline]: 3.4e-06 [add_attr]: 0.0054541, [1] [add_attr_with_inline]: 0.0054355, [1] [Cycle 1]: 0.00012678, [2] [tag_attr]: 3.531e-05 [meta_addattr_fg_expand]: 8.22e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 5.601e-05 [insert-virtual-dataset]: 2.70002e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 1.73002e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00710645, [53] [py_interpret_to_execute]: 1.001e-05 [rewriter_before_opt_a]: 0.00028804 [opt_a]: 0.00432024, [2] [Cycle 1]: 0.00357301, [45] [expand_dump_flag]: 5.02e-06 [switch_simplify]: 0.00010488 [loop_unroll]: 4.028e-05 [a_1]: 0.00088065 [with_stream_mark]: 2.642e-05 [recompute_prepare]: 1.438e-05 [updatestate_depend_eliminate]: 5.84e-06 [updatestate_assign_eliminate]: 4.63999e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 1.74e-06 [a_2]: 0.00010667 [accelerated_algorithm]: 8.33001e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 3.58999e-06 [shard_inline]: 7.43e-06 [merge_send_recv]: 9.51998e-06 [auto_parallel]: 1.023e-05 [parallel]: 0.0001067 [flash_sp]: 1.01e-05 [merge_comm]: 6.34999e-06 [allreduce_fusion]: 4.05e-06 [matmul_add_comm_reduction]: 1.209e-05 [allreduce_slice_to_reducescatter]: 1.23002e-06 [virtual_shard_identity]: 1.106e-05 [virtual_dataset]: 7.6e-06 [get_grad_eliminate_]: 6.94001e-06 [virtual_output]: 7.19001e-06 [merge_forward]: 4.83001e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [offload_activation]: 1.215e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.556e-05 [merge_recompute_call_nodes]: 1.58002e-06 [before_grad]: 1.348e-05 [set_forward_comm_id_for_comm_node_pass]: 4.39002e-06 [meta_fg_expand]: 3.8e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.48e-06 [after_resolve]: 1.121e-05 [a_after_grad]: 1.055e-05 [renormalize]: 0.00166701 [add_forward_monad_depend]: 8.24002e-06 [auto_monad_grad]: 2.30002e-06 [auto_monad_eliminator]: 2.07e-05 [cse]: 4.156e-05 [a_3]: 5.747e-05 [Cycle 2]: 0.00073355, [45] [expand_dump_flag]: 1.87999e-06 [switch_simplify]: 8.77999e-06 [loop_unroll]: 7.78999e-06 [a_1]: 0.00016465 [with_stream_mark]: 1.749e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.59002e-06 [updatestate_loads_eliminate]: 3.65998e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 8.841e-05 [accelerated_algorithm]: 6.91999e-06 [shard]: 1.91e-06 [meta_shard_fg_expand]: 2.08002e-06 [shard_inline]: 6.91999e-06 [merge_send_recv]: 7.29001e-06 [auto_parallel]: 8.41002e-06 [parallel]: 6.23002e-06 [flash_sp]: 3.62002e-06 [merge_comm]: 4.24002e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 8.05999e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 7.44002e-06 [virtual_dataset]: 6.58998e-06 [get_grad_eliminate_]: 6.27001e-06 [virtual_output]: 6.33998e-06 [merge_forward]: 4.23999e-06 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 9.44998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.186e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.117e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 2.49001e-06 [flash_sp_send_recv_attached]: 1.40999e-06 [receive_attached]: 2.04e-06 [after_resolve]: 1.052e-05 [a_after_grad]: 9.54e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 1.37e-06 [auto_monad_eliminator]: 8.62998e-06 [cse]: 1.995e-05 [a_3]: 3.89e-05 [py_interpret_to_execute_after_opt_a]: 6.59999e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 6.21e-05 [convert_after_rewriter]: 1.41002e-06 [order_py_execute_after_rewriter]: 1.31998e-06 [mutable_eliminate]: 0.00074453 [opt_b]: 0.0003116, [1] [Cycle 1]: 0.00030256, [7] [b_1]: 0.0001987 [b_2]: 9.71e-06 [updatestate_depend_eliminate]: 1.054e-05 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.35e-06 [renormalize]: 1.00999e-06 [cse]: 3.36e-05 [optimize_parallel_all_gather_comm]: 2.068e-05 [overlap_param_gather]: 2.56e-06 [cconv]: 3.483e-05 [loop_unroll]: 0.00049017 [opt_after_cconv]: 0.00011254, [1] [Cycle 1]: 0.00010592, [7] [c_1]: 3.058e-05 [parameter_eliminate]: 4.59002e-06 [updatestate_depend_eliminate]: 6.90998e-06 [updatestate_assign_eliminate]: 3.34001e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.372e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 2.018e-05 [tuple_transform]: 9.95e-05, [1] [Cycle 1]: 9.404e-05, [4] [d_1]: 6.269e-05 [none_parameter_eliminate]: 2.36e-06 [renormalize]: 3.00002e-07 [switch_simplify]: 7.71001e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 5.565e-05 [cse_after_recomputation]: 2.574e-05, [1] [Cycle 1]: 2.114e-05, [1] [cse]: 1.573e-05 [environ_conv]: 1.149e-05 [swap_dp_allreduce_reducescatter]: 6.39001e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 5.14e-06 [label_fine_grained_interleaved_index]: 2.72001e-06 [merge_cast_opt]: 1.43002e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.34001e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.06002e-06 [full_micro_interleaved_order_control]: 2.34999e-06 [reorder_send_recv_between_fp_bp]: 2.67001e-06 [comm_op_add_attrs]: 1.17e-06 [add_comm_op_reuse_tag]: 9.10019e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.38002e-06 [overlap_opt_shard_in_pipeline]: 3.826e-05 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.579e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 5.47001e-06 [overlap_recompute_and_grad_model_parallel]: 6.22001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57999e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 5.36002e-06 [overlap_grad_flash_sp]: 2.385e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.95001e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 9.54e-05, [1] [Cycle 1]: 9.041e-05, [6] [build]: 1.433e-05 [elim_shapecalc]: 1.13e-05 [elim_not_effective]: 1.651e-05 [opt_reshape]: 7.53e-06 [fold_const_symbol]: 1.245e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.34001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 2.228e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 6.16998e-06 [opt_after_jit_grad]: 0.00051194 [validate]: 8.132e-05 [backend_pass]: 1.27e-06 [task_emit]: 0.00818617 [execute]: 9.52999e-06 Sums bootstrap : 0.000770s : 1.38% type_inference : 0.039572s : 70.92% event_method : 0.000102s : 0.18% auto_monad : 0.000193s : 0.35% graph_reusing : 0.000012s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000056s : 0.10% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.02% optimize.rewriter_before_opt_a : 0.000288s : 0.52% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000114s : 0.20% optimize.opt_a.loop_unroll : 0.000048s : 0.09% optimize.opt_a.a_1 : 0.001045s : 1.87% optimize.opt_a.with_stream_mark : 0.000044s : 0.08% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000195s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000019s : 0.03% optimize.opt_a.parallel : 0.000113s : 0.20% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000011s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.03% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000022s : 0.04% optimize.opt_a.a_after_grad : 0.000020s : 0.04% optimize.opt_a.renormalize : 0.001667s : 2.99% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.05% optimize.opt_a.cse : 0.000062s : 0.11% optimize.opt_a.a_3 : 0.000096s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000062s : 0.11% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000745s : 1.33% optimize.opt_b.b_1 : 0.000199s : 0.36% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000034s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.04% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000035s : 0.06% optimize.loop_unroll : 0.000490s : 0.88% optimize.opt_after_cconv.c_1 : 0.000031s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000020s : 0.04% optimize.tuple_transform.d_1 : 0.000063s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.10% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000038s : 0.07% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.04% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000512s : 0.92% validate : 0.000081s : 0.15% backend_pass : 0.000001s : 0.00% task_emit : 0.008186s : 14.67% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000400 62 0.60% : 0.000002s : 3: substitution.elim_not_effective 2.22% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.67% : 0.000003s : 3: substitution.fold_const_symbol 1.49% : 0.000006s : 4: substitution.graph_param_transform 63.01% : 0.000252s : 8: substitution.inline 1.51% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.44% : 0.000006s : 2: substitution.minmaximum_grad 1.49% : 0.000006s : 6: substitution.remove_not_recompute_node 1.27% : 0.000005s : 2: substitution.replace_old_param 2.27% : 0.000009s : 1: substitution.switch_simplify 5.03% : 0.000020s : 4: substitution.tuple_list_convert_item_index_to_positive 2.04% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.39% : 0.000014s : 4: substitution.tuple_list_get_item_depend_reorder 10.54% : 0.000042s : 8: substitution.tuple_list_get_item_eliminator 3.02% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.039468 2 94.04% : 0.037116s : 1: type_inference.infer 5.96% : 0.002352s : 1: type_inference.specialize ------[replace.] 0.000120 11 59.99% : 0.000072s : 8: replace.inline 21.80% : 0.000026s : 1: replace.switch_simplify 18.21% : 0.000022s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000259 11 95.19% : 0.000247s : 8: match.inline 3.18% : 0.000008s : 1: match.switch_simplify 1.63% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000246 1438 0.87% : 0.000002s : 16: predicate.accumulaten_eliminater 0.80% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 16: predicate.addn_zero_filter 0.85% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.14% : 0.000005s : 24: predicate.arithmetic_simplify 0.96% : 0.000002s : 16: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 16: predicate.dict_set_item_eliminator 1.09% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.38% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.32% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.55% : 0.000004s : 28: predicate.environ_get_eliminate 1.09% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.65% : 0.000007s : 26: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.74% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.38% : 0.000016s : 66: predicate.inline 0.66% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.91% : 0.000002s : 8: predicate.less_batch_normalization 1.65% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 42: predicate.load_eliminater 1.26% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.96% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.54% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 16: predicate.minmaximum_grad 1.18% : 0.000003s : 4: predicate.mutable_eliminate 0.37% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.35% : 0.000006s : 26: predicate.partial_defer_inline 1.33% : 0.000003s : 22: predicate.partial_eliminate 0.91% : 0.000002s : 16: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 16: predicate.reduce_eliminate 2.29% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000003s : 26: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.27% : 0.000001s : 4: predicate.reset_defer_inline 1.13% : 0.000003s : 16: predicate.reshape_eliminate 0.52% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.30% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.41% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000002s : 8: predicate.special_op_eliminate 0.66% : 0.000002s : 8: predicate.specialize_transform 0.91% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 26: predicate.switch_defer_inline 2.02% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.35% : 0.000016s : 86: predicate.switch_simplify 0.96% : 0.000002s : 16: predicate.tile_eliminate 0.88% : 0.000002s : 16: predicate.transpose_eliminate 1.70% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.63% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.17% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.83% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000002s : 8: predicate.virtual_output_eliminate 0.32% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.51% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001992 23 60.51% : 0.001206s : 11: func_graph_cloner_run.FuncGraphClonerGraph 39.49% : 0.000787s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078480 196 0.00% : 0.000004s : 1: ForceFp32Comm 6.96% : 0.005462s : 1: add_attr 6.93% : 0.005440s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.26% : 0.000204s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.04% : 0.000814s : 1: bootstrap 0.05% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.14% : 0.000113s : 1: event_method 0.02% : 0.000016s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000017s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.64% : 0.000499s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.96% : 0.000757s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 2.05% : 0.001610s : 78: opt.transform.opt_a 0.04% : 0.000029s : 1: opt.transform.opt_after_cconv 0.04% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000150s : 28: opt.transform.opt_b 0.09% : 0.000068s : 2: opt.transform.opt_trans_graph 0.06% : 0.000044s : 4: opt.transform.symbol_engine_opt 5.51% : 0.004324s : 1: opt_a 0.15% : 0.000116s : 1: opt_after_cconv 0.66% : 0.000521s : 1: opt_after_jit_grad 0.40% : 0.000316s : 1: opt_b 9.06% : 0.007113s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.05% : 0.000042s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000061s : 1: pre_auto_parallel 0.02% : 0.000014s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000024s : 1: remove_dup_value 1.21% : 0.000947s : 1: renormalize.infer 0.90% : 0.000709s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000067s : 1: rewriter_after_opt_a 0.38% : 0.000296s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000099s : 1: symbol_engine_optimizer 10.46% : 0.008209s : 1: task_emit 0.13% : 0.000103s : 1: tuple_transform 50.46% : 0.039604s : 1: type_inference 0.17% : 0.000135s : 1: validate TotalTime = 0.609262, [24] [bootstrap]: 0.00080042 [type_inference]: 0.0820864 [event_method]: 0.00023966 [auto_monad]: 0.00029157 [graph_reusing]: 1.908e-05 [inline]: 2.61999e-06 [add_attr]: 0.00523239, [1] [add_attr_with_inline]: 0.00521675, [1] [Cycle 1]: 0.00011988, [2] [tag_attr]: 6.105e-05 [meta_addattr_fg_expand]: 1.287e-05 [parallel-infer-symbol]: 4.06001e-06 [pre_auto_parallel]: 8.22e-05 [insert-virtual-dataset]: 3.00002e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.0766252, [53] [py_interpret_to_execute]: 9.62999e-06 [rewriter_before_opt_a]: 0.00043556 [opt_a]: 0.0727861, [3] [Cycle 1]: 0.0647495, [45] [expand_dump_flag]: 5.35999e-06 [switch_simplify]: 0.00019171 [loop_unroll]: 7.438e-05 [a_1]: 0.00167575 [with_stream_mark]: 3.695e-05 [recompute_prepare]: 2.658e-05 [updatestate_depend_eliminate]: 1.02e-05 [updatestate_assign_eliminate]: 8.19998e-06 [updatestate_loads_eliminate]: 7.37002e-06 [parameter_eliminate]: 3.08998e-06 [a_2]: 0.0002268 [accelerated_algorithm]: 1.592e-05 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 4.94998e-06 [shard_inline]: 1.566e-05 [merge_send_recv]: 1.86e-05 [auto_parallel]: 1.587e-05 [parallel]: 4.923e-05 [flash_sp]: 1.646e-05 [merge_comm]: 1.079e-05 [allreduce_fusion]: 1.004e-05 [matmul_add_comm_reduction]: 3.454e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 2.136e-05 [virtual_dataset]: 1.479e-05 [get_grad_eliminate_]: 1.382e-05 [virtual_output]: 1.455e-05 [merge_forward]: 1.072e-05 [cell_reuse_recompute_pass]: 1.84e-06 [offload_activation]: 1.926e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.2e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 2.778e-05 [set_forward_comm_id_for_comm_node_pass]: 9.99001e-06 [meta_fg_expand]: 0.0224663 [flash_sp_send_recv_attached]: 4.92e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 0.00010444 [a_after_grad]: 0.00013835 [renormalize]: 0.0369153 [add_forward_monad_depend]: 2.99e-05 [auto_monad_grad]: 1.595e-05 [auto_monad_eliminator]: 0.00013667 [cse]: 0.00037013 [a_3]: 0.00152031 [Cycle 2]: 0.00658298, [45] [expand_dump_flag]: 4.35999e-06 [switch_simplify]: 9.433e-05 [loop_unroll]: 8.633e-05 [a_1]: 0.00196821 [with_stream_mark]: 4.131e-05 [recompute_prepare]: 2.314e-05 [updatestate_depend_eliminate]: 1.132e-05 [updatestate_assign_eliminate]: 8.54e-06 [updatestate_loads_eliminate]: 7.9e-06 [parameter_eliminate]: 2.98998e-06 [a_2]: 0.00021334 [accelerated_algorithm]: 3.983e-05 [shard]: 2.04e-06 [meta_shard_fg_expand]: 5.84e-06 [shard_inline]: 1.524e-05 [merge_send_recv]: 1.39e-05 [auto_parallel]: 1.554e-05 [parallel]: 1.053e-05 [flash_sp]: 4.43999e-06 [merge_comm]: 9.52999e-06 [allreduce_fusion]: 8.26002e-06 [matmul_add_comm_reduction]: 1.707e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 1.725e-05 [virtual_dataset]: 1.367e-05 [get_grad_eliminate_]: 1.324e-05 [virtual_output]: 1.377e-05 [merge_forward]: 1.143e-05 [cell_reuse_recompute_pass]: 1.99e-06 [offload_activation]: 1.741e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.918e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 2.475e-05 [set_forward_comm_id_for_comm_node_pass]: 9.77999e-06 [meta_fg_expand]: 0.00028291 [flash_sp_send_recv_attached]: 2.81999e-06 [receive_attached]: 2.60002e-06 [after_resolve]: 2.804e-05 [a_after_grad]: 2.272e-05 [renormalize]: 0.00270338 [add_forward_monad_depend]: 1.08e-05 [auto_monad_grad]: 3.09999e-06 [auto_monad_eliminator]: 3.488e-05 [cse]: 0.0002397 [a_3]: 0.00011874 [Cycle 3]: 0.00143057, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 1.781e-05 [loop_unroll]: 1.417e-05 [a_1]: 0.00038625 [with_stream_mark]: 2.675e-05 [recompute_prepare]: 1.556e-05 [updatestate_depend_eliminate]: 9.37999e-06 [updatestate_assign_eliminate]: 7.9e-06 [updatestate_loads_eliminate]: 7.82e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.0002031 [accelerated_algorithm]: 2.081e-05 [shard]: 2.32001e-06 [meta_shard_fg_expand]: 4.1e-06 [shard_inline]: 1.347e-05 [merge_send_recv]: 1.411e-05 [auto_parallel]: 1.584e-05 [parallel]: 9.56998e-06 [flash_sp]: 1.38002e-06 [merge_comm]: 8.49002e-06 [allreduce_fusion]: 8.94e-06 [matmul_add_comm_reduction]: 1.63e-05 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 1.562e-05 [virtual_dataset]: 1.365e-05 [get_grad_eliminate_]: 1.29e-05 [virtual_output]: 1.295e-05 [merge_forward]: 8.94e-06 [cell_reuse_recompute_pass]: 2.68e-06 [offload_activation]: 1.66e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.714e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 2.393e-05 [set_forward_comm_id_for_comm_node_pass]: 8.82999e-06 [meta_fg_expand]: 6.09001e-06 [flash_sp_send_recv_attached]: 2.19001e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.968e-05 [a_after_grad]: 2.153e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.34001e-06 [auto_monad_grad]: 2.46998e-06 [auto_monad_eliminator]: 2.378e-05 [cse]: 7.018e-05 [a_3]: 9.245e-05 [py_interpret_to_execute_after_opt_a]: 1.178e-05 [slice_cell_reuse_recomputed_activation]: 2.37001e-06 [rewriter_after_opt_a]: 5.731e-05 [convert_after_rewriter]: 1.39e-06 [order_py_execute_after_rewriter]: 1.29998e-06 [mutable_eliminate]: 0.00081505 [opt_b]: 0.00056076, [1] [Cycle 1]: 0.00055228, [7] [b_1]: 0.00037499 [b_2]: 1.661e-05 [updatestate_depend_eliminate]: 1.695e-05 [updatestate_assign_eliminate]: 7.82998e-06 [updatestate_loads_eliminate]: 7.41999e-06 [renormalize]: 1.04e-06 [cse]: 8.279e-05 [optimize_parallel_all_gather_comm]: 3.291e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 4.034e-05 [loop_unroll]: 0.00058733 [opt_after_cconv]: 0.00024181, [1] [Cycle 1]: 0.0002339, [7] [c_1]: 9.205e-05 [parameter_eliminate]: 5.54998e-06 [updatestate_depend_eliminate]: 1.348e-05 [updatestate_assign_eliminate]: 7.21999e-06 [updatestate_loads_eliminate]: 7.05998e-06 [cse]: 6.859e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 0.0001274 [tuple_transform]: 0.00018282, [1] [Cycle 1]: 0.00017686, [4] [d_1]: 0.00013504 [none_parameter_eliminate]: 2.53e-06 [renormalize]: 2.89991e-07 [switch_simplify]: 1.563e-05 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 9.253e-05 [cse_after_recomputation]: 5.505e-05, [1] [Cycle 1]: 4.991e-05, [1] [cse]: 4.31e-05 [environ_conv]: 1.475e-05 [swap_dp_allreduce_reducescatter]: 1.209e-05 [bias_add_comm_swap]: 3.33e-06 [label_micro_interleaved_index]: 6.02999e-06 [label_fine_grained_interleaved_index]: 3.45e-06 [merge_cast_opt]: 1.58002e-06 [slice_recompute_activation]: 2.37001e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 8.89995e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.17001e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.40001e-06 [add_comm_op_reuse_tag]: 1.21997e-06 [interleave_split_concat_branches]: 1.32e-06 [interleave_parallel_branches]: 1.18001e-06 [overlap_opt_shard_in_pipeline]: 1.49998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 2.833e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 7.87998e-06 [overlap_recompute_and_grad_model_parallel]: 8.99998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.42999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41998e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 7.42998e-06 [overlap_grad_flash_sp]: 3.793e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.49999e-06 [split_layernorm_comm]: 1.56998e-06 [handle_group_info]: 1.08001e-06 [symbol_engine_optimizer]: 0.00014269, [1] [Cycle 1]: 0.00013637, [6] [build]: 1.47e-05 [elim_shapecalc]: 2.267e-05 [elim_not_effective]: 2.735e-05 [opt_reshape]: 1.449e-05 [fold_const_symbol]: 2.532e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.17001e-06 [pipeline_parallel_scheduler]: 1.73002e-06 [auto_monad_reorder]: 3.318e-05 [get_jit_bprop_graph]: 2.30002e-06 [rewriter_after_jit_bprop_graph]: 6.27001e-06 [opt_after_jit_grad]: 0.00067368 [validate]: 8.839e-05 [backend_pass]: 1.15001e-06 [task_emit]: 0.442706 [execute]: 1.094e-05 Sums bootstrap : 0.000800s : 0.13% type_inference : 0.082086s : 13.63% event_method : 0.000240s : 0.04% auto_monad : 0.000292s : 0.05% graph_reusing : 0.000019s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000061s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000082s : 0.01% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.00% optimize.rewriter_before_opt_a : 0.000436s : 0.07% optimize.opt_a.expand_dump_flag : 0.000013s : 0.00% optimize.opt_a.switch_simplify : 0.000304s : 0.05% optimize.opt_a.loop_unroll : 0.000175s : 0.03% optimize.opt_a.a_1 : 0.004030s : 0.67% optimize.opt_a.with_stream_mark : 0.000105s : 0.02% optimize.opt_a.recompute_prepare : 0.000065s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000031s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000023s : 0.00% optimize.opt_a.parameter_eliminate : 0.000008s : 0.00% optimize.opt_a.a_2 : 0.000643s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000077s : 0.01% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000015s : 0.00% optimize.opt_a.shard_inline : 0.000044s : 0.01% optimize.opt_a.merge_send_recv : 0.000047s : 0.01% optimize.opt_a.auto_parallel : 0.000047s : 0.01% optimize.opt_a.parallel : 0.000069s : 0.01% optimize.opt_a.flash_sp : 0.000022s : 0.00% optimize.opt_a.merge_comm : 0.000029s : 0.00% optimize.opt_a.allreduce_fusion : 0.000027s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000068s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000054s : 0.01% optimize.opt_a.virtual_dataset : 0.000042s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.01% optimize.opt_a.virtual_output : 0.000041s : 0.01% optimize.opt_a.merge_forward : 0.000031s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000053s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000088s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000076s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000029s : 0.00% optimize.opt_a.meta_fg_expand : 0.022755s : 3.78% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.00% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000152s : 0.03% optimize.opt_a.a_after_grad : 0.000183s : 0.03% optimize.opt_a.renormalize : 0.039619s : 6.58% optimize.opt_a.add_forward_monad_depend : 0.000043s : 0.01% optimize.opt_a.auto_monad_grad : 0.000022s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000195s : 0.03% optimize.opt_a.cse : 0.000680s : 0.11% optimize.opt_a.a_3 : 0.001732s : 0.29% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000057s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000815s : 0.14% optimize.opt_b.b_1 : 0.000375s : 0.06% optimize.opt_b.b_2 : 0.000017s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000083s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.01% optimize.loop_unroll : 0.000587s : 0.10% optimize.opt_after_cconv.c_1 : 0.000092s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000069s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000127s : 0.02% optimize.tuple_transform.d_1 : 0.000135s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000016s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000093s : 0.02% optimize.cse_after_recomputation.cse : 0.000043s : 0.01% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000028s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000008s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000038s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000014s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000025s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000033s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000674s : 0.11% validate : 0.000088s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.442706s : 73.51% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.002081 315 0.18% : 0.000004s : 8: substitution.elim_not_effective 0.41% : 0.000009s : 12: substitution.float_depend_g_call 0.83% : 0.000017s : 9: substitution.float_tuple_getitem_switch 0.18% : 0.000004s : 8: substitution.fold_const_symbol 33.62% : 0.000700s : 5: substitution.getattr_setattr_resolve 0.49% : 0.000010s : 10: substitution.graph_param_transform 0.14% : 0.000003s : 2: substitution.incorporate_call 0.11% : 0.000002s : 2: substitution.incorporate_call_switch 37.84% : 0.000788s : 24: substitution.inline 1.46% : 0.000030s : 3: substitution.inline_without_move 0.74% : 0.000015s : 25: substitution.j_node_and_user_rematch 1.01% : 0.000021s : 4: substitution.less_batch_normalization 1.09% : 0.000023s : 13: substitution.minmaximum_grad 1.21% : 0.000025s : 12: substitution.partial_eliminate 0.90% : 0.000019s : 25: substitution.remove_not_recompute_node 4.96% : 0.000103s : 32: substitution.replace_applicator 0.81% : 0.000017s : 14: substitution.replace_old_param 0.14% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.65% : 0.000013s : 4: substitution.switch_simplify 0.73% : 0.000015s : 2: substitution.transpose_eliminate 2.85% : 0.000059s : 17: substitution.tuple_list_convert_item_index_to_positive 1.16% : 0.000024s : 17: substitution.tuple_list_get_item_const_eliminator 1.63% : 0.000034s : 17: substitution.tuple_list_get_item_depend_reorder 5.22% : 0.000109s : 32: substitution.tuple_list_get_item_eliminator 1.66% : 0.000035s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.081918 2 95.00% : 0.077826s : 1: type_inference.infer 5.00% : 0.004093s : 1: type_inference.specialize ------[replace.] 0.000613 45 12.32% : 0.000075s : 4: replace.getattr_setattr_resolve 50.26% : 0.000308s : 24: replace.inline 14.75% : 0.000090s : 5: replace.replace_applicator 9.60% : 0.000059s : 4: replace.switch_simplify 13.07% : 0.000080s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001501 45 43.29% : 0.000650s : 4: match.getattr_setattr_resolve 51.48% : 0.000773s : 24: match.inline 2.57% : 0.000039s : 5: match.replace_applicator 0.73% : 0.000011s : 4: match.switch_simplify 1.92% : 0.000029s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001069 7110 0.84% : 0.000009s : 68: predicate.accumulaten_eliminater 0.40% : 0.000004s : 10: predicate.ad_related_special_op_eliminate 0.39% : 0.000004s : 32: predicate.addn_check_dump 0.87% : 0.000009s : 68: predicate.addn_zero_filter 0.80% : 0.000009s : 68: predicate.adjust_all_reduce_mul_add 1.76% : 0.000019s : 100: predicate.arithmetic_simplify 0.87% : 0.000009s : 68: predicate.cast_eliminate 2.71% : 0.000029s : 215: predicate.check_bprop_eliminate 0.40% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.40% : 0.000004s : 32: predicate.depend_value_elim 0.92% : 0.000010s : 68: predicate.dict_get_item_const_eliminator 1.01% : 0.000011s : 68: predicate.dict_get_item_eliminator 0.92% : 0.000010s : 68: predicate.dict_set_item_eliminator 0.44% : 0.000005s : 20: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 10: predicate.elim_not_effective 0.18% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 0.94% : 0.000010s : 78: predicate.environ_add_const_eliminate 0.95% : 0.000010s : 78: predicate.environ_get_add_eliminate 0.95% : 0.000010s : 78: predicate.environ_get_depend_swap 1.35% : 0.000014s : 110: predicate.environ_get_eliminate 0.95% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.28% : 0.000014s : 100: predicate.exchange_switch_depend_value 1.94% : 0.000021s : 100: predicate.float_depend_g_call 0.42% : 0.000004s : 32: predicate.float_environ_get_switch 0.59% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 10: predicate.fold_const_symbol 0.48% : 0.000005s : 32: predicate.get_grad_eliminate 0.75% : 0.000008s : 31: predicate.getattr_setattr_resolve 0.08% : 0.000001s : 10: predicate.graph_param_transform 0.44% : 0.000005s : 32: predicate.incorporate_call 0.38% : 0.000004s : 32: predicate.incorporate_call_switch 4.51% : 0.000048s : 252: predicate.inline 1.48% : 0.000016s : 82: predicate.inline_without_move 0.22% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.63% : 0.000007s : 32: predicate.less_batch_normalization 1.33% : 0.000014s : 96: predicate.list_to_tuple_eliminator_ 2.03% : 0.000022s : 164: predicate.load_eliminater 0.45% : 0.000005s : 10: predicate.loop_unroll_after_grad 2.40% : 0.000026s : 182: predicate.loop_unroll_before_grad 1.14% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.42% : 0.000005s : 32: predicate.merge_addn 2.51% : 0.000027s : 198: predicate.micro_step_allgather_replace 2.65% : 0.000028s : 198: predicate.mini_step_allgather_replace 0.87% : 0.000009s : 68: predicate.minmaximum_grad 0.48% : 0.000005s : 10: predicate.mutable_eliminate 0.14% : 0.000002s : 10: predicate.opt_reshape 0.19% : 0.000002s : 10: predicate.parallel_virtual_node 1.88% : 0.000020s : 100: predicate.partial_defer_inline 1.19% : 0.000013s : 86: predicate.partial_eliminate 0.86% : 0.000009s : 68: predicate.print_const_string_wrapper 0.42% : 0.000004s : 32: predicate.reduce_all_const_elim 1.12% : 0.000012s : 68: predicate.reduce_eliminate 2.01% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000003s : 32: predicate.remove_not_recompute_node 2.34% : 0.000025s : 284: predicate.replace_applicator 0.69% : 0.000007s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.90% : 0.000010s : 68: predicate.reshape_eliminate 2.68% : 0.000029s : 198: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 10: predicate.row_tensor_eliminate 3.01% : 0.000032s : 215: predicate.same_eliminate 0.28% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.47% : 0.000005s : 32: predicate.shard_identity_eliminate 0.31% : 0.000003s : 20: predicate.special_op_eliminate 0.50% : 0.000005s : 32: predicate.specialize_transform 2.73% : 0.000029s : 198: predicate.split_environ_get_set_with_tuple_value 1.34% : 0.000014s : 82: predicate.stack_unstack_eliminate 2.18% : 0.000023s : 10: predicate.switch_call_monad_eliminater 1.42% : 0.000015s : 100: predicate.switch_defer_inline 4.29% : 0.000046s : 315: predicate.switch_layer_defer_inline 4.84% : 0.000052s : 332: predicate.switch_simplify 0.87% : 0.000009s : 68: predicate.tile_eliminate 0.89% : 0.000010s : 68: predicate.transpose_eliminate 1.23% : 0.000013s : 88: predicate.tuple_list_convert_item_index_to_positive 1.30% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.20% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.41% : 0.000026s : 128: predicate.tuple_list_get_item_eliminator 1.22% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.78% : 0.000019s : 120: predicate.tuple_list_set_item_eliminator 1.25% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 1.88% : 0.000020s : 164: predicate.updatestate_pure_node_eliminater 2.38% : 0.000025s : 196: predicate.updatestate_useless_node_eliminater 0.16% : 0.000002s : 10: predicate.value_based_eliminate 0.52% : 0.000006s : 32: predicate.virtual_dataset_eliminate 0.48% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000002s : 10: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006898 75 67.82% : 0.004678s : 36: func_graph_cloner_run.FuncGraphClonerGraph 32.18% : 0.002220s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.739773 247 0.00% : 0.000004s : 1: ForceFp32Comm 0.71% : 0.005239s : 1: add_attr 0.71% : 0.005221s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.01% : 0.000098s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000305s : 1: auto_monad 0.01% : 0.000039s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.11% : 0.000846s : 1: bootstrap 0.01% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000032s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000058s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.03% : 0.000256s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000024s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.08% : 0.000599s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.11% : 0.000829s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000030s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000035s : 1: opt.transform.mutable_eliminate 1.02% : 0.007544s : 125: opt.transform.opt_a 0.01% : 0.000090s : 1: opt.transform.opt_after_cconv 0.01% : 0.000060s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000354s : 28: opt.transform.opt_b 0.11% : 0.000851s : 2: opt.transform.opt_resolve 0.02% : 0.000148s : 2: opt.transform.opt_trans_graph 0.01% : 0.000086s : 4: opt.transform.symbol_engine_opt 9.84% : 0.072790s : 1: opt_a 0.03% : 0.000246s : 1: opt_after_cconv 0.09% : 0.000687s : 1: opt_after_jit_grad 0.08% : 0.000565s : 1: opt_b 10.36% : 0.076631s : 1: optimize 0.01% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000042s : 1: overlap_grad_flash_sp 0.00% : 0.000006s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000088s : 1: pre_auto_parallel 0.00% : 0.000014s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000134s : 1: remove_dup_value 4.56% : 0.033769s : 2: renormalize.infer 0.79% : 0.005818s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000063s : 1: rewriter_after_opt_a 0.06% : 0.000444s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000146s : 1: symbol_engine_optimizer 59.85% : 0.442729s : 1: task_emit 0.03% : 0.000186s : 1: tuple_transform 11.10% : 0.082119s : 1: type_inference 0.02% : 0.000156s : 1: validate TotalTime = 0.0493717, [24] [bootstrap]: 0.00045133 [type_inference]: 0.0283452 [event_method]: 2.708e-05 [auto_monad]: 8.868e-05 [graph_reusing]: 6.41e-06 [inline]: 2.81e-06 [add_attr]: 0.00392724, [1] [add_attr_with_inline]: 0.0039157, [1] [Cycle 1]: 7.968e-05, [2] [tag_attr]: 3.017e-05 [meta_addattr_fg_expand]: 6.28e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 4.61e-05 [insert-virtual-dataset]: 2.93998e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00599041, [53] [py_interpret_to_execute]: 7.03e-06 [rewriter_before_opt_a]: 0.00028239 [opt_a]: 0.00349699, [2] [Cycle 1]: 0.00287494, [45] [expand_dump_flag]: 3.23998e-06 [switch_simplify]: 9.108e-05 [loop_unroll]: 3.199e-05 [a_1]: 0.00065102 [with_stream_mark]: 2.216e-05 [recompute_prepare]: 8.84e-06 [updatestate_depend_eliminate]: 4.33999e-06 [updatestate_assign_eliminate]: 3.73999e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 7.122e-05 [accelerated_algorithm]: 6.19001e-06 [shard]: 1.79998e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 5.61e-06 [merge_send_recv]: 8.13999e-06 [auto_parallel]: 8.55001e-06 [parallel]: 1.895e-05 [flash_sp]: 1.101e-05 [merge_comm]: 3.73001e-06 [allreduce_fusion]: 3.39001e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 8.89995e-07 [virtual_shard_identity]: 7.73001e-06 [virtual_dataset]: 6.38998e-06 [get_grad_eliminate_]: 5.27999e-06 [virtual_output]: 5.34998e-06 [merge_forward]: 3.70998e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.002e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.324e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 1.112e-05 [set_forward_comm_id_for_comm_node_pass]: 3.57997e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.006e-05 [a_after_grad]: 9.17999e-06 [renormalize]: 0.00142583 [add_forward_monad_depend]: 8.08999e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 1.951e-05 [cse]: 3.872e-05 [a_3]: 5.203e-05 [Cycle 2]: 0.00060739, [45] [expand_dump_flag]: 2.81999e-06 [switch_simplify]: 8.66002e-06 [loop_unroll]: 5.91e-06 [a_1]: 9.837e-05 [with_stream_mark]: 1.864e-05 [recompute_prepare]: 5.74e-06 [updatestate_depend_eliminate]: 3.9e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.94001e-06 [parameter_eliminate]: 2.47001e-06 [a_2]: 6.13e-05 [accelerated_algorithm]: 6.57002e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 5.17999e-06 [merge_send_recv]: 7.47998e-06 [auto_parallel]: 8.18999e-06 [parallel]: 7.41001e-06 [flash_sp]: 3.4e-06 [merge_comm]: 3.21999e-06 [allreduce_fusion]: 3.41001e-06 [matmul_add_comm_reduction]: 8.45001e-06 [allreduce_slice_to_reducescatter]: 1.02998e-06 [virtual_shard_identity]: 6.28e-06 [virtual_dataset]: 4.95999e-06 [get_grad_eliminate_]: 5.10001e-06 [virtual_output]: 5.94e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.97999e-06 [offload_activation]: 9.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.537e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 8.69998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51001e-06 [meta_fg_expand]: 2.07001e-06 [flash_sp_send_recv_attached]: 1.23002e-06 [receive_attached]: 1.81998e-06 [after_resolve]: 8.80001e-06 [a_after_grad]: 7.66001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.42999e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 6.76e-06 [cse]: 1.578e-05 [a_3]: 3.082e-05 [py_interpret_to_execute_after_opt_a]: 6.09001e-06 [slice_cell_reuse_recomputed_activation]: 1.80001e-06 [rewriter_after_opt_a]: 2.064e-05 [convert_after_rewriter]: 1.17e-06 [order_py_execute_after_rewriter]: 1.09998e-06 [mutable_eliminate]: 0.00073955 [opt_b]: 0.00019638, [1] [Cycle 1]: 0.00018895, [7] [b_1]: 0.00010768 [b_2]: 6.54001e-06 [updatestate_depend_eliminate]: 7.89002e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.24999e-06 [renormalize]: 1.04e-06 [cse]: 2.559e-05 [optimize_parallel_all_gather_comm]: 1.936e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 3.563e-05 [loop_unroll]: 0.00046715 [opt_after_cconv]: 0.00010296, [1] [Cycle 1]: 9.677e-05, [7] [c_1]: 2.656e-05 [parameter_eliminate]: 5.12999e-06 [updatestate_depend_eliminate]: 6.58e-06 [updatestate_assign_eliminate]: 2.48002e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.946e-05 [renormalize]: 7.89994e-07 [remove_dup_value]: 1.686e-05 [tuple_transform]: 7.116e-05, [1] [Cycle 1]: 6.687e-05, [4] [d_1]: 3.874e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.26e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 5.501e-05 [cse_after_recomputation]: 2.365e-05, [1] [Cycle 1]: 1.891e-05, [1] [cse]: 1.332e-05 [environ_conv]: 1.037e-05 [swap_dp_allreduce_reducescatter]: 4.97e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.49001e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.14999e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.36e-06 [reorder_send_recv_between_fp_bp]: 2.96999e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.10999e-06 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.21002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.17999e-06 [control_data_broadcast_order]: 1.299e-05 [grouped_pairwise_exchange_alltoall]: 1.48002e-06 [offloading_packed_experts]: 4.28999e-06 [overlap_recompute_and_grad_model_parallel]: 4.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 1.99e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 2.032e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.33002e-06 [symbol_engine_optimizer]: 8.642e-05, [1] [Cycle 1]: 8.19e-05, [6] [build]: 1.363e-05 [elim_shapecalc]: 1.038e-05 [elim_not_effective]: 1.244e-05 [opt_reshape]: 6.16e-06 [fold_const_symbol]: 9.14e-06 [renormalize]: 1.79978e-07 [detach_backward]: 2.02001e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.978e-05 [get_jit_bprop_graph]: 2.31998e-06 [rewriter_after_jit_bprop_graph]: 5.70001e-06 [opt_after_jit_grad]: 0.00055708 [validate]: 5.51e-05 [backend_pass]: 8.90024e-07 [task_emit]: 0.00955947 [execute]: 1.009e-05 Sums bootstrap : 0.000451s : 1.02% type_inference : 0.028345s : 63.91% event_method : 0.000027s : 0.06% auto_monad : 0.000089s : 0.20% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000046s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.02% optimize.rewriter_before_opt_a : 0.000282s : 0.64% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000100s : 0.22% optimize.opt_a.loop_unroll : 0.000038s : 0.09% optimize.opt_a.a_1 : 0.000749s : 1.69% optimize.opt_a.with_stream_mark : 0.000041s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000133s : 0.30% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.02% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000017s : 0.04% optimize.opt_a.parallel : 0.000026s : 0.06% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.02% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000019s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000019s : 0.04% optimize.opt_a.a_after_grad : 0.000017s : 0.04% optimize.opt_a.renormalize : 0.001426s : 3.21% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.06% optimize.opt_a.cse : 0.000055s : 0.12% optimize.opt_a.a_3 : 0.000083s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000740s : 1.67% optimize.opt_b.b_1 : 0.000108s : 0.24% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000036s : 0.08% optimize.loop_unroll : 0.000467s : 1.05% optimize.opt_after_cconv.c_1 : 0.000027s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000039s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.12% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.04% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000557s : 1.26% validate : 0.000055s : 0.12% backend_pass : 0.000001s : 0.00% task_emit : 0.009559s : 21.55% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000233 26 0.90% : 0.000002s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.34% : 0.000005s : 3: substitution.graph_param_transform 80.44% : 0.000188s : 6: substitution.inline 1.80% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000006s : 4: substitution.remove_not_recompute_node 1.84% : 0.000004s : 2: substitution.replace_old_param 3.79% : 0.000009s : 1: substitution.switch_simplify 5.83% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.028258 2 94.73% : 0.026769s : 1: type_inference.infer 5.27% : 0.001489s : 1: type_inference.specialize ------[replace.] 0.000100 9 54.44% : 0.000055s : 6: replace.inline 25.99% : 0.000026s : 1: replace.switch_simplify 19.57% : 0.000020s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000204 9 90.09% : 0.000184s : 6: match.inline 3.84% : 0.000008s : 1: match.switch_simplify 6.07% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000189 1092 0.84% : 0.000002s : 12: predicate.accumulaten_eliminater 1.19% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 1.00% : 0.000002s : 12: predicate.addn_zero_filter 0.91% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.22% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.44% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.47% : 0.000001s : 6: predicate.depend_value_elim 1.00% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 15: predicate.environ_get_add_eliminate 0.99% : 0.000002s : 15: predicate.environ_get_depend_swap 1.47% : 0.000003s : 21: predicate.environ_get_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.59% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.54% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.70% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.46% : 0.000001s : 6: predicate.incorporate_call 0.42% : 0.000001s : 6: predicate.incorporate_call_switch 5.79% : 0.000011s : 50: predicate.inline 0.75% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.83% : 0.000002s : 6: predicate.less_batch_normalization 1.54% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.42% : 0.000005s : 32: predicate.load_eliminater 1.24% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.68% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 6: predicate.merge_addn 0.53% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 12: predicate.minmaximum_grad 1.76% : 0.000003s : 3: predicate.mutable_eliminate 0.37% : 0.000001s : 3: predicate.opt_reshape 0.48% : 0.000001s : 3: predicate.parallel_virtual_node 2.25% : 0.000004s : 20: predicate.partial_defer_inline 1.31% : 0.000002s : 17: predicate.partial_eliminate 1.12% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.36% : 0.000003s : 12: predicate.reduce_eliminate 2.39% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 6: predicate.remove_not_recompute_node 1.31% : 0.000002s : 20: predicate.replace_applicator 0.58% : 0.000001s : 6: predicate.replace_old_param 0.34% : 0.000001s : 3: predicate.reset_defer_inline 0.96% : 0.000002s : 12: predicate.reshape_eliminate 0.58% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 3: predicate.row_tensor_eliminate 0.85% : 0.000002s : 6: predicate.same_eliminate 0.34% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.58% : 0.000001s : 6: predicate.shard_identity_eliminate 0.68% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.97% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.32% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.64% : 0.000003s : 20: predicate.switch_defer_inline 2.03% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.94% : 0.000011s : 68: predicate.switch_simplify 1.19% : 0.000002s : 12: predicate.tile_eliminate 0.96% : 0.000002s : 12: predicate.transpose_eliminate 1.67% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.54% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.07% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.32% : 0.000002s : 18: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.31% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.02% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.60% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001261 16 57.42% : 0.000724s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.58% : 0.000537s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062038 196 0.01% : 0.000004s : 1: ForceFp32Comm 6.34% : 0.003935s : 1: add_attr 6.32% : 0.003920s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000060s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000094s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.78% : 0.000483s : 1: bootstrap 0.06% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.05% : 0.000034s : 1: event_method 0.03% : 0.000018s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.77% : 0.000477s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.21% : 0.000751s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000018s : 1: opt.transform.mutable_eliminate 1.91% : 0.001183s : 78: opt.transform.opt_a 0.04% : 0.000025s : 1: opt.transform.opt_after_cconv 0.04% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000085s : 28: opt.transform.opt_b 0.07% : 0.000043s : 2: opt.transform.opt_trans_graph 0.06% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.64% : 0.003500s : 1: opt_a 0.17% : 0.000106s : 1: opt_after_cconv 0.92% : 0.000568s : 1: opt_after_jit_grad 0.35% : 0.000219s : 1: opt_b 9.67% : 0.005996s : 1: optimize 0.04% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000051s : 1: pre_auto_parallel 0.02% : 0.000010s : 1: py_interpret_to_execute 0.02% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000020s : 1: remove_dup_value 1.33% : 0.000824s : 1: renormalize.infer 0.95% : 0.000592s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000024s : 1: rewriter_after_opt_a 0.47% : 0.000290s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000089s : 1: symbol_engine_optimizer 15.45% : 0.009582s : 1: task_emit 0.12% : 0.000074s : 1: tuple_transform 45.74% : 0.028374s : 1: type_inference 0.16% : 0.000099s : 1: validate TotalTime = 0.0551711, [24] [bootstrap]: 0.00090323 [type_inference]: 0.0315998 [event_method]: 2.639e-05 [auto_monad]: 0.00010185 [graph_reusing]: 6.71e-06 [inline]: 2.92002e-06 [add_attr]: 0.00493762, [1] [add_attr_with_inline]: 0.00490237, [1] [Cycle 1]: 8.42e-05, [2] [tag_attr]: 3.074e-05 [meta_addattr_fg_expand]: 6.78e-06 [parallel-infer-symbol]: 3.48999e-06 [pre_auto_parallel]: 4.808e-05 [insert-virtual-dataset]: 2.84001e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 2.03997e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00646533, [53] [py_interpret_to_execute]: 1.193e-05 [rewriter_before_opt_a]: 0.00029569 [opt_a]: 0.0037399, [2] [Cycle 1]: 0.00304597, [45] [expand_dump_flag]: 4.06001e-06 [switch_simplify]: 9.097e-05 [loop_unroll]: 3.632e-05 [a_1]: 0.00066318 [with_stream_mark]: 5.038e-05 [recompute_prepare]: 1.082e-05 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 3.80998e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 2.26998e-06 [a_2]: 7.496e-05 [accelerated_algorithm]: 6.79999e-06 [shard]: 1.77999e-06 [meta_shard_fg_expand]: 2.76e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 8.86002e-06 [auto_parallel]: 9.16002e-06 [parallel]: 1.999e-05 [flash_sp]: 1.067e-05 [merge_comm]: 4.00998e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 1.007e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 8.79e-06 [virtual_dataset]: 6.18998e-06 [get_grad_eliminate_]: 5.95002e-06 [virtual_output]: 6.00002e-06 [merge_forward]: 4.13001e-06 [cell_reuse_recompute_pass]: 1.71002e-06 [offload_activation]: 1.079e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.438e-05 [merge_recompute_call_nodes]: 1.94999e-06 [before_grad]: 1.135e-05 [set_forward_comm_id_for_comm_node_pass]: 3.47997e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 2.89001e-06 [receive_attached]: 2.41998e-06 [after_resolve]: 1.059e-05 [a_after_grad]: 9.34e-06 [renormalize]: 0.00149769 [add_forward_monad_depend]: 7.40998e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 1.911e-05 [cse]: 4.15e-05 [a_3]: 6.873e-05 [Cycle 2]: 0.00068007, [45] [expand_dump_flag]: 2.88998e-06 [switch_simplify]: 9.19e-06 [loop_unroll]: 7.71999e-06 [a_1]: 0.00011537 [with_stream_mark]: 1.806e-05 [recompute_prepare]: 6.18002e-06 [updatestate_depend_eliminate]: 3.67002e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.31999e-06 [parameter_eliminate]: 2.49001e-06 [a_2]: 6.848e-05 [accelerated_algorithm]: 6.52001e-06 [shard]: 1.96e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 5.85002e-06 [merge_send_recv]: 7.96001e-06 [auto_parallel]: 9.64e-06 [parallel]: 8.28001e-06 [flash_sp]: 3.96001e-06 [merge_comm]: 3.75e-06 [allreduce_fusion]: 3.26001e-06 [matmul_add_comm_reduction]: 8.72e-06 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 7.3e-06 [virtual_dataset]: 6.14001e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 6.06e-06 [merge_forward]: 3.91999e-06 [cell_reuse_recompute_pass]: 2.80002e-06 [offload_activation]: 9.08002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.693e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81999e-06 [meta_fg_expand]: 2.80997e-06 [flash_sp_send_recv_attached]: 1.60999e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.009e-05 [a_after_grad]: 8.40001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.65001e-06 [auto_monad_grad]: 1.31002e-06 [auto_monad_eliminator]: 8.17e-06 [cse]: 2.153e-05 [a_3]: 3.145e-05 [py_interpret_to_execute_after_opt_a]: 8.44002e-06 [slice_cell_reuse_recomputed_activation]: 2.20002e-06 [rewriter_after_opt_a]: 2.131e-05 [convert_after_rewriter]: 1.52001e-06 [order_py_execute_after_rewriter]: 1.51002e-06 [mutable_eliminate]: 0.00081459 [opt_b]: 0.00021739, [1] [Cycle 1]: 0.00020936, [7] [b_1]: 0.00011749 [b_2]: 6.91001e-06 [updatestate_depend_eliminate]: 8.57998e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 3.12002e-06 [renormalize]: 8.70001e-07 [cse]: 3.186e-05 [optimize_parallel_all_gather_comm]: 1.936e-05 [overlap_param_gather]: 2.48002e-06 [cconv]: 3.294e-05 [loop_unroll]: 0.00048326 [opt_after_cconv]: 0.00013713, [1] [Cycle 1]: 0.00013048, [7] [c_1]: 2.461e-05 [parameter_eliminate]: 4.75999e-06 [updatestate_depend_eliminate]: 6.33998e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.98e-06 [cse]: 2.472e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 4.436e-05 [tuple_transform]: 7.683e-05, [1] [Cycle 1]: 7.062e-05, [4] [d_1]: 4.113e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.05e-06 [partial_unused_args_eliminate]: 2.32001e-06 [add_recomputation]: 5.513e-05 [cse_after_recomputation]: 2.569e-05, [1] [Cycle 1]: 2.002e-05, [1] [cse]: 1.427e-05 [environ_conv]: 1.168e-05 [swap_dp_allreduce_reducescatter]: 5.66e-06 [bias_add_comm_swap]: 2.81e-06 [label_micro_interleaved_index]: 5.57001e-06 [label_fine_grained_interleaved_index]: 2.65997e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.13002e-06 [micro_interleaved_order_control]: 2.12001e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.20999e-06 [full_micro_interleaved_order_control]: 2.20002e-06 [reorder_send_recv_between_fp_bp]: 3.08e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 1.63002e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 1.22999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82999e-06 [control_data_broadcast_order]: 1.337e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 4.21001e-06 [overlap_recompute_and_grad_model_parallel]: 5.50001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.31002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.26001e-06 [overlap_grad_flash_sp]: 2.215e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 1.55001e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.698e-05, [1] [Cycle 1]: 8.246e-05, [6] [build]: 1.294e-05 [elim_shapecalc]: 9.79999e-06 [elim_not_effective]: 1.25e-05 [opt_reshape]: 6.94999e-06 [fold_const_symbol]: 9.22999e-06 [renormalize]: 1.39989e-07 [detach_backward]: 2.13002e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.81e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 7.3e-06 [opt_after_jit_grad]: 0.00055828 [validate]: 9.26e-05 [backend_pass]: 9.50007e-07 [task_emit]: 0.0100426 [execute]: 1.014e-05 Sums bootstrap : 0.000903s : 1.84% type_inference : 0.031600s : 64.53% event_method : 0.000026s : 0.05% auto_monad : 0.000102s : 0.21% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000048s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000012s : 0.02% optimize.rewriter_before_opt_a : 0.000296s : 0.60% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000100s : 0.20% optimize.opt_a.loop_unroll : 0.000044s : 0.09% optimize.opt_a.a_1 : 0.000779s : 1.59% optimize.opt_a.with_stream_mark : 0.000068s : 0.14% optimize.opt_a.recompute_prepare : 0.000017s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000143s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.02% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000019s : 0.04% optimize.opt_a.parallel : 0.000028s : 0.06% optimize.opt_a.flash_sp : 0.000015s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.03% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.04% optimize.opt_a.a_after_grad : 0.000018s : 0.04% optimize.opt_a.renormalize : 0.001498s : 3.06% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.06% optimize.opt_a.cse : 0.000063s : 0.13% optimize.opt_a.a_3 : 0.000100s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.04% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000815s : 1.66% optimize.opt_b.b_1 : 0.000117s : 0.24% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000032s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000033s : 0.07% optimize.loop_unroll : 0.000483s : 0.99% optimize.opt_after_cconv.c_1 : 0.000025s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.09% optimize.tuple_transform.d_1 : 0.000041s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.11% optimize.cse_after_recomputation.cse : 0.000014s : 0.03% optimize.environ_conv : 0.000012s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000002s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000558s : 1.14% validate : 0.000093s : 0.19% backend_pass : 0.000001s : 0.00% task_emit : 0.010043s : 20.51% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000236 26 0.86% : 0.000002s : 2: substitution.elim_not_effective 0.65% : 0.000002s : 2: substitution.fold_const_symbol 2.64% : 0.000006s : 3: substitution.graph_param_transform 80.46% : 0.000190s : 6: substitution.inline 1.90% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.31% : 0.000005s : 4: substitution.remove_not_recompute_node 2.23% : 0.000005s : 2: substitution.replace_old_param 3.23% : 0.000008s : 1: substitution.switch_simplify 5.73% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.031497 2 95.41% : 0.030051s : 1: type_inference.infer 4.59% : 0.001445s : 1: type_inference.specialize ------[replace.] 0.000099 9 54.82% : 0.000054s : 6: replace.inline 26.25% : 0.000026s : 1: replace.switch_simplify 18.94% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000206 9 90.63% : 0.000186s : 6: match.inline 3.31% : 0.000007s : 1: match.switch_simplify 6.05% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1092 0.94% : 0.000002s : 12: predicate.accumulaten_eliminater 0.70% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 1.08% : 0.000002s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.38% : 0.000005s : 18: predicate.arithmetic_simplify 0.92% : 0.000002s : 12: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.43% : 0.000001s : 6: predicate.depend_value_elim 1.04% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.21% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 3: predicate.elim_not_effective 0.42% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 15: predicate.environ_add_const_eliminate 1.19% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_depend_swap 1.54% : 0.000003s : 21: predicate.environ_get_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.59% : 0.000005s : 20: predicate.float_depend_g_call 0.67% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.15% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.39% : 0.000001s : 6: predicate.incorporate_call_switch 6.00% : 0.000012s : 50: predicate.inline 0.53% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 6: predicate.less_batch_normalization 1.62% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.27% : 0.000005s : 32: predicate.load_eliminater 1.12% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.79% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.90% : 0.000004s : 18: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 6: predicate.merge_addn 0.52% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 12: predicate.minmaximum_grad 1.67% : 0.000003s : 3: predicate.mutable_eliminate 0.30% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.20% : 0.000004s : 20: predicate.partial_defer_inline 1.26% : 0.000003s : 17: predicate.partial_eliminate 1.11% : 0.000002s : 12: predicate.print_const_string_wrapper 0.68% : 0.000001s : 6: predicate.reduce_all_const_elim 1.31% : 0.000003s : 12: predicate.reduce_eliminate 2.55% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000001s : 6: predicate.remove_not_recompute_node 1.09% : 0.000002s : 20: predicate.replace_applicator 0.59% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 1.07% : 0.000002s : 12: predicate.reshape_eliminate 0.64% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 3: predicate.row_tensor_eliminate 0.85% : 0.000002s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 6: predicate.shard_identity_eliminate 0.61% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.63% : 0.000003s : 20: predicate.switch_defer_inline 2.01% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.84% : 0.000012s : 68: predicate.switch_simplify 1.07% : 0.000002s : 12: predicate.tile_eliminate 0.88% : 0.000002s : 12: predicate.transpose_eliminate 1.39% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000007s : 26: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.59% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.27% : 0.000005s : 32: predicate.updatestate_pure_node_eliminater 2.85% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 3: predicate.value_based_eliminate 0.64% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 6: predicate.virtual_output_eliminate 0.26% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001532 16 63.79% : 0.000977s : 8: func_graph_cloner_run.FuncGraphClonerGraph 36.21% : 0.000555s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069412 196 0.01% : 0.000004s : 1: ForceFp32Comm 7.12% : 0.004945s : 1: add_attr 7.07% : 0.004907s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000059s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000108s : 1: auto_monad 0.03% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.44% : 0.001003s : 1: bootstrap 0.05% : 0.000036s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000005s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.05% : 0.000032s : 1: event_method 0.03% : 0.000017s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000007s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.71% : 0.000493s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.19% : 0.000828s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000021s : 1: opt.transform.mutable_eliminate 1.79% : 0.001244s : 78: opt.transform.opt_a 0.03% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000092s : 28: opt.transform.opt_b 0.07% : 0.000046s : 2: opt.transform.opt_trans_graph 0.05% : 0.000035s : 4: opt.transform.symbol_engine_opt 5.39% : 0.003744s : 1: opt_a 0.20% : 0.000141s : 1: opt_after_cconv 0.82% : 0.000569s : 1: opt_after_jit_grad 0.32% : 0.000221s : 1: opt_b 9.32% : 0.006471s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000027s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.08% : 0.000053s : 1: pre_auto_parallel 0.02% : 0.000016s : 1: py_interpret_to_execute 0.02% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000049s : 1: remove_dup_value 1.25% : 0.000868s : 1: renormalize.infer 0.89% : 0.000616s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000025s : 1: rewriter_after_opt_a 0.44% : 0.000304s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000090s : 1: symbol_engine_optimizer 14.50% : 0.010064s : 1: task_emit 0.12% : 0.000081s : 1: tuple_transform 45.56% : 0.031626s : 1: type_inference 0.20% : 0.000141s : 1: validate TotalTime = 0.0575, [24] [bootstrap]: 0.00055887 [type_inference]: 0.034573 [event_method]: 0.00011985 [auto_monad]: 0.0001957 [graph_reusing]: 1.334e-05 [inline]: 2.89001e-06 [add_attr]: 0.00422742, [1] [add_attr_with_inline]: 0.00421284, [1] [Cycle 1]: 9.823e-05, [2] [tag_attr]: 3.981e-05 [meta_addattr_fg_expand]: 8.43999e-06 [parallel-infer-symbol]: 3.71999e-06 [pre_auto_parallel]: 5.514e-05 [insert-virtual-dataset]: 2.57001e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 1.97999e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00772218, [53] [py_interpret_to_execute]: 1.134e-05 [rewriter_before_opt_a]: 0.00029155 [opt_a]: 0.00465531, [2] [Cycle 1]: 0.00379348, [45] [expand_dump_flag]: 4.07998e-06 [switch_simplify]: 0.00010281 [loop_unroll]: 4.005e-05 [a_1]: 0.00090016 [with_stream_mark]: 2.696e-05 [recompute_prepare]: 1.439e-05 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 4.62e-06 [updatestate_loads_eliminate]: 4.18001e-06 [parameter_eliminate]: 2.76e-06 [a_2]: 0.00010891 [accelerated_algorithm]: 9.41e-06 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 3.21001e-06 [shard_inline]: 8.43001e-06 [merge_send_recv]: 1.05e-05 [auto_parallel]: 1.13e-05 [parallel]: 1.893e-05 [flash_sp]: 1.34e-05 [merge_comm]: 5.05001e-06 [allreduce_fusion]: 4.19002e-06 [matmul_add_comm_reduction]: 1.122e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 1.25e-05 [virtual_dataset]: 7.66001e-06 [get_grad_eliminate_]: 6.96999e-06 [virtual_output]: 7.19001e-06 [merge_forward]: 5.10999e-06 [cell_reuse_recompute_pass]: 1.65001e-06 [offload_activation]: 1.238e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.877e-05 [merge_recompute_call_nodes]: 2.19001e-06 [before_grad]: 1.308e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94e-06 [meta_fg_expand]: 4.35e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.21e-06 [after_resolve]: 1.3e-05 [a_after_grad]: 1.211e-05 [renormalize]: 0.00188924 [add_forward_monad_depend]: 1.319e-05 [auto_monad_grad]: 3.2e-06 [auto_monad_eliminator]: 2.58e-05 [cse]: 4.767e-05 [a_3]: 6.704e-05 [Cycle 2]: 0.00084724, [45] [expand_dump_flag]: 2.33998e-06 [switch_simplify]: 1.132e-05 [loop_unroll]: 1.062e-05 [a_1]: 0.00018229 [with_stream_mark]: 2.3e-05 [recompute_prepare]: 8.54e-06 [updatestate_depend_eliminate]: 7.14001e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 4.37e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 9.633e-05 [accelerated_algorithm]: 8.87e-06 [shard]: 2.24001e-06 [meta_shard_fg_expand]: 2.86999e-06 [shard_inline]: 7e-06 [merge_send_recv]: 1.063e-05 [auto_parallel]: 1.18e-05 [parallel]: 9.77001e-06 [flash_sp]: 4.45999e-06 [merge_comm]: 4.15e-06 [allreduce_fusion]: 4.74998e-06 [matmul_add_comm_reduction]: 1.053e-05 [allreduce_slice_to_reducescatter]: 9.70002e-07 [virtual_shard_identity]: 8.2e-06 [virtual_dataset]: 7.38e-06 [get_grad_eliminate_]: 7.3e-06 [virtual_output]: 7.16999e-06 [merge_forward]: 5.71e-06 [cell_reuse_recompute_pass]: 2.96999e-06 [offload_activation]: 1.141e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.719e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.272e-05 [set_forward_comm_id_for_comm_node_pass]: 5.40001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.52999e-06 [receive_attached]: 1.94e-06 [after_resolve]: 1.234e-05 [a_after_grad]: 1.02e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.41998e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.082e-05 [cse]: 2.672e-05 [a_3]: 4.076e-05 [py_interpret_to_execute_after_opt_a]: 1.082e-05 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 2.851e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.22e-06 [mutable_eliminate]: 0.00090717 [opt_b]: 0.00030652, [1] [Cycle 1]: 0.00029848, [7] [b_1]: 0.00018736 [b_2]: 1.05e-05 [updatestate_depend_eliminate]: 1.171e-05 [updatestate_assign_eliminate]: 3.99002e-06 [updatestate_loads_eliminate]: 4.52998e-06 [renormalize]: 8.59989e-07 [cse]: 4.18e-05 [optimize_parallel_all_gather_comm]: 2.207e-05 [overlap_param_gather]: 2.09e-06 [cconv]: 3.595e-05 [loop_unroll]: 0.00053457 [opt_after_cconv]: 0.00012455, [1] [Cycle 1]: 0.00011708, [7] [c_1]: 3.156e-05 [parameter_eliminate]: 5.56e-06 [updatestate_depend_eliminate]: 7.93001e-06 [updatestate_assign_eliminate]: 3.07002e-06 [updatestate_loads_eliminate]: 2.93003e-06 [cse]: 3.083e-05 [renormalize]: 8.90024e-07 [remove_dup_value]: 5.06e-05 [tuple_transform]: 0.00010339, [1] [Cycle 1]: 9.845e-05, [4] [d_1]: 6.682e-05 [none_parameter_eliminate]: 2.58e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 8.08001e-06 [partial_unused_args_eliminate]: 2.22999e-06 [add_recomputation]: 6.071e-05 [cse_after_recomputation]: 2.832e-05, [1] [Cycle 1]: 2.385e-05, [1] [cse]: 1.758e-05 [environ_conv]: 1.17e-05 [swap_dp_allreduce_reducescatter]: 6.44001e-06 [bias_add_comm_swap]: 3.2e-06 [label_micro_interleaved_index]: 6.34999e-06 [label_fine_grained_interleaved_index]: 2.59001e-06 [merge_cast_opt]: 2.07999e-06 [slice_recompute_activation]: 1.96e-06 [micro_interleaved_order_control]: 2.91999e-06 [assign_add_opt]: 1.29998e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.34e-06 [full_micro_interleaved_order_control]: 2.64001e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.52001e-06 [interleave_parallel_branches]: 1.08001e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 2.089e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 5.79e-06 [overlap_recompute_and_grad_model_parallel]: 5.66998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.47999e-06 [overlap_recompute_comm]: 2.29001e-06 [overlap_grad_ring_attention]: 5.37999e-06 [overlap_grad_flash_sp]: 2.439e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.26998e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 0.00010199, [1] [Cycle 1]: 9.588e-05, [6] [build]: 1.416e-05 [elim_shapecalc]: 1.308e-05 [elim_not_effective]: 1.602e-05 [opt_reshape]: 8.92999e-06 [fold_const_symbol]: 1.26e-05 [renormalize]: 2.00002e-07 [detach_backward]: 2.44999e-06 [pipeline_parallel_scheduler]: 1.50999e-06 [auto_monad_reorder]: 2.278e-05 [get_jit_bprop_graph]: 2.54999e-06 [rewriter_after_jit_bprop_graph]: 7.4e-06 [opt_after_jit_grad]: 0.00066246 [validate]: 5.576e-05 [backend_pass]: 9.09989e-07 [task_emit]: 0.00896405 [execute]: 1.077e-05 Sums bootstrap : 0.000559s : 1.07% type_inference : 0.034573s : 66.46% event_method : 0.000120s : 0.23% auto_monad : 0.000196s : 0.38% graph_reusing : 0.000013s : 0.03% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000040s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000055s : 0.11% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.02% optimize.rewriter_before_opt_a : 0.000292s : 0.56% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000114s : 0.22% optimize.opt_a.loop_unroll : 0.000051s : 0.10% optimize.opt_a.a_1 : 0.001082s : 2.08% optimize.opt_a.with_stream_mark : 0.000050s : 0.10% optimize.opt_a.recompute_prepare : 0.000023s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000205s : 0.39% optimize.opt_a.accelerated_algorithm : 0.000018s : 0.04% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.03% optimize.opt_a.merge_send_recv : 0.000021s : 0.04% optimize.opt_a.auto_parallel : 0.000023s : 0.04% optimize.opt_a.parallel : 0.000029s : 0.06% optimize.opt_a.flash_sp : 0.000018s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.04% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.03% optimize.opt_a.virtual_output : 0.000014s : 0.03% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.05% optimize.opt_a.a_after_grad : 0.000022s : 0.04% optimize.opt_a.renormalize : 0.001889s : 3.63% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000037s : 0.07% optimize.opt_a.cse : 0.000074s : 0.14% optimize.opt_a.a_3 : 0.000108s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000907s : 1.74% optimize.opt_b.b_1 : 0.000187s : 0.36% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000012s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000042s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.07% optimize.loop_unroll : 0.000535s : 1.03% optimize.opt_after_cconv.c_1 : 0.000032s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000031s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000051s : 0.10% optimize.tuple_transform.d_1 : 0.000067s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.12% optimize.cse_after_recomputation.cse : 0.000018s : 0.03% optimize.environ_conv : 0.000012s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.04% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000662s : 1.27% validate : 0.000056s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.008964s : 17.23% execute : 0.000011s : 0.02% Time group info: ------[substitution.] 0.000387 62 0.58% : 0.000002s : 3: substitution.elim_not_effective 2.42% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.47% : 0.000002s : 3: substitution.fold_const_symbol 1.69% : 0.000007s : 4: substitution.graph_param_transform 60.21% : 0.000233s : 8: substitution.inline 1.56% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.87% : 0.000007s : 2: substitution.minmaximum_grad 1.86% : 0.000007s : 6: substitution.remove_not_recompute_node 1.61% : 0.000006s : 2: substitution.replace_old_param 2.39% : 0.000009s : 1: substitution.switch_simplify 5.37% : 0.000021s : 4: substitution.tuple_list_convert_item_index_to_positive 2.25% : 0.000009s : 4: substitution.tuple_list_get_item_const_eliminator 3.45% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 10.88% : 0.000042s : 8: substitution.tuple_list_get_item_eliminator 3.40% : 0.000013s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.034474 2 93.75% : 0.032321s : 1: type_inference.infer 6.25% : 0.002153s : 1: type_inference.specialize ------[replace.] 0.000116 11 58.46% : 0.000068s : 8: replace.inline 23.73% : 0.000028s : 1: replace.switch_simplify 17.80% : 0.000021s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000241 11 94.83% : 0.000228s : 8: match.inline 3.42% : 0.000008s : 1: match.switch_simplify 1.75% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000272 1438 1.23% : 0.000003s : 16: predicate.accumulaten_eliminater 0.95% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 0.93% : 0.000003s : 16: predicate.addn_zero_filter 0.80% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.37% : 0.000006s : 24: predicate.arithmetic_simplify 1.06% : 0.000003s : 16: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.51% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000003s : 16: predicate.dict_get_item_const_eliminator 1.41% : 0.000004s : 16: predicate.dict_get_item_eliminator 1.03% : 0.000003s : 16: predicate.dict_set_item_eliminator 1.06% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000005s : 28: predicate.environ_get_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.45% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.35% : 0.000006s : 26: predicate.float_depend_g_call 0.65% : 0.000002s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.45% : 0.000001s : 8: predicate.incorporate_call 0.36% : 0.000001s : 8: predicate.incorporate_call_switch 5.67% : 0.000015s : 66: predicate.inline 0.69% : 0.000002s : 8: predicate.inline_without_move 0.22% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.07% : 0.000003s : 8: predicate.less_batch_normalization 1.66% : 0.000005s : 26: predicate.list_to_tuple_eliminator_ 2.39% : 0.000007s : 42: predicate.load_eliminater 0.90% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.56% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.81% : 0.000005s : 24: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.44% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000003s : 16: predicate.minmaximum_grad 1.74% : 0.000005s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 2.08% : 0.000006s : 26: predicate.partial_defer_inline 1.21% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000003s : 16: predicate.print_const_string_wrapper 0.61% : 0.000002s : 8: predicate.reduce_all_const_elim 1.28% : 0.000003s : 16: predicate.reduce_eliminate 2.47% : 0.000007s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.09% : 0.000003s : 26: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000000s : 4: predicate.reset_defer_inline 1.04% : 0.000003s : 16: predicate.reshape_eliminate 0.56% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.78% : 0.000002s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 8: predicate.shard_identity_eliminate 0.59% : 0.000002s : 8: predicate.special_op_eliminate 0.58% : 0.000002s : 8: predicate.specialize_transform 0.97% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 26: predicate.switch_defer_inline 1.96% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.96% : 0.000016s : 86: predicate.switch_simplify 1.01% : 0.000003s : 16: predicate.tile_eliminate 1.07% : 0.000003s : 16: predicate.transpose_eliminate 1.66% : 0.000005s : 24: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000005s : 24: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.73% : 0.000010s : 34: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 26: predicate.tuple_to_list_eliminator_ 2.12% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 2.84% : 0.000008s : 50: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.56% : 0.000002s : 8: predicate.virtual_output_eliminate 0.21% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.50% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001780 23 56.19% : 0.001000s : 11: func_graph_cloner_run.FuncGraphClonerGraph 43.81% : 0.000780s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.073234 196 0.01% : 0.000005s : 1: ForceFp32Comm 5.78% : 0.004236s : 1: add_attr 5.76% : 0.004217s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.09% : 0.000065s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000206s : 1: auto_monad 0.04% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.81% : 0.000593s : 1: bootstrap 0.05% : 0.000039s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000025s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000031s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.18% : 0.000131s : 1: event_method 0.02% : 0.000018s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000018s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.74% : 0.000544s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.26% : 0.000923s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000029s : 1: opt.transform.mutable_eliminate 2.30% : 0.001682s : 78: opt.transform.opt_a 0.04% : 0.000030s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.22% : 0.000162s : 28: opt.transform.opt_b 0.10% : 0.000072s : 2: opt.transform.opt_trans_graph 0.06% : 0.000047s : 4: opt.transform.symbol_engine_opt 6.36% : 0.004659s : 1: opt_a 0.17% : 0.000128s : 1: opt_after_cconv 0.92% : 0.000675s : 1: opt_after_jit_grad 0.42% : 0.000311s : 1: opt_b 10.55% : 0.007729s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000060s : 1: pre_auto_parallel 0.02% : 0.000015s : 1: py_interpret_to_execute 0.02% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000055s : 1: remove_dup_value 1.40% : 0.001023s : 1: renormalize.infer 1.17% : 0.000853s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000032s : 1: rewriter_after_opt_a 0.41% : 0.000299s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000105s : 1: symbol_engine_optimizer 12.27% : 0.008989s : 1: task_emit 0.15% : 0.000106s : 1: tuple_transform 47.25% : 0.034604s : 1: type_inference 0.14% : 0.000104s : 1: validate TotalTime = 0.0605116, [24] [bootstrap]: 0.00054204 [type_inference]: 0.038327 [event_method]: 0.00012176 [auto_monad]: 0.00019425 [graph_reusing]: 1.264e-05 [inline]: 2.59999e-06 [add_attr]: 0.00407371, [1] [add_attr_with_inline]: 0.00406138, [1] [Cycle 1]: 9.307e-05, [2] [tag_attr]: 3.897e-05 [meta_addattr_fg_expand]: 7.7e-06 [parallel-infer-symbol]: 4.22e-06 [pre_auto_parallel]: 5.312e-05 [insert-virtual-dataset]: 2.79001e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00754064, [53] [py_interpret_to_execute]: 9.52001e-06 [rewriter_before_opt_a]: 0.00028351 [opt_a]: 0.004561, [2] [Cycle 1]: 0.00371692, [45] [expand_dump_flag]: 3.72002e-06 [switch_simplify]: 9.796e-05 [loop_unroll]: 3.927e-05 [a_1]: 0.00084264 [with_stream_mark]: 2.701e-05 [recompute_prepare]: 1.322e-05 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 4.2e-06 [updatestate_loads_eliminate]: 3.58e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 0.00010379 [accelerated_algorithm]: 7.9e-06 [shard]: 2.31998e-06 [meta_shard_fg_expand]: 2.43e-06 [shard_inline]: 8.18001e-06 [merge_send_recv]: 1.06e-05 [auto_parallel]: 1.187e-05 [parallel]: 2.165e-05 [flash_sp]: 1.042e-05 [merge_comm]: 4.86002e-06 [allreduce_fusion]: 4.90999e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 9.90025e-07 [virtual_shard_identity]: 1.075e-05 [virtual_dataset]: 7.63001e-06 [get_grad_eliminate_]: 7.13e-06 [virtual_output]: 7.43999e-06 [merge_forward]: 5.49e-06 [cell_reuse_recompute_pass]: 2.01e-06 [offload_activation]: 1.157e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.963e-05 [merge_recompute_call_nodes]: 1.67001e-06 [before_grad]: 1.374e-05 [set_forward_comm_id_for_comm_node_pass]: 6.34999e-06 [meta_fg_expand]: 4.44002e-06 [flash_sp_send_recv_attached]: 4.1e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 1.685e-05 [a_after_grad]: 1.076e-05 [renormalize]: 0.00185888 [add_forward_monad_depend]: 1.017e-05 [auto_monad_grad]: 2.96001e-06 [auto_monad_eliminator]: 2.476e-05 [cse]: 4.599e-05 [a_3]: 6.626e-05 [Cycle 2]: 0.00082824, [45] [expand_dump_flag]: 1.92999e-06 [switch_simplify]: 9.42999e-06 [loop_unroll]: 7.42002e-06 [a_1]: 0.00017551 [with_stream_mark]: 2.197e-05 [recompute_prepare]: 8.38001e-06 [updatestate_depend_eliminate]: 4.55999e-06 [updatestate_assign_eliminate]: 3.98001e-06 [updatestate_loads_eliminate]: 3.91999e-06 [parameter_eliminate]: 2.39001e-06 [a_2]: 9.17e-05 [accelerated_algorithm]: 8.48999e-06 [shard]: 2.08002e-06 [meta_shard_fg_expand]: 2.29001e-06 [shard_inline]: 6.86001e-06 [merge_send_recv]: 9.86998e-06 [auto_parallel]: 1.141e-05 [parallel]: 8.15e-06 [flash_sp]: 3.86999e-06 [merge_comm]: 4.67998e-06 [allreduce_fusion]: 4.08999e-06 [matmul_add_comm_reduction]: 1.066e-05 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 9.60001e-06 [virtual_dataset]: 7.00002e-06 [get_grad_eliminate_]: 6.47001e-06 [virtual_output]: 6.48e-06 [merge_forward]: 5.42999e-06 [cell_reuse_recompute_pass]: 2.93998e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.585e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.196e-05 [set_forward_comm_id_for_comm_node_pass]: 7.53e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 1.86e-06 [receive_attached]: 2.11e-06 [after_resolve]: 1.209e-05 [a_after_grad]: 1.135e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.88e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 1.29e-05 [cse]: 2.922e-05 [a_3]: 4.266e-05 [py_interpret_to_execute_after_opt_a]: 9.31998e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 2.99e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00080405 [opt_b]: 0.00032212, [1] [Cycle 1]: 0.00031238, [7] [b_1]: 0.00019656 [b_2]: 1.03e-05 [updatestate_depend_eliminate]: 1.093e-05 [updatestate_assign_eliminate]: 4.12998e-06 [updatestate_loads_eliminate]: 4.12e-06 [renormalize]: 9.89996e-07 [cse]: 4.377e-05 [optimize_parallel_all_gather_comm]: 2.303e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 3.805e-05 [loop_unroll]: 0.00056457 [opt_after_cconv]: 0.00013162, [1] [Cycle 1]: 0.00012363, [7] [c_1]: 3.307e-05 [parameter_eliminate]: 5.25999e-06 [updatestate_depend_eliminate]: 8.67e-06 [updatestate_assign_eliminate]: 3.3e-06 [updatestate_loads_eliminate]: 3.39001e-06 [cse]: 3.421e-05 [renormalize]: 7.80012e-07 [remove_dup_value]: 1.985e-05 [tuple_transform]: 0.00010554, [1] [Cycle 1]: 0.00010053, [4] [d_1]: 7.267e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.55e-06 [partial_unused_args_eliminate]: 2.49999e-06 [add_recomputation]: 6.418e-05 [cse_after_recomputation]: 2.883e-05, [1] [Cycle 1]: 2.388e-05, [1] [cse]: 1.801e-05 [environ_conv]: 1.085e-05 [swap_dp_allreduce_reducescatter]: 6.48e-06 [bias_add_comm_swap]: 3.52002e-06 [label_micro_interleaved_index]: 5.26998e-06 [label_fine_grained_interleaved_index]: 2.71999e-06 [merge_cast_opt]: 1.60999e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.27001e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.55001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.55001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 2.002e-05 [grouped_pairwise_exchange_alltoall]: 1.91e-06 [offloading_packed_experts]: 5.35999e-06 [overlap_recompute_and_grad_model_parallel]: 5.76e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.12999e-06 [overlap_grad_ring_attention]: 4.65001e-06 [overlap_grad_flash_sp]: 2.716e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 1.90001e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 0.00010906, [1] [Cycle 1]: 0.00010245, [6] [build]: 1.552e-05 [elim_shapecalc]: 1.687e-05 [elim_not_effective]: 1.657e-05 [opt_reshape]: 8.02e-06 [fold_const_symbol]: 1.239e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.63e-06 [pipeline_parallel_scheduler]: 2.00002e-06 [auto_monad_reorder]: 2.456e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 7.51001e-06 [opt_after_jit_grad]: 0.00065209 [validate]: 5.684e-05 [backend_pass]: 1.03001e-06 [task_emit]: 0.00857085 [execute]: 9.56e-06 Sums bootstrap : 0.000542s : 0.98% type_inference : 0.038327s : 69.50% event_method : 0.000122s : 0.22% auto_monad : 0.000194s : 0.35% graph_reusing : 0.000013s : 0.02% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000053s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.02% optimize.rewriter_before_opt_a : 0.000284s : 0.51% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000107s : 0.19% optimize.opt_a.loop_unroll : 0.000047s : 0.08% optimize.opt_a.a_1 : 0.001018s : 1.85% optimize.opt_a.with_stream_mark : 0.000049s : 0.09% optimize.opt_a.recompute_prepare : 0.000022s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000195s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.03% optimize.opt_a.merge_send_recv : 0.000020s : 0.04% optimize.opt_a.auto_parallel : 0.000023s : 0.04% optimize.opt_a.parallel : 0.000030s : 0.05% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000010s : 0.02% optimize.opt_a.allreduce_fusion : 0.000009s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.04% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.03% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000035s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000026s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.03% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000029s : 0.05% optimize.opt_a.a_after_grad : 0.000022s : 0.04% optimize.opt_a.renormalize : 0.001859s : 3.37% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.02% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.07% optimize.opt_a.cse : 0.000075s : 0.14% optimize.opt_a.a_3 : 0.000109s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000804s : 1.46% optimize.opt_b.b_1 : 0.000197s : 0.36% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000044s : 0.08% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.07% optimize.loop_unroll : 0.000565s : 1.02% optimize.opt_after_cconv.c_1 : 0.000033s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000034s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000020s : 0.04% optimize.tuple_transform.d_1 : 0.000073s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000064s : 0.12% optimize.cse_after_recomputation.cse : 0.000018s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000020s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000027s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000025s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.01% opt_after_jit_grad : 0.000652s : 1.18% validate : 0.000057s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.008571s : 15.54% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000379 62 0.74% : 0.000003s : 3: substitution.elim_not_effective 2.96% : 0.000011s : 3: substitution.float_tuple_getitem_switch 0.60% : 0.000002s : 3: substitution.fold_const_symbol 1.81% : 0.000007s : 4: substitution.graph_param_transform 58.03% : 0.000220s : 8: substitution.inline 1.50% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.86% : 0.000007s : 2: substitution.minmaximum_grad 1.88% : 0.000007s : 6: substitution.remove_not_recompute_node 1.73% : 0.000007s : 2: substitution.replace_old_param 2.18% : 0.000008s : 1: substitution.switch_simplify 5.39% : 0.000020s : 4: substitution.tuple_list_convert_item_index_to_positive 2.23% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.92% : 0.000015s : 4: substitution.tuple_list_get_item_depend_reorder 11.54% : 0.000044s : 8: substitution.tuple_list_get_item_eliminator 3.62% : 0.000014s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.038217 2 93.60% : 0.035770s : 1: type_inference.infer 6.40% : 0.002447s : 1: type_inference.specialize ------[replace.] 0.000112 11 59.65% : 0.000067s : 8: replace.inline 23.64% : 0.000026s : 1: replace.switch_simplify 16.72% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000228 11 94.70% : 0.000215s : 8: match.inline 3.13% : 0.000007s : 1: match.switch_simplify 2.17% : 0.000005s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000258 1438 1.08% : 0.000003s : 16: predicate.accumulaten_eliminater 1.23% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.89% : 0.000002s : 16: predicate.addn_zero_filter 0.82% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.13% : 0.000005s : 24: predicate.arithmetic_simplify 0.96% : 0.000002s : 16: predicate.cast_eliminate 0.66% : 0.000002s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.15% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 16: predicate.dict_set_item_eliminator 1.10% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.28% : 0.000001s : 4: predicate.elim_not_effective 0.75% : 0.000002s : 4: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.00% : 0.000003s : 20: predicate.environ_get_depend_swap 1.52% : 0.000004s : 28: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.40% : 0.000006s : 26: predicate.float_depend_g_call 0.43% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.56% : 0.000001s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 5.96% : 0.000015s : 66: predicate.inline 0.68% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 8: predicate.less_batch_normalization 2.03% : 0.000005s : 26: predicate.list_to_tuple_eliminator_ 2.20% : 0.000006s : 42: predicate.load_eliminater 1.22% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.61% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.83% : 0.000005s : 24: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 16: predicate.minmaximum_grad 1.92% : 0.000005s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 2.28% : 0.000006s : 26: predicate.partial_defer_inline 1.27% : 0.000003s : 22: predicate.partial_eliminate 0.98% : 0.000003s : 16: predicate.print_const_string_wrapper 0.53% : 0.000001s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 16: predicate.reduce_eliminate 2.30% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000003s : 26: predicate.replace_applicator 0.46% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000000s : 4: predicate.reset_defer_inline 1.06% : 0.000003s : 16: predicate.reshape_eliminate 0.45% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.89% : 0.000002s : 8: predicate.same_eliminate 0.34% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.97% : 0.000003s : 8: predicate.shard_identity_eliminate 0.72% : 0.000002s : 8: predicate.special_op_eliminate 0.55% : 0.000001s : 8: predicate.specialize_transform 1.01% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.53% : 0.000004s : 26: predicate.switch_defer_inline 1.97% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.49% : 0.000014s : 86: predicate.switch_simplify 0.94% : 0.000002s : 16: predicate.tile_eliminate 0.92% : 0.000002s : 16: predicate.transpose_eliminate 1.59% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000005s : 24: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.77% : 0.000010s : 34: predicate.tuple_list_get_item_eliminator 1.63% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.52% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.14% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 2.72% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.71% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 8: predicate.virtual_output_eliminate 0.20% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002004 23 58.48% : 0.001172s : 11: func_graph_cloner_run.FuncGraphClonerGraph 41.52% : 0.000832s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.075879 196 0.00% : 0.000003s : 1: ForceFp32Comm 5.38% : 0.004082s : 1: add_attr 5.36% : 0.004066s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.09% : 0.000069s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.27% : 0.000205s : 1: auto_monad 0.04% : 0.000029s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.77% : 0.000584s : 1: bootstrap 0.05% : 0.000042s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000033s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.18% : 0.000134s : 1: event_method 0.02% : 0.000016s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000018s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.07% : 0.000050s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.76% : 0.000576s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.08% : 0.000821s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000031s : 1: opt.transform.mutable_eliminate 2.11% : 0.001599s : 78: opt.transform.opt_a 0.04% : 0.000032s : 1: opt.transform.opt_after_cconv 0.05% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000172s : 28: opt.transform.opt_b 0.10% : 0.000078s : 2: opt.transform.opt_trans_graph 0.06% : 0.000049s : 4: opt.transform.symbol_engine_opt 6.02% : 0.004565s : 1: opt_a 0.18% : 0.000135s : 1: opt_after_cconv 0.88% : 0.000667s : 1: opt_after_jit_grad 0.43% : 0.000327s : 1: opt_b 9.95% : 0.007547s : 1: optimize 0.04% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000005s : 1: order_py_execute_after_rewriter 0.04% : 0.000031s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.08% : 0.000057s : 1: pre_auto_parallel 0.02% : 0.000013s : 1: py_interpret_to_execute 0.02% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000024s : 1: remove_dup_value 1.35% : 0.001026s : 1: renormalize.infer 1.08% : 0.000818s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000035s : 1: rewriter_after_opt_a 0.38% : 0.000292s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000112s : 1: symbol_engine_optimizer 11.32% : 0.008592s : 1: task_emit 0.14% : 0.000109s : 1: tuple_transform 50.55% : 0.038359s : 1: type_inference 0.14% : 0.000106s : 1: validate TotalTime = 0.395593, [24] [bootstrap]: 0.00091376 [type_inference]: 0.146729 [event_method]: 9.306e-05 [auto_monad]: 0.00017513 [graph_reusing]: 1.25e-05 [inline]: 2.36998e-06 [add_attr]: 0.0772143, [1] [add_attr_with_inline]: 0.0771982, [1] [Cycle 1]: 9.527e-05, [2] [tag_attr]: 3.458e-05 [meta_addattr_fg_expand]: 7.95e-06 [parallel-infer-symbol]: 3.66999e-06 [pre_auto_parallel]: 5.236e-05 [insert-virtual-dataset]: 2.54001e-06 [parallel-infer-symbol-second]: 6.99976e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.84e-06 [optimize]: 0.0066328, [53] [py_interpret_to_execute]: 5.86e-06 [rewriter_before_opt_a]: 0.00027006 [opt_a]: 0.00409901, [2] [Cycle 1]: 0.00338008, [45] [expand_dump_flag]: 3.78001e-06 [switch_simplify]: 0.00010004 [loop_unroll]: 3.927e-05 [a_1]: 0.00078655 [with_stream_mark]: 2.029e-05 [recompute_prepare]: 9.67999e-06 [updatestate_depend_eliminate]: 4.80001e-06 [updatestate_assign_eliminate]: 4.43001e-06 [updatestate_loads_eliminate]: 3.81001e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 9.714e-05 [accelerated_algorithm]: 7.4e-06 [shard]: 2.06e-06 [meta_shard_fg_expand]: 2.22999e-06 [shard_inline]: 6.89999e-06 [merge_send_recv]: 9.66e-06 [auto_parallel]: 8.00999e-06 [parallel]: 4.449e-05 [flash_sp]: 9.42999e-06 [merge_comm]: 4.52e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 1.042e-05 [allreduce_slice_to_reducescatter]: 6.30011e-07 [virtual_shard_identity]: 9.11002e-06 [virtual_dataset]: 7.05e-06 [get_grad_eliminate_]: 6.59999e-06 [virtual_output]: 6.86999e-06 [merge_forward]: 4.61002e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.167e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.333e-05 [merge_recompute_call_nodes]: 1.91e-06 [before_grad]: 1.219e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18001e-06 [meta_fg_expand]: 4.04002e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.45002e-06 [after_resolve]: 1.052e-05 [a_after_grad]: 1.027e-05 [renormalize]: 0.00171298 [add_forward_monad_depend]: 6.39001e-06 [auto_monad_grad]: 2.44999e-06 [auto_monad_eliminator]: 2.039e-05 [cse]: 3.986e-05 [a_3]: 5.473e-05 [Cycle 2]: 0.00070708, [45] [expand_dump_flag]: 2.43e-06 [switch_simplify]: 8.35001e-06 [loop_unroll]: 6.94999e-06 [a_1]: 0.00015937 [with_stream_mark]: 1.498e-05 [recompute_prepare]: 7.43e-06 [updatestate_depend_eliminate]: 4.15999e-06 [updatestate_assign_eliminate]: 3.26999e-06 [updatestate_loads_eliminate]: 3.46001e-06 [parameter_eliminate]: 1.08001e-06 [a_2]: 8.543e-05 [accelerated_algorithm]: 6.68e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 7.08e-06 [merge_send_recv]: 6.32001e-06 [auto_parallel]: 7.67002e-06 [parallel]: 6.21e-06 [flash_sp]: 3.61999e-06 [merge_comm]: 4e-06 [allreduce_fusion]: 4.14002e-06 [matmul_add_comm_reduction]: 7.3e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 6.95002e-06 [virtual_dataset]: 6.23e-06 [get_grad_eliminate_]: 6.13998e-06 [virtual_output]: 6.02001e-06 [merge_forward]: 4.02e-06 [cell_reuse_recompute_pass]: 1.88002e-06 [offload_activation]: 8.89e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.213e-05 [merge_recompute_call_nodes]: 6.99976e-07 [before_grad]: 9.90002e-06 [set_forward_comm_id_for_comm_node_pass]: 4.60001e-06 [meta_fg_expand]: 2.82002e-06 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 9.49978e-07 [after_resolve]: 9.69e-06 [a_after_grad]: 9.56e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29003e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 8.49998e-06 [cse]: 1.959e-05 [a_3]: 4.008e-05 [py_interpret_to_execute_after_opt_a]: 5.98998e-06 [slice_cell_reuse_recomputed_activation]: 2.48e-06 [rewriter_after_opt_a]: 2.519e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.09998e-06 [mutable_eliminate]: 0.00067239 [opt_b]: 0.00028685, [1] [Cycle 1]: 0.00027987, [7] [b_1]: 0.00018969 [b_2]: 9.76e-06 [updatestate_depend_eliminate]: 7.75e-06 [updatestate_assign_eliminate]: 3.19001e-06 [updatestate_loads_eliminate]: 3.23e-06 [renormalize]: 7.79983e-07 [cse]: 2.69e-05 [optimize_parallel_all_gather_comm]: 1.897e-05 [overlap_param_gather]: 2.05002e-06 [cconv]: 2.886e-05 [loop_unroll]: 0.00047821 [opt_after_cconv]: 0.00011012, [1] [Cycle 1]: 0.0001044, [7] [c_1]: 3.155e-05 [parameter_eliminate]: 2.72001e-06 [updatestate_depend_eliminate]: 6.12001e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.11999e-06 [cse]: 2.346e-05 [renormalize]: 8.50006e-07 [remove_dup_value]: 1.895e-05 [tuple_transform]: 9.577e-05, [1] [Cycle 1]: 9.114e-05, [4] [d_1]: 6.216e-05 [none_parameter_eliminate]: 1.66002e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.69002e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 5.22e-05 [cse_after_recomputation]: 2.687e-05, [1] [Cycle 1]: 2.225e-05, [1] [cse]: 1.665e-05 [environ_conv]: 9.41e-06 [swap_dp_allreduce_reducescatter]: 6.26e-06 [bias_add_comm_swap]: 2.66e-06 [label_micro_interleaved_index]: 4.80001e-06 [label_fine_grained_interleaved_index]: 2.95998e-06 [merge_cast_opt]: 1.72001e-06 [slice_recompute_activation]: 2.29001e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.67001e-06 [reorder_send_recv_between_fp_bp]: 3.09999e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.47001e-06 [interleave_parallel_branches]: 1.21002e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99999e-06 [control_data_broadcast_order]: 1.443e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 4.72e-06 [overlap_recompute_and_grad_model_parallel]: 5.12e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.32001e-06 [overlap_grad_ring_attention]: 4.52e-06 [overlap_grad_flash_sp]: 2.247e-05 [begin_end_overlap_inline]: 7.30011e-07 [split_matmul_comm_elemetwise]: 2.01e-06 [split_layernorm_comm]: 2.29999e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 8.647e-05, [1] [Cycle 1]: 8.191e-05, [6] [build]: 1.161e-05 [elim_shapecalc]: 1.057e-05 [elim_not_effective]: 1.392e-05 [opt_reshape]: 7.61999e-06 [fold_const_symbol]: 1.134e-05 [renormalize]: 1.50001e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.62001e-06 [auto_monad_reorder]: 2.175e-05 [get_jit_bprop_graph]: 2.21998e-06 [rewriter_after_jit_bprop_graph]: 4.48999e-06 [opt_after_jit_grad]: 0.00053602 [validate]: 5.201e-05 [backend_pass]: 9.5999e-07 [task_emit]: 0.162816 [execute]: 1.116e-05 Sums bootstrap : 0.000914s : 0.29% type_inference : 0.146729s : 46.25% event_method : 0.000093s : 0.03% auto_monad : 0.000175s : 0.06% graph_reusing : 0.000013s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000052s : 0.02% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000270s : 0.09% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000108s : 0.03% optimize.opt_a.loop_unroll : 0.000046s : 0.01% optimize.opt_a.a_1 : 0.000946s : 0.30% optimize.opt_a.with_stream_mark : 0.000035s : 0.01% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000183s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.00% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000016s : 0.00% optimize.opt_a.parallel : 0.000051s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000009s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.001713s : 0.54% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.01% optimize.opt_a.cse : 0.000059s : 0.02% optimize.opt_a.a_3 : 0.000095s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000025s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000672s : 0.21% optimize.opt_b.b_1 : 0.000190s : 0.06% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.01% optimize.loop_unroll : 0.000478s : 0.15% optimize.opt_after_cconv.c_1 : 0.000032s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.01% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000062s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.02% optimize.cse_after_recomputation.cse : 0.000017s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000022s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000536s : 0.17% validate : 0.000052s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.162816s : 51.32% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000325 62 0.70% : 0.000002s : 3: substitution.elim_not_effective 2.28% : 0.000007s : 3: substitution.float_tuple_getitem_switch 0.55% : 0.000002s : 3: substitution.fold_const_symbol 1.85% : 0.000006s : 4: substitution.graph_param_transform 60.94% : 0.000198s : 8: substitution.inline 1.49% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.60% : 0.000005s : 2: substitution.minmaximum_grad 1.88% : 0.000006s : 6: substitution.remove_not_recompute_node 1.19% : 0.000004s : 2: substitution.replace_old_param 2.46% : 0.000008s : 1: substitution.switch_simplify 5.22% : 0.000017s : 4: substitution.tuple_list_convert_item_index_to_positive 2.38% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.33% : 0.000011s : 4: substitution.tuple_list_get_item_depend_reorder 10.38% : 0.000034s : 8: substitution.tuple_list_get_item_eliminator 3.75% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.146644 2 98.74% : 0.144796s : 1: type_inference.infer 1.26% : 0.001847s : 1: type_inference.specialize ------[replace.] 0.000101 11 59.96% : 0.000061s : 8: replace.inline 23.56% : 0.000024s : 1: replace.switch_simplify 16.47% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000205 11 94.51% : 0.000194s : 8: match.inline 3.44% : 0.000007s : 1: match.switch_simplify 2.05% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000231 1438 0.95% : 0.000002s : 16: predicate.accumulaten_eliminater 0.81% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 8: predicate.addn_check_dump 1.05% : 0.000002s : 16: predicate.addn_zero_filter 0.91% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 24: predicate.arithmetic_simplify 0.99% : 0.000002s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 1.01% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.29% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_depend_swap 1.77% : 0.000004s : 28: predicate.environ_get_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.58% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.48% : 0.000006s : 26: predicate.float_depend_g_call 0.46% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 6.13% : 0.000014s : 66: predicate.inline 0.67% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.59% : 0.000006s : 42: predicate.load_eliminater 0.79% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.85% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 16: predicate.minmaximum_grad 1.09% : 0.000003s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 1.95% : 0.000005s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.50% : 0.000001s : 8: predicate.reduce_all_const_elim 1.31% : 0.000003s : 16: predicate.reduce_eliminate 2.37% : 0.000005s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 26: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 16: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.72% : 0.000002s : 8: predicate.shard_identity_eliminate 0.65% : 0.000002s : 8: predicate.special_op_eliminate 0.64% : 0.000001s : 8: predicate.specialize_transform 0.79% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.66% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.05% : 0.000014s : 86: predicate.switch_simplify 0.96% : 0.000002s : 16: predicate.tile_eliminate 0.96% : 0.000002s : 16: predicate.transpose_eliminate 1.83% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.64% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.31% : 0.000008s : 34: predicate.tuple_list_get_item_eliminator 1.73% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.33% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001817 23 62.59% : 0.001137s : 11: func_graph_cloner_run.FuncGraphClonerGraph 37.41% : 0.000680s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.482872 196 0.00% : 0.000003s : 1: ForceFp32Comm 15.99% : 0.077221s : 1: add_attr 15.99% : 0.077203s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000056s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000184s : 1: auto_monad 0.01% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.20% : 0.000979s : 1: bootstrap 0.01% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.02% : 0.000102s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.10% : 0.000487s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.14% : 0.000682s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.31% : 0.001479s : 78: opt.transform.opt_a 0.01% : 0.000030s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000170s : 28: opt.transform.opt_b 0.01% : 0.000068s : 2: opt.transform.opt_trans_graph 0.01% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.85% : 0.004103s : 1: opt_a 0.02% : 0.000114s : 1: opt_after_cconv 0.11% : 0.000546s : 1: opt_after_jit_grad 0.06% : 0.000291s : 1: opt_b 1.37% : 0.006638s : 1: optimize 0.00% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.01% : 0.000056s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000023s : 1: remove_dup_value 0.21% : 0.001014s : 1: renormalize.infer 0.14% : 0.000688s : 1: renormalize.specialize 0.00% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000028s : 1: rewriter_after_opt_a 0.06% : 0.000276s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000089s : 1: symbol_engine_optimizer 33.72% : 0.162841s : 1: task_emit 0.02% : 0.000099s : 1: tuple_transform 30.39% : 0.146753s : 1: type_inference 0.02% : 0.000096s : 1: validate TotalTime = 0.0736843, [24] [bootstrap]: 0.00072346 [type_inference]: 0.0466934 [event_method]: 0.00018283 [auto_monad]: 0.00025736 [graph_reusing]: 1.225e-05 [inline]: 3.55e-06 [add_attr]: 0.00569377, [1] [add_attr_with_inline]: 0.00565799, [1] [Cycle 1]: 0.00011116, [2] [tag_attr]: 4.585e-05 [meta_addattr_fg_expand]: 8.87e-06 [parallel-infer-symbol]: 3.67998e-06 [pre_auto_parallel]: 6.056e-05 [insert-virtual-dataset]: 3.21001e-06 [parallel-infer-symbol-second]: 6.39993e-07 [dataset_repeat_opt]: 2.36998e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00846292, [53] [py_interpret_to_execute]: 1.24e-05 [rewriter_before_opt_a]: 0.00030333 [opt_a]: 0.00538473, [2] [Cycle 1]: 0.00447666, [45] [expand_dump_flag]: 4.70999e-06 [switch_simplify]: 0.00010721 [loop_unroll]: 4.64e-05 [a_1]: 0.00098886 [with_stream_mark]: 6.328e-05 [recompute_prepare]: 1.581e-05 [updatestate_depend_eliminate]: 6.53e-06 [updatestate_assign_eliminate]: 4.68001e-06 [updatestate_loads_eliminate]: 4.05998e-06 [parameter_eliminate]: 2.96999e-06 [a_2]: 0.00011927 [accelerated_algorithm]: 9.46e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 3.6e-06 [shard_inline]: 7.51001e-06 [merge_send_recv]: 1.052e-05 [auto_parallel]: 1.095e-05 [parallel]: 2.066e-05 [flash_sp]: 1.119e-05 [merge_comm]: 4.80999e-06 [allreduce_fusion]: 4.37003e-06 [matmul_add_comm_reduction]: 1.142e-05 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 1.087e-05 [virtual_dataset]: 8.22e-06 [get_grad_eliminate_]: 8.79e-06 [virtual_output]: 8.08999e-06 [merge_forward]: 5.91e-06 [cell_reuse_recompute_pass]: 2.25002e-06 [offload_activation]: 1.275e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.914e-05 [merge_recompute_call_nodes]: 1.59998e-06 [before_grad]: 1.342e-05 [set_forward_comm_id_for_comm_node_pass]: 5.15999e-06 [meta_fg_expand]: 4.06001e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.88e-06 [after_resolve]: 1.831e-05 [a_after_grad]: 1.393e-05 [renormalize]: 0.00239255 [add_forward_monad_depend]: 1.134e-05 [auto_monad_grad]: 2.77002e-06 [auto_monad_eliminator]: 2.724e-05 [cse]: 4.434e-05 [a_3]: 7.136e-05 [Cycle 2]: 0.00089177, [45] [expand_dump_flag]: 2.44001e-06 [switch_simplify]: 1.043e-05 [loop_unroll]: 7.99002e-06 [a_1]: 0.00019574 [with_stream_mark]: 6.547e-05 [recompute_prepare]: 9.36002e-06 [updatestate_depend_eliminate]: 5.82999e-06 [updatestate_assign_eliminate]: 3.71001e-06 [updatestate_loads_eliminate]: 4.42998e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 9.735e-05 [accelerated_algorithm]: 7.39002e-06 [shard]: 2.25002e-06 [meta_shard_fg_expand]: 2.39999e-06 [shard_inline]: 7.55998e-06 [merge_send_recv]: 9.45001e-06 [auto_parallel]: 1.038e-05 [parallel]: 9.33002e-06 [flash_sp]: 3.59002e-06 [merge_comm]: 4.07998e-06 [allreduce_fusion]: 3.9e-06 [matmul_add_comm_reduction]: 1.162e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 8.47e-06 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 7.76001e-06 [merge_forward]: 5.51e-06 [cell_reuse_recompute_pass]: 2.94999e-06 [offload_activation]: 1.165e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.369e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.224e-05 [set_forward_comm_id_for_comm_node_pass]: 6.09999e-06 [meta_fg_expand]: 3.67002e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 2.46e-06 [after_resolve]: 1.385e-05 [a_after_grad]: 1.071e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.61998e-06 [auto_monad_grad]: 1.64e-06 [auto_monad_eliminator]: 1.09e-05 [cse]: 2.691e-05 [a_3]: 4.229e-05 [py_interpret_to_execute_after_opt_a]: 1.083e-05 [slice_cell_reuse_recomputed_activation]: 2.63e-06 [rewriter_after_opt_a]: 3.143e-05 [convert_after_rewriter]: 1.28002e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00083429 [opt_b]: 0.00030648, [1] [Cycle 1]: 0.00029706, [7] [b_1]: 0.00018719 [b_2]: 9.34998e-06 [updatestate_depend_eliminate]: 1.127e-05 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 4.48001e-06 [renormalize]: 3.7998e-07 [cse]: 3.922e-05 [optimize_parallel_all_gather_comm]: 2.093e-05 [overlap_param_gather]: 2.32999e-06 [cconv]: 3.853e-05 [loop_unroll]: 0.00056418 [opt_after_cconv]: 0.00013581, [1] [Cycle 1]: 0.00012783, [7] [c_1]: 4.049e-05 [parameter_eliminate]: 6.00002e-06 [updatestate_depend_eliminate]: 8.69e-06 [updatestate_assign_eliminate]: 3.51999e-06 [updatestate_loads_eliminate]: 4.3e-06 [cse]: 2.973e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 2.202e-05 [tuple_transform]: 0.00011487, [1] [Cycle 1]: 0.00010993, [4] [d_1]: 7.777e-05 [none_parameter_eliminate]: 2.21998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.018e-05 [partial_unused_args_eliminate]: 2.12999e-06 [add_recomputation]: 0.000111 [cse_after_recomputation]: 3.281e-05, [1] [Cycle 1]: 2.653e-05, [1] [cse]: 2.023e-05 [environ_conv]: 1.256e-05 [swap_dp_allreduce_reducescatter]: 8.24002e-06 [bias_add_comm_swap]: 4.37998e-06 [label_micro_interleaved_index]: 7.13e-06 [label_fine_grained_interleaved_index]: 2.56998e-06 [merge_cast_opt]: 1.45001e-06 [slice_recompute_activation]: 2.59001e-06 [micro_interleaved_order_control]: 2.91999e-06 [assign_add_opt]: 1.27999e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.17e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.19003e-06 [interleave_split_concat_branches]: 1.52999e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.672e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 7.33e-06 [overlap_recompute_and_grad_model_parallel]: 5.67001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 4.82e-06 [overlap_grad_flash_sp]: 2.55e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.13998e-06 [split_layernorm_comm]: 2.11e-06 [handle_group_info]: 1.12999e-06 [symbol_engine_optimizer]: 0.00011385, [1] [Cycle 1]: 0.00010889, [6] [build]: 1.552e-05 [elim_shapecalc]: 1.609e-05 [elim_not_effective]: 1.765e-05 [opt_reshape]: 1.113e-05 [fold_const_symbol]: 1.39e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.42e-06 [auto_monad_reorder]: 2.142e-05 [get_jit_bprop_graph]: 2.09e-06 [rewriter_after_jit_bprop_graph]: 7.41001e-06 [opt_after_jit_grad]: 0.00073933 [validate]: 9.702e-05 [backend_pass]: 8.59989e-07 [task_emit]: 0.0103269 [execute]: 1.034e-05 Sums bootstrap : 0.000723s : 1.09% type_inference : 0.046693s : 70.05% event_method : 0.000183s : 0.27% auto_monad : 0.000257s : 0.39% graph_reusing : 0.000012s : 0.02% inline : 0.000004s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000061s : 0.09% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000012s : 0.02% optimize.rewriter_before_opt_a : 0.000303s : 0.46% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000118s : 0.18% optimize.opt_a.loop_unroll : 0.000054s : 0.08% optimize.opt_a.a_1 : 0.001185s : 1.78% optimize.opt_a.with_stream_mark : 0.000129s : 0.19% optimize.opt_a.recompute_prepare : 0.000025s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000217s : 0.32% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.02% optimize.opt_a.merge_send_recv : 0.000020s : 0.03% optimize.opt_a.auto_parallel : 0.000021s : 0.03% optimize.opt_a.parallel : 0.000030s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.03% optimize.opt_a.virtual_dataset : 0.000016s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.02% optimize.opt_a.virtual_output : 0.000016s : 0.02% optimize.opt_a.merge_forward : 0.000011s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000024s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.02% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000032s : 0.05% optimize.opt_a.a_after_grad : 0.000025s : 0.04% optimize.opt_a.renormalize : 0.002393s : 3.59% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000038s : 0.06% optimize.opt_a.cse : 0.000071s : 0.11% optimize.opt_a.a_3 : 0.000114s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000031s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000834s : 1.25% optimize.opt_b.b_1 : 0.000187s : 0.28% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000039s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000039s : 0.06% optimize.loop_unroll : 0.000564s : 0.85% optimize.opt_after_cconv.c_1 : 0.000040s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000030s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000022s : 0.03% optimize.tuple_transform.d_1 : 0.000078s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000111s : 0.17% optimize.cse_after_recomputation.cse : 0.000020s : 0.03% optimize.environ_conv : 0.000013s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000007s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.04% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000739s : 1.11% validate : 0.000097s : 0.15% backend_pass : 0.000001s : 0.00% task_emit : 0.010327s : 15.49% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000455 62 0.60% : 0.000003s : 3: substitution.elim_not_effective 2.13% : 0.000010s : 3: substitution.float_tuple_getitem_switch 0.62% : 0.000003s : 3: substitution.fold_const_symbol 1.44% : 0.000007s : 4: substitution.graph_param_transform 62.20% : 0.000283s : 8: substitution.inline 1.24% : 0.000006s : 6: substitution.j_node_and_user_rematch 1.56% : 0.000007s : 2: substitution.minmaximum_grad 1.88% : 0.000009s : 6: substitution.remove_not_recompute_node 1.63% : 0.000007s : 2: substitution.replace_old_param 1.83% : 0.000008s : 1: substitution.switch_simplify 5.56% : 0.000025s : 4: substitution.tuple_list_convert_item_index_to_positive 2.19% : 0.000010s : 4: substitution.tuple_list_get_item_const_eliminator 3.21% : 0.000015s : 4: substitution.tuple_list_get_item_depend_reorder 10.57% : 0.000048s : 8: substitution.tuple_list_get_item_eliminator 3.34% : 0.000015s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.046535 2 93.98% : 0.043733s : 1: type_inference.infer 6.02% : 0.002802s : 1: type_inference.specialize ------[replace.] 0.000125 11 57.73% : 0.000072s : 8: replace.inline 22.31% : 0.000028s : 1: replace.switch_simplify 19.97% : 0.000025s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000291 11 95.67% : 0.000278s : 8: match.inline 2.61% : 0.000008s : 1: match.switch_simplify 1.72% : 0.000005s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000268 1438 0.92% : 0.000002s : 16: predicate.accumulaten_eliminater 1.20% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000003s : 16: predicate.addn_zero_filter 0.88% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.45% : 0.000007s : 24: predicate.arithmetic_simplify 1.09% : 0.000003s : 16: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.45% : 0.000001s : 8: predicate.compare_switch_simplify 0.13% : 0.000000s : 4: predicate.const_output_eliminate 0.45% : 0.000001s : 8: predicate.depend_value_elim 0.92% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.95% : 0.000003s : 8: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.29% : 0.000003s : 20: predicate.environ_get_depend_swap 1.61% : 0.000004s : 28: predicate.environ_get_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.39% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.34% : 0.000006s : 26: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.65% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.37% : 0.000001s : 8: predicate.incorporate_call_switch 5.49% : 0.000015s : 66: predicate.inline 0.75% : 0.000002s : 8: predicate.inline_without_move 0.23% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.28% : 0.000006s : 42: predicate.load_eliminater 0.98% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.61% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.76% : 0.000005s : 24: predicate.make_slice_get_slice_eliminator 0.57% : 0.000002s : 8: predicate.merge_addn 0.67% : 0.000002s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 16: predicate.minmaximum_grad 1.67% : 0.000004s : 4: predicate.mutable_eliminate 0.39% : 0.000001s : 4: predicate.opt_reshape 0.48% : 0.000001s : 4: predicate.parallel_virtual_node 1.98% : 0.000005s : 26: predicate.partial_defer_inline 1.22% : 0.000003s : 22: predicate.partial_eliminate 0.99% : 0.000003s : 16: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.46% : 0.000004s : 16: predicate.reduce_eliminate 2.47% : 0.000007s : 42: predicate.redundant_stop_gradient_eliminater 0.27% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 26: predicate.replace_applicator 0.38% : 0.000001s : 8: predicate.replace_old_param 0.17% : 0.000000s : 4: predicate.reset_defer_inline 1.01% : 0.000003s : 16: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.65% : 0.000002s : 8: predicate.same_eliminate 0.35% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000002s : 8: predicate.special_op_eliminate 0.56% : 0.000002s : 8: predicate.specialize_transform 0.85% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.61% : 0.000004s : 26: predicate.switch_defer_inline 2.05% : 0.000006s : 34: predicate.switch_layer_defer_inline 5.52% : 0.000015s : 86: predicate.switch_simplify 0.95% : 0.000003s : 16: predicate.tile_eliminate 0.92% : 0.000002s : 16: predicate.transpose_eliminate 1.84% : 0.000005s : 24: predicate.tuple_list_convert_item_index_to_positive 1.91% : 0.000005s : 24: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 4.01% : 0.000011s : 34: predicate.tuple_list_get_item_eliminator 1.63% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.45% : 0.000007s : 32: predicate.tuple_list_set_item_eliminator 1.71% : 0.000005s : 26: predicate.tuple_to_list_eliminator_ 2.04% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.73% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 4: predicate.value_based_eliminate 0.63% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.85% : 0.000002s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003134 23 60.15% : 0.001885s : 11: func_graph_cloner_run.FuncGraphClonerGraph 39.85% : 0.001249s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.092305 196 0.00% : 0.000004s : 1: ForceFp32Comm 6.18% : 0.005702s : 1: add_attr 6.14% : 0.005664s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.13% : 0.000117s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.29% : 0.000270s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.89% : 0.000821s : 1: bootstrap 0.05% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000036s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.21% : 0.000197s : 1: event_method 0.02% : 0.000019s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000017s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.62% : 0.000575s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000007s : 1: micro_interleaved_order_control 0.92% : 0.000850s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000029s : 1: opt.transform.mutable_eliminate 1.97% : 0.001815s : 78: opt.transform.opt_a 0.04% : 0.000039s : 1: opt.transform.opt_after_cconv 0.04% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000162s : 28: opt.transform.opt_b 0.09% : 0.000085s : 2: opt.transform.opt_trans_graph 0.06% : 0.000054s : 4: opt.transform.symbol_engine_opt 5.84% : 0.005390s : 1: opt_a 0.15% : 0.000140s : 1: opt_after_cconv 0.82% : 0.000754s : 1: opt_after_jit_grad 0.34% : 0.000310s : 1: opt_b 9.18% : 0.008470s : 1: optimize 0.03% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000007s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.07% : 0.000065s : 1: pre_auto_parallel 0.02% : 0.000019s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.03% : 0.000026s : 1: remove_dup_value 1.31% : 0.001206s : 1: renormalize.infer 1.27% : 0.001171s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000036s : 1: rewriter_after_opt_a 0.34% : 0.000311s : 1: rewriter_before_opt_a 0.01% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000117s : 1: symbol_engine_optimizer 11.22% : 0.010353s : 1: task_emit 0.13% : 0.000118s : 1: tuple_transform 50.62% : 0.046726s : 1: type_inference 0.16% : 0.000150s : 1: validate TotalTime = 0.0551219, [24] [bootstrap]: 0.00065728 [type_inference]: 0.0301111 [event_method]: 2.833e-05 [auto_monad]: 0.00012798 [graph_reusing]: 7.04001e-06 [inline]: 2.99999e-06 [add_attr]: 0.00427136, [1] [add_attr_with_inline]: 0.00425729, [1] [Cycle 1]: 9.3e-05, [2] [tag_attr]: 3.193e-05 [meta_addattr_fg_expand]: 6.82002e-06 [parallel-infer-symbol]: 4.53999e-06 [pre_auto_parallel]: 5.216e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 9.70002e-07 [dataset_repeat_opt]: 1.88002e-06 [pipeline_split]: 1.87001e-06 [optimize]: 0.00704443, [53] [py_interpret_to_execute]: 1.283e-05 [rewriter_before_opt_a]: 0.00030841 [opt_a]: 0.00411356, [2] [Cycle 1]: 0.00336565, [45] [expand_dump_flag]: 3.86999e-06 [switch_simplify]: 0.00010088 [loop_unroll]: 3.462e-05 [a_1]: 0.00071403 [with_stream_mark]: 2.664e-05 [recompute_prepare]: 1.445e-05 [updatestate_depend_eliminate]: 4.57e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 2.99999e-06 [parameter_eliminate]: 2.48e-06 [a_2]: 7.594e-05 [accelerated_algorithm]: 8e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.22001e-06 [shard_inline]: 6.04999e-06 [merge_send_recv]: 9.25999e-06 [auto_parallel]: 1.124e-05 [parallel]: 2.876e-05 [flash_sp]: 1.257e-05 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 3.18e-06 [matmul_add_comm_reduction]: 1.114e-05 [allreduce_slice_to_reducescatter]: 1.36998e-06 [virtual_shard_identity]: 1.437e-05 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 6.63998e-06 [virtual_output]: 6.60002e-06 [merge_forward]: 5.25999e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 1.011e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.923e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 1.294e-05 [set_forward_comm_id_for_comm_node_pass]: 3.93999e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 3.39001e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.281e-05 [a_after_grad]: 9.52999e-06 [renormalize]: 0.00168063 [add_forward_monad_depend]: 9.94001e-06 [auto_monad_grad]: 2.69001e-06 [auto_monad_eliminator]: 4.886e-05 [cse]: 4.092e-05 [a_3]: 5.658e-05 [Cycle 2]: 0.00073341, [45] [expand_dump_flag]: 2.89999e-06 [switch_simplify]: 9.33002e-06 [loop_unroll]: 7.59002e-06 [a_1]: 0.00011708 [with_stream_mark]: 2.243e-05 [recompute_prepare]: 6.30002e-06 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 4.14002e-06 [updatestate_loads_eliminate]: 2.91999e-06 [parameter_eliminate]: 2.11998e-06 [a_2]: 6.801e-05 [accelerated_algorithm]: 6.91001e-06 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 2.55002e-06 [shard_inline]: 6.36e-06 [merge_send_recv]: 9.49e-06 [auto_parallel]: 1.038e-05 [parallel]: 9.57001e-06 [flash_sp]: 4.05998e-06 [merge_comm]: 3.65998e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 9.56998e-06 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 9.39e-06 [virtual_dataset]: 6.44999e-06 [get_grad_eliminate_]: 6.23998e-06 [virtual_output]: 6.32001e-06 [merge_forward]: 4.90999e-06 [cell_reuse_recompute_pass]: 3.05002e-06 [offload_activation]: 1.056e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.755e-05 [merge_recompute_call_nodes]: 2.06e-06 [before_grad]: 1.249e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 1.80001e-06 [receive_attached]: 2.09e-06 [after_resolve]: 1.221e-05 [a_after_grad]: 8.50001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 2.70002e-06 [auto_monad_grad]: 2.30002e-06 [auto_monad_eliminator]: 1.316e-05 [cse]: 2.621e-05 [a_3]: 3.765e-05 [py_interpret_to_execute_after_opt_a]: 1.176e-05 [slice_cell_reuse_recomputed_activation]: 2.07001e-06 [rewriter_after_opt_a]: 2.443e-05 [convert_after_rewriter]: 1.47001e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00088508 [opt_b]: 0.00023045, [1] [Cycle 1]: 0.00022152, [7] [b_1]: 0.00011937 [b_2]: 8.84e-06 [updatestate_depend_eliminate]: 1.107e-05 [updatestate_assign_eliminate]: 3.04001e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 8.59989e-07 [cse]: 3.629e-05 [optimize_parallel_all_gather_comm]: 2.19e-05 [overlap_param_gather]: 1.89e-06 [cconv]: 3.815e-05 [loop_unroll]: 0.00055232 [opt_after_cconv]: 0.00011926, [1] [Cycle 1]: 0.00011053, [7] [c_1]: 2.749e-05 [parameter_eliminate]: 5.56e-06 [updatestate_depend_eliminate]: 8.29002e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.58e-06 [cse]: 2.797e-05 [renormalize]: 5.39992e-07 [remove_dup_value]: 1.748e-05 [tuple_transform]: 7.941e-05, [1] [Cycle 1]: 7.374e-05, [4] [d_1]: 4.441e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.28e-06 [partial_unused_args_eliminate]: 2.07999e-06 [add_recomputation]: 5.605e-05 [cse_after_recomputation]: 2.692e-05, [1] [Cycle 1]: 2.238e-05, [1] [cse]: 1.617e-05 [environ_conv]: 1.087e-05 [swap_dp_allreduce_reducescatter]: 6.19001e-06 [bias_add_comm_swap]: 3.71999e-06 [label_micro_interleaved_index]: 5.95002e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.81999e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 1.25001e-06 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.61999e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.96e-06 [interleave_parallel_branches]: 1.32999e-06 [overlap_opt_shard_in_pipeline]: 1.27999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.73002e-06 [control_data_broadcast_order]: 1.708e-05 [grouped_pairwise_exchange_alltoall]: 1.63002e-06 [offloading_packed_experts]: 4.58001e-06 [overlap_recompute_and_grad_model_parallel]: 5.52001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.32999e-06 [overlap_grad_ring_attention]: 4.08999e-06 [overlap_grad_flash_sp]: 2.324e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.93998e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00010369, [1] [Cycle 1]: 9.803e-05, [6] [build]: 1.403e-05 [elim_shapecalc]: 1.398e-05 [elim_not_effective]: 1.487e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.066e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.93e-06 [pipeline_parallel_scheduler]: 1.87001e-06 [auto_monad_reorder]: 2.064e-05 [get_jit_bprop_graph]: 2.42001e-06 [rewriter_after_jit_bprop_graph]: 7.06001e-06 [opt_after_jit_grad]: 0.0007359 [validate]: 6.678e-05 [backend_pass]: 1.12e-06 [task_emit]: 0.0116572 [execute]: 9.51e-06 Sums bootstrap : 0.000657s : 1.33% type_inference : 0.030111s : 60.78% event_method : 0.000028s : 0.06% auto_monad : 0.000128s : 0.26% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000005s : 0.01% pre_auto_parallel : 0.000052s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000013s : 0.03% optimize.rewriter_before_opt_a : 0.000308s : 0.62% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000110s : 0.22% optimize.opt_a.loop_unroll : 0.000042s : 0.09% optimize.opt_a.a_1 : 0.000831s : 1.68% optimize.opt_a.with_stream_mark : 0.000049s : 0.10% optimize.opt_a.recompute_prepare : 0.000021s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000144s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.03% optimize.opt_a.merge_send_recv : 0.000019s : 0.04% optimize.opt_a.auto_parallel : 0.000022s : 0.04% optimize.opt_a.parallel : 0.000038s : 0.08% optimize.opt_a.flash_sp : 0.000017s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000010s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000021s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000025s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.05% optimize.opt_a.a_after_grad : 0.000018s : 0.04% optimize.opt_a.renormalize : 0.001681s : 3.39% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000062s : 0.13% optimize.opt_a.cse : 0.000067s : 0.14% optimize.opt_a.a_3 : 0.000094s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000012s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000885s : 1.79% optimize.opt_b.b_1 : 0.000119s : 0.24% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.08% optimize.loop_unroll : 0.000552s : 1.11% optimize.opt_after_cconv.c_1 : 0.000027s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.06% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.11% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000736s : 1.49% validate : 0.000067s : 0.13% backend_pass : 0.000001s : 0.00% task_emit : 0.011657s : 23.53% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000265 26 0.70% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000002s : 2: substitution.fold_const_symbol 2.48% : 0.000007s : 3: substitution.graph_param_transform 80.03% : 0.000212s : 6: substitution.inline 2.08% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.48% : 0.000007s : 4: substitution.remove_not_recompute_node 2.54% : 0.000007s : 2: substitution.replace_old_param 3.99% : 0.000011s : 1: substitution.switch_simplify 4.96% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030019 2 94.54% : 0.028380s : 1: type_inference.infer 5.46% : 0.001639s : 1: type_inference.specialize ------[replace.] 0.000116 9 54.92% : 0.000063s : 6: replace.inline 24.17% : 0.000028s : 1: replace.switch_simplify 20.91% : 0.000024s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000230 9 90.69% : 0.000208s : 6: match.inline 4.19% : 0.000010s : 1: match.switch_simplify 5.11% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000212 1092 0.88% : 0.000002s : 12: predicate.accumulaten_eliminater 1.30% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 6: predicate.addn_check_dump 1.08% : 0.000002s : 12: predicate.addn_zero_filter 0.96% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 18: predicate.arithmetic_simplify 1.22% : 0.000003s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.12% : 0.000000s : 3: predicate.const_output_eliminate 0.52% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.40% : 0.000003s : 12: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.14% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.54% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 15: predicate.environ_add_const_eliminate 0.94% : 0.000002s : 15: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 15: predicate.environ_get_depend_swap 1.65% : 0.000003s : 21: predicate.environ_get_eliminate 0.98% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.36% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.95% : 0.000006s : 20: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.66% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 3: predicate.fold_const_symbol 0.86% : 0.000002s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.40% : 0.000001s : 6: predicate.incorporate_call 0.36% : 0.000001s : 6: predicate.incorporate_call_switch 5.95% : 0.000013s : 50: predicate.inline 0.53% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.14% : 0.000002s : 6: predicate.less_batch_normalization 1.70% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.22% : 0.000005s : 32: predicate.load_eliminater 1.59% : 0.000003s : 3: predicate.loop_unroll_after_grad 2.74% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 6: predicate.merge_addn 0.44% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.43% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 12: predicate.minmaximum_grad 1.58% : 0.000003s : 3: predicate.mutable_eliminate 0.39% : 0.000001s : 3: predicate.opt_reshape 0.33% : 0.000001s : 3: predicate.parallel_virtual_node 2.22% : 0.000005s : 20: predicate.partial_defer_inline 1.22% : 0.000003s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.20% : 0.000003s : 12: predicate.reduce_eliminate 2.47% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.47% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000003s : 20: predicate.replace_applicator 0.36% : 0.000001s : 6: predicate.replace_old_param 0.18% : 0.000000s : 3: predicate.reset_defer_inline 1.22% : 0.000003s : 12: predicate.reshape_eliminate 0.45% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 0.86% : 0.000002s : 6: predicate.same_eliminate 0.30% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 6: predicate.shard_identity_eliminate 0.70% : 0.000001s : 6: predicate.special_op_eliminate 0.69% : 0.000001s : 6: predicate.specialize_transform 1.27% : 0.000003s : 6: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 20: predicate.switch_defer_inline 2.01% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.55% : 0.000012s : 68: predicate.switch_simplify 0.95% : 0.000002s : 12: predicate.tile_eliminate 0.92% : 0.000002s : 12: predicate.transpose_eliminate 1.66% : 0.000004s : 18: predicate.tuple_list_convert_item_index_to_positive 1.44% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.30% : 0.000007s : 26: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.03% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.05% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.64% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.52% : 0.000001s : 3: predicate.value_based_eliminate 0.82% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.18% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.49% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001469 16 56.58% : 0.000831s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.42% : 0.000638s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.069612 196 0.01% : 0.000005s : 1: ForceFp32Comm 6.15% : 0.004279s : 1: add_attr 6.12% : 0.004262s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000062s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.19% : 0.000135s : 1: auto_monad 0.04% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000007s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.00% : 0.000697s : 1: bootstrap 0.06% : 0.000042s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000005s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.05% : 0.000036s : 1: event_method 0.02% : 0.000017s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.81% : 0.000563s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.29% : 0.000901s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000027s : 1: opt.transform.mutable_eliminate 1.91% : 0.001326s : 78: opt.transform.opt_a 0.04% : 0.000026s : 1: opt.transform.opt_after_cconv 0.05% : 0.000036s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000095s : 28: opt.transform.opt_b 0.07% : 0.000048s : 2: opt.transform.opt_trans_graph 0.06% : 0.000042s : 4: opt.transform.symbol_engine_opt 5.92% : 0.004118s : 1: opt_a 0.18% : 0.000123s : 1: opt_after_cconv 1.08% : 0.000755s : 1: opt_after_jit_grad 0.34% : 0.000235s : 1: opt_b 10.13% : 0.007051s : 1: optimize 0.04% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000028s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000009s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000057s : 1: pre_auto_parallel 0.02% : 0.000016s : 1: py_interpret_to_execute 0.02% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 1.38% : 0.000960s : 1: renormalize.infer 1.01% : 0.000705s : 1: renormalize.specialize 0.08% : 0.000055s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000030s : 1: rewriter_after_opt_a 0.46% : 0.000319s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000107s : 1: symbol_engine_optimizer 16.78% : 0.011683s : 1: task_emit 0.12% : 0.000084s : 1: tuple_transform 43.30% : 0.030140s : 1: type_inference 0.17% : 0.000121s : 1: validate TotalTime = 0.0539141, [24] [bootstrap]: 0.00049796 [type_inference]: 0.0309256 [event_method]: 2.826e-05 [auto_monad]: 8.919e-05 [graph_reusing]: 7.35e-06 [inline]: 3.06999e-06 [add_attr]: 0.00427634, [1] [add_attr_with_inline]: 0.00426457, [1] [Cycle 1]: 9.322e-05, [2] [tag_attr]: 3.152e-05 [meta_addattr_fg_expand]: 6.16998e-06 [parallel-infer-symbol]: 4.22998e-06 [pre_auto_parallel]: 4.904e-05 [insert-virtual-dataset]: 2.83e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.00687668, [53] [py_interpret_to_execute]: 1.077e-05 [rewriter_before_opt_a]: 0.00029839 [opt_a]: 0.00409218, [2] [Cycle 1]: 0.00342812, [45] [expand_dump_flag]: 3.66001e-06 [switch_simplify]: 9.426e-05 [loop_unroll]: 3.348e-05 [a_1]: 0.00076334 [with_stream_mark]: 2.449e-05 [recompute_prepare]: 9.76e-06 [updatestate_depend_eliminate]: 4.46002e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 2.29001e-06 [a_2]: 8.328e-05 [accelerated_algorithm]: 7.92e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 6.98e-06 [merge_send_recv]: 9.07001e-06 [auto_parallel]: 1.009e-05 [parallel]: 2.182e-05 [flash_sp]: 1.071e-05 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 4.03999e-06 [matmul_add_comm_reduction]: 1.101e-05 [allreduce_slice_to_reducescatter]: 1.49998e-06 [virtual_shard_identity]: 1.175e-05 [virtual_dataset]: 7.85998e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 6.51e-06 [merge_forward]: 4.93001e-06 [cell_reuse_recompute_pass]: 2.22001e-06 [offload_activation]: 1.163e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.718e-05 [merge_recompute_call_nodes]: 1.64998e-06 [before_grad]: 1.382e-05 [set_forward_comm_id_for_comm_node_pass]: 5.19e-06 [meta_fg_expand]: 4e-06 [flash_sp_send_recv_attached]: 2.90998e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.438e-05 [a_after_grad]: 9.34998e-06 [renormalize]: 0.00172714 [add_forward_monad_depend]: 1.088e-05 [auto_monad_grad]: 2.81e-06 [auto_monad_eliminator]: 2.333e-05 [cse]: 4.243e-05 [a_3]: 5.913e-05 [Cycle 2]: 0.00065036, [45] [expand_dump_flag]: 1.98997e-06 [switch_simplify]: 9.24e-06 [loop_unroll]: 6.98e-06 [a_1]: 0.00010808 [with_stream_mark]: 2.023e-05 [recompute_prepare]: 5.77001e-06 [updatestate_depend_eliminate]: 3.93001e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.12002e-06 [parameter_eliminate]: 1.72999e-06 [a_2]: 6.326e-05 [accelerated_algorithm]: 5.56e-06 [shard]: 2.64999e-06 [meta_shard_fg_expand]: 2.83e-06 [shard_inline]: 5.59e-06 [merge_send_recv]: 8.42e-06 [auto_parallel]: 8.60001e-06 [parallel]: 8.42e-06 [flash_sp]: 3.66001e-06 [merge_comm]: 3.15998e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.04e-06 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 6.88e-06 [virtual_dataset]: 6.06e-06 [get_grad_eliminate_]: 5.09e-06 [virtual_output]: 5.25999e-06 [merge_forward]: 3.8e-06 [cell_reuse_recompute_pass]: 2.39999e-06 [offload_activation]: 1.019e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.706e-05 [merge_recompute_call_nodes]: 1.86e-06 [before_grad]: 9.20001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.08999e-06 [meta_fg_expand]: 2.80002e-06 [flash_sp_send_recv_attached]: 1.97001e-06 [receive_attached]: 2.11998e-06 [after_resolve]: 1.043e-05 [a_after_grad]: 7.91001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.76003e-06 [auto_monad_eliminator]: 8.00999e-06 [cse]: 1.781e-05 [a_3]: 3.124e-05 [py_interpret_to_execute_after_opt_a]: 9.63002e-06 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 2.306e-05 [convert_after_rewriter]: 1.20999e-06 [order_py_execute_after_rewriter]: 1.86998e-06 [mutable_eliminate]: 0.00082167 [opt_b]: 0.00021117, [1] [Cycle 1]: 0.00020299, [7] [b_1]: 0.00011345 [b_2]: 6.54999e-06 [updatestate_depend_eliminate]: 9.10001e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.39999e-06 [renormalize]: 1.10999e-06 [cse]: 3.028e-05 [optimize_parallel_all_gather_comm]: 1.989e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 3.424e-05 [loop_unroll]: 0.00051223 [opt_after_cconv]: 0.00010561, [1] [Cycle 1]: 9.94e-05, [7] [c_1]: 2.644e-05 [parameter_eliminate]: 5.14e-06 [updatestate_depend_eliminate]: 5.56e-06 [updatestate_assign_eliminate]: 2.45002e-06 [updatestate_loads_eliminate]: 2.38998e-06 [cse]: 2.298e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 4.795e-05 [tuple_transform]: 7.971e-05, [1] [Cycle 1]: 7.344e-05, [4] [d_1]: 4.434e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.46e-06 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 5.376e-05 [cse_after_recomputation]: 2.464e-05, [1] [Cycle 1]: 2.009e-05, [1] [cse]: 1.42e-05 [environ_conv]: 1.25e-05 [swap_dp_allreduce_reducescatter]: 5.46e-06 [bias_add_comm_swap]: 3.28e-06 [label_micro_interleaved_index]: 6.19001e-06 [label_fine_grained_interleaved_index]: 2.43e-06 [merge_cast_opt]: 1.81e-06 [slice_recompute_activation]: 2.42001e-06 [micro_interleaved_order_control]: 2.22999e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.28002e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.46998e-06 [comm_op_add_attrs]: 1.68002e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 2.26e-06 [interleave_parallel_branches]: 1.44e-06 [overlap_opt_shard_in_pipeline]: 1.31002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02001e-06 [control_data_broadcast_order]: 1.531e-05 [grouped_pairwise_exchange_alltoall]: 1.75001e-06 [offloading_packed_experts]: 4.04002e-06 [overlap_recompute_and_grad_model_parallel]: 5.35001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.66e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.18999e-06 [overlap_grad_flash_sp]: 2.544e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.49999e-06 [split_layernorm_comm]: 1.60999e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 9.839e-05, [1] [Cycle 1]: 9.292e-05, [6] [build]: 1.458e-05 [elim_shapecalc]: 1.16e-05 [elim_not_effective]: 1.531e-05 [opt_reshape]: 7.59002e-06 [fold_const_symbol]: 9.91e-06 [renormalize]: 1.19995e-07 [detach_backward]: 2.77002e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.882e-05 [get_jit_bprop_graph]: 2.35002e-06 [rewriter_after_jit_bprop_graph]: 7.88001e-06 [opt_after_jit_grad]: 0.00068499 [validate]: 6.189e-05 [backend_pass]: 9.10019e-07 [task_emit]: 0.0100732 [execute]: 1.086e-05 Sums bootstrap : 0.000498s : 1.03% type_inference : 0.030926s : 63.91% event_method : 0.000028s : 0.06% auto_monad : 0.000089s : 0.18% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000049s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.02% optimize.rewriter_before_opt_a : 0.000298s : 0.62% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000104s : 0.21% optimize.opt_a.loop_unroll : 0.000040s : 0.08% optimize.opt_a.a_1 : 0.000871s : 1.80% optimize.opt_a.with_stream_mark : 0.000045s : 0.09% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000147s : 0.30% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.03% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.04% optimize.opt_a.auto_parallel : 0.000019s : 0.04% optimize.opt_a.parallel : 0.000030s : 0.06% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.04% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000025s : 0.05% optimize.opt_a.a_after_grad : 0.000017s : 0.04% optimize.opt_a.renormalize : 0.001727s : 3.57% optimize.opt_a.add_forward_monad_depend : 0.000012s : 0.03% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.06% optimize.opt_a.cse : 0.000060s : 0.12% optimize.opt_a.a_3 : 0.000090s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000023s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000822s : 1.70% optimize.opt_b.b_1 : 0.000113s : 0.23% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.07% optimize.loop_unroll : 0.000512s : 1.06% optimize.opt_after_cconv.c_1 : 0.000026s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000048s : 0.10% optimize.tuple_transform.d_1 : 0.000044s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.11% optimize.cse_after_recomputation.cse : 0.000014s : 0.03% optimize.environ_conv : 0.000013s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.01% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000008s : 0.02% opt_after_jit_grad : 0.000685s : 1.42% validate : 0.000062s : 0.13% backend_pass : 0.000001s : 0.00% task_emit : 0.010073s : 20.82% execute : 0.000011s : 0.02% Time group info: ------[substitution.] 0.000250 26 0.82% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000002s : 2: substitution.fold_const_symbol 2.53% : 0.000006s : 3: substitution.graph_param_transform 80.55% : 0.000201s : 6: substitution.inline 2.02% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.54% : 0.000006s : 4: substitution.remove_not_recompute_node 2.69% : 0.000007s : 2: substitution.replace_old_param 3.13% : 0.000008s : 1: substitution.switch_simplify 5.09% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030832 2 94.69% : 0.029195s : 1: type_inference.infer 5.31% : 0.001637s : 1: type_inference.specialize ------[replace.] 0.000106 9 54.78% : 0.000058s : 6: replace.inline 25.64% : 0.000027s : 1: replace.switch_simplify 19.58% : 0.000021s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000215 9 91.50% : 0.000197s : 6: match.inline 3.19% : 0.000007s : 1: match.switch_simplify 5.31% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1092 1.01% : 0.000002s : 12: predicate.accumulaten_eliminater 1.56% : 0.000003s : 3: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 6: predicate.addn_check_dump 1.41% : 0.000003s : 12: predicate.addn_zero_filter 0.70% : 0.000001s : 12: predicate.adjust_all_reduce_mul_add 2.38% : 0.000005s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.13% : 0.000000s : 3: predicate.const_output_eliminate 0.56% : 0.000001s : 6: predicate.depend_value_elim 0.98% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.20% : 0.000003s : 12: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.23% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.68% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_depend_swap 1.54% : 0.000003s : 21: predicate.environ_get_eliminate 1.30% : 0.000003s : 15: predicate.environ_get_set_eliminate 1.39% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.40% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.71% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 3: predicate.fold_const_symbol 0.70% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000001s : 3: predicate.graph_param_transform 0.43% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 5.73% : 0.000012s : 50: predicate.inline 0.55% : 0.000001s : 6: predicate.inline_without_move 0.23% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 6: predicate.less_batch_normalization 1.63% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.27% : 0.000005s : 32: predicate.load_eliminater 1.05% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.82% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 6: predicate.merge_addn 0.45% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.50% : 0.000003s : 3: predicate.mutable_eliminate 0.30% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.00% : 0.000004s : 20: predicate.partial_defer_inline 1.17% : 0.000002s : 17: predicate.partial_eliminate 1.01% : 0.000002s : 12: predicate.print_const_string_wrapper 0.69% : 0.000001s : 6: predicate.reduce_all_const_elim 1.20% : 0.000002s : 12: predicate.reduce_eliminate 2.34% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000001s : 6: predicate.remove_not_recompute_node 1.28% : 0.000003s : 20: predicate.replace_applicator 0.51% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 1.04% : 0.000002s : 12: predicate.reshape_eliminate 0.48% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.29% : 0.000001s : 3: predicate.row_tensor_eliminate 0.76% : 0.000002s : 6: predicate.same_eliminate 0.41% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.79% : 0.000002s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.72% : 0.000001s : 6: predicate.specialize_transform 0.92% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.46% : 0.000003s : 20: predicate.switch_defer_inline 1.97% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.39% : 0.000011s : 68: predicate.switch_simplify 0.94% : 0.000002s : 12: predicate.tile_eliminate 1.07% : 0.000002s : 12: predicate.transpose_eliminate 1.60% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.71% : 0.000004s : 18: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000006s : 24: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.57% : 0.000005s : 32: predicate.updatestate_pure_node_eliminater 2.55% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.93% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.18% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001461 16 54.30% : 0.000793s : 8: func_graph_cloner_run.FuncGraphClonerGraph 45.70% : 0.000667s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.068291 196 0.01% : 0.000004s : 1: ForceFp32Comm 6.27% : 0.004283s : 1: add_attr 6.25% : 0.004269s : 1: add_attr_with_inline 0.08% : 0.000056s : 1: add_comm_op_reuse_tag 0.08% : 0.000058s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000095s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.78% : 0.000534s : 1: bootstrap 0.06% : 0.000038s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000019s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.05% : 0.000034s : 1: event_method 0.03% : 0.000019s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000005s : 1: interleave_parallel_branches 0.01% : 0.000006s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.77% : 0.000523s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.22% : 0.000835s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000020s : 1: opt.transform.mutable_eliminate 1.97% : 0.001345s : 78: opt.transform.opt_a 0.04% : 0.000025s : 1: opt.transform.opt_after_cconv 0.05% : 0.000034s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000088s : 28: opt.transform.opt_b 0.07% : 0.000047s : 2: opt.transform.opt_trans_graph 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 6.00% : 0.004097s : 1: opt_a 0.16% : 0.000110s : 1: opt_after_cconv 1.03% : 0.000701s : 1: opt_after_jit_grad 0.32% : 0.000215s : 1: opt_b 10.08% : 0.006883s : 1: optimize 0.03% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000005s : 1: order_py_execute_after_rewriter 0.04% : 0.000030s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000054s : 1: pre_auto_parallel 0.02% : 0.000015s : 1: py_interpret_to_execute 0.02% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.08% : 0.000052s : 1: remove_dup_value 1.45% : 0.000988s : 1: renormalize.infer 1.06% : 0.000723s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000027s : 1: rewriter_after_opt_a 0.45% : 0.000306s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000102s : 1: symbol_engine_optimizer 14.79% : 0.010097s : 1: task_emit 0.12% : 0.000083s : 1: tuple_transform 45.33% : 0.030957s : 1: type_inference 0.16% : 0.000111s : 1: validate TotalTime = 0.137928, [24] [bootstrap]: 0.00069576 [type_inference]: 0.0604364 [event_method]: 0.00038005 [auto_monad]: 0.00029966 [graph_reusing]: 1.813e-05 [inline]: 3.28998e-06 [add_attr]: 0.00399463, [1] [add_attr_with_inline]: 0.00397881, [1] [Cycle 1]: 0.00012758, [2] [tag_attr]: 6.33e-05 [meta_addattr_fg_expand]: 1.368e-05 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 8.505e-05 [insert-virtual-dataset]: 2.36998e-06 [parallel-infer-symbol-second]: 1.83002e-06 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.0564334, [53] [py_interpret_to_execute]: 1.004e-05 [rewriter_before_opt_a]: 0.00044504 [opt_a]: 0.0527314, [3] [Cycle 1]: 0.0436902, [45] [expand_dump_flag]: 5.89e-06 [switch_simplify]: 0.00020485 [loop_unroll]: 8.699e-05 [a_1]: 0.00209617 [with_stream_mark]: 4.398e-05 [recompute_prepare]: 3.311e-05 [updatestate_depend_eliminate]: 1.126e-05 [updatestate_assign_eliminate]: 9.72001e-06 [updatestate_loads_eliminate]: 7.85e-06 [parameter_eliminate]: 3.97002e-06 [a_2]: 0.00025262 [accelerated_algorithm]: 1.845e-05 [shard]: 1.91998e-06 [meta_shard_fg_expand]: 8.69e-06 [shard_inline]: 1.815e-05 [merge_send_recv]: 2.026e-05 [auto_parallel]: 1.737e-05 [parallel]: 2.343e-05 [flash_sp]: 1.482e-05 [merge_comm]: 1.075e-05 [allreduce_fusion]: 9.07999e-06 [matmul_add_comm_reduction]: 3.567e-05 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 2.049e-05 [virtual_dataset]: 1.731e-05 [get_grad_eliminate_]: 1.661e-05 [virtual_output]: 1.605e-05 [merge_forward]: 1.065e-05 [cell_reuse_recompute_pass]: 2.26e-06 [offload_activation]: 1.978e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.482e-05 [merge_recompute_call_nodes]: 2.01e-06 [before_grad]: 3.083e-05 [set_forward_comm_id_for_comm_node_pass]: 1.069e-05 [meta_fg_expand]: 0.00650561 [flash_sp_send_recv_attached]: 5.74999e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 0.00010869 [a_after_grad]: 0.00014131 [renormalize]: 0.0312162 [add_forward_monad_depend]: 2.905e-05 [auto_monad_grad]: 2.221e-05 [auto_monad_eliminator]: 0.00014513 [cse]: 0.00038096 [a_3]: 0.00154209 [Cycle 2]: 0.00756735, [45] [expand_dump_flag]: 4.58999e-06 [switch_simplify]: 9.774e-05 [loop_unroll]: 8.763e-05 [a_1]: 0.00212839 [with_stream_mark]: 4.312e-05 [recompute_prepare]: 2.226e-05 [updatestate_depend_eliminate]: 1.136e-05 [updatestate_assign_eliminate]: 7.63001e-06 [updatestate_loads_eliminate]: 7.63001e-06 [parameter_eliminate]: 2.98e-06 [a_2]: 0.00021214 [accelerated_algorithm]: 3.783e-05 [shard]: 2.58e-06 [meta_shard_fg_expand]: 6.72002e-06 [shard_inline]: 1.666e-05 [merge_send_recv]: 1.439e-05 [auto_parallel]: 1.533e-05 [parallel]: 1.117e-05 [flash_sp]: 4.53999e-06 [merge_comm]: 9.61e-06 [allreduce_fusion]: 8.26002e-06 [matmul_add_comm_reduction]: 1.594e-05 [allreduce_slice_to_reducescatter]: 1.22999e-06 [virtual_shard_identity]: 1.543e-05 [virtual_dataset]: 1.351e-05 [get_grad_eliminate_]: 1.293e-05 [virtual_output]: 1.304e-05 [merge_forward]: 8.69e-06 [cell_reuse_recompute_pass]: 2.09e-06 [offload_activation]: 1.724e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.671e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 2.562e-05 [set_forward_comm_id_for_comm_node_pass]: 9.48002e-06 [meta_fg_expand]: 0.00035613 [flash_sp_send_recv_attached]: 1.97999e-06 [receive_attached]: 3.01999e-06 [after_resolve]: 2.826e-05 [a_after_grad]: 2.209e-05 [renormalize]: 0.00351066 [add_forward_monad_depend]: 1.135e-05 [auto_monad_grad]: 3.31999e-06 [auto_monad_eliminator]: 3.856e-05 [cse]: 0.00019005 [a_3]: 0.00012387 [Cycle 3]: 0.00144889, [45] [expand_dump_flag]: 3.37002e-06 [switch_simplify]: 1.939e-05 [loop_unroll]: 1.416e-05 [a_1]: 0.00039764 [with_stream_mark]: 2.578e-05 [recompute_prepare]: 1.66e-05 [updatestate_depend_eliminate]: 1.127e-05 [updatestate_assign_eliminate]: 8.08001e-06 [updatestate_loads_eliminate]: 8.13999e-06 [parameter_eliminate]: 2.34001e-06 [a_2]: 0.00020466 [accelerated_algorithm]: 2.056e-05 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 5.35999e-06 [shard_inline]: 1.379e-05 [merge_send_recv]: 1.576e-05 [auto_parallel]: 1.788e-05 [parallel]: 8.93002e-06 [flash_sp]: 1.50999e-06 [merge_comm]: 8.33999e-06 [allreduce_fusion]: 8.16002e-06 [matmul_add_comm_reduction]: 1.673e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 1.664e-05 [virtual_dataset]: 1.329e-05 [get_grad_eliminate_]: 1.311e-05 [virtual_output]: 1.424e-05 [merge_forward]: 9.27001e-06 [cell_reuse_recompute_pass]: 3.35998e-06 [offload_activation]: 1.692e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.873e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 2.397e-05 [set_forward_comm_id_for_comm_node_pass]: 9.32999e-06 [meta_fg_expand]: 6.64999e-06 [flash_sp_send_recv_attached]: 1.78002e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 2.055e-05 [a_after_grad]: 2.121e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.51998e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.212e-05 [cse]: 6.642e-05 [a_3]: 9.295e-05 [py_interpret_to_execute_after_opt_a]: 1.147e-05 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 8.326e-05 [convert_after_rewriter]: 1.41998e-06 [order_py_execute_after_rewriter]: 1.22999e-06 [mutable_eliminate]: 0.00082511 [opt_b]: 0.00054212, [1] [Cycle 1]: 0.0005333, [7] [b_1]: 0.000364 [b_2]: 1.788e-05 [updatestate_depend_eliminate]: 1.504e-05 [updatestate_assign_eliminate]: 7.71999e-06 [updatestate_loads_eliminate]: 7.73001e-06 [renormalize]: 1.05001e-06 [cse]: 7.796e-05 [optimize_parallel_all_gather_comm]: 3.118e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 3.986e-05 [loop_unroll]: 0.00052788 [opt_after_cconv]: 0.00019752, [1] [Cycle 1]: 0.00019067, [7] [c_1]: 6.82e-05 [parameter_eliminate]: 4.59002e-06 [updatestate_depend_eliminate]: 1.176e-05 [updatestate_assign_eliminate]: 7.28999e-06 [updatestate_loads_eliminate]: 6.81999e-06 [cse]: 5.558e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 0.00012449 [tuple_transform]: 0.00017119, [1] [Cycle 1]: 0.000166, [4] [d_1]: 0.00012759 [none_parameter_eliminate]: 1.98002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 1.497e-05 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 8.295e-05 [cse_after_recomputation]: 5.131e-05, [1] [Cycle 1]: 4.539e-05, [1] [cse]: 3.86e-05 [environ_conv]: 1.5e-05 [swap_dp_allreduce_reducescatter]: 1.296e-05 [bias_add_comm_swap]: 2.74001e-06 [label_micro_interleaved_index]: 5.19e-06 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 1.86e-06 [micro_interleaved_order_control]: 2.12001e-06 [assign_add_opt]: 1.57001e-06 [ForceFp32Comm]: 9.5999e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.84001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.60001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.16e-06 [control_data_broadcast_order]: 2.56e-05 [grouped_pairwise_exchange_alltoall]: 1.72001e-06 [offloading_packed_experts]: 7.26999e-06 [overlap_recompute_and_grad_model_parallel]: 7.23999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.17001e-06 [overlap_grad_ring_attention]: 7.08998e-06 [overlap_grad_flash_sp]: 3.631e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.14e-06 [symbol_engine_optimizer]: 0.00013661, [1] [Cycle 1]: 0.00013109, [6] [build]: 1.46e-05 [elim_shapecalc]: 2.105e-05 [elim_not_effective]: 2.734e-05 [opt_reshape]: 1.45e-05 [fold_const_symbol]: 2.286e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.99e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 3.13e-05 [get_jit_bprop_graph]: 3.06001e-06 [rewriter_after_jit_bprop_graph]: 7.13998e-06 [opt_after_jit_grad]: 0.00057582 [validate]: 8.123e-05 [backend_pass]: 1.00001e-06 [task_emit]: 0.0145114 [execute]: 9.93002e-06 Sums bootstrap : 0.000696s : 0.53% type_inference : 0.060436s : 45.73% event_method : 0.000380s : 0.29% auto_monad : 0.000300s : 0.23% graph_reusing : 0.000018s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000063s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000085s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000010s : 0.01% optimize.rewriter_before_opt_a : 0.000445s : 0.34% optimize.opt_a.expand_dump_flag : 0.000014s : 0.01% optimize.opt_a.switch_simplify : 0.000322s : 0.24% optimize.opt_a.loop_unroll : 0.000189s : 0.14% optimize.opt_a.a_1 : 0.004622s : 3.50% optimize.opt_a.with_stream_mark : 0.000113s : 0.09% optimize.opt_a.recompute_prepare : 0.000072s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000034s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000024s : 0.02% optimize.opt_a.parameter_eliminate : 0.000009s : 0.01% optimize.opt_a.a_2 : 0.000669s : 0.51% optimize.opt_a.accelerated_algorithm : 0.000077s : 0.06% optimize.opt_a.shard : 0.000007s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000021s : 0.02% optimize.opt_a.shard_inline : 0.000049s : 0.04% optimize.opt_a.merge_send_recv : 0.000050s : 0.04% optimize.opt_a.auto_parallel : 0.000051s : 0.04% optimize.opt_a.parallel : 0.000044s : 0.03% optimize.opt_a.flash_sp : 0.000021s : 0.02% optimize.opt_a.merge_comm : 0.000029s : 0.02% optimize.opt_a.allreduce_fusion : 0.000026s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000068s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000053s : 0.04% optimize.opt_a.virtual_dataset : 0.000044s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000043s : 0.03% optimize.opt_a.virtual_output : 0.000043s : 0.03% optimize.opt_a.merge_forward : 0.000029s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.01% optimize.opt_a.offload_activation : 0.000054s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000090s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000080s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000029s : 0.02% optimize.opt_a.meta_fg_expand : 0.006868s : 5.20% optimize.opt_a.flash_sp_send_recv_attached : 0.000010s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.01% optimize.opt_a.after_resolve : 0.000158s : 0.12% optimize.opt_a.a_after_grad : 0.000185s : 0.14% optimize.opt_a.renormalize : 0.034727s : 26.28% optimize.opt_a.add_forward_monad_depend : 0.000043s : 0.03% optimize.opt_a.auto_monad_grad : 0.000028s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000206s : 0.16% optimize.opt_a.cse : 0.000637s : 0.48% optimize.opt_a.a_3 : 0.001759s : 1.33% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000083s : 0.06% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000825s : 0.62% optimize.opt_b.b_1 : 0.000364s : 0.28% optimize.opt_b.b_2 : 0.000018s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000078s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.03% optimize.loop_unroll : 0.000528s : 0.40% optimize.opt_after_cconv.c_1 : 0.000068s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.cse : 0.000056s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000124s : 0.09% optimize.tuple_transform.d_1 : 0.000128s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000015s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000083s : 0.06% optimize.cse_after_recomputation.cse : 0.000039s : 0.03% optimize.environ_conv : 0.000015s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000026s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000036s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000021s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000014s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000576s : 0.44% validate : 0.000081s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.014511s : 10.98% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.002237 315 0.17% : 0.000004s : 8: substitution.elim_not_effective 0.48% : 0.000011s : 12: substitution.float_depend_g_call 0.82% : 0.000018s : 9: substitution.float_tuple_getitem_switch 0.19% : 0.000004s : 8: substitution.fold_const_symbol 28.04% : 0.000627s : 5: substitution.getattr_setattr_resolve 0.44% : 0.000010s : 10: substitution.graph_param_transform 0.13% : 0.000003s : 2: substitution.incorporate_call 0.09% : 0.000002s : 2: substitution.incorporate_call_switch 43.77% : 0.000979s : 24: substitution.inline 1.40% : 0.000031s : 3: substitution.inline_without_move 0.72% : 0.000016s : 25: substitution.j_node_and_user_rematch 0.88% : 0.000020s : 4: substitution.less_batch_normalization 1.11% : 0.000025s : 13: substitution.minmaximum_grad 0.51% : 0.000011s : 12: substitution.partial_eliminate 0.89% : 0.000020s : 25: substitution.remove_not_recompute_node 5.02% : 0.000112s : 32: substitution.replace_applicator 0.72% : 0.000016s : 14: substitution.replace_old_param 0.15% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.60% : 0.000013s : 4: substitution.switch_simplify 0.67% : 0.000015s : 2: substitution.transpose_eliminate 2.90% : 0.000065s : 17: substitution.tuple_list_convert_item_index_to_positive 1.27% : 0.000028s : 17: substitution.tuple_list_get_item_const_eliminator 1.73% : 0.000039s : 17: substitution.tuple_list_get_item_depend_reorder 5.56% : 0.000124s : 32: substitution.tuple_list_get_item_eliminator 1.75% : 0.000039s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.060269 2 92.34% : 0.055651s : 1: type_inference.infer 7.66% : 0.004619s : 1: type_inference.specialize ------[replace.] 0.000690 45 10.58% : 0.000073s : 4: replace.getattr_setattr_resolve 51.67% : 0.000356s : 24: replace.inline 14.59% : 0.000101s : 5: replace.replace_applicator 9.54% : 0.000066s : 4: replace.switch_simplify 13.63% : 0.000094s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001624 45 35.61% : 0.000578s : 4: match.getattr_setattr_resolve 59.23% : 0.000962s : 24: match.inline 2.58% : 0.000042s : 5: match.replace_applicator 0.68% : 0.000011s : 4: match.switch_simplify 1.90% : 0.000031s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001086 7110 0.84% : 0.000009s : 68: predicate.accumulaten_eliminater 0.30% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.42% : 0.000005s : 32: predicate.addn_check_dump 0.85% : 0.000009s : 68: predicate.addn_zero_filter 0.86% : 0.000009s : 68: predicate.adjust_all_reduce_mul_add 1.84% : 0.000020s : 100: predicate.arithmetic_simplify 0.86% : 0.000009s : 68: predicate.cast_eliminate 2.65% : 0.000029s : 215: predicate.check_bprop_eliminate 0.42% : 0.000005s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.41% : 0.000004s : 32: predicate.depend_value_elim 0.90% : 0.000010s : 68: predicate.dict_get_item_const_eliminator 1.02% : 0.000011s : 68: predicate.dict_get_item_eliminator 0.85% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.34% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 10: predicate.elim_not_effective 0.20% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000011s : 78: predicate.environ_add_const_eliminate 0.91% : 0.000010s : 78: predicate.environ_get_add_eliminate 0.94% : 0.000010s : 78: predicate.environ_get_depend_swap 1.36% : 0.000015s : 110: predicate.environ_get_eliminate 0.92% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.24% : 0.000013s : 100: predicate.exchange_switch_depend_value 1.98% : 0.000021s : 100: predicate.float_depend_g_call 0.40% : 0.000004s : 32: predicate.float_environ_get_switch 0.60% : 0.000007s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.47% : 0.000005s : 32: predicate.get_grad_eliminate 0.76% : 0.000008s : 31: predicate.getattr_setattr_resolve 0.10% : 0.000001s : 10: predicate.graph_param_transform 0.41% : 0.000004s : 32: predicate.incorporate_call 0.37% : 0.000004s : 32: predicate.incorporate_call_switch 6.51% : 0.000071s : 252: predicate.inline 1.41% : 0.000015s : 82: predicate.inline_without_move 0.23% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.71% : 0.000008s : 32: predicate.less_batch_normalization 1.24% : 0.000014s : 96: predicate.list_to_tuple_eliminator_ 1.99% : 0.000022s : 164: predicate.load_eliminater 0.35% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.51% : 0.000027s : 182: predicate.loop_unroll_before_grad 1.17% : 0.000013s : 88: predicate.make_slice_get_slice_eliminator 0.43% : 0.000005s : 32: predicate.merge_addn 2.53% : 0.000028s : 198: predicate.micro_step_allgather_replace 2.50% : 0.000027s : 198: predicate.mini_step_allgather_replace 0.87% : 0.000009s : 68: predicate.minmaximum_grad 0.54% : 0.000006s : 10: predicate.mutable_eliminate 0.15% : 0.000002s : 10: predicate.opt_reshape 0.19% : 0.000002s : 10: predicate.parallel_virtual_node 2.01% : 0.000022s : 100: predicate.partial_defer_inline 1.21% : 0.000013s : 86: predicate.partial_eliminate 0.91% : 0.000010s : 68: predicate.print_const_string_wrapper 0.50% : 0.000005s : 32: predicate.reduce_all_const_elim 1.10% : 0.000012s : 68: predicate.reduce_eliminate 1.94% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.26% : 0.000003s : 32: predicate.remove_not_recompute_node 2.37% : 0.000026s : 284: predicate.replace_applicator 0.71% : 0.000008s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.88% : 0.000010s : 68: predicate.reshape_eliminate 2.77% : 0.000030s : 198: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 10: predicate.row_tensor_eliminate 2.92% : 0.000032s : 215: predicate.same_eliminate 0.26% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.50% : 0.000005s : 32: predicate.shard_identity_eliminate 0.35% : 0.000004s : 20: predicate.special_op_eliminate 0.49% : 0.000005s : 32: predicate.specialize_transform 2.71% : 0.000029s : 198: predicate.split_environ_get_set_with_tuple_value 1.36% : 0.000015s : 82: predicate.stack_unstack_eliminate 0.14% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.45% : 0.000016s : 100: predicate.switch_defer_inline 3.98% : 0.000043s : 315: predicate.switch_layer_defer_inline 4.84% : 0.000053s : 332: predicate.switch_simplify 0.88% : 0.000010s : 68: predicate.tile_eliminate 0.83% : 0.000009s : 68: predicate.transpose_eliminate 1.29% : 0.000014s : 88: predicate.tuple_list_convert_item_index_to_positive 1.33% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.20% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.54% : 0.000028s : 128: predicate.tuple_list_get_item_eliminator 1.24% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.89% : 0.000020s : 120: predicate.tuple_list_set_item_eliminator 1.30% : 0.000014s : 96: predicate.tuple_to_list_eliminator_ 1.93% : 0.000021s : 164: predicate.updatestate_pure_node_eliminater 2.39% : 0.000026s : 196: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 10: predicate.value_based_eliminate 0.50% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.45% : 0.000005s : 32: predicate.virtual_output_eliminate 0.14% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007315 75 62.50% : 0.004572s : 36: func_graph_cloner_run.FuncGraphClonerGraph 37.50% : 0.002743s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.242716 247 0.00% : 0.000004s : 1: ForceFp32Comm 1.65% : 0.004002s : 1: add_attr 1.64% : 0.003986s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000087s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.13% : 0.000313s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.02% : 0.000051s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.30% : 0.000732s : 1: bootstrap 0.02% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000029s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000054s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000018s : 1: environ_conv 0.16% : 0.000396s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000023s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.22% : 0.000537s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.35% : 0.000840s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000027s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000036s : 1: opt.transform.mutable_eliminate 3.40% : 0.008242s : 125: opt.transform.opt_a 0.03% : 0.000066s : 1: opt.transform.opt_after_cconv 0.02% : 0.000055s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000347s : 28: opt.transform.opt_b 0.32% : 0.000779s : 2: opt.transform.opt_resolve 0.06% : 0.000140s : 2: opt.transform.opt_trans_graph 0.03% : 0.000082s : 4: opt.transform.symbol_engine_opt 21.73% : 0.052736s : 1: opt_a 0.08% : 0.000201s : 1: opt_after_cconv 0.24% : 0.000587s : 1: opt_after_jit_grad 0.23% : 0.000548s : 1: opt_b 23.25% : 0.056440s : 1: optimize 0.01% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000040s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000006s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000090s : 1: pre_auto_parallel 0.01% : 0.000014s : 1: py_interpret_to_execute 0.01% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000130s : 1: remove_dup_value 11.41% : 0.027689s : 2: renormalize.infer 2.89% : 0.007005s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000090s : 1: rewriter_after_opt_a 0.19% : 0.000454s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000139s : 1: symbol_engine_optimizer 5.99% : 0.014534s : 1: task_emit 0.07% : 0.000174s : 1: tuple_transform 24.91% : 0.060470s : 1: type_inference 0.05% : 0.000133s : 1: validate TotalTime = 0.0534915, [24] [bootstrap]: 0.00067713 [type_inference]: 0.030535 [event_method]: 2.525e-05 [auto_monad]: 8.575e-05 [graph_reusing]: 6.68e-06 [inline]: 2.66e-06 [add_attr]: 0.00395624, [1] [add_attr_with_inline]: 0.00394365, [1] [Cycle 1]: 7.956e-05, [2] [tag_attr]: 2.873e-05 [meta_addattr_fg_expand]: 7.09001e-06 [parallel-infer-symbol]: 3.48e-06 [pre_auto_parallel]: 4.458e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 1.66002e-06 [pipeline_split]: 1.47001e-06 [optimize]: 0.0064543, [53] [py_interpret_to_execute]: 6.55997e-06 [rewriter_before_opt_a]: 0.00028065 [opt_a]: 0.0039022, [2] [Cycle 1]: 0.00326374, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 9.13e-05 [loop_unroll]: 3.932e-05 [a_1]: 0.00068956 [with_stream_mark]: 2.055e-05 [recompute_prepare]: 9.86003e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 4.00998e-06 [updatestate_loads_eliminate]: 3.33e-06 [parameter_eliminate]: 2.36e-06 [a_2]: 7.601e-05 [accelerated_algorithm]: 6.98e-06 [shard]: 2.01e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 6.89001e-06 [merge_send_recv]: 9.18002e-06 [auto_parallel]: 7.15003e-06 [parallel]: 2.423e-05 [flash_sp]: 9.49999e-06 [merge_comm]: 4.3e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 9.66e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 1.004e-05 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 6.84001e-06 [virtual_output]: 5.84999e-06 [merge_forward]: 3.67002e-06 [cell_reuse_recompute_pass]: 1.34e-06 [offload_activation]: 1.014e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.389e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 1.197e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 3.05998e-06 [flash_sp_send_recv_attached]: 2.79001e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 1.049e-05 [a_after_grad]: 9.81e-06 [renormalize]: 0.00174549 [add_forward_monad_depend]: 8.57998e-06 [auto_monad_grad]: 2.49001e-06 [auto_monad_eliminator]: 1.745e-05 [cse]: 3.719e-05 [a_3]: 5.224e-05 [Cycle 2]: 0.00062607, [45] [expand_dump_flag]: 1.94999e-06 [switch_simplify]: 8.80999e-06 [loop_unroll]: 6.49001e-06 [a_1]: 0.0001091 [with_stream_mark]: 1.629e-05 [recompute_prepare]: 5.71e-06 [updatestate_depend_eliminate]: 3.04001e-06 [updatestate_assign_eliminate]: 2.83003e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 1.71998e-06 [a_2]: 6.461e-05 [accelerated_algorithm]: 6.07001e-06 [shard]: 1.78002e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 5.38002e-06 [merge_send_recv]: 7.05998e-06 [auto_parallel]: 8.23999e-06 [parallel]: 7.65e-06 [flash_sp]: 4.12e-06 [merge_comm]: 3.25e-06 [allreduce_fusion]: 3.16001e-06 [matmul_add_comm_reduction]: 8.17998e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 6.91999e-06 [virtual_dataset]: 5.47001e-06 [get_grad_eliminate_]: 5.24e-06 [virtual_output]: 5.32001e-06 [merge_forward]: 3.55e-06 [cell_reuse_recompute_pass]: 2.76e-06 [offload_activation]: 8.18001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.522e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 9.22999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.43e-06 [meta_fg_expand]: 2.74999e-06 [flash_sp_send_recv_attached]: 1.37e-06 [receive_attached]: 1.81e-06 [after_resolve]: 9.71e-06 [a_after_grad]: 8.65999e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.42e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 6.69001e-06 [cse]: 1.779e-05 [a_3]: 3.132e-05 [py_interpret_to_execute_after_opt_a]: 7.61001e-06 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 2.131e-05 [convert_after_rewriter]: 2.31e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00074374 [opt_b]: 0.00020905, [1] [Cycle 1]: 0.00020069, [7] [b_1]: 0.00011861 [b_2]: 7.78999e-06 [updatestate_depend_eliminate]: 6.86001e-06 [updatestate_assign_eliminate]: 2.79999e-06 [updatestate_loads_eliminate]: 2.52001e-06 [renormalize]: 9.80013e-07 [cse]: 2.553e-05 [optimize_parallel_all_gather_comm]: 1.889e-05 [overlap_param_gather]: 2.96999e-06 [cconv]: 3.271e-05 [loop_unroll]: 0.0004992 [opt_after_cconv]: 0.0001115, [1] [Cycle 1]: 0.00010393, [7] [c_1]: 2.794e-05 [parameter_eliminate]: 4.43001e-06 [updatestate_depend_eliminate]: 6.91001e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.44999e-06 [cse]: 2.456e-05 [renormalize]: 6.50005e-07 [remove_dup_value]: 1.714e-05 [tuple_transform]: 7.761e-05, [1] [Cycle 1]: 7.246e-05, [4] [d_1]: 4.483e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.13998e-06 [partial_unused_args_eliminate]: 1.63002e-06 [add_recomputation]: 5.496e-05 [cse_after_recomputation]: 2.537e-05, [1] [Cycle 1]: 2.028e-05, [1] [cse]: 1.425e-05 [environ_conv]: 1.21e-05 [swap_dp_allreduce_reducescatter]: 4.98001e-06 [bias_add_comm_swap]: 2.61e-06 [label_micro_interleaved_index]: 5.20999e-06 [label_fine_grained_interleaved_index]: 2.48e-06 [merge_cast_opt]: 1.47999e-06 [slice_recompute_activation]: 2.17001e-06 [micro_interleaved_order_control]: 2.17001e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.48e-06 [reorder_send_recv_between_fp_bp]: 2.97002e-06 [comm_op_add_attrs]: 9.80013e-07 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 1.59e-06 [control_data_broadcast_order]: 1.298e-05 [grouped_pairwise_exchange_alltoall]: 1.50001e-06 [offloading_packed_experts]: 4.69998e-06 [overlap_recompute_and_grad_model_parallel]: 4.62998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.37e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.30999e-06 [overlap_grad_flash_sp]: 2.151e-05 [begin_end_overlap_inline]: 6.40022e-07 [split_matmul_comm_elemetwise]: 2.78003e-06 [split_layernorm_comm]: 1.81998e-06 [handle_group_info]: 1.12999e-06 [symbol_engine_optimizer]: 8.707e-05, [1] [Cycle 1]: 8.282e-05, [6] [build]: 1.272e-05 [elim_shapecalc]: 1.081e-05 [elim_not_effective]: 1.329e-05 [opt_reshape]: 6.39999e-06 [fold_const_symbol]: 1.04e-05 [renormalize]: 1.29978e-07 [detach_backward]: 2.47001e-06 [pipeline_parallel_scheduler]: 1.63002e-06 [auto_monad_reorder]: 1.946e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 5.05999e-06 [opt_after_jit_grad]: 0.00060063 [validate]: 5.914e-05 [backend_pass]: 9.29984e-07 [task_emit]: 0.0107302 [execute]: 9.57001e-06 Sums bootstrap : 0.000677s : 1.40% type_inference : 0.030535s : 63.04% event_method : 0.000025s : 0.05% auto_monad : 0.000086s : 0.18% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000045s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000281s : 0.58% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000100s : 0.21% optimize.opt_a.loop_unroll : 0.000046s : 0.09% optimize.opt_a.a_1 : 0.000799s : 1.65% optimize.opt_a.with_stream_mark : 0.000037s : 0.08% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000141s : 0.29% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.03% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.03% optimize.opt_a.merge_send_recv : 0.000016s : 0.03% optimize.opt_a.auto_parallel : 0.000015s : 0.03% optimize.opt_a.parallel : 0.000032s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.04% optimize.opt_a.virtual_dataset : 0.000012s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000011s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000021s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.04% optimize.opt_a.a_after_grad : 0.000018s : 0.04% optimize.opt_a.renormalize : 0.001746s : 3.60% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.05% optimize.opt_a.cse : 0.000055s : 0.11% optimize.opt_a.a_3 : 0.000084s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.04% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000744s : 1.54% optimize.opt_b.b_1 : 0.000119s : 0.24% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000026s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.04% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000033s : 0.07% optimize.loop_unroll : 0.000499s : 1.03% optimize.opt_after_cconv.c_1 : 0.000028s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000025s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000045s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.11% optimize.cse_after_recomputation.cse : 0.000014s : 0.03% optimize.environ_conv : 0.000012s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000601s : 1.24% validate : 0.000059s : 0.12% backend_pass : 0.000001s : 0.00% task_emit : 0.010730s : 22.15% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000237 26 1.34% : 0.000003s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.99% : 0.000007s : 3: substitution.graph_param_transform 80.77% : 0.000192s : 6: substitution.inline 1.96% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.35% : 0.000006s : 4: substitution.remove_not_recompute_node 1.77% : 0.000004s : 2: substitution.replace_old_param 2.96% : 0.000007s : 1: substitution.switch_simplify 5.33% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030453 2 94.63% : 0.028819s : 1: type_inference.infer 5.37% : 0.001635s : 1: type_inference.specialize ------[replace.] 0.000100 9 55.04% : 0.000055s : 6: replace.inline 26.70% : 0.000027s : 1: replace.switch_simplify 18.27% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000205 9 91.52% : 0.000188s : 6: match.inline 3.05% : 0.000006s : 1: match.switch_simplify 5.43% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000195 1092 1.08% : 0.000002s : 12: predicate.accumulaten_eliminater 1.08% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 0.86% : 0.000002s : 12: predicate.addn_zero_filter 0.81% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.53% : 0.000005s : 18: predicate.arithmetic_simplify 0.96% : 0.000002s : 12: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.52% : 0.000001s : 6: predicate.depend_value_elim 0.91% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.17% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.10% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 15: predicate.environ_get_depend_swap 1.58% : 0.000003s : 21: predicate.environ_get_eliminate 0.97% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.38% : 0.000005s : 20: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.60% : 0.000001s : 6: predicate.get_grad_eliminate 0.25% : 0.000000s : 3: predicate.graph_param_transform 0.46% : 0.000001s : 6: predicate.incorporate_call 0.40% : 0.000001s : 6: predicate.incorporate_call_switch 6.13% : 0.000012s : 50: predicate.inline 0.88% : 0.000002s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.05% : 0.000002s : 6: predicate.less_batch_normalization 1.73% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.25% : 0.000004s : 32: predicate.load_eliminater 0.88% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.75% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.50% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 6: predicate.merge_addn 0.61% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.42% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 12: predicate.minmaximum_grad 1.25% : 0.000002s : 3: predicate.mutable_eliminate 0.31% : 0.000001s : 3: predicate.opt_reshape 0.55% : 0.000001s : 3: predicate.parallel_virtual_node 2.09% : 0.000004s : 20: predicate.partial_defer_inline 1.28% : 0.000003s : 17: predicate.partial_eliminate 1.01% : 0.000002s : 12: predicate.print_const_string_wrapper 0.46% : 0.000001s : 6: predicate.reduce_all_const_elim 1.67% : 0.000003s : 12: predicate.reduce_eliminate 2.15% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 6: predicate.remove_not_recompute_node 1.42% : 0.000003s : 20: predicate.replace_applicator 0.57% : 0.000001s : 6: predicate.replace_old_param 0.33% : 0.000001s : 3: predicate.reset_defer_inline 1.01% : 0.000002s : 12: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 0.65% : 0.000001s : 6: predicate.same_eliminate 0.34% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.83% : 0.000002s : 6: predicate.shard_identity_eliminate 0.88% : 0.000002s : 6: predicate.special_op_eliminate 0.57% : 0.000001s : 6: predicate.specialize_transform 1.01% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.67% : 0.000003s : 20: predicate.switch_defer_inline 2.00% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.73% : 0.000011s : 68: predicate.switch_simplify 1.05% : 0.000002s : 12: predicate.tile_eliminate 0.83% : 0.000002s : 12: predicate.transpose_eliminate 1.60% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.73% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 2.06% : 0.000004s : 20: predicate.tuple_to_list_eliminator_ 2.21% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.87% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 3: predicate.value_based_eliminate 0.60% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001544 16 56.55% : 0.000873s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.45% : 0.000671s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.067062 196 0.01% : 0.000004s : 1: ForceFp32Comm 5.91% : 0.003962s : 1: add_attr 5.89% : 0.003948s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.09% : 0.000059s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.14% : 0.000091s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.06% : 0.000711s : 1: bootstrap 0.05% : 0.000036s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000005s : 1: convert_after_rewriter 0.04% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.05% : 0.000032s : 1: event_method 0.02% : 0.000016s : 1: execute 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.76% : 0.000510s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.12% : 0.000754s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000019s : 1: opt.transform.mutable_eliminate 1.88% : 0.001258s : 78: opt.transform.opt_a 0.04% : 0.000027s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000095s : 28: opt.transform.opt_b 0.07% : 0.000049s : 2: opt.transform.opt_trans_graph 0.06% : 0.000037s : 4: opt.transform.symbol_engine_opt 5.82% : 0.003906s : 1: opt_a 0.17% : 0.000115s : 1: opt_after_cconv 0.91% : 0.000613s : 1: opt_after_jit_grad 0.32% : 0.000213s : 1: opt_b 9.63% : 0.006461s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000025s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.02% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000022s : 1: remove_dup_value 1.36% : 0.000914s : 1: renormalize.infer 1.22% : 0.000820s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000025s : 1: rewriter_after_opt_a 0.43% : 0.000288s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.13% : 0.000090s : 1: symbol_engine_optimizer 16.04% : 0.010754s : 1: task_emit 0.12% : 0.000080s : 1: tuple_transform 45.57% : 0.030561s : 1: type_inference 0.15% : 0.000103s : 1: validate TotalTime = 0.0461296, [24] [bootstrap]: 0.00049301 [type_inference]: 0.0269668 [event_method]: 2.536e-05 [auto_monad]: 8.699e-05 [graph_reusing]: 6.98e-06 [inline]: 2.74001e-06 [add_attr]: 0.00359033, [1] [add_attr_with_inline]: 0.00358116, [1] [Cycle 1]: 6.92e-05, [2] [tag_attr]: 2.705e-05 [meta_addattr_fg_expand]: 6.86001e-06 [parallel-infer-symbol]: 4.07003e-06 [pre_auto_parallel]: 3.872e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.86998e-06 [optimize]: 0.00540112, [53] [py_interpret_to_execute]: 5.34e-06 [rewriter_before_opt_a]: 0.00025085 [opt_a]: 0.00307874, [2] [Cycle 1]: 0.00250145, [45] [expand_dump_flag]: 3.55998e-06 [switch_simplify]: 8.041e-05 [loop_unroll]: 3.235e-05 [a_1]: 0.00061451 [with_stream_mark]: 1.588e-05 [recompute_prepare]: 7.91001e-06 [updatestate_depend_eliminate]: 4.50999e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 6.893e-05 [accelerated_algorithm]: 6.41998e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 6.00002e-06 [merge_send_recv]: 7.91001e-06 [auto_parallel]: 5.66e-06 [parallel]: 1.815e-05 [flash_sp]: 7.06999e-06 [merge_comm]: 3.48999e-06 [allreduce_fusion]: 3.32002e-06 [matmul_add_comm_reduction]: 1.014e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 7.98001e-06 [virtual_dataset]: 5.76998e-06 [get_grad_eliminate_]: 5.44998e-06 [virtual_output]: 5.67001e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.02998e-06 [offload_activation]: 9.26002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.109e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 9.79e-06 [set_forward_comm_id_for_comm_node_pass]: 3.26999e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.78998e-06 [after_resolve]: 9.74e-06 [a_after_grad]: 8.48999e-06 [renormalize]: 0.00116051 [add_forward_monad_depend]: 5.84e-06 [auto_monad_grad]: 2.06e-06 [auto_monad_eliminator]: 1.648e-05 [cse]: 3.425e-05 [a_3]: 4.346e-05 [Cycle 2]: 0.00056681, [45] [expand_dump_flag]: 1.77999e-06 [switch_simplify]: 6.81001e-06 [loop_unroll]: 6.16e-06 [a_1]: 9.721e-05 [with_stream_mark]: 1.312e-05 [recompute_prepare]: 5.27999e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.31e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 6.113e-05 [accelerated_algorithm]: 5.39998e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.71002e-06 [shard_inline]: 4.93001e-06 [merge_send_recv]: 4.51002e-06 [auto_parallel]: 5.49e-06 [parallel]: 5.29e-06 [flash_sp]: 3.08e-06 [merge_comm]: 3.00002e-06 [allreduce_fusion]: 2.93e-06 [matmul_add_comm_reduction]: 5.19e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.40002e-06 [virtual_dataset]: 5.12999e-06 [get_grad_eliminate_]: 5.19e-06 [virtual_output]: 4.95001e-06 [merge_forward]: 2.74001e-06 [cell_reuse_recompute_pass]: 1.89e-06 [offload_activation]: 6.23e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.224e-05 [merge_recompute_call_nodes]: 8.09989e-07 [before_grad]: 8.66002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.31999e-06 [meta_fg_expand]: 2.25002e-06 [flash_sp_send_recv_attached]: 9.29984e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 8.48999e-06 [a_after_grad]: 7.21999e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 6.59001e-06 [cse]: 1.581e-05 [a_3]: 3.07e-05 [py_interpret_to_execute_after_opt_a]: 4.60001e-06 [slice_cell_reuse_recomputed_activation]: 2.07999e-06 [rewriter_after_opt_a]: 1.764e-05 [convert_after_rewriter]: 1.20999e-06 [order_py_execute_after_rewriter]: 1.17999e-06 [mutable_eliminate]: 0.00062286 [opt_b]: 0.00021726, [1] [Cycle 1]: 0.00021023, [7] [b_1]: 0.00011014 [b_2]: 2.954e-05 [updatestate_depend_eliminate]: 6.34001e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.57001e-06 [renormalize]: 6.29982e-07 [cse]: 2.279e-05 [optimize_parallel_all_gather_comm]: 1.701e-05 [overlap_param_gather]: 1.94e-06 [cconv]: 2.684e-05 [loop_unroll]: 0.00047967 [opt_after_cconv]: 0.00010514, [1] [Cycle 1]: 9.907e-05, [7] [c_1]: 2.71e-05 [parameter_eliminate]: 3.7e-06 [updatestate_depend_eliminate]: 6.29999e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.52001e-06 [cse]: 2.092e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.68e-05 [tuple_transform]: 6.776e-05, [1] [Cycle 1]: 6.364e-05, [4] [d_1]: 3.723e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.09001e-06 [partial_unused_args_eliminate]: 2.15002e-06 [add_recomputation]: 4.907e-05 [cse_after_recomputation]: 2.315e-05, [1] [Cycle 1]: 1.886e-05, [1] [cse]: 1.338e-05 [environ_conv]: 9.33002e-06 [swap_dp_allreduce_reducescatter]: 5.01002e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.50001e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 2.01998e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 8.00006e-07 [remove_cast_before_assign_add]: 1.30999e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.27999e-06 [add_comm_op_reuse_tag]: 1.29e-06 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 1.42999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.73e-06 [control_data_broadcast_order]: 1.292e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.57998e-06 [overlap_recompute_and_grad_model_parallel]: 5.46e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.23998e-06 [overlap_grad_ring_attention]: 4.23999e-06 [overlap_grad_flash_sp]: 1.989e-05 [begin_end_overlap_inline]: 9.00007e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 2.13002e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 7.94e-05, [1] [Cycle 1]: 7.517e-05, [6] [build]: 1.137e-05 [elim_shapecalc]: 8.66002e-06 [elim_not_effective]: 1.146e-05 [opt_reshape]: 6.32001e-06 [fold_const_symbol]: 9.17001e-06 [renormalize]: 1.60013e-07 [detach_backward]: 2.21e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.834e-05 [get_jit_bprop_graph]: 1.71e-06 [rewriter_after_jit_bprop_graph]: 4.25999e-06 [opt_after_jit_grad]: 0.00052001 [validate]: 4.695e-05 [backend_pass]: 1.27e-06 [task_emit]: 0.0086669 [execute]: 9.22001e-06 Sums bootstrap : 0.000493s : 1.19% type_inference : 0.026967s : 64.93% event_method : 0.000025s : 0.06% auto_monad : 0.000087s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000039s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000251s : 0.60% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000087s : 0.21% optimize.opt_a.loop_unroll : 0.000039s : 0.09% optimize.opt_a.a_1 : 0.000712s : 1.71% optimize.opt_a.with_stream_mark : 0.000029s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000130s : 0.31% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000010s : 0.02% optimize.opt_a.merge_comm : 0.000006s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.001161s : 2.79% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.06% optimize.opt_a.cse : 0.000050s : 0.12% optimize.opt_a.a_3 : 0.000074s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000018s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000623s : 1.50% optimize.opt_b.b_1 : 0.000110s : 0.27% optimize.opt_b.b_2 : 0.000030s : 0.07% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.06% optimize.loop_unroll : 0.000480s : 1.16% optimize.opt_after_cconv.c_1 : 0.000027s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000037s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000049s : 0.12% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000520s : 1.25% validate : 0.000047s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.008667s : 20.87% execute : 0.000009s : 0.02% Time group info: ------[substitution.] 0.000204 26 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.88% : 0.000002s : 2: substitution.fold_const_symbol 2.64% : 0.000005s : 3: substitution.graph_param_transform 80.10% : 0.000164s : 6: substitution.inline 1.81% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.39% : 0.000005s : 4: substitution.remove_not_recompute_node 1.60% : 0.000003s : 2: substitution.replace_old_param 3.81% : 0.000008s : 1: substitution.switch_simplify 5.75% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026891 2 94.67% : 0.025459s : 1: type_inference.infer 5.33% : 0.001432s : 1: type_inference.specialize ------[replace.] 0.000088 9 57.91% : 0.000051s : 6: replace.inline 23.23% : 0.000020s : 1: replace.switch_simplify 18.86% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 9 90.12% : 0.000160s : 6: match.inline 3.88% : 0.000007s : 1: match.switch_simplify 6.00% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000185 1092 1.01% : 0.000002s : 12: predicate.accumulaten_eliminater 0.85% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 12: predicate.addn_zero_filter 0.93% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.39% : 0.000004s : 18: predicate.arithmetic_simplify 1.10% : 0.000002s : 12: predicate.cast_eliminate 0.50% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.44% : 0.000001s : 6: predicate.depend_value_elim 1.01% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.12% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.01% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 3: predicate.elim_not_effective 0.45% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.16% : 0.000002s : 15: predicate.environ_get_depend_swap 1.50% : 0.000003s : 21: predicate.environ_get_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.58% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.45% : 0.000005s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.65% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000001s : 3: predicate.graph_param_transform 0.46% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 5.96% : 0.000011s : 50: predicate.inline 0.58% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.94% : 0.000002s : 6: predicate.less_batch_normalization 1.75% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 32: predicate.load_eliminater 1.02% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.88% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.55% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.74% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 12: predicate.minmaximum_grad 1.19% : 0.000002s : 3: predicate.mutable_eliminate 0.40% : 0.000001s : 3: predicate.opt_reshape 0.36% : 0.000001s : 3: predicate.parallel_virtual_node 1.94% : 0.000004s : 20: predicate.partial_defer_inline 1.33% : 0.000002s : 17: predicate.partial_eliminate 1.01% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.20% : 0.000002s : 12: predicate.reduce_eliminate 2.35% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 20: predicate.replace_applicator 0.46% : 0.000001s : 6: predicate.replace_old_param 0.23% : 0.000000s : 3: predicate.reset_defer_inline 1.01% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 1.01% : 0.000002s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.71% : 0.000001s : 6: predicate.shard_identity_eliminate 0.63% : 0.000001s : 6: predicate.special_op_eliminate 0.57% : 0.000001s : 6: predicate.specialize_transform 0.74% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.40% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.63% : 0.000003s : 20: predicate.switch_defer_inline 2.10% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.95% : 0.000011s : 68: predicate.switch_simplify 1.05% : 0.000002s : 12: predicate.tile_eliminate 0.97% : 0.000002s : 12: predicate.transpose_eliminate 1.67% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.71% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.37% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.83% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.31% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.97% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.41% : 0.000001s : 3: predicate.value_based_eliminate 0.64% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 6: predicate.virtual_output_eliminate 0.26% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001115 16 55.81% : 0.000622s : 8: func_graph_cloner_run.FuncGraphClonerGraph 44.19% : 0.000493s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057567 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.25% : 0.003596s : 1: add_attr 6.23% : 0.003585s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000053s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000093s : 1: auto_monad 0.04% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.92% : 0.000527s : 1: bootstrap 0.05% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.06% : 0.000032s : 1: event_method 0.03% : 0.000016s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.85% : 0.000489s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.10% : 0.000633s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000016s : 1: opt.transform.mutable_eliminate 1.95% : 0.001120s : 78: opt.transform.opt_a 0.05% : 0.000026s : 1: opt.transform.opt_after_cconv 0.04% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000111s : 28: opt.transform.opt_b 0.07% : 0.000041s : 2: opt.transform.opt_trans_graph 0.06% : 0.000032s : 4: opt.transform.symbol_engine_opt 5.35% : 0.003082s : 1: opt_a 0.19% : 0.000109s : 1: opt_after_cconv 0.92% : 0.000530s : 1: opt_after_jit_grad 0.38% : 0.000221s : 1: opt_b 9.39% : 0.005406s : 1: optimize 0.04% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000043s : 1: pre_auto_parallel 0.02% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 1.14% : 0.000658s : 1: renormalize.infer 0.86% : 0.000494s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000021s : 1: rewriter_after_opt_a 0.45% : 0.000257s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000082s : 1: symbol_engine_optimizer 15.08% : 0.008684s : 1: task_emit 0.12% : 0.000071s : 1: tuple_transform 46.89% : 0.026991s : 1: type_inference 0.14% : 0.000081s : 1: validate TotalTime = 0.171158, [24] [bootstrap]: 0.00044854 [type_inference]: 0.0539485 [event_method]: 0.00017231 [auto_monad]: 0.00017471 [graph_reusing]: 1.086e-05 [inline]: 3.20002e-06 [add_attr]: 0.00395138, [1] [add_attr_with_inline]: 0.00393961, [1] [Cycle 1]: 0.00010695, [2] [tag_attr]: 5.3e-05 [meta_addattr_fg_expand]: 1.184e-05 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 7.502e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.0536958, [53] [py_interpret_to_execute]: 7.7e-06 [rewriter_before_opt_a]: 0.00042005 [opt_a]: 0.0404993, [3] [Cycle 1]: 0.0221656, [45] [expand_dump_flag]: 5.21998e-06 [switch_simplify]: 0.0001759 [loop_unroll]: 7.929e-05 [a_1]: 0.00178254 [with_stream_mark]: 3.325e-05 [recompute_prepare]: 2.63e-05 [updatestate_depend_eliminate]: 9.29e-06 [updatestate_assign_eliminate]: 7.51999e-06 [updatestate_loads_eliminate]: 8.89e-06 [parameter_eliminate]: 3.21001e-06 [a_2]: 0.00023516 [accelerated_algorithm]: 1.643e-05 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 6.68e-06 [shard_inline]: 1.771e-05 [merge_send_recv]: 1.919e-05 [auto_parallel]: 1.419e-05 [parallel]: 2.04e-05 [flash_sp]: 1.275e-05 [merge_comm]: 1.077e-05 [allreduce_fusion]: 8.84998e-06 [matmul_add_comm_reduction]: 3.215e-05 [allreduce_slice_to_reducescatter]: 1.35001e-06 [virtual_shard_identity]: 1.768e-05 [virtual_dataset]: 1.6e-05 [get_grad_eliminate_]: 1.663e-05 [virtual_output]: 1.547e-05 [merge_forward]: 9.39e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.949e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.279e-05 [merge_recompute_call_nodes]: 1.96e-06 [before_grad]: 2.779e-05 [set_forward_comm_id_for_comm_node_pass]: 9.44e-06 [meta_fg_expand]: 0.00212873 [flash_sp_send_recv_attached]: 4.17e-06 [receive_attached]: 2.83e-06 [after_resolve]: 7.755e-05 [a_after_grad]: 0.00010395 [renormalize]: 0.0153788 [add_forward_monad_depend]: 1.458e-05 [auto_monad_grad]: 8.17e-06 [auto_monad_eliminator]: 0.00016141 [cse]: 0.00046364 [a_3]: 0.00075653 [Cycle 2]: 0.0122607, [45] [expand_dump_flag]: 3.48999e-06 [switch_simplify]: 0.00010104 [loop_unroll]: 9.567e-05 [a_1]: 0.00329503 [with_stream_mark]: 8.776e-05 [recompute_prepare]: 7.264e-05 [updatestate_depend_eliminate]: 4.357e-05 [updatestate_assign_eliminate]: 3.88e-05 [updatestate_loads_eliminate]: 3.668e-05 [parameter_eliminate]: 3.13e-06 [a_2]: 0.00102103 [accelerated_algorithm]: 0.00014067 [shard]: 2.44999e-06 [meta_shard_fg_expand]: 2.533e-05 [shard_inline]: 6.669e-05 [merge_send_recv]: 4.873e-05 [auto_parallel]: 4.541e-05 [parallel]: 1.067e-05 [flash_sp]: 4.16001e-06 [merge_comm]: 4.219e-05 [allreduce_fusion]: 5.32e-05 [matmul_add_comm_reduction]: 5.334e-05 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 6.767e-05 [virtual_dataset]: 6.261e-05 [get_grad_eliminate_]: 6.281e-05 [virtual_output]: 6.246e-05 [merge_forward]: 4.003e-05 [cell_reuse_recompute_pass]: 2.96999e-06 [offload_activation]: 5.866e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011894 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 0.00010757 [set_forward_comm_id_for_comm_node_pass]: 4.603e-05 [meta_fg_expand]: 0.00023116 [flash_sp_send_recv_attached]: 2.36998e-06 [receive_attached]: 2.48998e-06 [after_resolve]: 7.512e-05 [a_after_grad]: 0.00010521 [renormalize]: 0.00469451 [add_forward_monad_depend]: 1.197e-05 [auto_monad_grad]: 2.73998e-06 [auto_monad_eliminator]: 0.0001107 [cse]: 0.00027798 [a_3]: 0.00048626 [Cycle 3]: 0.00605139, [45] [expand_dump_flag]: 2.66e-06 [switch_simplify]: 6.934e-05 [loop_unroll]: 6.359e-05 [a_1]: 0.00196448 [with_stream_mark]: 6.209e-05 [recompute_prepare]: 6.647e-05 [updatestate_depend_eliminate]: 4.41e-05 [updatestate_assign_eliminate]: 3.874e-05 [updatestate_loads_eliminate]: 3.856e-05 [parameter_eliminate]: 3.06001e-06 [a_2]: 0.00108276 [accelerated_algorithm]: 0.0001028 [shard]: 2.49999e-06 [meta_shard_fg_expand]: 1.955e-05 [shard_inline]: 6.838e-05 [merge_send_recv]: 8.319e-05 [auto_parallel]: 5.097e-05 [parallel]: 8.76997e-06 [flash_sp]: 1.95001e-06 [merge_comm]: 4.66e-05 [allreduce_fusion]: 4.393e-05 [matmul_add_comm_reduction]: 5.628e-05 [allreduce_slice_to_reducescatter]: 9.80013e-07 [virtual_shard_identity]: 7.904e-05 [virtual_dataset]: 7.409e-05 [get_grad_eliminate_]: 7.303e-05 [virtual_output]: 7.322e-05 [merge_forward]: 5.72e-05 [cell_reuse_recompute_pass]: 2.98e-06 [offload_activation]: 5.758e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00014185 [merge_recompute_call_nodes]: 1.71002e-06 [before_grad]: 0.00012928 [set_forward_comm_id_for_comm_node_pass]: 6.183e-05 [meta_fg_expand]: 3.343e-05 [flash_sp_send_recv_attached]: 1.99e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 8.013e-05 [a_after_grad]: 0.00013829 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 3.93999e-06 [auto_monad_grad]: 2.73e-06 [auto_monad_eliminator]: 9.479e-05 [cse]: 0.00023985 [a_3]: 0.00050219 [py_interpret_to_execute_after_opt_a]: 1.133e-05 [slice_cell_reuse_recomputed_activation]: 2.76999e-06 [rewriter_after_opt_a]: 0.00020653 [convert_after_rewriter]: 1.59e-06 [order_py_execute_after_rewriter]: 1.11002e-06 [mutable_eliminate]: 0.00085933 [opt_b]: 0.00695709, [2] [Cycle 1]: 0.00451476, [7] [b_1]: 0.00367656 [b_2]: 8.354e-05 [updatestate_depend_eliminate]: 0.00032432 [updatestate_assign_eliminate]: 4.722e-05 [updatestate_loads_eliminate]: 4.058e-05 [renormalize]: 1.23002e-06 [cse]: 0.00026135 [Cycle 2]: 0.00242509, [7] [b_1]: 0.00180882 [b_2]: 7.996e-05 [updatestate_depend_eliminate]: 0.00015256 [updatestate_assign_eliminate]: 4.221e-05 [updatestate_loads_eliminate]: 3.879e-05 [renormalize]: 5.00004e-08 [cse]: 0.00023762 [optimize_parallel_all_gather_comm]: 9.652e-05 [overlap_param_gather]: 2.59001e-06 [cconv]: 5.941e-05 [loop_unroll]: 0.00086108 [opt_after_cconv]: 0.00084558, [1] [Cycle 1]: 0.00083702, [7] [c_1]: 0.00039296 [parameter_eliminate]: 6.81001e-06 [updatestate_depend_eliminate]: 8.172e-05 [updatestate_assign_eliminate]: 4.017e-05 [updatestate_loads_eliminate]: 3.964e-05 [cse]: 0.00022934 [renormalize]: 1.17e-06 [remove_dup_value]: 0.00051941 [tuple_transform]: 0.00069426, [1] [Cycle 1]: 0.00068665, [4] [d_1]: 0.00058545 [none_parameter_eliminate]: 3.26001e-06 [renormalize]: 4.50003e-07 [switch_simplify]: 6.571e-05 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 0.00027958 [cse_after_recomputation]: 0.00021115, [1] [Cycle 1]: 0.0002036, [1] [cse]: 0.00019342 [environ_conv]: 3.695e-05 [swap_dp_allreduce_reducescatter]: 4.475e-05 [bias_add_comm_swap]: 3.83999e-06 [label_micro_interleaved_index]: 7.50998e-06 [label_fine_grained_interleaved_index]: 3.34001e-06 [merge_cast_opt]: 1.79998e-06 [slice_recompute_activation]: 2.05002e-06 [micro_interleaved_order_control]: 2.83998e-06 [assign_add_opt]: 1.60001e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.69999e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.11002e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 0.0001037 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 2.519e-05 [overlap_recompute_and_grad_model_parallel]: 2.662e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.20002e-06 [overlap_grad_ring_attention]: 2.469e-05 [overlap_grad_flash_sp]: 0.00013164 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00043285, [1] [Cycle 1]: 0.00042625, [6] [build]: 3.16e-05 [elim_shapecalc]: 8.173e-05 [elim_not_effective]: 0.00010892 [opt_reshape]: 6.461e-05 [fold_const_symbol]: 9.926e-05 [renormalize]: 4.39992e-07 [detach_backward]: 2.64001e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 0.00010778 [get_jit_bprop_graph]: 2.11998e-06 [rewriter_after_jit_bprop_graph]: 7.09001e-06 [opt_after_jit_grad]: 0.00096221 [validate]: 0.00017344 [backend_pass]: 1.26997e-06 [task_emit]: 0.0570248 [execute]: 1.029e-05 Sums bootstrap : 0.000449s : 0.27% type_inference : 0.053949s : 32.64% event_method : 0.000172s : 0.10% auto_monad : 0.000175s : 0.11% graph_reusing : 0.000011s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000053s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000075s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000420s : 0.25% optimize.opt_a.expand_dump_flag : 0.000011s : 0.01% optimize.opt_a.switch_simplify : 0.000346s : 0.21% optimize.opt_a.loop_unroll : 0.000239s : 0.14% optimize.opt_a.a_1 : 0.007042s : 4.26% optimize.opt_a.with_stream_mark : 0.000183s : 0.11% optimize.opt_a.recompute_prepare : 0.000165s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000097s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000085s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000084s : 0.05% optimize.opt_a.parameter_eliminate : 0.000009s : 0.01% optimize.opt_a.a_2 : 0.002339s : 1.42% optimize.opt_a.accelerated_algorithm : 0.000260s : 0.16% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000052s : 0.03% optimize.opt_a.shard_inline : 0.000153s : 0.09% optimize.opt_a.merge_send_recv : 0.000151s : 0.09% optimize.opt_a.auto_parallel : 0.000111s : 0.07% optimize.opt_a.parallel : 0.000040s : 0.02% optimize.opt_a.flash_sp : 0.000019s : 0.01% optimize.opt_a.merge_comm : 0.000100s : 0.06% optimize.opt_a.allreduce_fusion : 0.000106s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000142s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000164s : 0.10% optimize.opt_a.virtual_dataset : 0.000153s : 0.09% optimize.opt_a.get_grad_eliminate_ : 0.000152s : 0.09% optimize.opt_a.virtual_output : 0.000151s : 0.09% optimize.opt_a.merge_forward : 0.000107s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.00% optimize.opt_a.offload_activation : 0.000136s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000294s : 0.18% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000265s : 0.16% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000117s : 0.07% optimize.opt_a.meta_fg_expand : 0.002393s : 1.45% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.01% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000233s : 0.14% optimize.opt_a.a_after_grad : 0.000347s : 0.21% optimize.opt_a.renormalize : 0.020073s : 12.14% optimize.opt_a.add_forward_monad_depend : 0.000030s : 0.02% optimize.opt_a.auto_monad_grad : 0.000014s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000367s : 0.22% optimize.opt_a.cse : 0.000981s : 0.59% optimize.opt_a.a_3 : 0.001745s : 1.06% optimize.py_interpret_to_execute_after_opt_a : 0.000011s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000207s : 0.12% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000859s : 0.52% optimize.opt_b.b_1 : 0.005485s : 3.32% optimize.opt_b.b_2 : 0.000163s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000477s : 0.29% optimize.opt_b.updatestate_assign_eliminate : 0.000089s : 0.05% optimize.opt_b.updatestate_loads_eliminate : 0.000079s : 0.05% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000499s : 0.30% optimize.optimize_parallel_all_gather_comm : 0.000097s : 0.06% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000059s : 0.04% optimize.loop_unroll : 0.000861s : 0.52% optimize.opt_after_cconv.c_1 : 0.000393s : 0.24% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000082s : 0.05% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000040s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000040s : 0.02% optimize.opt_after_cconv.cse : 0.000229s : 0.14% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000519s : 0.31% optimize.tuple_transform.d_1 : 0.000585s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000066s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000280s : 0.17% optimize.cse_after_recomputation.cse : 0.000193s : 0.12% optimize.environ_conv : 0.000037s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000045s : 0.03% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000104s : 0.06% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000025s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000027s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.01% optimize.overlap_grad_flash_sp : 0.000132s : 0.08% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000032s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000082s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000109s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000065s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000099s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000108s : 0.07% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000962s : 0.58% validate : 0.000173s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.057025s : 34.50% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.001848 670 5.95% : 0.000110s : 36: substitution.arithmetic_simplify 0.84% : 0.000015s : 46: substitution.elim_not_effective 0.54% : 0.000010s : 11: substitution.float_depend_g_call 1.40% : 0.000026s : 17: substitution.float_tuple_getitem_switch 0.77% : 0.000014s : 46: substitution.fold_const_symbol 2.01% : 0.000037s : 51: substitution.graph_param_transform 0.16% : 0.000003s : 2: substitution.incorporate_call 0.11% : 0.000002s : 2: substitution.incorporate_call_switch 45.36% : 0.000838s : 21: substitution.inline 1.17% : 0.000022s : 2: substitution.inline_without_move 2.31% : 0.000043s : 102: substitution.j_node_and_user_rematch 3.57% : 0.000066s : 30: substitution.less_batch_normalization 1.24% : 0.000023s : 13: substitution.minmaximum_grad 0.55% : 0.000010s : 11: substitution.partial_eliminate 3.28% : 0.000061s : 102: substitution.remove_not_recompute_node 1.72% : 0.000032s : 9: substitution.replace_applicator 0.76% : 0.000014s : 11: substitution.replace_old_param 0.19% : 0.000004s : 1: substitution.set_cell_output_no_recompute 0.79% : 0.000015s : 4: substitution.switch_simplify 1.64% : 0.000030s : 12: substitution.transpose_eliminate 4.88% : 0.000090s : 25: substitution.tuple_list_convert_item_index_to_positive 2.54% : 0.000047s : 25: substitution.tuple_list_get_item_const_eliminator 3.42% : 0.000063s : 25: substitution.tuple_list_get_item_depend_reorder 7.58% : 0.000140s : 40: substitution.tuple_list_get_item_eliminator 3.43% : 0.000063s : 25: substitution.tuple_list_get_set_item_eliminator 3.81% : 0.000070s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.053807 2 92.95% : 0.050012s : 1: type_inference.infer 7.05% : 0.003795s : 1: type_inference.specialize ------[replace.] 0.000356 33 53.09% : 0.000189s : 21: replace.inline 15.86% : 0.000057s : 4: replace.switch_simplify 20.94% : 0.000075s : 7: replace.tuple_list_get_item_eliminator 10.11% : 0.000036s : 1: replace.zero_like_fill_zero ------[match.] 0.000929 33 88.72% : 0.000825s : 21: match.inline 1.22% : 0.000011s : 4: match.switch_simplify 2.66% : 0.000025s : 7: match.tuple_list_get_item_eliminator 7.40% : 0.000069s : 1: match.zero_like_fill_zero ------[predicate.] 0.002850 20376 0.72% : 0.000021s : 165: predicate.accumulaten_eliminater 0.45% : 0.000013s : 51: predicate.ad_related_special_op_eliminate 0.61% : 0.000017s : 132: predicate.addn_check_dump 0.74% : 0.000021s : 165: predicate.addn_zero_filter 0.71% : 0.000020s : 165: predicate.adjust_all_reduce_mul_add 1.83% : 0.000052s : 297: predicate.arithmetic_simplify 0.75% : 0.000021s : 165: predicate.cast_eliminate 1.05% : 0.000030s : 217: predicate.check_bprop_eliminate 0.62% : 0.000018s : 132: predicate.compare_switch_simplify 0.50% : 0.000014s : 177: predicate.const_output_eliminate 0.62% : 0.000018s : 132: predicate.depend_value_elim 0.78% : 0.000022s : 165: predicate.dict_get_item_const_eliminator 0.84% : 0.000024s : 165: predicate.dict_get_item_eliminator 1.63% : 0.000046s : 165: predicate.dict_set_item_eliminator 1.18% : 0.000034s : 228: predicate.dumpgradient_eliminate 0.13% : 0.000004s : 51: predicate.elim_not_effective 0.30% : 0.000009s : 51: predicate.elim_shapecalc_of_broadcastargs 1.57% : 0.000045s : 342: predicate.environ_add_const_eliminate 1.59% : 0.000045s : 342: predicate.environ_get_add_eliminate 1.56% : 0.000045s : 342: predicate.environ_get_depend_swap 2.17% : 0.000062s : 474: predicate.environ_get_eliminate 1.55% : 0.000044s : 342: predicate.environ_get_set_eliminate 0.89% : 0.000025s : 193: predicate.exchange_switch_depend_value 1.21% : 0.000034s : 193: predicate.float_depend_g_call 0.63% : 0.000018s : 132: predicate.float_environ_get_switch 1.52% : 0.000043s : 309: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 51: predicate.fold_const_symbol 0.67% : 0.000019s : 132: predicate.get_grad_eliminate 0.16% : 0.000004s : 51: predicate.graph_param_transform 0.61% : 0.000017s : 132: predicate.incorporate_call 0.61% : 0.000018s : 132: predicate.incorporate_call_switch 5.14% : 0.000147s : 858: predicate.inline 1.05% : 0.000030s : 159: predicate.inline_without_move 0.33% : 0.000009s : 132: predicate.j_node_and_user_rematch 0.80% : 0.000023s : 132: predicate.less_batch_normalization 1.95% : 0.000056s : 400: predicate.list_to_tuple_eliminator_ 2.54% : 0.000072s : 573: predicate.load_eliminater 0.54% : 0.000015s : 59: predicate.loop_unroll_after_grad 1.16% : 0.000033s : 233: predicate.loop_unroll_before_grad 1.91% : 0.000055s : 401: predicate.make_slice_get_slice_eliminator 0.62% : 0.000018s : 132: predicate.merge_addn 1.03% : 0.000029s : 217: predicate.micro_step_allgather_replace 1.03% : 0.000029s : 217: predicate.mini_step_allgather_replace 0.72% : 0.000021s : 165: predicate.minmaximum_grad 0.56% : 0.000016s : 60: predicate.mutable_eliminate 0.27% : 0.000008s : 51: predicate.opt_reshape 0.90% : 0.000026s : 177: predicate.parallel_virtual_node 1.29% : 0.000037s : 193: predicate.partial_defer_inline 1.19% : 0.000034s : 231: predicate.partial_eliminate 0.71% : 0.000020s : 165: predicate.print_const_string_wrapper 0.62% : 0.000018s : 132: predicate.reduce_all_const_elim 0.90% : 0.000026s : 165: predicate.reduce_eliminate 2.58% : 0.000074s : 573: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000009s : 132: predicate.remove_not_recompute_node 1.25% : 0.000036s : 389: predicate.replace_applicator 0.43% : 0.000012s : 159: predicate.replace_old_param 0.49% : 0.000014s : 177: predicate.reset_defer_inline 0.73% : 0.000021s : 165: predicate.reshape_eliminate 1.07% : 0.000030s : 217: predicate.row_tensor_add_zeros_like 0.65% : 0.000019s : 118: predicate.row_tensor_eliminate 1.20% : 0.000034s : 217: predicate.same_eliminate 0.38% : 0.000011s : 132: predicate.set_cell_output_no_recompute 0.71% : 0.000020s : 132: predicate.shard_identity_eliminate 1.16% : 0.000033s : 228: predicate.special_op_eliminate 0.69% : 0.000020s : 132: predicate.specialize_transform 1.11% : 0.000032s : 217: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000025s : 159: predicate.stack_unstack_eliminate 0.31% : 0.000009s : 59: predicate.switch_call_monad_eliminater 0.95% : 0.000027s : 193: predicate.switch_defer_inline 1.95% : 0.000055s : 410: predicate.switch_layer_defer_inline 3.12% : 0.000089s : 617: predicate.switch_simplify 0.77% : 0.000022s : 165: predicate.tile_eliminate 0.77% : 0.000022s : 165: predicate.transpose_eliminate 1.95% : 0.000056s : 393: predicate.tuple_list_convert_item_index_to_positive 2.00% : 0.000057s : 393: predicate.tuple_list_get_item_const_eliminator 1.95% : 0.000055s : 393: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000084s : 532: predicate.tuple_list_get_item_eliminator 1.98% : 0.000057s : 393: predicate.tuple_list_get_set_item_eliminator 2.66% : 0.000076s : 525: predicate.tuple_list_set_item_eliminator 1.90% : 0.000054s : 400: predicate.tuple_to_list_eliminator_ 2.57% : 0.000073s : 573: predicate.updatestate_pure_node_eliminater 3.30% : 0.000094s : 705: predicate.updatestate_useless_node_eliminater 0.94% : 0.000027s : 177: predicate.value_based_eliminate 0.70% : 0.000020s : 132: predicate.virtual_dataset_eliminate 0.67% : 0.000019s : 132: predicate.virtual_output_eliminate 0.27% : 0.000008s : 51: predicate.virtual_view_grad_eliminate 0.98% : 0.000028s : 179: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005459 51 60.95% : 0.003327s : 26: func_graph_cloner_run.FuncGraphClonerGraph 39.05% : 0.002132s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.269896 292 0.00% : 0.000004s : 1: ForceFp32Comm 1.47% : 0.003957s : 1: add_attr 1.46% : 0.003943s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.11% : 0.000285s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.07% : 0.000184s : 1: auto_monad 0.04% : 0.000114s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.18% : 0.000480s : 1: bootstrap 0.02% : 0.000064s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000109s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.08% : 0.000215s : 1: cse_after_recomputation 0.00% : 0.000007s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.02% : 0.000041s : 1: environ_conv 0.07% : 0.000186s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.32% : 0.000872s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.32% : 0.000871s : 1: mutable_eliminate 0.01% : 0.000028s : 1: offloading_packed_experts 0.04% : 0.000101s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000094s : 1: opt.transform.mutable_eliminate 5.13% : 0.013852s : 117: opt.transform.opt_a 0.14% : 0.000391s : 1: opt.transform.opt_after_cconv 0.08% : 0.000210s : 1: opt.transform.opt_after_jit_grad 2.05% : 0.005524s : 83: opt.transform.opt_b 0.24% : 0.000646s : 2: opt.transform.opt_trans_graph 0.13% : 0.000350s : 4: opt.transform.symbol_engine_opt 15.01% : 0.040504s : 1: opt_a 0.32% : 0.000851s : 1: opt_after_cconv 0.36% : 0.000975s : 1: opt_after_jit_grad 2.58% : 0.006963s : 1: opt_b 19.90% : 0.053702s : 1: optimize 0.04% : 0.000102s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.05% : 0.000136s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000030s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000016s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000080s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.20% : 0.000530s : 1: remove_dup_value 5.19% : 0.013999s : 2: renormalize.infer 2.24% : 0.006048s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000012s : 1: rewriter_after_jit_bprop_graph 0.08% : 0.000211s : 1: rewriter_after_opt_a 0.16% : 0.000428s : 1: rewriter_before_opt_a 0.00% : 0.000006s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000048s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000436s : 1: symbol_engine_optimizer 21.14% : 0.057048s : 1: task_emit 0.26% : 0.000698s : 1: tuple_transform 20.00% : 0.053974s : 1: type_inference 0.11% : 0.000296s : 1: validate TotalTime = 0.0460529, [24] [bootstrap]: 0.00051522 [type_inference]: 0.0268219 [event_method]: 2.3e-05 [auto_monad]: 7.999e-05 [graph_reusing]: 6.44999e-06 [inline]: 2.49001e-06 [add_attr]: 0.00346856, [1] [add_attr_with_inline]: 0.00345875, [1] [Cycle 1]: 6.509e-05, [2] [tag_attr]: 2.341e-05 [meta_addattr_fg_expand]: 6.71e-06 [parallel-infer-symbol]: 3.94002e-06 [pre_auto_parallel]: 3.773e-05 [insert-virtual-dataset]: 3.12002e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 2.00002e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00545239, [53] [py_interpret_to_execute]: 5.14e-06 [rewriter_before_opt_a]: 0.00025273 [opt_a]: 0.00323867, [2] [Cycle 1]: 0.00263791, [45] [expand_dump_flag]: 4.15e-06 [switch_simplify]: 8.025e-05 [loop_unroll]: 3.306e-05 [a_1]: 0.00060789 [with_stream_mark]: 1.581e-05 [recompute_prepare]: 7.71001e-06 [updatestate_depend_eliminate]: 4.2e-06 [updatestate_assign_eliminate]: 3.38999e-06 [updatestate_loads_eliminate]: 2.84001e-06 [parameter_eliminate]: 1.81e-06 [a_2]: 7.021e-05 [accelerated_algorithm]: 5.84e-06 [shard]: 1.48002e-06 [meta_shard_fg_expand]: 1.79e-06 [shard_inline]: 5.51998e-06 [merge_send_recv]: 7.38e-06 [auto_parallel]: 5.59998e-06 [parallel]: 6.954e-05 [flash_sp]: 7.85e-06 [merge_comm]: 4.62e-06 [allreduce_fusion]: 3.30003e-06 [matmul_add_comm_reduction]: 9.05001e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.70001e-06 [virtual_dataset]: 6.36998e-06 [get_grad_eliminate_]: 5.87999e-06 [virtual_output]: 5.72001e-06 [merge_forward]: 3.86999e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 9.02e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.192e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 9.31e-06 [set_forward_comm_id_for_comm_node_pass]: 3.35e-06 [meta_fg_expand]: 2.86e-06 [flash_sp_send_recv_attached]: 2.37999e-06 [receive_attached]: 2.16e-06 [after_resolve]: 8.80001e-06 [a_after_grad]: 8.28999e-06 [renormalize]: 0.00124914 [add_forward_monad_depend]: 5.44998e-06 [auto_monad_grad]: 2.41e-06 [auto_monad_eliminator]: 1.59e-05 [cse]: 3.608e-05 [a_3]: 4.449e-05 [Cycle 2]: 0.00059033, [45] [expand_dump_flag]: 1.64e-06 [switch_simplify]: 7.3e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.00010031 [with_stream_mark]: 1.103e-05 [recompute_prepare]: 5.47999e-06 [updatestate_depend_eliminate]: 2.81999e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.30002e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 6.586e-05 [accelerated_algorithm]: 5.59998e-06 [shard]: 1.18001e-06 [meta_shard_fg_expand]: 1.53002e-06 [shard_inline]: 5.30999e-06 [merge_send_recv]: 4.83001e-06 [auto_parallel]: 5.57999e-06 [parallel]: 4.60999e-06 [flash_sp]: 3.04001e-06 [merge_comm]: 3.32997e-06 [allreduce_fusion]: 2.95002e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 5.39e-06 [get_grad_eliminate_]: 5.07e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 3.26001e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 6.15002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.374e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 8.56002e-06 [set_forward_comm_id_for_comm_node_pass]: 2.91e-06 [meta_fg_expand]: 2.13002e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.05001e-06 [after_resolve]: 7.45998e-06 [a_after_grad]: 7.48999e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.59e-06 [auto_monad_grad]: 1.14e-06 [auto_monad_eliminator]: 7.38e-06 [cse]: 1.637e-05 [a_3]: 3.272e-05 [py_interpret_to_execute_after_opt_a]: 4.92e-06 [slice_cell_reuse_recomputed_activation]: 2.63e-06 [rewriter_after_opt_a]: 1.897e-05 [convert_after_rewriter]: 1.65001e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00057944 [opt_b]: 0.00018411, [1] [Cycle 1]: 0.00017727, [7] [b_1]: 0.00010416 [b_2]: 7.19001e-06 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.24001e-06 [renormalize]: 5.3001e-07 [cse]: 2.166e-05 [optimize_parallel_all_gather_comm]: 1.554e-05 [overlap_param_gather]: 2.46e-06 [cconv]: 2.489e-05 [loop_unroll]: 0.00041846 [opt_after_cconv]: 0.00012381, [1] [Cycle 1]: 0.00011792, [7] [c_1]: 2.459e-05 [parameter_eliminate]: 2.22999e-06 [updatestate_depend_eliminate]: 5.06002e-06 [updatestate_assign_eliminate]: 3.22002e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 2.012e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 3.261e-05 [tuple_transform]: 7.155e-05, [1] [Cycle 1]: 6.616e-05, [4] [d_1]: 3.756e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 6.54999e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 4.612e-05 [cse_after_recomputation]: 2.34e-05, [1] [Cycle 1]: 1.856e-05, [1] [cse]: 1.255e-05 [environ_conv]: 8.13999e-06 [swap_dp_allreduce_reducescatter]: 5.52001e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 3.94997e-06 [label_fine_grained_interleaved_index]: 3.18e-06 [merge_cast_opt]: 1.29e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.44999e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 7.10017e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.56998e-06 [reorder_send_recv_between_fp_bp]: 2.42001e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.25001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.93997e-06 [control_data_broadcast_order]: 1.184e-05 [grouped_pairwise_exchange_alltoall]: 1.43002e-06 [offloading_packed_experts]: 3.93001e-06 [overlap_recompute_and_grad_model_parallel]: 4.78001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.67001e-06 [overlap_recompute_comm]: 2.41998e-06 [overlap_grad_ring_attention]: 4.13001e-06 [overlap_grad_flash_sp]: 1.954e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.40002e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 1.31002e-06 [symbol_engine_optimizer]: 8.097e-05, [1] [Cycle 1]: 7.632e-05, [6] [build]: 9.99001e-06 [elim_shapecalc]: 8.77e-06 [elim_not_effective]: 1.252e-05 [opt_reshape]: 6.53e-06 [fold_const_symbol]: 9.52999e-06 [renormalize]: 1.80007e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 1.639e-05 [get_jit_bprop_graph]: 1.57001e-06 [rewriter_after_jit_bprop_graph]: 3.75e-06 [opt_after_jit_grad]: 0.00047155 [validate]: 4.424e-05 [backend_pass]: 9.29984e-07 [task_emit]: 0.00882806 [execute]: 8e-06 Sums bootstrap : 0.000515s : 1.24% type_inference : 0.026822s : 64.61% event_method : 0.000023s : 0.06% auto_monad : 0.000080s : 0.19% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000023s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000038s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000253s : 0.61% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000088s : 0.21% optimize.opt_a.loop_unroll : 0.000040s : 0.10% optimize.opt_a.a_1 : 0.000708s : 1.71% optimize.opt_a.with_stream_mark : 0.000027s : 0.06% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000136s : 0.33% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000074s : 0.18% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.001249s : 3.01% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.06% optimize.opt_a.cse : 0.000052s : 0.13% optimize.opt_a.a_3 : 0.000077s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000019s : 0.05% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000579s : 1.40% optimize.opt_b.b_1 : 0.000104s : 0.25% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000022s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.06% optimize.loop_unroll : 0.000418s : 1.01% optimize.opt_after_cconv.c_1 : 0.000025s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000033s : 0.08% optimize.tuple_transform.d_1 : 0.000038s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000046s : 0.11% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000472s : 1.14% validate : 0.000044s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.008828s : 21.27% execute : 0.000008s : 0.02% Time group info: ------[substitution.] 0.000200 26 1.10% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000002s : 2: substitution.fold_const_symbol 3.02% : 0.000006s : 3: substitution.graph_param_transform 80.86% : 0.000162s : 6: substitution.inline 1.53% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.68% : 0.000005s : 4: substitution.remove_not_recompute_node 1.34% : 0.000003s : 2: substitution.replace_old_param 3.53% : 0.000007s : 1: substitution.switch_simplify 5.20% : 0.000010s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026751 2 94.67% : 0.025325s : 1: type_inference.infer 5.33% : 0.001426s : 1: type_inference.specialize ------[replace.] 0.000090 9 58.02% : 0.000052s : 6: replace.inline 22.70% : 0.000020s : 1: replace.switch_simplify 19.28% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 9 91.01% : 0.000158s : 6: match.inline 3.62% : 0.000006s : 1: match.switch_simplify 5.37% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000180 1092 0.97% : 0.000002s : 12: predicate.accumulaten_eliminater 0.73% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 6: predicate.addn_check_dump 0.91% : 0.000002s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.39% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.62% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.44% : 0.000001s : 6: predicate.depend_value_elim 1.00% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.21% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.96% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.40% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.07% : 0.000002s : 15: predicate.environ_get_depend_swap 1.88% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.59% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.43% : 0.000004s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.72% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.45% : 0.000001s : 6: predicate.incorporate_call_switch 5.81% : 0.000010s : 50: predicate.inline 0.57% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.76% : 0.000001s : 6: predicate.less_batch_normalization 1.65% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.38% : 0.000004s : 32: predicate.load_eliminater 0.91% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.90% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.56% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 6: predicate.merge_addn 0.51% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.94% : 0.000002s : 12: predicate.minmaximum_grad 1.02% : 0.000002s : 3: predicate.mutable_eliminate 0.41% : 0.000001s : 3: predicate.opt_reshape 0.37% : 0.000001s : 3: predicate.parallel_virtual_node 2.05% : 0.000004s : 20: predicate.partial_defer_inline 1.41% : 0.000003s : 17: predicate.partial_eliminate 1.07% : 0.000002s : 12: predicate.print_const_string_wrapper 0.55% : 0.000001s : 6: predicate.reduce_all_const_elim 1.29% : 0.000002s : 12: predicate.reduce_eliminate 2.65% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 20: predicate.replace_applicator 0.38% : 0.000001s : 6: predicate.replace_old_param 0.25% : 0.000000s : 3: predicate.reset_defer_inline 1.03% : 0.000002s : 12: predicate.reshape_eliminate 0.72% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.62% : 0.000001s : 3: predicate.row_tensor_eliminate 0.68% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.80% : 0.000001s : 6: predicate.shard_identity_eliminate 0.67% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.81% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.70% : 0.000003s : 20: predicate.switch_defer_inline 2.18% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.14% : 0.000011s : 68: predicate.switch_simplify 1.02% : 0.000002s : 12: predicate.tile_eliminate 0.86% : 0.000002s : 12: predicate.transpose_eliminate 1.57% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.41% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.84% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.24% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.07% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 3: predicate.value_based_eliminate 0.79% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001117 16 53.66% : 0.000600s : 8: func_graph_cloner_run.FuncGraphClonerGraph 46.34% : 0.000518s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057486 196 0.01% : 0.000004s : 1: ForceFp32Comm 6.04% : 0.003474s : 1: add_attr 6.02% : 0.003463s : 1: add_attr_with_inline 0.01% : 0.000005s : 1: add_comm_op_reuse_tag 0.09% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000085s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.00% : 0.000576s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000030s : 1: event_method 0.03% : 0.000014s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.74% : 0.000427s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.02% : 0.000588s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 1.96% : 0.001126s : 78: opt.transform.opt_a 0.04% : 0.000023s : 1: opt.transform.opt_after_cconv 0.04% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000085s : 28: opt.transform.opt_b 0.07% : 0.000041s : 2: opt.transform.opt_trans_graph 0.06% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.64% : 0.003242s : 1: opt_a 0.22% : 0.000127s : 1: opt_after_cconv 0.84% : 0.000481s : 1: opt_after_jit_grad 0.33% : 0.000187s : 1: opt_b 9.49% : 0.005457s : 1: optimize 0.03% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000037s : 1: remove_dup_value 1.28% : 0.000733s : 1: renormalize.infer 0.88% : 0.000508s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000022s : 1: rewriter_after_opt_a 0.45% : 0.000258s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000084s : 1: symbol_engine_optimizer 15.38% : 0.008843s : 1: task_emit 0.13% : 0.000074s : 1: tuple_transform 46.70% : 0.026843s : 1: type_inference 0.14% : 0.000080s : 1: validate TotalTime = 0.0541898, [24] [bootstrap]: 0.00062176 [type_inference]: 0.0340096 [event_method]: 0.00011498 [auto_monad]: 0.00018393 [graph_reusing]: 1.203e-05 [inline]: 3.06999e-06 [add_attr]: 0.00379887, [1] [add_attr_with_inline]: 0.00378768, [1] [Cycle 1]: 7.816e-05, [2] [tag_attr]: 3.28e-05 [meta_addattr_fg_expand]: 8.02998e-06 [parallel-infer-symbol]: 3.93999e-06 [pre_auto_parallel]: 4.508e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 2.04e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00673382, [53] [py_interpret_to_execute]: 6.68e-06 [rewriter_before_opt_a]: 0.00024785 [opt_a]: 0.00412129, [2] [Cycle 1]: 0.00339073, [45] [expand_dump_flag]: 4.41002e-06 [switch_simplify]: 8.903e-05 [loop_unroll]: 3.996e-05 [a_1]: 0.00080657 [with_stream_mark]: 1.946e-05 [recompute_prepare]: 1.028e-05 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 4.43999e-06 [updatestate_loads_eliminate]: 4.55999e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 9.777e-05 [accelerated_algorithm]: 8.2e-06 [shard]: 1.98002e-06 [meta_shard_fg_expand]: 2.74999e-06 [shard_inline]: 6.96001e-06 [merge_send_recv]: 1.019e-05 [auto_parallel]: 8.66002e-06 [parallel]: 1.904e-05 [flash_sp]: 9.54e-06 [merge_comm]: 4.55001e-06 [allreduce_fusion]: 4.32e-06 [matmul_add_comm_reduction]: 1.018e-05 [allreduce_slice_to_reducescatter]: 1.52001e-06 [virtual_shard_identity]: 8.20999e-06 [virtual_dataset]: 7.71001e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 7.2e-06 [merge_forward]: 4.69998e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.148e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.353e-05 [merge_recompute_call_nodes]: 1.46998e-06 [before_grad]: 1.168e-05 [set_forward_comm_id_for_comm_node_pass]: 4.22998e-06 [meta_fg_expand]: 4.05e-06 [flash_sp_send_recv_attached]: 2.61e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.07e-05 [a_after_grad]: 1.041e-05 [renormalize]: 0.00172653 [add_forward_monad_depend]: 6.34001e-06 [auto_monad_grad]: 2.48002e-06 [auto_monad_eliminator]: 2.051e-05 [cse]: 4.327e-05 [a_3]: 5.644e-05 [Cycle 2]: 0.0007187, [45] [expand_dump_flag]: 2.16e-06 [switch_simplify]: 9.00001e-06 [loop_unroll]: 7.25e-06 [a_1]: 0.00016143 [with_stream_mark]: 1.546e-05 [recompute_prepare]: 7.08e-06 [updatestate_depend_eliminate]: 4.28999e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 3.53e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 8.69e-05 [accelerated_algorithm]: 6.97002e-06 [shard]: 1.27999e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 6.50997e-06 [merge_send_recv]: 6.61999e-06 [auto_parallel]: 7.48e-06 [parallel]: 5.63002e-06 [flash_sp]: 3.63e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 7.17002e-06 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 7.30998e-06 [virtual_dataset]: 6.79999e-06 [get_grad_eliminate_]: 6.66999e-06 [virtual_output]: 6.29999e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 2.19001e-06 [offload_activation]: 8.33999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.246e-05 [merge_recompute_call_nodes]: 1.06997e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 4.57998e-06 [meta_fg_expand]: 3.04999e-06 [flash_sp_send_recv_attached]: 1.02998e-06 [receive_attached]: 1.50001e-06 [after_resolve]: 9.72001e-06 [a_after_grad]: 9.69999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.50999e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 8.2e-06 [cse]: 2.134e-05 [a_3]: 3.937e-05 [py_interpret_to_execute_after_opt_a]: 6.12999e-06 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 2.484e-05 [convert_after_rewriter]: 1.20001e-06 [order_py_execute_after_rewriter]: 1.93997e-06 [mutable_eliminate]: 0.0006629 [opt_b]: 0.00025604, [1] [Cycle 1]: 0.0002492, [7] [b_1]: 0.00016041 [b_2]: 8.84e-06 [updatestate_depend_eliminate]: 6.99001e-06 [updatestate_assign_eliminate]: 3.29001e-06 [updatestate_loads_eliminate]: 3.11999e-06 [renormalize]: 6.30011e-07 [cse]: 2.887e-05 [optimize_parallel_all_gather_comm]: 1.931e-05 [overlap_param_gather]: 2.48e-06 [cconv]: 2.641e-05 [loop_unroll]: 0.00053615 [opt_after_cconv]: 0.00011679, [1] [Cycle 1]: 0.00011064, [7] [c_1]: 3.187e-05 [parameter_eliminate]: 3.64002e-06 [updatestate_depend_eliminate]: 7e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.91e-06 [cse]: 2.762e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 4.028e-05 [tuple_transform]: 0.00010024, [1] [Cycle 1]: 9.559e-05, [4] [d_1]: 6.532e-05 [none_parameter_eliminate]: 2.02999e-06 [renormalize]: 2.40019e-07 [switch_simplify]: 8.02e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 5.253e-05 [cse_after_recomputation]: 2.68e-05, [1] [Cycle 1]: 2.186e-05, [1] [cse]: 1.639e-05 [environ_conv]: 1.103e-05 [swap_dp_allreduce_reducescatter]: 6.58e-06 [bias_add_comm_swap]: 2.36e-06 [label_micro_interleaved_index]: 4.47e-06 [label_fine_grained_interleaved_index]: 2.79001e-06 [merge_cast_opt]: 1.24998e-06 [slice_recompute_activation]: 2.42001e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 8.49977e-07 [remove_cast_before_assign_add]: 1.28002e-06 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 1.46998e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.27e-06 [overlap_opt_shard_in_pipeline]: 1.19998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86003e-06 [control_data_broadcast_order]: 1.763e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 4.87e-06 [overlap_recompute_and_grad_model_parallel]: 6.07999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46998e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 5.47999e-06 [overlap_grad_flash_sp]: 2.322e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.46998e-06 [split_layernorm_comm]: 1.82001e-06 [handle_group_info]: 1.14003e-06 [symbol_engine_optimizer]: 9.036e-05, [1] [Cycle 1]: 8.556e-05, [6] [build]: 1.107e-05 [elim_shapecalc]: 1.098e-05 [elim_not_effective]: 1.457e-05 [opt_reshape]: 7.93999e-06 [fold_const_symbol]: 1.147e-05 [renormalize]: 2.29978e-07 [detach_backward]: 2.27999e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 2.085e-05 [get_jit_bprop_graph]: 1.86003e-06 [rewriter_after_jit_bprop_graph]: 4.74e-06 [opt_after_jit_grad]: 0.00057722 [validate]: 4.826e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.00772269 [execute]: 1.043e-05 Sums bootstrap : 0.000622s : 1.26% type_inference : 0.034010s : 69.01% event_method : 0.000115s : 0.23% auto_monad : 0.000184s : 0.37% graph_reusing : 0.000012s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000045s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000248s : 0.50% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000098s : 0.20% optimize.opt_a.loop_unroll : 0.000047s : 0.10% optimize.opt_a.a_1 : 0.000968s : 1.96% optimize.opt_a.with_stream_mark : 0.000035s : 0.07% optimize.opt_a.recompute_prepare : 0.000017s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000185s : 0.37% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000017s : 0.03% optimize.opt_a.auto_parallel : 0.000016s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.05% optimize.opt_a.flash_sp : 0.000013s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.03% optimize.opt_a.virtual_dataset : 0.000015s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000009s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000020s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.04% optimize.opt_a.a_after_grad : 0.000020s : 0.04% optimize.opt_a.renormalize : 0.001727s : 3.50% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.02% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.06% optimize.opt_a.cse : 0.000065s : 0.13% optimize.opt_a.a_3 : 0.000096s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000025s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000663s : 1.35% optimize.opt_b.b_1 : 0.000160s : 0.33% optimize.opt_b.b_2 : 0.000009s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000026s : 0.05% optimize.loop_unroll : 0.000536s : 1.09% optimize.opt_after_cconv.c_1 : 0.000032s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000028s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.08% optimize.tuple_transform.d_1 : 0.000065s : 0.13% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.11% optimize.cse_after_recomputation.cse : 0.000016s : 0.03% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000577s : 1.17% validate : 0.000048s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007723s : 15.67% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000330 62 0.72% : 0.000002s : 3: substitution.elim_not_effective 2.16% : 0.000007s : 3: substitution.float_tuple_getitem_switch 0.55% : 0.000002s : 3: substitution.fold_const_symbol 2.02% : 0.000007s : 4: substitution.graph_param_transform 61.70% : 0.000204s : 8: substitution.inline 1.45% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.44% : 0.000005s : 2: substitution.minmaximum_grad 1.76% : 0.000006s : 6: substitution.remove_not_recompute_node 1.06% : 0.000003s : 2: substitution.replace_old_param 2.44% : 0.000008s : 1: substitution.switch_simplify 5.14% : 0.000017s : 4: substitution.tuple_list_convert_item_index_to_positive 2.10% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.46% : 0.000011s : 4: substitution.tuple_list_get_item_depend_reorder 10.71% : 0.000035s : 8: substitution.tuple_list_get_item_eliminator 3.29% : 0.000011s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.033918 2 94.01% : 0.031885s : 1: type_inference.infer 5.99% : 0.002033s : 1: type_inference.specialize ------[replace.] 0.000097 11 62.33% : 0.000061s : 8: replace.inline 19.64% : 0.000019s : 1: replace.switch_simplify 18.03% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000210 11 94.75% : 0.000199s : 8: match.inline 3.36% : 0.000007s : 1: match.switch_simplify 1.89% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000244 1438 1.03% : 0.000003s : 16: predicate.accumulaten_eliminater 0.99% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.19% : 0.000003s : 16: predicate.addn_zero_filter 0.96% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.18% : 0.000005s : 24: predicate.arithmetic_simplify 1.10% : 0.000003s : 16: predicate.cast_eliminate 0.71% : 0.000002s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_depend_swap 1.70% : 0.000004s : 28: predicate.environ_get_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.50% : 0.000006s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.81% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.39% : 0.000001s : 8: predicate.incorporate_call_switch 5.62% : 0.000014s : 66: predicate.inline 0.67% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 42: predicate.load_eliminater 0.98% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.76% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.47% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 16: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.97% : 0.000005s : 26: predicate.partial_defer_inline 1.33% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.63% : 0.000002s : 8: predicate.reduce_all_const_elim 1.39% : 0.000003s : 16: predicate.reduce_eliminate 2.60% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.26% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000003s : 26: predicate.replace_applicator 0.31% : 0.000001s : 8: predicate.replace_old_param 0.17% : 0.000000s : 4: predicate.reset_defer_inline 1.15% : 0.000003s : 16: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.53% : 0.000001s : 4: predicate.row_tensor_eliminate 0.79% : 0.000002s : 8: predicate.same_eliminate 0.32% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.59% : 0.000001s : 8: predicate.special_op_eliminate 0.58% : 0.000001s : 8: predicate.specialize_transform 0.80% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.64% : 0.000004s : 26: predicate.switch_defer_inline 2.08% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.76% : 0.000014s : 86: predicate.switch_simplify 1.09% : 0.000003s : 16: predicate.tile_eliminate 1.00% : 0.000002s : 16: predicate.transpose_eliminate 1.69% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000008s : 34: predicate.tuple_list_get_item_eliminator 1.62% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.79% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.78% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000002s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001696 23 57.19% : 0.000970s : 11: func_graph_cloner_run.FuncGraphClonerGraph 42.81% : 0.000726s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.068184 196 0.07% : 0.000046s : 1: ForceFp32Comm 5.58% : 0.003805s : 1: add_attr 5.56% : 0.003792s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.28% : 0.000194s : 1: auto_monad 0.04% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.97% : 0.000661s : 1: bootstrap 0.04% : 0.000030s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000021s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000015s : 1: environ_conv 0.18% : 0.000125s : 1: event_method 0.02% : 0.000017s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000016s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.80% : 0.000546s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.99% : 0.000673s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 2.19% : 0.001496s : 78: opt.transform.opt_a 0.04% : 0.000031s : 1: opt.transform.opt_after_cconv 0.04% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000140s : 28: opt.transform.opt_b 0.10% : 0.000070s : 2: opt.transform.opt_trans_graph 0.06% : 0.000042s : 4: opt.transform.symbol_engine_opt 6.05% : 0.004125s : 1: opt_a 0.18% : 0.000121s : 1: opt_after_cconv 0.86% : 0.000587s : 1: opt_after_jit_grad 0.38% : 0.000259s : 1: opt_b 9.88% : 0.006739s : 1: optimize 0.03% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000005s : 1: order_py_execute_after_rewriter 0.04% : 0.000026s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.07% : 0.000044s : 1: remove_dup_value 1.42% : 0.000969s : 1: renormalize.infer 1.10% : 0.000748s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000028s : 1: rewriter_after_opt_a 0.37% : 0.000254s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000093s : 1: symbol_engine_optimizer 11.35% : 0.007741s : 1: task_emit 0.15% : 0.000103s : 1: tuple_transform 49.92% : 0.034036s : 1: type_inference 0.13% : 0.000090s : 1: validate TotalTime = 0.0493063, [24] [bootstrap]: 0.00045321 [type_inference]: 0.0305568 [event_method]: 0.00010126 [auto_monad]: 0.00017276 [graph_reusing]: 1.257e-05 [inline]: 2.17999e-06 [add_attr]: 0.00338482, [1] [add_attr_with_inline]: 0.00337524, [1] [Cycle 1]: 7.154e-05, [2] [tag_attr]: 2.975e-05 [meta_addattr_fg_expand]: 7.98001e-06 [parallel-infer-symbol]: 4.03001e-06 [pre_auto_parallel]: 4.355e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.64e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00627611, [53] [py_interpret_to_execute]: 4.97e-06 [rewriter_before_opt_a]: 0.00023696 [opt_a]: 0.00386734, [2] [Cycle 1]: 0.00314567, [45] [expand_dump_flag]: 3.88001e-06 [switch_simplify]: 8.471e-05 [loop_unroll]: 3.916e-05 [a_1]: 0.00077998 [with_stream_mark]: 1.656e-05 [recompute_prepare]: 8.78001e-06 [updatestate_depend_eliminate]: 4.87998e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 4.27e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 9.65e-05 [accelerated_algorithm]: 7.51001e-06 [shard]: 1.80001e-06 [meta_shard_fg_expand]: 2.52001e-06 [shard_inline]: 7.01001e-06 [merge_send_recv]: 9.99999e-06 [auto_parallel]: 8.15999e-06 [parallel]: 1.933e-05 [flash_sp]: 8.17e-06 [merge_comm]: 4.62998e-06 [allreduce_fusion]: 4.1e-06 [matmul_add_comm_reduction]: 1.054e-05 [allreduce_slice_to_reducescatter]: 1.01002e-06 [virtual_shard_identity]: 8.62e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.71e-06 [virtual_output]: 6.68e-06 [merge_forward]: 4.77998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.024e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.343e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.212e-05 [set_forward_comm_id_for_comm_node_pass]: 4.45e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 2.69001e-06 [after_resolve]: 1e-05 [a_after_grad]: 1.018e-05 [renormalize]: 0.00154461 [add_forward_monad_depend]: 5.61e-06 [auto_monad_grad]: 2.51998e-06 [auto_monad_eliminator]: 1.908e-05 [cse]: 3.837e-05 [a_3]: 5.486e-05 [Cycle 2]: 0.00071067, [45] [expand_dump_flag]: 2.01e-06 [switch_simplify]: 9.10001e-06 [loop_unroll]: 7.19001e-06 [a_1]: 0.0001606 [with_stream_mark]: 1.523e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 4.30999e-06 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 8.636e-05 [accelerated_algorithm]: 6.81999e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.81e-06 [shard_inline]: 6.68e-06 [merge_send_recv]: 6.21998e-06 [auto_parallel]: 7.71001e-06 [parallel]: 5.86e-06 [flash_sp]: 3.55e-06 [merge_comm]: 4.48999e-06 [allreduce_fusion]: 3.83001e-06 [matmul_add_comm_reduction]: 7.81001e-06 [allreduce_slice_to_reducescatter]: 5.00004e-07 [virtual_shard_identity]: 7.45998e-06 [virtual_dataset]: 6.29999e-06 [get_grad_eliminate_]: 6.03998e-06 [virtual_output]: 6.49001e-06 [merge_forward]: 3.63999e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 7.63999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.208e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 1.084e-05 [set_forward_comm_id_for_comm_node_pass]: 4.50001e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 9.27001e-06 [a_after_grad]: 9.17001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.07e-06 [auto_monad_grad]: 1.24998e-06 [auto_monad_eliminator]: 7.88999e-06 [cse]: 2.018e-05 [a_3]: 3.94e-05 [py_interpret_to_execute_after_opt_a]: 5.84999e-06 [slice_cell_reuse_recomputed_activation]: 2.17001e-06 [rewriter_after_opt_a]: 2.402e-05 [convert_after_rewriter]: 1.15001e-06 [order_py_execute_after_rewriter]: 1.34e-06 [mutable_eliminate]: 0.00057732 [opt_b]: 0.00025017, [1] [Cycle 1]: 0.00024369, [7] [b_1]: 0.00016108 [b_2]: 9.72999e-06 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 3.48e-06 [updatestate_loads_eliminate]: 3.04001e-06 [renormalize]: 3.4002e-07 [cse]: 2.435e-05 [optimize_parallel_all_gather_comm]: 1.807e-05 [overlap_param_gather]: 2.51e-06 [cconv]: 2.534e-05 [loop_unroll]: 0.00045792 [opt_after_cconv]: 0.00010911, [1] [Cycle 1]: 0.0001038, [7] [c_1]: 3.125e-05 [parameter_eliminate]: 2.50997e-06 [updatestate_depend_eliminate]: 6.50002e-06 [updatestate_assign_eliminate]: 3.36001e-06 [updatestate_loads_eliminate]: 3.04999e-06 [cse]: 2.355e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.868e-05 [tuple_transform]: 9.326e-05, [1] [Cycle 1]: 8.883e-05, [4] [d_1]: 6.164e-05 [none_parameter_eliminate]: 1.72999e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 7.25998e-06 [partial_unused_args_eliminate]: 2.13002e-06 [add_recomputation]: 5.111e-05 [cse_after_recomputation]: 2.648e-05, [1] [Cycle 1]: 2.237e-05, [1] [cse]: 1.699e-05 [environ_conv]: 1.011e-05 [swap_dp_allreduce_reducescatter]: 6.17999e-06 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.96001e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.73998e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 1.10999e-06 [remove_cast_before_assign_add]: 1.41998e-06 [full_micro_interleaved_order_control]: 2.59999e-06 [reorder_send_recv_between_fp_bp]: 2.76e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.24e-06 [interleave_split_concat_branches]: 1.24e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.54e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76e-06 [control_data_broadcast_order]: 1.693e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 4.64998e-06 [overlap_recompute_and_grad_model_parallel]: 5.42999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 4.80001e-06 [overlap_grad_flash_sp]: 2.105e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.30002e-06 [split_layernorm_comm]: 1.77001e-06 [handle_group_info]: 1.11002e-06 [symbol_engine_optimizer]: 0.00012941, [1] [Cycle 1]: 0.00012478, [6] [build]: 1.133e-05 [elim_shapecalc]: 4.727e-05 [elim_not_effective]: 1.669e-05 [opt_reshape]: 7.8e-06 [fold_const_symbol]: 1.148e-05 [renormalize]: 2.09984e-07 [detach_backward]: 2.25002e-06 [pipeline_parallel_scheduler]: 1.54998e-06 [auto_monad_reorder]: 2.023e-05 [get_jit_bprop_graph]: 1.64e-06 [rewriter_after_jit_bprop_graph]: 4.12e-06 [opt_after_jit_grad]: 0.00052797 [validate]: 4.587e-05 [backend_pass]: 9.09989e-07 [task_emit]: 0.00744346 [execute]: 9.15999e-06 Sums bootstrap : 0.000453s : 1.01% type_inference : 0.030557s : 68.10% event_method : 0.000101s : 0.23% auto_monad : 0.000173s : 0.39% graph_reusing : 0.000013s : 0.03% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000044s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000237s : 0.53% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000094s : 0.21% optimize.opt_a.loop_unroll : 0.000046s : 0.10% optimize.opt_a.a_1 : 0.000941s : 2.10% optimize.opt_a.with_stream_mark : 0.000032s : 0.07% optimize.opt_a.recompute_prepare : 0.000016s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000183s : 0.41% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000016s : 0.04% optimize.opt_a.auto_parallel : 0.000016s : 0.04% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000012s : 0.03% optimize.opt_a.merge_comm : 0.000009s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000018s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000023s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000019s : 0.04% optimize.opt_a.a_after_grad : 0.000019s : 0.04% optimize.opt_a.renormalize : 0.001545s : 3.44% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.06% optimize.opt_a.cse : 0.000059s : 0.13% optimize.opt_a.a_3 : 0.000094s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000577s : 1.29% optimize.opt_b.b_1 : 0.000161s : 0.36% optimize.opt_b.b_2 : 0.000010s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.04% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000025s : 0.06% optimize.loop_unroll : 0.000458s : 1.02% optimize.opt_after_cconv.c_1 : 0.000031s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.04% optimize.tuple_transform.d_1 : 0.000062s : 0.14% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.11% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000010s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000047s : 0.11% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000528s : 1.18% validate : 0.000046s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007443s : 16.59% execute : 0.000009s : 0.02% Time group info: ------[substitution.] 0.000303 62 0.81% : 0.000002s : 3: substitution.elim_not_effective 2.13% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.60% : 0.000002s : 3: substitution.fold_const_symbol 1.90% : 0.000006s : 4: substitution.graph_param_transform 61.57% : 0.000186s : 8: substitution.inline 1.52% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.57% : 0.000005s : 2: substitution.minmaximum_grad 1.89% : 0.000006s : 6: substitution.remove_not_recompute_node 0.95% : 0.000003s : 2: substitution.replace_old_param 2.34% : 0.000007s : 1: substitution.switch_simplify 5.20% : 0.000016s : 4: substitution.tuple_list_convert_item_index_to_positive 2.40% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.57% : 0.000011s : 4: substitution.tuple_list_get_item_depend_reorder 10.00% : 0.000030s : 8: substitution.tuple_list_get_item_eliminator 3.56% : 0.000011s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.030433 2 93.75% : 0.028529s : 1: type_inference.infer 6.25% : 0.001903s : 1: type_inference.specialize ------[replace.] 0.000096 11 63.37% : 0.000061s : 8: replace.inline 18.95% : 0.000018s : 1: replace.switch_simplify 17.68% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000192 11 94.71% : 0.000182s : 8: match.inline 3.28% : 0.000006s : 1: match.switch_simplify 2.01% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1438 1.11% : 0.000003s : 16: predicate.accumulaten_eliminater 0.90% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 16: predicate.addn_zero_filter 0.86% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.22% : 0.000005s : 24: predicate.arithmetic_simplify 1.18% : 0.000003s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.46% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.51% : 0.000001s : 8: predicate.depend_value_elim 1.01% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 16: predicate.dict_get_item_eliminator 1.01% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.24% : 0.000001s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.24% : 0.000003s : 20: predicate.environ_get_depend_swap 1.89% : 0.000004s : 28: predicate.environ_get_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.54% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.57% : 0.000006s : 26: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.85% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.27% : 0.000001s : 4: predicate.graph_param_transform 0.50% : 0.000001s : 8: predicate.incorporate_call 0.41% : 0.000001s : 8: predicate.incorporate_call_switch 5.77% : 0.000014s : 66: predicate.inline 0.66% : 0.000002s : 8: predicate.inline_without_move 0.25% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.77% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.58% : 0.000006s : 42: predicate.load_eliminater 0.88% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.77% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.54% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 16: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.38% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 2.15% : 0.000005s : 26: predicate.partial_defer_inline 1.37% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.40% : 0.000003s : 16: predicate.reduce_eliminate 2.52% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000003s : 26: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 1.06% : 0.000003s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.75% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.57% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.66% : 0.000004s : 26: predicate.switch_defer_inline 2.15% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.91% : 0.000014s : 86: predicate.switch_simplify 0.98% : 0.000002s : 16: predicate.tile_eliminate 0.99% : 0.000002s : 16: predicate.transpose_eliminate 1.65% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000008s : 34: predicate.tuple_list_get_item_eliminator 1.60% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.97% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001539 23 56.35% : 0.000867s : 11: func_graph_cloner_run.FuncGraphClonerGraph 43.65% : 0.000672s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.062247 196 0.01% : 0.000004s : 1: ForceFp32Comm 5.45% : 0.003391s : 1: add_attr 5.43% : 0.003379s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000055s : 1: add_recomputation 0.06% : 0.000037s : 1: assign_add_opt 0.29% : 0.000182s : 1: auto_monad 0.04% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.78% : 0.000485s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000020s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000013s : 1: environ_conv 0.18% : 0.000111s : 1: event_method 0.02% : 0.000014s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000017s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.75% : 0.000467s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.94% : 0.000586s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.34% : 0.001455s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.04% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000141s : 28: opt.transform.opt_b 0.11% : 0.000067s : 2: opt.transform.opt_trans_graph 0.13% : 0.000079s : 4: opt.transform.symbol_engine_opt 6.22% : 0.003871s : 1: opt_a 0.18% : 0.000113s : 1: opt_after_cconv 0.86% : 0.000537s : 1: opt_after_jit_grad 0.41% : 0.000254s : 1: opt_b 10.09% : 0.006282s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000048s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000023s : 1: remove_dup_value 1.38% : 0.000860s : 1: renormalize.infer 1.08% : 0.000675s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000027s : 1: rewriter_after_opt_a 0.39% : 0.000243s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000132s : 1: symbol_engine_optimizer 11.98% : 0.007459s : 1: task_emit 0.16% : 0.000097s : 1: tuple_transform 49.13% : 0.030580s : 1: type_inference 0.13% : 0.000081s : 1: validate TotalTime = 0.0442524, [24] [bootstrap]: 0.00043381 [type_inference]: 0.0256103 [event_method]: 2.267e-05 [auto_monad]: 8.277e-05 [graph_reusing]: 6.86001e-06 [inline]: 2.74999e-06 [add_attr]: 0.00353518, [1] [add_attr_with_inline]: 0.00352689, [1] [Cycle 1]: 6.333e-05, [2] [tag_attr]: 2.359e-05 [meta_addattr_fg_expand]: 6.49001e-06 [parallel-infer-symbol]: 3.61999e-06 [pre_auto_parallel]: 3.59e-05 [insert-virtual-dataset]: 2.88e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.00527729, [53] [py_interpret_to_execute]: 5.21002e-06 [rewriter_before_opt_a]: 0.00024445 [opt_a]: 0.00309916, [2] [Cycle 1]: 0.00252993, [45] [expand_dump_flag]: 3.51999e-06 [switch_simplify]: 8.032e-05 [loop_unroll]: 6.511e-05 [a_1]: 0.00060713 [with_stream_mark]: 1.518e-05 [recompute_prepare]: 7.88001e-06 [updatestate_depend_eliminate]: 3.98999e-06 [updatestate_assign_eliminate]: 3.4e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 6.96e-05 [accelerated_algorithm]: 6.25002e-06 [shard]: 1.75001e-06 [meta_shard_fg_expand]: 2.11998e-06 [shard_inline]: 5.77001e-06 [merge_send_recv]: 8.04997e-06 [auto_parallel]: 6.48998e-06 [parallel]: 1.802e-05 [flash_sp]: 7.97e-06 [merge_comm]: 3.68e-06 [allreduce_fusion]: 3.34001e-06 [matmul_add_comm_reduction]: 9.12001e-06 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 7.36999e-06 [virtual_dataset]: 5.82001e-06 [get_grad_eliminate_]: 5.51e-06 [virtual_output]: 5.66e-06 [merge_forward]: 3.73001e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 9.55001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.221e-05 [merge_recompute_call_nodes]: 1.82999e-06 [before_grad]: 9.66e-06 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.11e-06 [after_resolve]: 9.05001e-06 [a_after_grad]: 8.18999e-06 [renormalize]: 0.00117257 [add_forward_monad_depend]: 5.40001e-06 [auto_monad_grad]: 2.04e-06 [auto_monad_eliminator]: 1.583e-05 [cse]: 3.413e-05 [a_3]: 4.261e-05 [Cycle 2]: 0.00055949, [45] [expand_dump_flag]: 1.60001e-06 [switch_simplify]: 6.89001e-06 [loop_unroll]: 5.71e-06 [a_1]: 9.779e-05 [with_stream_mark]: 1.127e-05 [recompute_prepare]: 5.61e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 2.32999e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 6.216e-05 [accelerated_algorithm]: 5.30999e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 5.21998e-06 [merge_send_recv]: 4.59002e-06 [auto_parallel]: 5.19e-06 [parallel]: 4.67e-06 [flash_sp]: 3.13e-06 [merge_comm]: 2.93e-06 [allreduce_fusion]: 2.74999e-06 [matmul_add_comm_reduction]: 5.13002e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.37001e-06 [virtual_dataset]: 5.27001e-06 [get_grad_eliminate_]: 5.03002e-06 [virtual_output]: 4.97e-06 [merge_forward]: 2.98998e-06 [cell_reuse_recompute_pass]: 1.36002e-06 [offload_activation]: 6.26e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.16e-05 [merge_recompute_call_nodes]: 7.7e-07 [before_grad]: 8.26002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.16001e-06 [meta_fg_expand]: 2.01e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 1.09003e-06 [after_resolve]: 8.50999e-06 [a_after_grad]: 7.52998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 8.79983e-07 [auto_monad_eliminator]: 6.31e-06 [cse]: 1.447e-05 [a_3]: 3.069e-05 [py_interpret_to_execute_after_opt_a]: 4.3e-06 [slice_cell_reuse_recomputed_activation]: 1.99999e-06 [rewriter_after_opt_a]: 1.697e-05 [convert_after_rewriter]: 1.23002e-06 [order_py_execute_after_rewriter]: 1.50999e-06 [mutable_eliminate]: 0.00053564 [opt_b]: 0.000183, [1] [Cycle 1]: 0.00017713, [7] [b_1]: 0.00010569 [b_2]: 6.56999e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.26998e-06 [renormalize]: 6.29982e-07 [cse]: 1.989e-05 [optimize_parallel_all_gather_comm]: 1.605e-05 [overlap_param_gather]: 2.22999e-06 [cconv]: 2.528e-05 [loop_unroll]: 0.00047603 [opt_after_cconv]: 9.501e-05, [1] [Cycle 1]: 8.954e-05, [7] [c_1]: 2.47e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 5.16002e-06 [updatestate_assign_eliminate]: 2.62001e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 1.931e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.546e-05 [tuple_transform]: 6.669e-05, [1] [Cycle 1]: 6.237e-05, [4] [d_1]: 3.602e-05 [none_parameter_eliminate]: 1.66002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 5.99e-06 [partial_unused_args_eliminate]: 2.01e-06 [add_recomputation]: 4.504e-05 [cse_after_recomputation]: 2.267e-05, [1] [Cycle 1]: 1.85e-05, [1] [cse]: 1.29e-05 [environ_conv]: 7.83999e-06 [swap_dp_allreduce_reducescatter]: 5.41002e-06 [bias_add_comm_swap]: 2.73e-06 [label_micro_interleaved_index]: 4.72e-06 [label_fine_grained_interleaved_index]: 2.58e-06 [merge_cast_opt]: 1.36002e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.25002e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.22001e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.39998e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.10001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.72001e-06 [control_data_broadcast_order]: 1.23e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.58999e-06 [overlap_recompute_and_grad_model_parallel]: 4.30999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.03002e-06 [overlap_grad_ring_attention]: 4.03999e-06 [overlap_grad_flash_sp]: 1.921e-05 [begin_end_overlap_inline]: 5.8001e-07 [split_matmul_comm_elemetwise]: 2.74001e-06 [split_layernorm_comm]: 2.13002e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 8.012e-05, [1] [Cycle 1]: 7.574e-05, [6] [build]: 1.059e-05 [elim_shapecalc]: 8.96002e-06 [elim_not_effective]: 1.241e-05 [opt_reshape]: 6.18002e-06 [fold_const_symbol]: 9.25999e-06 [renormalize]: 2.19996e-07 [detach_backward]: 1.92999e-06 [pipeline_parallel_scheduler]: 1.55999e-06 [auto_monad_reorder]: 1.886e-05 [get_jit_bprop_graph]: 1.24e-06 [rewriter_after_jit_bprop_graph]: 3.5e-06 [opt_after_jit_grad]: 0.00048083 [validate]: 4.472e-05 [backend_pass]: 9.39996e-07 [task_emit]: 0.00845672 [execute]: 7.69002e-06 Sums bootstrap : 0.000434s : 1.09% type_inference : 0.025610s : 64.48% event_method : 0.000023s : 0.06% auto_monad : 0.000083s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000036s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000244s : 0.62% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000087s : 0.22% optimize.opt_a.loop_unroll : 0.000071s : 0.18% optimize.opt_a.a_1 : 0.000705s : 1.77% optimize.opt_a.with_stream_mark : 0.000026s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000132s : 0.33% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.001173s : 2.95% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.06% optimize.opt_a.cse : 0.000049s : 0.12% optimize.opt_a.a_3 : 0.000073s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000017s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000536s : 1.35% optimize.opt_b.b_1 : 0.000106s : 0.27% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.06% optimize.loop_unroll : 0.000476s : 1.20% optimize.opt_after_cconv.c_1 : 0.000025s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000036s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.11% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000481s : 1.21% validate : 0.000045s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.008457s : 21.29% execute : 0.000008s : 0.02% Time group info: ------[substitution.] 0.000198 26 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000002s : 2: substitution.fold_const_symbol 2.65% : 0.000005s : 3: substitution.graph_param_transform 80.29% : 0.000159s : 6: substitution.inline 1.91% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000005s : 4: substitution.remove_not_recompute_node 1.59% : 0.000003s : 2: substitution.replace_old_param 3.29% : 0.000007s : 1: substitution.switch_simplify 5.73% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025537 2 94.54% : 0.024142s : 1: type_inference.infer 5.46% : 0.001394s : 1: type_inference.specialize ------[replace.] 0.000086 9 58.84% : 0.000051s : 6: replace.inline 22.60% : 0.000020s : 1: replace.switch_simplify 18.56% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000171 9 90.70% : 0.000155s : 6: match.inline 3.31% : 0.000006s : 1: match.switch_simplify 5.99% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000176 1092 1.02% : 0.000002s : 12: predicate.accumulaten_eliminater 0.91% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 0.94% : 0.000002s : 12: predicate.addn_zero_filter 0.85% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.25% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.59% : 0.000001s : 6: predicate.check_bprop_eliminate 0.52% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.54% : 0.000001s : 6: predicate.depend_value_elim 1.06% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.89% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 3: predicate.elim_not_effective 0.40% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_depend_swap 1.67% : 0.000003s : 21: predicate.environ_get_eliminate 1.06% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.58% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.79% : 0.000005s : 20: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.65% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.61% : 0.000001s : 6: predicate.get_grad_eliminate 0.21% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 6.01% : 0.000011s : 50: predicate.inline 0.60% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.73% : 0.000001s : 6: predicate.less_batch_normalization 1.65% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.47% : 0.000004s : 32: predicate.load_eliminater 0.98% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.99% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.85% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.50% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.64% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.85% : 0.000001s : 12: predicate.minmaximum_grad 1.08% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.35% : 0.000001s : 3: predicate.parallel_virtual_node 2.06% : 0.000004s : 20: predicate.partial_defer_inline 1.39% : 0.000002s : 17: predicate.partial_eliminate 0.97% : 0.000002s : 12: predicate.print_const_string_wrapper 0.51% : 0.000001s : 6: predicate.reduce_all_const_elim 1.48% : 0.000003s : 12: predicate.reduce_eliminate 2.48% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 20: predicate.replace_applicator 0.50% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 1.08% : 0.000002s : 12: predicate.reshape_eliminate 0.53% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.44% : 0.000001s : 3: predicate.row_tensor_eliminate 0.63% : 0.000001s : 6: predicate.same_eliminate 0.36% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.72% : 0.000001s : 6: predicate.shard_identity_eliminate 0.64% : 0.000001s : 6: predicate.special_op_eliminate 0.57% : 0.000001s : 6: predicate.specialize_transform 0.73% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.73% : 0.000003s : 20: predicate.switch_defer_inline 2.07% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.11% : 0.000011s : 68: predicate.switch_simplify 1.00% : 0.000002s : 12: predicate.tile_eliminate 1.00% : 0.000002s : 12: predicate.transpose_eliminate 1.59% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.63% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.39% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.93% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.55% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001104 16 54.03% : 0.000596s : 8: func_graph_cloner_run.FuncGraphClonerGraph 45.97% : 0.000507s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055520 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.38% : 0.003540s : 1: add_attr 6.36% : 0.003530s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000088s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.05% : 0.000027s : 1: bias_add_comm_swap 0.84% : 0.000464s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000029s : 1: event_method 0.02% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.87% : 0.000484s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.98% : 0.000544s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 2.06% : 0.001143s : 78: opt.transform.opt_a 0.04% : 0.000023s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000084s : 28: opt.transform.opt_b 0.07% : 0.000040s : 2: opt.transform.opt_trans_graph 0.06% : 0.000033s : 4: opt.transform.symbol_engine_opt 5.59% : 0.003103s : 1: opt_a 0.18% : 0.000099s : 1: opt_after_cconv 0.88% : 0.000490s : 1: opt_after_jit_grad 0.34% : 0.000186s : 1: opt_b 9.51% : 0.005282s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000040s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000019s : 1: remove_dup_value 1.19% : 0.000660s : 1: renormalize.infer 0.91% : 0.000504s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000020s : 1: rewriter_after_opt_a 0.45% : 0.000250s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000083s : 1: symbol_engine_optimizer 15.26% : 0.008471s : 1: task_emit 0.12% : 0.000069s : 1: tuple_transform 46.17% : 0.025634s : 1: type_inference 0.14% : 0.000076s : 1: validate TotalTime = 0.0425382, [24] [bootstrap]: 0.00047122 [type_inference]: 0.0248165 [event_method]: 2.233e-05 [auto_monad]: 8.034e-05 [graph_reusing]: 6.63003e-06 [inline]: 2.24001e-06 [add_attr]: 0.00325344, [1] [add_attr_with_inline]: 0.00324525, [1] [Cycle 1]: 5.78e-05, [2] [tag_attr]: 2.206e-05 [meta_addattr_fg_expand]: 7.01001e-06 [parallel-infer-symbol]: 4.05e-06 [pre_auto_parallel]: 3.463e-05 [insert-virtual-dataset]: 2.54999e-06 [parallel-infer-symbol-second]: 6.59988e-07 [dataset_repeat_opt]: 1.71002e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00494706, [53] [py_interpret_to_execute]: 4.26001e-06 [rewriter_before_opt_a]: 0.00023753 [opt_a]: 0.00284751, [2] [Cycle 1]: 0.00228592, [45] [expand_dump_flag]: 3.29001e-06 [switch_simplify]: 7.667e-05 [loop_unroll]: 3.214e-05 [a_1]: 0.00058522 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 7.2e-06 [updatestate_depend_eliminate]: 3.57997e-06 [updatestate_assign_eliminate]: 3.46001e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 7.062e-05 [accelerated_algorithm]: 6.12999e-06 [shard]: 2.09e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 5.59998e-06 [merge_send_recv]: 8.40999e-06 [auto_parallel]: 5.95002e-06 [parallel]: 1.795e-05 [flash_sp]: 8.03001e-06 [merge_comm]: 3.66999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 8.38999e-06 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 7.58001e-06 [virtual_dataset]: 5.97999e-06 [get_grad_eliminate_]: 5.35999e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 3.89002e-06 [cell_reuse_recompute_pass]: 1.09998e-06 [offload_activation]: 8.89e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.117e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 9.31e-06 [set_forward_comm_id_for_comm_node_pass]: 3.38e-06 [meta_fg_expand]: 2.73998e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 8.85001e-06 [a_after_grad]: 8.05e-06 [renormalize]: 0.0009914 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 1.92001e-06 [auto_monad_eliminator]: 1.511e-05 [cse]: 3.677e-05 [a_3]: 4.137e-05 [Cycle 2]: 0.00055204, [45] [expand_dump_flag]: 1.19998e-06 [switch_simplify]: 6.79999e-06 [loop_unroll]: 5.62999e-06 [a_1]: 9.625e-05 [with_stream_mark]: 1.211e-05 [recompute_prepare]: 5.35001e-06 [updatestate_depend_eliminate]: 2.89999e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.22999e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 6.005e-05 [accelerated_algorithm]: 5.27999e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.40001e-06 [shard_inline]: 5.24e-06 [merge_send_recv]: 4.33001e-06 [auto_parallel]: 5.09e-06 [parallel]: 4.87998e-06 [flash_sp]: 2.99001e-06 [merge_comm]: 2.91e-06 [allreduce_fusion]: 2.66e-06 [matmul_add_comm_reduction]: 5.10999e-06 [allreduce_slice_to_reducescatter]: 3.70026e-07 [virtual_shard_identity]: 6.28e-06 [virtual_dataset]: 5.19e-06 [get_grad_eliminate_]: 5.01002e-06 [virtual_output]: 5.04e-06 [merge_forward]: 2.66999e-06 [cell_reuse_recompute_pass]: 1.43002e-06 [offload_activation]: 5.66003e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.094e-05 [merge_recompute_call_nodes]: 7.40023e-07 [before_grad]: 8.05e-06 [set_forward_comm_id_for_comm_node_pass]: 2.91e-06 [meta_fg_expand]: 1.88002e-06 [flash_sp_send_recv_attached]: 7.59988e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 8.24002e-06 [a_after_grad]: 7.73001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 7.2e-07 [auto_monad_eliminator]: 6.16e-06 [cse]: 1.456e-05 [a_3]: 3.065e-05 [py_interpret_to_execute_after_opt_a]: 4.63999e-06 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 1.71e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00050533 [opt_b]: 0.0001828, [1] [Cycle 1]: 0.00017672, [7] [b_1]: 0.00010516 [b_2]: 7.32002e-06 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 4.30009e-07 [cse]: 2.036e-05 [optimize_parallel_all_gather_comm]: 1.639e-05 [overlap_param_gather]: 2.29001e-06 [cconv]: 2.514e-05 [loop_unroll]: 0.00042714 [opt_after_cconv]: 9.472e-05, [1] [Cycle 1]: 8.887e-05, [7] [c_1]: 2.462e-05 [parameter_eliminate]: 2.39001e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.96e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 2.938e-05 [tuple_transform]: 6.599e-05, [1] [Cycle 1]: 6.14e-05, [4] [d_1]: 3.569e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 6.17001e-06 [partial_unused_args_eliminate]: 1.88002e-06 [add_recomputation]: 4.406e-05 [cse_after_recomputation]: 2.245e-05, [1] [Cycle 1]: 1.809e-05, [1] [cse]: 1.285e-05 [environ_conv]: 8.42e-06 [swap_dp_allreduce_reducescatter]: 5.74999e-06 [bias_add_comm_swap]: 2.98998e-06 [label_micro_interleaved_index]: 4.22998e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.54e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.53998e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 9.99979e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.66002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.278e-05 [grouped_pairwise_exchange_alltoall]: 1.80001e-06 [offloading_packed_experts]: 3.44001e-06 [overlap_recompute_and_grad_model_parallel]: 4.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.02001e-06 [overlap_grad_ring_attention]: 3.87002e-06 [overlap_grad_flash_sp]: 1.701e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 1.95001e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 8.012e-05, [1] [Cycle 1]: 7.565e-05, [6] [build]: 1.015e-05 [elim_shapecalc]: 9.03002e-06 [elim_not_effective]: 1.225e-05 [opt_reshape]: 6.60002e-06 [fold_const_symbol]: 9.71e-06 [renormalize]: 2.19996e-07 [detach_backward]: 1.59e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 1.576e-05 [get_jit_bprop_graph]: 9.5999e-07 [rewriter_after_jit_bprop_graph]: 3.54002e-06 [opt_after_jit_grad]: 0.00046638 [validate]: 4.194e-05 [backend_pass]: 1.02e-06 [task_emit]: 0.00814991 [execute]: 8.21002e-06 Sums bootstrap : 0.000471s : 1.23% type_inference : 0.024817s : 64.78% event_method : 0.000022s : 0.06% auto_monad : 0.000080s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000035s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000238s : 0.62% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000083s : 0.22% optimize.opt_a.loop_unroll : 0.000038s : 0.10% optimize.opt_a.a_1 : 0.000681s : 1.78% optimize.opt_a.with_stream_mark : 0.000027s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000131s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.04% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000017s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000017s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.000991s : 2.59% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.06% optimize.opt_a.cse : 0.000051s : 0.13% optimize.opt_a.a_3 : 0.000072s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000505s : 1.32% optimize.opt_b.b_1 : 0.000105s : 0.27% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.07% optimize.loop_unroll : 0.000427s : 1.12% optimize.opt_after_cconv.c_1 : 0.000025s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000029s : 0.08% optimize.tuple_transform.d_1 : 0.000036s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.12% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000466s : 1.22% validate : 0.000042s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.008150s : 21.28% execute : 0.000008s : 0.02% Time group info: ------[substitution.] 0.000182 26 1.10% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000002s : 2: substitution.fold_const_symbol 2.90% : 0.000005s : 3: substitution.graph_param_transform 79.22% : 0.000144s : 6: substitution.inline 1.86% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.43% : 0.000004s : 4: substitution.remove_not_recompute_node 1.69% : 0.000003s : 2: substitution.replace_old_param 4.03% : 0.000007s : 1: substitution.switch_simplify 5.82% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024753 2 94.74% : 0.023451s : 1: type_inference.infer 5.26% : 0.001302s : 1: type_inference.specialize ------[replace.] 0.000080 9 59.11% : 0.000047s : 6: replace.inline 21.32% : 0.000017s : 1: replace.switch_simplify 19.57% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 9 89.90% : 0.000140s : 6: match.inline 4.13% : 0.000006s : 1: match.switch_simplify 5.97% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000176 1092 1.00% : 0.000002s : 12: predicate.accumulaten_eliminater 0.84% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 12: predicate.addn_zero_filter 0.93% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.20% : 0.000004s : 18: predicate.arithmetic_simplify 0.97% : 0.000002s : 12: predicate.cast_eliminate 0.58% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.98% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.20% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.06% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.42% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.16% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.22% : 0.000002s : 15: predicate.environ_get_depend_swap 1.65% : 0.000003s : 21: predicate.environ_get_eliminate 1.17% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.58% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.46% : 0.000004s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.72% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.79% : 0.000010s : 50: predicate.inline 0.62% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.74% : 0.000001s : 6: predicate.less_batch_normalization 1.75% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.54% : 0.000004s : 32: predicate.load_eliminater 1.13% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.86% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 12: predicate.minmaximum_grad 1.37% : 0.000002s : 3: predicate.mutable_eliminate 0.30% : 0.000001s : 3: predicate.opt_reshape 0.35% : 0.000001s : 3: predicate.parallel_virtual_node 2.07% : 0.000004s : 20: predicate.partial_defer_inline 1.42% : 0.000002s : 17: predicate.partial_eliminate 0.97% : 0.000002s : 12: predicate.print_const_string_wrapper 0.56% : 0.000001s : 6: predicate.reduce_all_const_elim 1.34% : 0.000002s : 12: predicate.reduce_eliminate 2.48% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 20: predicate.replace_applicator 0.39% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.87% : 0.000002s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.65% : 0.000001s : 6: predicate.shard_identity_eliminate 0.71% : 0.000001s : 6: predicate.special_op_eliminate 0.60% : 0.000001s : 6: predicate.specialize_transform 0.72% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.67% : 0.000003s : 20: predicate.switch_defer_inline 2.33% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.15% : 0.000011s : 68: predicate.switch_simplify 1.02% : 0.000002s : 12: predicate.tile_eliminate 0.98% : 0.000002s : 12: predicate.transpose_eliminate 1.75% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.79% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.32% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.97% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 3: predicate.value_based_eliminate 0.58% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001029 16 55.65% : 0.000573s : 8: func_graph_cloner_run.FuncGraphClonerGraph 44.35% : 0.000456s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052931 196 0.01% : 0.000004s : 1: ForceFp32Comm 6.15% : 0.003258s : 1: add_attr 6.14% : 0.003248s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000085s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.95% : 0.000502s : 1: bootstrap 0.05% : 0.000029s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000004s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.03% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.82% : 0.000436s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.97% : 0.000514s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.04% : 0.001078s : 78: opt.transform.opt_a 0.04% : 0.000023s : 1: opt.transform.opt_after_cconv 0.04% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000085s : 28: opt.transform.opt_b 0.08% : 0.000040s : 2: opt.transform.opt_trans_graph 0.06% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.39% : 0.002851s : 1: opt_a 0.18% : 0.000098s : 1: opt_after_cconv 0.90% : 0.000475s : 1: opt_after_jit_grad 0.35% : 0.000186s : 1: opt_b 9.35% : 0.004952s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000039s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000033s : 1: remove_dup_value 1.01% : 0.000535s : 1: renormalize.infer 0.85% : 0.000448s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000020s : 1: rewriter_after_opt_a 0.46% : 0.000243s : 1: rewriter_before_opt_a 0.01% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000083s : 1: symbol_engine_optimizer 15.42% : 0.008162s : 1: task_emit 0.13% : 0.000069s : 1: tuple_transform 46.92% : 0.024833s : 1: type_inference 0.14% : 0.000072s : 1: validate TotalTime = 0.0448374, [24] [bootstrap]: 0.00052571 [type_inference]: 0.0267296 [event_method]: 2.111e-05 [auto_monad]: 7.984e-05 [graph_reusing]: 6.39001e-06 [inline]: 1.81e-06 [add_attr]: 0.00333269, [1] [add_attr_with_inline]: 0.00332362, [1] [Cycle 1]: 5.326e-05, [2] [tag_attr]: 2.029e-05 [meta_addattr_fg_expand]: 6.91999e-06 [parallel-infer-symbol]: 2.99001e-06 [pre_auto_parallel]: 3.402e-05 [insert-virtual-dataset]: 2.19001e-06 [parallel-infer-symbol-second]: 6.40022e-07 [dataset_repeat_opt]: 2.11e-06 [pipeline_split]: 1.73997e-06 [optimize]: 0.00499946, [53] [py_interpret_to_execute]: 4.27e-06 [rewriter_before_opt_a]: 0.00022812 [opt_a]: 0.00296021, [2] [Cycle 1]: 0.0023857, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 7.433e-05 [loop_unroll]: 3.26e-05 [a_1]: 0.00058844 [with_stream_mark]: 1.578e-05 [recompute_prepare]: 7.63999e-06 [updatestate_depend_eliminate]: 3.96001e-06 [updatestate_assign_eliminate]: 3.59002e-06 [updatestate_loads_eliminate]: 2.92002e-06 [parameter_eliminate]: 2.29999e-06 [a_2]: 7.011e-05 [accelerated_algorithm]: 6.06e-06 [shard]: 1.66998e-06 [meta_shard_fg_expand]: 1.99e-06 [shard_inline]: 6.13002e-06 [merge_send_recv]: 8.22998e-06 [auto_parallel]: 5.73002e-06 [parallel]: 1.906e-05 [flash_sp]: 7.26999e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.16001e-06 [matmul_add_comm_reduction]: 8.36002e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 6.94001e-06 [virtual_dataset]: 5.74999e-06 [get_grad_eliminate_]: 5.67999e-06 [virtual_output]: 5.39e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 8.76002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.103e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 9.44e-06 [set_forward_comm_id_for_comm_node_pass]: 3.28e-06 [meta_fg_expand]: 2.76e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 2.02999e-06 [after_resolve]: 8.72998e-06 [a_after_grad]: 7.93001e-06 [renormalize]: 0.00108804 [add_forward_monad_depend]: 5.40001e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.599e-05 [cse]: 3.41e-05 [a_3]: 4.299e-05 [Cycle 2]: 0.0005658, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 7.53e-06 [loop_unroll]: 6.14999e-06 [a_1]: 9.837e-05 [with_stream_mark]: 1.08e-05 [recompute_prepare]: 5.82999e-06 [updatestate_depend_eliminate]: 2.99001e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 2.24001e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 6.251e-05 [accelerated_algorithm]: 5.57001e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 5.22e-06 [merge_send_recv]: 4.32e-06 [auto_parallel]: 5.19998e-06 [parallel]: 4.31002e-06 [flash_sp]: 3.04999e-06 [merge_comm]: 3.31999e-06 [allreduce_fusion]: 2.74999e-06 [matmul_add_comm_reduction]: 4.90001e-06 [allreduce_slice_to_reducescatter]: 3.09985e-07 [virtual_shard_identity]: 6.26e-06 [virtual_dataset]: 5.40999e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 4.97999e-06 [merge_forward]: 2.81e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 6.08002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.175e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 8.45999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.13e-06 [meta_fg_expand]: 1.76e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 7.77e-06 [a_after_grad]: 7.74002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.34e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 6.48003e-06 [cse]: 1.575e-05 [a_3]: 3.103e-05 [py_interpret_to_execute_after_opt_a]: 4.27998e-06 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 1.663e-05 [convert_after_rewriter]: 1.20001e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00045499 [opt_b]: 0.00018665, [1] [Cycle 1]: 0.00018059, [7] [b_1]: 0.00010774 [b_2]: 7.28e-06 [updatestate_depend_eliminate]: 5.24998e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 2.29001e-06 [renormalize]: 4.00003e-07 [cse]: 2.022e-05 [optimize_parallel_all_gather_comm]: 1.598e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.162e-05 [loop_unroll]: 0.00041452 [opt_after_cconv]: 9.619e-05, [1] [Cycle 1]: 9.1e-05, [7] [c_1]: 2.57e-05 [parameter_eliminate]: 2.27001e-06 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.21e-06 [cse]: 1.981e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 2.571e-05 [tuple_transform]: 6.515e-05, [1] [Cycle 1]: 6.098e-05, [4] [d_1]: 3.564e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.58e-06 [partial_unused_args_eliminate]: 1.88002e-06 [add_recomputation]: 4.576e-05 [cse_after_recomputation]: 2.114e-05, [1] [Cycle 1]: 1.655e-05, [1] [cse]: 1.129e-05 [environ_conv]: 7.8e-06 [swap_dp_allreduce_reducescatter]: 5.46e-06 [bias_add_comm_swap]: 2.76999e-06 [label_micro_interleaved_index]: 3.88999e-06 [label_fine_grained_interleaved_index]: 2.56998e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.03997e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.49001e-06 [reorder_send_recv_between_fp_bp]: 2.83998e-06 [comm_op_add_attrs]: 1.12999e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.01002e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.238e-05 [grouped_pairwise_exchange_alltoall]: 1.74998e-06 [offloading_packed_experts]: 3.48e-06 [overlap_recompute_and_grad_model_parallel]: 4.15999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 4.05e-06 [overlap_grad_flash_sp]: 1.635e-05 [begin_end_overlap_inline]: 6.69999e-07 [split_matmul_comm_elemetwise]: 2.06998e-06 [split_layernorm_comm]: 1.84e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 7.78e-05, [1] [Cycle 1]: 7.368e-05, [6] [build]: 9.52999e-06 [elim_shapecalc]: 9.24e-06 [elim_not_effective]: 1.209e-05 [opt_reshape]: 5.96e-06 [fold_const_symbol]: 8.79003e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.97999e-06 [pipeline_parallel_scheduler]: 1.38002e-06 [auto_monad_reorder]: 1.618e-05 [get_jit_bprop_graph]: 9.99979e-07 [rewriter_after_jit_bprop_graph]: 3.27002e-06 [opt_after_jit_grad]: 0.00049452 [validate]: 4.03e-05 [backend_pass]: 1.14e-06 [task_emit]: 0.00832936 [execute]: 6.73003e-06 Sums bootstrap : 0.000526s : 1.30% type_inference : 0.026730s : 66.00% event_method : 0.000021s : 0.05% auto_monad : 0.000080s : 0.20% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000034s : 0.08% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000228s : 0.56% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000082s : 0.20% optimize.opt_a.loop_unroll : 0.000039s : 0.10% optimize.opt_a.a_1 : 0.000687s : 1.70% optimize.opt_a.with_stream_mark : 0.000027s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000133s : 0.33% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.06% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000010s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.001088s : 2.69% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.06% optimize.opt_a.cse : 0.000050s : 0.12% optimize.opt_a.a_3 : 0.000074s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000455s : 1.12% optimize.opt_b.b_1 : 0.000108s : 0.27% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.05% optimize.loop_unroll : 0.000415s : 1.02% optimize.opt_after_cconv.c_1 : 0.000026s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000026s : 0.06% optimize.tuple_transform.d_1 : 0.000036s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000046s : 0.11% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000016s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000495s : 1.22% validate : 0.000040s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.008329s : 20.57% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000179 26 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.95% : 0.000002s : 2: substitution.fold_const_symbol 2.86% : 0.000005s : 3: substitution.graph_param_transform 79.60% : 0.000143s : 6: substitution.inline 1.81% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.33% : 0.000004s : 4: substitution.remove_not_recompute_node 1.54% : 0.000003s : 2: substitution.replace_old_param 4.06% : 0.000007s : 1: substitution.switch_simplify 5.80% : 0.000010s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026669 2 95.36% : 0.025430s : 1: type_inference.infer 4.64% : 0.001238s : 1: type_inference.specialize ------[replace.] 0.000079 9 60.14% : 0.000048s : 6: replace.inline 20.32% : 0.000016s : 1: replace.switch_simplify 19.54% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 9 89.85% : 0.000140s : 6: match.inline 4.21% : 0.000007s : 1: match.switch_simplify 5.94% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000174 1092 0.94% : 0.000002s : 12: predicate.accumulaten_eliminater 0.84% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 6: predicate.addn_check_dump 1.16% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.18% : 0.000004s : 18: predicate.arithmetic_simplify 1.03% : 0.000002s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.51% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.47% : 0.000001s : 6: predicate.depend_value_elim 1.06% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.11% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 3: predicate.elim_not_effective 0.35% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.18% : 0.000002s : 15: predicate.environ_get_depend_swap 1.66% : 0.000003s : 21: predicate.environ_get_eliminate 1.25% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.66% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.75% : 0.000005s : 20: predicate.float_depend_g_call 0.44% : 0.000001s : 6: predicate.float_environ_get_switch 0.70% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.62% : 0.000001s : 6: predicate.get_grad_eliminate 0.20% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.94% : 0.000010s : 50: predicate.inline 0.67% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.71% : 0.000001s : 6: predicate.less_batch_normalization 1.71% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.47% : 0.000004s : 32: predicate.load_eliminater 0.94% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.92% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.50% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.00% : 0.000002s : 3: predicate.mutable_eliminate 0.32% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.06% : 0.000004s : 20: predicate.partial_defer_inline 1.53% : 0.000003s : 17: predicate.partial_eliminate 0.97% : 0.000002s : 12: predicate.print_const_string_wrapper 0.57% : 0.000001s : 6: predicate.reduce_all_const_elim 1.41% : 0.000002s : 12: predicate.reduce_eliminate 2.34% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 20: predicate.replace_applicator 0.51% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.03% : 0.000002s : 12: predicate.reshape_eliminate 0.53% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.66% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.74% : 0.000001s : 6: predicate.shard_identity_eliminate 0.68% : 0.000001s : 6: predicate.special_op_eliminate 0.64% : 0.000001s : 6: predicate.specialize_transform 0.75% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.71% : 0.000003s : 20: predicate.switch_defer_inline 2.44% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.08% : 0.000011s : 68: predicate.switch_simplify 0.96% : 0.000002s : 12: predicate.tile_eliminate 1.03% : 0.000002s : 12: predicate.transpose_eliminate 1.53% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 2.96% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.34% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.96% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 3: predicate.value_based_eliminate 0.65% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001153 16 58.68% : 0.000676s : 8: func_graph_cloner_run.FuncGraphClonerGraph 41.32% : 0.000476s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.055494 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.01% : 0.003337s : 1: add_attr 6.00% : 0.003327s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.15% : 0.000086s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.01% : 0.000559s : 1: bootstrap 0.05% : 0.000025s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.02% : 0.000011s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.76% : 0.000423s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.83% : 0.000463s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000013s : 1: opt.transform.mutable_eliminate 1.97% : 0.001091s : 78: opt.transform.opt_a 0.04% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000087s : 28: opt.transform.opt_b 0.07% : 0.000040s : 2: opt.transform.opt_trans_graph 0.06% : 0.000033s : 4: opt.transform.symbol_engine_opt 5.34% : 0.002963s : 1: opt_a 0.18% : 0.000099s : 1: opt_after_cconv 0.91% : 0.000504s : 1: opt_after_jit_grad 0.34% : 0.000190s : 1: opt_b 9.02% : 0.005004s : 1: optimize 0.03% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.08% : 0.000045s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000038s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000029s : 1: remove_dup_value 0.98% : 0.000545s : 1: renormalize.infer 0.96% : 0.000535s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000020s : 1: rewriter_after_opt_a 0.42% : 0.000234s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000081s : 1: symbol_engine_optimizer 15.03% : 0.008340s : 1: task_emit 0.12% : 0.000068s : 1: tuple_transform 48.19% : 0.026744s : 1: type_inference 0.13% : 0.000072s : 1: validate TotalTime = 0.153972, [24] [bootstrap]: 0.00045881 [type_inference]: 0.0472231 [event_method]: 0.0002902 [auto_monad]: 0.00015499 [graph_reusing]: 1.032e-05 [inline]: 1.91e-06 [add_attr]: 0.00319164, [1] [add_attr_with_inline]: 0.00318077, [1] [Cycle 1]: 8.26e-05, [2] [tag_attr]: 4.09e-05 [meta_addattr_fg_expand]: 1.235e-05 [parallel-infer-symbol]: 2.71e-06 [pre_auto_parallel]: 5.816e-05 [insert-virtual-dataset]: 2.47001e-06 [parallel-infer-symbol-second]: 6.19999e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.51002e-06 [optimize]: 0.0539146, [53] [py_interpret_to_execute]: 4.31002e-06 [rewriter_before_opt_a]: 0.00036165 [opt_a]: 0.0414375, [3] [Cycle 1]: 0.0182079, [45] [expand_dump_flag]: 5.02e-06 [switch_simplify]: 0.00015572 [loop_unroll]: 6.585e-05 [a_1]: 0.00140497 [with_stream_mark]: 2.547e-05 [recompute_prepare]: 2.067e-05 [updatestate_depend_eliminate]: 8.34998e-06 [updatestate_assign_eliminate]: 6.76999e-06 [updatestate_loads_eliminate]: 7.55e-06 [parameter_eliminate]: 2.63e-06 [a_2]: 0.0002085 [accelerated_algorithm]: 1.44e-05 [shard]: 1.42e-06 [meta_shard_fg_expand]: 4.12e-06 [shard_inline]: 1.418e-05 [merge_send_recv]: 1.474e-05 [auto_parallel]: 9.92999e-06 [parallel]: 1.632e-05 [flash_sp]: 8.84998e-06 [merge_comm]: 8.75999e-06 [allreduce_fusion]: 7.82998e-06 [matmul_add_comm_reduction]: 2.413e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 1.505e-05 [virtual_dataset]: 1.337e-05 [get_grad_eliminate_]: 1.395e-05 [virtual_output]: 1.334e-05 [merge_forward]: 8.23001e-06 [cell_reuse_recompute_pass]: 1.08001e-06 [offload_activation]: 1.543e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.491e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 2.328e-05 [set_forward_comm_id_for_comm_node_pass]: 8.37998e-06 [meta_fg_expand]: 0.00155321 [flash_sp_send_recv_attached]: 3.84002e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 6.508e-05 [a_after_grad]: 8.612e-05 [renormalize]: 0.0125634 [add_forward_monad_depend]: 1.096e-05 [auto_monad_grad]: 6.10002e-06 [auto_monad_eliminator]: 0.00015277 [cse]: 0.0004786 [a_3]: 0.00081902 [Cycle 2]: 0.0167406, [45] [expand_dump_flag]: 2.14e-06 [switch_simplify]: 0.00011169 [loop_unroll]: 0.00010718 [a_1]: 0.00350358 [with_stream_mark]: 6.777e-05 [recompute_prepare]: 7.863e-05 [updatestate_depend_eliminate]: 4.61e-05 [updatestate_assign_eliminate]: 4.241e-05 [updatestate_loads_eliminate]: 4.131e-05 [parameter_eliminate]: 1.24e-06 [a_2]: 0.0051816 [accelerated_algorithm]: 0.00020412 [shard]: 1.72001e-06 [meta_shard_fg_expand]: 3.379e-05 [shard_inline]: 8.046e-05 [merge_send_recv]: 5.824e-05 [auto_parallel]: 5.02e-05 [parallel]: 6.87002e-06 [flash_sp]: 4.00998e-06 [merge_comm]: 4.939e-05 [allreduce_fusion]: 4.673e-05 [matmul_add_comm_reduction]: 6.131e-05 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 7.832e-05 [virtual_dataset]: 7.496e-05 [get_grad_eliminate_]: 7.504e-05 [virtual_output]: 7.71e-05 [merge_forward]: 4.347e-05 [cell_reuse_recompute_pass]: 2.38998e-06 [offload_activation]: 6.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.0001335 [merge_recompute_call_nodes]: 1.16002e-06 [before_grad]: 0.00012541 [set_forward_comm_id_for_comm_node_pass]: 5.057e-05 [meta_fg_expand]: 0.0001967 [flash_sp_send_recv_attached]: 1.39e-06 [receive_attached]: 1.60999e-06 [after_resolve]: 8.94e-05 [a_after_grad]: 0.00012512 [renormalize]: 0.00448536 [add_forward_monad_depend]: 5.30001e-06 [auto_monad_grad]: 1.69e-06 [auto_monad_eliminator]: 0.00011182 [cse]: 0.00032551 [a_3]: 0.00058664 [Cycle 3]: 0.0064733, [45] [expand_dump_flag]: 1.54e-06 [switch_simplify]: 7.865e-05 [loop_unroll]: 7.731e-05 [a_1]: 0.00226374 [with_stream_mark]: 5.528e-05 [recompute_prepare]: 7.648e-05 [updatestate_depend_eliminate]: 4.703e-05 [updatestate_assign_eliminate]: 4.397e-05 [updatestate_loads_eliminate]: 4.448e-05 [parameter_eliminate]: 1.24e-06 [a_2]: 0.00122203 [accelerated_algorithm]: 0.00010368 [shard]: 1.12e-06 [meta_shard_fg_expand]: 2.139e-05 [shard_inline]: 8.063e-05 [merge_send_recv]: 5.464e-05 [auto_parallel]: 4.985e-05 [parallel]: 4.47e-06 [flash_sp]: 1.09e-06 [merge_comm]: 4.963e-05 [allreduce_fusion]: 4.779e-05 [matmul_add_comm_reduction]: 5.489e-05 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.803e-05 [virtual_dataset]: 7.564e-05 [get_grad_eliminate_]: 7.803e-05 [virtual_output]: 7.55e-05 [merge_forward]: 4.445e-05 [cell_reuse_recompute_pass]: 1.55001e-06 [offload_activation]: 5.648e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013292 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 0.00012708 [set_forward_comm_id_for_comm_node_pass]: 5.234e-05 [meta_fg_expand]: 3.839e-05 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 9.79984e-07 [after_resolve]: 7.912e-05 [a_after_grad]: 0.00012566 [renormalize]: 1.10012e-07 [add_forward_monad_depend]: 1.87001e-06 [auto_monad_grad]: 1.24998e-06 [auto_monad_eliminator]: 8.095e-05 [cse]: 0.00025726 [a_3]: 0.00055239 [py_interpret_to_execute_after_opt_a]: 5.15999e-06 [slice_cell_reuse_recomputed_activation]: 2.19001e-06 [rewriter_after_opt_a]: 0.00022362 [convert_after_rewriter]: 1.34e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00081226 [opt_b]: 0.00675768, [2] [Cycle 1]: 0.00438043, [7] [b_1]: 0.00381806 [b_2]: 8.048e-05 [updatestate_depend_eliminate]: 6.006e-05 [updatestate_assign_eliminate]: 4.434e-05 [updatestate_loads_eliminate]: 4.483e-05 [renormalize]: 4.90021e-07 [cse]: 0.00028275 [Cycle 2]: 0.00236602, [7] [b_1]: 0.0018649 [b_2]: 7.758e-05 [updatestate_depend_eliminate]: 5.197e-05 [updatestate_assign_eliminate]: 4.337e-05 [updatestate_loads_eliminate]: 4.431e-05 [renormalize]: 1.00001e-07 [cse]: 0.00024377 [optimize_parallel_all_gather_comm]: 9.461e-05 [overlap_param_gather]: 2.28998e-06 [cconv]: 4.012e-05 [loop_unroll]: 0.00057244 [opt_after_cconv]: 0.00089755, [1] [Cycle 1]: 0.00089097, [7] [c_1]: 0.00040565 [parameter_eliminate]: 2.44001e-06 [updatestate_depend_eliminate]: 6.16e-05 [updatestate_assign_eliminate]: 4.637e-05 [updatestate_loads_eliminate]: 4.454e-05 [cse]: 0.000286 [renormalize]: 4.49974e-07 [remove_dup_value]: 0.00051483 [tuple_transform]: 0.00060883, [1] [Cycle 1]: 0.00060287, [4] [d_1]: 0.00050856 [none_parameter_eliminate]: 2.53003e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 6.691e-05 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 0.00025854 [cse_after_recomputation]: 0.00016838, [1] [Cycle 1]: 0.00016233, [1] [cse]: 0.00015426 [environ_conv]: 2.817e-05 [swap_dp_allreduce_reducescatter]: 4.872e-05 [bias_add_comm_swap]: 2.84999e-06 [label_micro_interleaved_index]: 4.06001e-06 [label_fine_grained_interleaved_index]: 2.47001e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.26998e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 7.2e-07 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.26998e-06 [reorder_send_recv_between_fp_bp]: 2.56998e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.34e-06 [overlap_opt_shard_grad_in_pipeline]: 1.68002e-06 [control_data_broadcast_order]: 0.0001087 [grouped_pairwise_exchange_alltoall]: 1.42999e-06 [offloading_packed_experts]: 2.71e-05 [overlap_recompute_and_grad_model_parallel]: 2.698e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49998e-06 [overlap_recompute_comm]: 2.30002e-06 [overlap_grad_ring_attention]: 2.586e-05 [overlap_grad_flash_sp]: 0.00013952 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 1.86e-06 [split_layernorm_comm]: 1.92999e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 0.00045002, [1] [Cycle 1]: 0.00044319, [6] [build]: 1.884e-05 [elim_shapecalc]: 8.086e-05 [elim_not_effective]: 0.00011813 [opt_reshape]: 7.3e-05 [fold_const_symbol]: 0.0001177 [renormalize]: 2.80008e-07 [detach_backward]: 1.84998e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 0.00010559 [get_jit_bprop_graph]: 1.42e-06 [rewriter_after_jit_bprop_graph]: 4.57e-06 [opt_after_jit_grad]: 0.00071378 [validate]: 0.00016187 [backend_pass]: 1.27e-06 [task_emit]: 0.0473636 [execute]: 7.93001e-06 Sums bootstrap : 0.000459s : 0.31% type_inference : 0.047223s : 31.65% event_method : 0.000290s : 0.19% auto_monad : 0.000155s : 0.10% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000058s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000362s : 0.24% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000346s : 0.23% optimize.opt_a.loop_unroll : 0.000250s : 0.17% optimize.opt_a.a_1 : 0.007172s : 4.81% optimize.opt_a.with_stream_mark : 0.000149s : 0.10% optimize.opt_a.recompute_prepare : 0.000176s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000101s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000093s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000093s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.006612s : 4.43% optimize.opt_a.accelerated_algorithm : 0.000322s : 0.22% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000059s : 0.04% optimize.opt_a.shard_inline : 0.000175s : 0.12% optimize.opt_a.merge_send_recv : 0.000128s : 0.09% optimize.opt_a.auto_parallel : 0.000110s : 0.07% optimize.opt_a.parallel : 0.000028s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000108s : 0.07% optimize.opt_a.allreduce_fusion : 0.000102s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000140s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000171s : 0.11% optimize.opt_a.virtual_dataset : 0.000164s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000167s : 0.11% optimize.opt_a.virtual_output : 0.000166s : 0.11% optimize.opt_a.merge_forward : 0.000096s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000132s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000291s : 0.20% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000276s : 0.18% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000111s : 0.07% optimize.opt_a.meta_fg_expand : 0.001788s : 1.20% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000234s : 0.16% optimize.opt_a.a_after_grad : 0.000337s : 0.23% optimize.opt_a.renormalize : 0.017049s : 11.43% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.01% optimize.opt_a.auto_monad_grad : 0.000009s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000346s : 0.23% optimize.opt_a.cse : 0.001061s : 0.71% optimize.opt_a.a_3 : 0.001958s : 1.31% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000224s : 0.15% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000812s : 0.54% optimize.opt_b.b_1 : 0.005683s : 3.81% optimize.opt_b.b_2 : 0.000158s : 0.11% optimize.opt_b.updatestate_depend_eliminate : 0.000112s : 0.08% optimize.opt_b.updatestate_assign_eliminate : 0.000088s : 0.06% optimize.opt_b.updatestate_loads_eliminate : 0.000089s : 0.06% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000527s : 0.35% optimize.optimize_parallel_all_gather_comm : 0.000095s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000040s : 0.03% optimize.loop_unroll : 0.000572s : 0.38% optimize.opt_after_cconv.c_1 : 0.000406s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000062s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000046s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000045s : 0.03% optimize.opt_after_cconv.cse : 0.000286s : 0.19% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000515s : 0.35% optimize.tuple_transform.d_1 : 0.000509s : 0.34% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000067s : 0.04% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000259s : 0.17% optimize.cse_after_recomputation.cse : 0.000154s : 0.10% optimize.environ_conv : 0.000028s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000049s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000109s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000027s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000027s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.02% optimize.overlap_grad_flash_sp : 0.000140s : 0.09% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000019s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000081s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000118s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000073s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000118s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000106s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000714s : 0.48% validate : 0.000162s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.047364s : 31.74% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.001385 720 6.71% : 0.000093s : 36: substitution.arithmetic_simplify 1.22% : 0.000017s : 52: substitution.elim_not_effective 0.57% : 0.000008s : 11: substitution.float_depend_g_call 1.22% : 0.000017s : 17: substitution.float_tuple_getitem_switch 1.21% : 0.000017s : 52: substitution.fold_const_symbol 2.86% : 0.000040s : 59: substitution.graph_param_transform 0.17% : 0.000002s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 42.79% : 0.000592s : 21: substitution.inline 1.20% : 0.000017s : 2: substitution.inline_without_move 3.12% : 0.000043s : 114: substitution.j_node_and_user_rematch 5.95% : 0.000082s : 34: substitution.less_batch_normalization 1.33% : 0.000018s : 13: substitution.minmaximum_grad 0.60% : 0.000008s : 11: substitution.partial_eliminate 4.51% : 0.000062s : 114: substitution.remove_not_recompute_node 1.52% : 0.000021s : 9: substitution.replace_applicator 0.71% : 0.000010s : 11: substitution.replace_old_param 0.20% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.94% : 0.000013s : 4: substitution.switch_simplify 1.76% : 0.000024s : 14: substitution.transpose_eliminate 4.57% : 0.000063s : 25: substitution.tuple_list_convert_item_index_to_positive 2.19% : 0.000030s : 25: substitution.tuple_list_get_item_const_eliminator 2.91% : 0.000040s : 25: substitution.tuple_list_get_item_depend_reorder 6.86% : 0.000095s : 40: substitution.tuple_list_get_item_eliminator 2.81% : 0.000039s : 25: substitution.tuple_list_get_set_item_eliminator 1.93% : 0.000027s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.047128 2 93.84% : 0.044227s : 1: type_inference.infer 6.16% : 0.002901s : 1: type_inference.specialize ------[replace.] 0.000295 33 52.74% : 0.000156s : 21: replace.inline 14.58% : 0.000043s : 4: replace.switch_simplify 27.13% : 0.000080s : 7: replace.tuple_list_get_item_eliminator 5.54% : 0.000016s : 1: replace.zero_like_fill_zero ------[match.] 0.000636 33 91.17% : 0.000580s : 21: match.inline 1.62% : 0.000010s : 4: match.switch_simplify 3.18% : 0.000020s : 7: match.tuple_list_get_item_eliminator 4.03% : 0.000026s : 1: match.zero_like_fill_zero ------[predicate.] 0.003169 24043 0.71% : 0.000023s : 191: predicate.accumulaten_eliminater 0.48% : 0.000015s : 59: predicate.ad_related_special_op_eliminate 0.64% : 0.000020s : 158: predicate.addn_check_dump 0.72% : 0.000023s : 191: predicate.addn_zero_filter 0.71% : 0.000022s : 191: predicate.adjust_all_reduce_mul_add 1.80% : 0.000057s : 349: predicate.arithmetic_simplify 0.73% : 0.000023s : 191: predicate.cast_eliminate 1.06% : 0.000034s : 256: predicate.check_bprop_eliminate 0.63% : 0.000020s : 158: predicate.compare_switch_simplify 0.51% : 0.000016s : 216: predicate.const_output_eliminate 0.65% : 0.000020s : 158: predicate.depend_value_elim 0.80% : 0.000025s : 191: predicate.dict_get_item_const_eliminator 0.86% : 0.000027s : 191: predicate.dict_get_item_eliminator 0.72% : 0.000023s : 191: predicate.dict_set_item_eliminator 1.17% : 0.000037s : 275: predicate.dumpgradient_eliminate 0.14% : 0.000004s : 59: predicate.elim_not_effective 0.28% : 0.000009s : 59: predicate.elim_shapecalc_of_broadcastargs 1.59% : 0.000050s : 407: predicate.environ_add_const_eliminate 1.57% : 0.000050s : 407: predicate.environ_get_add_eliminate 1.58% : 0.000050s : 407: predicate.environ_get_depend_swap 2.24% : 0.000071s : 565: predicate.environ_get_eliminate 1.57% : 0.000050s : 407: predicate.environ_get_set_eliminate 0.85% : 0.000027s : 219: predicate.exchange_switch_depend_value 1.13% : 0.000036s : 219: predicate.float_depend_g_call 0.63% : 0.000020s : 158: predicate.float_environ_get_switch 1.54% : 0.000049s : 374: predicate.float_tuple_getitem_switch 0.14% : 0.000004s : 59: predicate.fold_const_symbol 0.69% : 0.000022s : 158: predicate.get_grad_eliminate 0.15% : 0.000005s : 59: predicate.graph_param_transform 0.64% : 0.000020s : 158: predicate.incorporate_call 0.61% : 0.000019s : 158: predicate.incorporate_call_switch 5.52% : 0.000175s : 1014: predicate.inline 1.00% : 0.000032s : 185: predicate.inline_without_move 0.35% : 0.000011s : 158: predicate.j_node_and_user_rematch 0.79% : 0.000025s : 158: predicate.less_batch_normalization 1.96% : 0.000062s : 473: predicate.list_to_tuple_eliminator_ 2.59% : 0.000082s : 677: predicate.load_eliminater 0.61% : 0.000019s : 72: predicate.loop_unroll_after_grad 1.12% : 0.000035s : 259: predicate.loop_unroll_before_grad 1.91% : 0.000060s : 479: predicate.make_slice_get_slice_eliminator 0.64% : 0.000020s : 158: predicate.merge_addn 1.04% : 0.000033s : 256: predicate.micro_step_allgather_replace 1.05% : 0.000033s : 256: predicate.mini_step_allgather_replace 0.72% : 0.000023s : 191: predicate.minmaximum_grad 0.63% : 0.000020s : 73: predicate.mutable_eliminate 0.27% : 0.000008s : 59: predicate.opt_reshape 0.93% : 0.000030s : 216: predicate.parallel_virtual_node 1.19% : 0.000038s : 219: predicate.partial_defer_inline 1.20% : 0.000038s : 270: predicate.partial_eliminate 0.71% : 0.000023s : 191: predicate.print_const_string_wrapper 0.64% : 0.000020s : 158: predicate.reduce_all_const_elim 0.89% : 0.000028s : 191: predicate.reduce_eliminate 2.61% : 0.000083s : 677: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000011s : 158: predicate.remove_not_recompute_node 1.30% : 0.000041s : 454: predicate.replace_applicator 0.44% : 0.000014s : 185: predicate.replace_old_param 0.53% : 0.000017s : 216: predicate.reset_defer_inline 0.73% : 0.000023s : 191: predicate.reshape_eliminate 1.07% : 0.000034s : 256: predicate.row_tensor_add_zeros_like 0.66% : 0.000021s : 144: predicate.row_tensor_eliminate 1.28% : 0.000041s : 256: predicate.same_eliminate 0.41% : 0.000013s : 158: predicate.set_cell_output_no_recompute 0.70% : 0.000022s : 158: predicate.shard_identity_eliminate 1.18% : 0.000037s : 275: predicate.special_op_eliminate 0.71% : 0.000022s : 158: predicate.specialize_transform 1.10% : 0.000035s : 256: predicate.split_environ_get_set_with_tuple_value 0.89% : 0.000028s : 185: predicate.stack_unstack_eliminate 0.31% : 0.000010s : 72: predicate.switch_call_monad_eliminater 0.93% : 0.000029s : 219: predicate.switch_defer_inline 1.95% : 0.000062s : 475: predicate.switch_layer_defer_inline 3.08% : 0.000098s : 703: predicate.switch_simplify 0.72% : 0.000023s : 191: predicate.tile_eliminate 0.76% : 0.000024s : 191: predicate.transpose_eliminate 1.94% : 0.000061s : 466: predicate.tuple_list_convert_item_index_to_positive 2.00% : 0.000063s : 466: predicate.tuple_list_get_item_const_eliminator 1.99% : 0.000063s : 466: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000091s : 631: predicate.tuple_list_get_item_eliminator 1.98% : 0.000063s : 466: predicate.tuple_list_get_set_item_eliminator 2.73% : 0.000087s : 624: predicate.tuple_list_set_item_eliminator 1.89% : 0.000060s : 473: predicate.tuple_to_list_eliminator_ 2.57% : 0.000081s : 677: predicate.updatestate_pure_node_eliminater 3.41% : 0.000108s : 835: predicate.updatestate_useless_node_eliminater 0.91% : 0.000029s : 216: predicate.value_based_eliminate 0.70% : 0.000022s : 158: predicate.virtual_dataset_eliminate 0.70% : 0.000022s : 158: predicate.virtual_output_eliminate 0.26% : 0.000008s : 59: predicate.virtual_view_grad_eliminate 1.06% : 0.000034s : 218: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004516 51 62.36% : 0.002816s : 26: func_graph_cloner_run.FuncGraphClonerGraph 37.64% : 0.001700s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.254196 292 0.00% : 0.000003s : 1: ForceFp32Comm 1.26% : 0.003196s : 1: add_attr 1.25% : 0.003184s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000264s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000164s : 1: auto_monad 0.04% : 0.000110s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000488s : 1: bootstrap 0.02% : 0.000044s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000113s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.07% : 0.000172s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000032s : 1: environ_conv 0.12% : 0.000302s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.23% : 0.000581s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.32% : 0.000822s : 1: mutable_eliminate 0.01% : 0.000030s : 1: offloading_packed_experts 0.04% : 0.000112s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000114s : 1: opt.transform.mutable_eliminate 7.34% : 0.018648s : 117: opt.transform.opt_a 0.16% : 0.000404s : 1: opt.transform.opt_after_cconv 0.08% : 0.000204s : 1: opt.transform.opt_after_jit_grad 2.26% : 0.005747s : 83: opt.transform.opt_b 0.22% : 0.000571s : 2: opt.transform.opt_trans_graph 0.15% : 0.000385s : 4: opt.transform.symbol_engine_opt 16.30% : 0.041441s : 1: opt_a 0.35% : 0.000902s : 1: opt_after_cconv 0.28% : 0.000724s : 1: opt_after_jit_grad 2.66% : 0.006762s : 1: opt_b 21.21% : 0.053919s : 1: optimize 0.04% : 0.000099s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.06% : 0.000143s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000029s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000030s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.21% : 0.000524s : 1: remove_dup_value 4.55% : 0.011568s : 2: renormalize.infer 2.15% : 0.005464s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000228s : 1: rewriter_after_opt_a 0.14% : 0.000368s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000052s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000453s : 1: symbol_engine_optimizer 18.64% : 0.047376s : 1: task_emit 0.24% : 0.000613s : 1: tuple_transform 18.58% : 0.047238s : 1: type_inference 0.11% : 0.000270s : 1: validate TotalTime = 0.0465229, [24] [bootstrap]: 0.00045387 [type_inference]: 0.0293396 [event_method]: 0.00012032 [auto_monad]: 0.00016634 [graph_reusing]: 1.186e-05 [inline]: 1.85001e-06 [add_attr]: 0.00307331, [1] [add_attr_with_inline]: 0.00306565, [1] [Cycle 1]: 6.115e-05, [2] [tag_attr]: 2.564e-05 [meta_addattr_fg_expand]: 7.36999e-06 [parallel-infer-symbol]: 3.17997e-06 [pre_auto_parallel]: 3.841e-05 [insert-virtual-dataset]: 2.78e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 2.18998e-06 [optimize]: 0.00564222, [53] [py_interpret_to_execute]: 4.1e-06 [rewriter_before_opt_a]: 0.00024262 [opt_a]: 0.00345215, [2] [Cycle 1]: 0.0027635, [45] [expand_dump_flag]: 3.86001e-06 [switch_simplify]: 8.249e-05 [loop_unroll]: 3.942e-05 [a_1]: 0.00076383 [with_stream_mark]: 1.542e-05 [recompute_prepare]: 8.90001e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.78999e-06 [updatestate_loads_eliminate]: 3.86001e-06 [parameter_eliminate]: 1.84998e-06 [a_2]: 9.702e-05 [accelerated_algorithm]: 7.88001e-06 [shard]: 1.58002e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 7.11999e-06 [merge_send_recv]: 8.97e-06 [auto_parallel]: 6.20002e-06 [parallel]: 1.591e-05 [flash_sp]: 6.91999e-06 [merge_comm]: 4.55999e-06 [allreduce_fusion]: 3.91001e-06 [matmul_add_comm_reduction]: 9.50001e-06 [allreduce_slice_to_reducescatter]: 5.99975e-07 [virtual_shard_identity]: 8.59e-06 [virtual_dataset]: 7.09001e-06 [get_grad_eliminate_]: 7.21001e-06 [virtual_output]: 7.13998e-06 [merge_forward]: 4.87998e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 9.94999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [merge_recompute_call_nodes]: 1.51998e-06 [before_grad]: 1.151e-05 [set_forward_comm_id_for_comm_node_pass]: 4.42998e-06 [meta_fg_expand]: 3.63e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 2.21998e-06 [after_resolve]: 1.096e-05 [a_after_grad]: 1.022e-05 [renormalize]: 0.00120614 [add_forward_monad_depend]: 5.04e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.657e-05 [cse]: 3.865e-05 [a_3]: 5.033e-05 [Cycle 2]: 0.0006794, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 8.40999e-06 [loop_unroll]: 7.33e-06 [a_1]: 0.00015546 [with_stream_mark]: 1.123e-05 [recompute_prepare]: 6.91999e-06 [updatestate_depend_eliminate]: 3.71001e-06 [updatestate_assign_eliminate]: 3.2e-06 [updatestate_loads_eliminate]: 3.31001e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 8.499e-05 [accelerated_algorithm]: 6.47001e-06 [shard]: 9.20001e-07 [meta_shard_fg_expand]: 1.66e-06 [shard_inline]: 6.45002e-06 [merge_send_recv]: 5.34e-06 [auto_parallel]: 5.95002e-06 [parallel]: 4.18999e-06 [flash_sp]: 2.96999e-06 [merge_comm]: 3.92002e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 6.04001e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 6.91001e-06 [virtual_dataset]: 6.34999e-06 [get_grad_eliminate_]: 6.26e-06 [virtual_output]: 5.86e-06 [merge_forward]: 3.16999e-06 [cell_reuse_recompute_pass]: 1.21002e-06 [offload_activation]: 6.35002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.161e-05 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 1.008e-05 [set_forward_comm_id_for_comm_node_pass]: 4.31002e-06 [meta_fg_expand]: 2.51998e-06 [flash_sp_send_recv_attached]: 7.89994e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 9.07001e-06 [a_after_grad]: 9.18997e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 9.20001e-07 [auto_monad_eliminator]: 7.92e-06 [cse]: 2.003e-05 [a_3]: 3.833e-05 [py_interpret_to_execute_after_opt_a]: 3.76999e-06 [slice_cell_reuse_recomputed_activation]: 1.88002e-06 [rewriter_after_opt_a]: 2.19e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00049214 [opt_b]: 0.00024633, [1] [Cycle 1]: 0.00024064, [7] [b_1]: 0.00016109 [b_2]: 8.2e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 3.28e-06 [updatestate_loads_eliminate]: 3.00998e-06 [renormalize]: 4.19997e-07 [cse]: 2.428e-05 [optimize_parallel_all_gather_comm]: 1.625e-05 [overlap_param_gather]: 2.41998e-06 [cconv]: 2.211e-05 [loop_unroll]: 0.00042821 [opt_after_cconv]: 0.00010861, [1] [Cycle 1]: 0.00010326, [7] [c_1]: 3.262e-05 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 3.51001e-06 [cse]: 2.32e-05 [renormalize]: 4.60015e-07 [remove_dup_value]: 2.837e-05 [tuple_transform]: 9.467e-05, [1] [Cycle 1]: 9.043e-05, [4] [d_1]: 6.173e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.72002e-06 [partial_unused_args_eliminate]: 2.27001e-06 [add_recomputation]: 4.65e-05 [cse_after_recomputation]: 2.552e-05, [1] [Cycle 1]: 2.141e-05, [1] [cse]: 1.631e-05 [environ_conv]: 7.48999e-06 [swap_dp_allreduce_reducescatter]: 6.17999e-06 [bias_add_comm_swap]: 2.71999e-06 [label_micro_interleaved_index]: 3.8e-06 [label_fine_grained_interleaved_index]: 2.84001e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.97999e-06 [micro_interleaved_order_control]: 2.02999e-06 [assign_add_opt]: 1.42999e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.04e-06 [full_micro_interleaved_order_control]: 2.28998e-06 [reorder_send_recv_between_fp_bp]: 2.53e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 9.60019e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.363e-05 [grouped_pairwise_exchange_alltoall]: 1.68002e-06 [offloading_packed_experts]: 4.42e-06 [overlap_recompute_and_grad_model_parallel]: 4.98001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29998e-06 [overlap_recompute_comm]: 2.09e-06 [overlap_grad_ring_attention]: 5.04e-06 [overlap_grad_flash_sp]: 1.874e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.01002e-06 [symbol_engine_optimizer]: 8.569e-05, [1] [Cycle 1]: 8.094e-05, [6] [build]: 1.021e-05 [elim_shapecalc]: 1.067e-05 [elim_not_effective]: 1.393e-05 [opt_reshape]: 7.68999e-06 [fold_const_symbol]: 1.052e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.52001e-06 [pipeline_parallel_scheduler]: 1.29e-06 [auto_monad_reorder]: 1.913e-05 [get_jit_bprop_graph]: 9.29984e-07 [rewriter_after_jit_bprop_graph]: 3.23e-06 [opt_after_jit_grad]: 0.00046105 [validate]: 3.933e-05 [backend_pass]: 1.03001e-06 [task_emit]: 0.00692294 [execute]: 7.28e-06 Sums bootstrap : 0.000454s : 1.07% type_inference : 0.029340s : 69.06% event_method : 0.000120s : 0.28% auto_monad : 0.000166s : 0.39% graph_reusing : 0.000012s : 0.03% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000243s : 0.57% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000091s : 0.21% optimize.opt_a.loop_unroll : 0.000047s : 0.11% optimize.opt_a.a_1 : 0.000919s : 2.16% optimize.opt_a.with_stream_mark : 0.000027s : 0.06% optimize.opt_a.recompute_prepare : 0.000016s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000182s : 0.43% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.03% optimize.opt_a.merge_send_recv : 0.000014s : 0.03% optimize.opt_a.auto_parallel : 0.000012s : 0.03% optimize.opt_a.parallel : 0.000020s : 0.05% optimize.opt_a.flash_sp : 0.000010s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.04% optimize.opt_a.virtual_dataset : 0.000013s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000022s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.05% optimize.opt_a.a_after_grad : 0.000019s : 0.05% optimize.opt_a.renormalize : 0.001206s : 2.84% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.06% optimize.opt_a.cse : 0.000059s : 0.14% optimize.opt_a.a_3 : 0.000089s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000492s : 1.16% optimize.opt_b.b_1 : 0.000161s : 0.38% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.05% optimize.loop_unroll : 0.000428s : 1.01% optimize.opt_after_cconv.c_1 : 0.000033s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.cse : 0.000023s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000028s : 0.07% optimize.tuple_transform.d_1 : 0.000062s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000047s : 0.11% optimize.cse_after_recomputation.cse : 0.000016s : 0.04% optimize.environ_conv : 0.000007s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000461s : 1.09% validate : 0.000039s : 0.09% backend_pass : 0.000001s : 0.00% task_emit : 0.006923s : 16.30% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000282 62 0.68% : 0.000002s : 3: substitution.elim_not_effective 2.11% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.57% : 0.000002s : 3: substitution.fold_const_symbol 2.06% : 0.000006s : 4: substitution.graph_param_transform 61.54% : 0.000173s : 8: substitution.inline 1.33% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.49% : 0.000004s : 2: substitution.minmaximum_grad 2.12% : 0.000006s : 6: substitution.remove_not_recompute_node 1.09% : 0.000003s : 2: substitution.replace_old_param 2.64% : 0.000007s : 1: substitution.switch_simplify 5.34% : 0.000015s : 4: substitution.tuple_list_convert_item_index_to_positive 2.38% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.40% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 10.04% : 0.000028s : 8: substitution.tuple_list_get_item_eliminator 3.20% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.029273 2 94.17% : 0.027568s : 1: type_inference.infer 5.83% : 0.001706s : 1: type_inference.specialize ------[replace.] 0.000087 11 64.71% : 0.000056s : 8: replace.inline 16.71% : 0.000014s : 1: replace.switch_simplify 18.58% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 11 94.47% : 0.000169s : 8: match.inline 3.43% : 0.000006s : 1: match.switch_simplify 2.10% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000228 1438 0.99% : 0.000002s : 16: predicate.accumulaten_eliminater 0.95% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 16: predicate.addn_zero_filter 0.91% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.11% : 0.000005s : 24: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.51% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.23% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.23% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_depend_swap 1.80% : 0.000004s : 28: predicate.environ_get_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.62% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.58% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.88% : 0.000013s : 66: predicate.inline 0.67% : 0.000002s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.86% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.61% : 0.000006s : 42: predicate.load_eliminater 0.89% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.82% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.61% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.47% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 16: predicate.minmaximum_grad 0.95% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 2.03% : 0.000005s : 26: predicate.partial_defer_inline 1.46% : 0.000003s : 22: predicate.partial_eliminate 1.02% : 0.000002s : 16: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.39% : 0.000003s : 16: predicate.reduce_eliminate 2.50% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000003s : 26: predicate.replace_applicator 0.43% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 16: predicate.reshape_eliminate 0.63% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000001s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.76% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.68% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.73% : 0.000004s : 26: predicate.switch_defer_inline 2.16% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.03% : 0.000014s : 86: predicate.switch_simplify 1.00% : 0.000002s : 16: predicate.tile_eliminate 0.96% : 0.000002s : 16: predicate.transpose_eliminate 1.71% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.17% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.68% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.62% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.33% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.27% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001421 23 55.09% : 0.000783s : 11: func_graph_cloner_run.FuncGraphClonerGraph 44.91% : 0.000638s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.058130 196 0.01% : 0.000004s : 1: ForceFp32Comm 5.29% : 0.003078s : 1: add_attr 5.28% : 0.003069s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.09% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.30% : 0.000176s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.83% : 0.000484s : 1: bootstrap 0.04% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000017s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.22% : 0.000129s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.03% : 0.000016s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.75% : 0.000436s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.86% : 0.000500s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000017s : 1: opt.transform.mutable_eliminate 2.47% : 0.001433s : 78: opt.transform.opt_a 0.05% : 0.000031s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.000141s : 28: opt.transform.opt_b 0.12% : 0.000067s : 2: opt.transform.opt_trans_graph 0.07% : 0.000039s : 4: opt.transform.symbol_engine_opt 5.94% : 0.003455s : 1: opt_a 0.19% : 0.000112s : 1: opt_after_cconv 0.81% : 0.000470s : 1: opt_after_jit_grad 0.43% : 0.000250s : 1: opt_b 9.71% : 0.005646s : 1: optimize 0.03% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000007s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000032s : 1: remove_dup_value 1.04% : 0.000602s : 1: renormalize.infer 1.02% : 0.000596s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000025s : 1: rewriter_after_opt_a 0.43% : 0.000249s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000088s : 1: symbol_engine_optimizer 11.93% : 0.006934s : 1: task_emit 0.17% : 0.000097s : 1: tuple_transform 50.50% : 0.029354s : 1: type_inference 0.12% : 0.000070s : 1: validate TotalTime = 0.120207, [24] [bootstrap]: 0.00048464 [type_inference]: 0.0469173 [event_method]: 0.00027916 [auto_monad]: 0.00024609 [graph_reusing]: 1.81e-05 [inline]: 2.14e-06 [add_attr]: 0.00313737, [1] [add_attr_with_inline]: 0.0031291, [1] [Cycle 1]: 8.977e-05, [2] [tag_attr]: 4.557e-05 [meta_addattr_fg_expand]: 1.341e-05 [parallel-infer-symbol]: 2.93e-06 [pre_auto_parallel]: 6.291e-05 [insert-virtual-dataset]: 2.33998e-06 [parallel-infer-symbol-second]: 1.33002e-06 [dataset_repeat_opt]: 1.60999e-06 [pipeline_split]: 1.51002e-06 [optimize]: 0.0406067, [53] [py_interpret_to_execute]: 4.90999e-06 [rewriter_before_opt_a]: 0.00036046 [opt_a]: 0.0376577, [3] [Cycle 1]: 0.0314153, [45] [expand_dump_flag]: 5.15999e-06 [switch_simplify]: 0.00016476 [loop_unroll]: 7.27e-05 [a_1]: 0.00160811 [with_stream_mark]: 2.36e-05 [recompute_prepare]: 2.2e-05 [updatestate_depend_eliminate]: 8.84e-06 [updatestate_assign_eliminate]: 8.23001e-06 [updatestate_loads_eliminate]: 7.55998e-06 [parameter_eliminate]: 2.66999e-06 [a_2]: 0.00022437 [accelerated_algorithm]: 1.532e-05 [shard]: 1.66e-06 [meta_shard_fg_expand]: 4.90999e-06 [shard_inline]: 1.479e-05 [merge_send_recv]: 1.572e-05 [auto_parallel]: 1.074e-05 [parallel]: 1.732e-05 [flash_sp]: 9.87999e-06 [merge_comm]: 9.64999e-06 [allreduce_fusion]: 9.12001e-06 [matmul_add_comm_reduction]: 2.444e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.718e-05 [virtual_dataset]: 1.526e-05 [get_grad_eliminate_]: 1.61e-05 [virtual_output]: 1.486e-05 [merge_forward]: 9.25001e-06 [cell_reuse_recompute_pass]: 1.22999e-06 [offload_activation]: 1.664e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.976e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 2.747e-05 [set_forward_comm_id_for_comm_node_pass]: 9.59e-06 [meta_fg_expand]: 0.00454378 [flash_sp_send_recv_attached]: 3.99002e-06 [receive_attached]: 2.79999e-06 [after_resolve]: 9.141e-05 [a_after_grad]: 0.00012875 [renormalize]: 0.0220485 [add_forward_monad_depend]: 1.554e-05 [auto_monad_grad]: 1.308e-05 [auto_monad_eliminator]: 0.00011122 [cse]: 0.00031578 [a_3]: 0.00140433 [Cycle 2]: 0.0049521, [45] [expand_dump_flag]: 2.38002e-06 [switch_simplify]: 9.139e-05 [loop_unroll]: 8.311e-05 [a_1]: 0.00174778 [with_stream_mark]: 1.823e-05 [recompute_prepare]: 1.624e-05 [updatestate_depend_eliminate]: 8.28001e-06 [updatestate_assign_eliminate]: 7.15e-06 [updatestate_loads_eliminate]: 6.63e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.00020253 [accelerated_algorithm]: 1.406e-05 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 3.53e-06 [shard_inline]: 1.399e-05 [merge_send_recv]: 1.034e-05 [auto_parallel]: 1.098e-05 [parallel]: 4.46002e-06 [flash_sp]: 3.08e-06 [merge_comm]: 8.42998e-06 [allreduce_fusion]: 7.78999e-06 [matmul_add_comm_reduction]: 1.119e-05 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 1.432e-05 [virtual_dataset]: 1.323e-05 [get_grad_eliminate_]: 1.288e-05 [virtual_output]: 1.303e-05 [merge_forward]: 7.21001e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 1.187e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.438e-05 [merge_recompute_call_nodes]: 8.29983e-07 [before_grad]: 2.321e-05 [set_forward_comm_id_for_comm_node_pass]: 8.50999e-06 [meta_fg_expand]: 0.00018464 [flash_sp_send_recv_attached]: 1.17e-06 [receive_attached]: 1.39e-06 [after_resolve]: 2.191e-05 [a_after_grad]: 2.105e-05 [renormalize]: 0.00175861 [add_forward_monad_depend]: 4.32e-06 [auto_monad_grad]: 1.40001e-06 [auto_monad_eliminator]: 2.25e-05 [cse]: 0.00017295 [a_3]: 0.00010291 [Cycle 3]: 0.00127585, [45] [expand_dump_flag]: 1.29998e-06 [switch_simplify]: 1.575e-05 [loop_unroll]: 1.367e-05 [a_1]: 0.00036091 [with_stream_mark]: 2.947e-05 [recompute_prepare]: 1.589e-05 [updatestate_depend_eliminate]: 7.99002e-06 [updatestate_assign_eliminate]: 6.96999e-06 [updatestate_loads_eliminate]: 7.16001e-06 [parameter_eliminate]: 1.06002e-06 [a_2]: 0.00019891 [accelerated_algorithm]: 1.388e-05 [shard]: 1.06997e-06 [meta_shard_fg_expand]: 3.26001e-06 [shard_inline]: 1.322e-05 [merge_send_recv]: 9.91e-06 [auto_parallel]: 1.048e-05 [parallel]: 3.91999e-06 [flash_sp]: 9.79984e-07 [merge_comm]: 8.27e-06 [allreduce_fusion]: 8e-06 [matmul_add_comm_reduction]: 1.124e-05 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 1.421e-05 [virtual_dataset]: 1.299e-05 [get_grad_eliminate_]: 1.287e-05 [virtual_output]: 1.298e-05 [merge_forward]: 7.18998e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.123e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.45e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 2.201e-05 [set_forward_comm_id_for_comm_node_pass]: 8.27998e-06 [meta_fg_expand]: 5.20001e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 1.582e-05 [a_after_grad]: 1.981e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.37999e-06 [auto_monad_grad]: 1.01997e-06 [auto_monad_eliminator]: 1.617e-05 [cse]: 5e-05 [a_3]: 8.969e-05 [py_interpret_to_execute_after_opt_a]: 4.25e-06 [slice_cell_reuse_recomputed_activation]: 2.51998e-06 [rewriter_after_opt_a]: 4.482e-05 [convert_after_rewriter]: 1.27e-06 [order_py_execute_after_rewriter]: 1.09e-06 [mutable_eliminate]: 0.00050429 [opt_b]: 0.00047599, [1] [Cycle 1]: 0.0004699, [7] [b_1]: 0.000339 [b_2]: 1.527e-05 [updatestate_depend_eliminate]: 1.038e-05 [updatestate_assign_eliminate]: 7e-06 [updatestate_loads_eliminate]: 6.89999e-06 [renormalize]: 3.9002e-07 [cse]: 5.384e-05 [optimize_parallel_all_gather_comm]: 2.484e-05 [overlap_param_gather]: 1.88002e-06 [cconv]: 2.074e-05 [loop_unroll]: 0.00045265 [opt_after_cconv]: 0.00018463, [1] [Cycle 1]: 0.00017873, [7] [c_1]: 6.617e-05 [parameter_eliminate]: 2.29001e-06 [updatestate_depend_eliminate]: 1.096e-05 [updatestate_assign_eliminate]: 6.99001e-06 [updatestate_loads_eliminate]: 6.76e-06 [cse]: 5.186e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 7.964e-05 [tuple_transform]: 0.00016298, [1] [Cycle 1]: 0.00015787, [4] [d_1]: 0.00012171 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 1.472e-05 [partial_unused_args_eliminate]: 2.11e-06 [add_recomputation]: 6.633e-05 [cse_after_recomputation]: 4.856e-05, [1] [Cycle 1]: 4.38e-05, [1] [cse]: 3.803e-05 [environ_conv]: 1.076e-05 [swap_dp_allreduce_reducescatter]: 1.042e-05 [bias_add_comm_swap]: 2.45002e-06 [label_micro_interleaved_index]: 3.89002e-06 [label_fine_grained_interleaved_index]: 2.41e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 1.95001e-06 [micro_interleaved_order_control]: 2.17001e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 1.06002e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.73e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 9.30013e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.59e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 2.897e-05 [grouped_pairwise_exchange_alltoall]: 1.45999e-06 [offloading_packed_experts]: 7.48999e-06 [overlap_recompute_and_grad_model_parallel]: 8.72e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 2.46e-06 [overlap_grad_ring_attention]: 7.53e-06 [overlap_grad_flash_sp]: 3.638e-05 [begin_end_overlap_inline]: 7.00005e-07 [split_matmul_comm_elemetwise]: 2.06998e-06 [split_layernorm_comm]: 1.62001e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 0.0001268, [1] [Cycle 1]: 0.00012117, [6] [build]: 9.32001e-06 [elim_shapecalc]: 1.822e-05 [elim_not_effective]: 2.63e-05 [opt_reshape]: 1.477e-05 [fold_const_symbol]: 2.307e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.40001e-06 [auto_monad_reorder]: 3.435e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 3.59002e-06 [opt_after_jit_grad]: 0.00053482 [validate]: 6.337e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.0275651 [execute]: 7.65998e-06 Sums bootstrap : 0.000485s : 0.42% type_inference : 0.046917s : 40.53% event_method : 0.000279s : 0.24% auto_monad : 0.000246s : 0.21% graph_reusing : 0.000018s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000063s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000360s : 0.31% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000272s : 0.23% optimize.opt_a.loop_unroll : 0.000169s : 0.15% optimize.opt_a.a_1 : 0.003717s : 3.21% optimize.opt_a.with_stream_mark : 0.000071s : 0.06% optimize.opt_a.recompute_prepare : 0.000054s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000025s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000022s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000021s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000626s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000043s : 0.04% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.01% optimize.opt_a.shard_inline : 0.000042s : 0.04% optimize.opt_a.merge_send_recv : 0.000036s : 0.03% optimize.opt_a.auto_parallel : 0.000032s : 0.03% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000026s : 0.02% optimize.opt_a.allreduce_fusion : 0.000025s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000046s : 0.04% optimize.opt_a.virtual_dataset : 0.000041s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000042s : 0.04% optimize.opt_a.virtual_output : 0.000041s : 0.04% optimize.opt_a.merge_forward : 0.000024s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000040s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000079s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000073s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.02% optimize.opt_a.meta_fg_expand : 0.004734s : 4.09% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000129s : 0.11% optimize.opt_a.a_after_grad : 0.000170s : 0.15% optimize.opt_a.renormalize : 0.023807s : 20.57% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.02% optimize.opt_a.auto_monad_grad : 0.000016s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000150s : 0.13% optimize.opt_a.cse : 0.000539s : 0.47% optimize.opt_a.a_3 : 0.001597s : 1.38% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000504s : 0.44% optimize.opt_b.b_1 : 0.000339s : 0.29% optimize.opt_b.b_2 : 0.000015s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000054s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.02% optimize.loop_unroll : 0.000453s : 0.39% optimize.opt_after_cconv.c_1 : 0.000066s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.cse : 0.000052s : 0.04% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000080s : 0.07% optimize.tuple_transform.d_1 : 0.000122s : 0.11% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000015s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.06% optimize.cse_after_recomputation.cse : 0.000038s : 0.03% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000029s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000036s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000034s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000535s : 0.46% validate : 0.000063s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.027565s : 23.81% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.001536 311 0.24% : 0.000004s : 8: substitution.elim_not_effective 0.55% : 0.000008s : 12: substitution.float_depend_g_call 0.71% : 0.000011s : 9: substitution.float_tuple_getitem_switch 0.21% : 0.000003s : 8: substitution.fold_const_symbol 33.24% : 0.000511s : 5: substitution.getattr_setattr_resolve 0.60% : 0.000009s : 10: substitution.graph_param_transform 0.18% : 0.000003s : 2: substitution.incorporate_call 0.11% : 0.000002s : 2: substitution.incorporate_call_switch 38.67% : 0.000594s : 24: substitution.inline 1.36% : 0.000021s : 3: substitution.inline_without_move 0.86% : 0.000013s : 25: substitution.j_node_and_user_rematch 1.20% : 0.000018s : 13: substitution.minmaximum_grad 0.59% : 0.000009s : 12: substitution.partial_eliminate 1.09% : 0.000017s : 25: substitution.remove_not_recompute_node 4.90% : 0.000075s : 32: substitution.replace_applicator 0.65% : 0.000010s : 14: substitution.replace_old_param 0.17% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.79% : 0.000012s : 4: substitution.switch_simplify 0.52% : 0.000008s : 2: substitution.transpose_eliminate 2.91% : 0.000045s : 17: substitution.tuple_list_convert_item_index_to_positive 1.41% : 0.000022s : 17: substitution.tuple_list_get_item_const_eliminator 1.86% : 0.000028s : 17: substitution.tuple_list_get_item_depend_reorder 5.30% : 0.000081s : 32: substitution.tuple_list_get_item_eliminator 1.87% : 0.000029s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.046811 2 93.11% : 0.043586s : 1: type_inference.infer 6.89% : 0.003224s : 1: type_inference.specialize ------[replace.] 0.000506 45 11.20% : 0.000057s : 4: replace.getattr_setattr_resolve 53.23% : 0.000269s : 24: replace.inline 14.08% : 0.000071s : 5: replace.replace_applicator 8.17% : 0.000041s : 4: replace.switch_simplify 13.31% : 0.000067s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001107 45 42.30% : 0.000468s : 4: match.getattr_setattr_resolve 52.49% : 0.000581s : 24: match.inline 2.32% : 0.000026s : 5: match.replace_applicator 0.90% : 0.000010s : 4: match.switch_simplify 1.99% : 0.000022s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001000 7110 0.90% : 0.000009s : 68: predicate.accumulaten_eliminater 0.36% : 0.000004s : 10: predicate.ad_related_special_op_eliminate 0.42% : 0.000004s : 32: predicate.addn_check_dump 0.86% : 0.000009s : 68: predicate.addn_zero_filter 0.84% : 0.000008s : 68: predicate.adjust_all_reduce_mul_add 1.86% : 0.000019s : 100: predicate.arithmetic_simplify 0.89% : 0.000009s : 68: predicate.cast_eliminate 2.85% : 0.000028s : 215: predicate.check_bprop_eliminate 0.43% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.46% : 0.000005s : 32: predicate.depend_value_elim 0.94% : 0.000009s : 68: predicate.dict_get_item_const_eliminator 1.02% : 0.000010s : 68: predicate.dict_get_item_eliminator 0.89% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.35% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 10: predicate.elim_not_effective 0.16% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000010s : 78: predicate.environ_add_const_eliminate 0.99% : 0.000010s : 78: predicate.environ_get_add_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_depend_swap 1.44% : 0.000014s : 110: predicate.environ_get_eliminate 0.97% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.35% : 0.000014s : 100: predicate.exchange_switch_depend_value 1.83% : 0.000018s : 100: predicate.float_depend_g_call 0.43% : 0.000004s : 32: predicate.float_environ_get_switch 0.62% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.48% : 0.000005s : 32: predicate.get_grad_eliminate 0.64% : 0.000006s : 31: predicate.getattr_setattr_resolve 0.09% : 0.000001s : 10: predicate.graph_param_transform 0.44% : 0.000004s : 32: predicate.incorporate_call 0.40% : 0.000004s : 32: predicate.incorporate_call_switch 4.50% : 0.000045s : 252: predicate.inline 1.50% : 0.000015s : 82: predicate.inline_without_move 0.23% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.55% : 0.000006s : 32: predicate.less_batch_normalization 1.30% : 0.000013s : 96: predicate.list_to_tuple_eliminator_ 2.05% : 0.000021s : 164: predicate.load_eliminater 0.38% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.57% : 0.000026s : 182: predicate.loop_unroll_before_grad 1.15% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.46% : 0.000005s : 32: predicate.merge_addn 2.63% : 0.000026s : 198: predicate.micro_step_allgather_replace 2.63% : 0.000026s : 198: predicate.mini_step_allgather_replace 0.90% : 0.000009s : 68: predicate.minmaximum_grad 0.37% : 0.000004s : 10: predicate.mutable_eliminate 0.16% : 0.000002s : 10: predicate.opt_reshape 0.17% : 0.000002s : 10: predicate.parallel_virtual_node 1.85% : 0.000018s : 100: predicate.partial_defer_inline 1.29% : 0.000013s : 86: predicate.partial_eliminate 0.91% : 0.000009s : 68: predicate.print_const_string_wrapper 0.45% : 0.000005s : 32: predicate.reduce_all_const_elim 1.23% : 0.000012s : 68: predicate.reduce_eliminate 2.07% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.26% : 0.000003s : 32: predicate.remove_not_recompute_node 2.37% : 0.000024s : 284: predicate.replace_applicator 0.67% : 0.000007s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.91% : 0.000009s : 68: predicate.reshape_eliminate 2.73% : 0.000027s : 198: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 10: predicate.row_tensor_eliminate 3.08% : 0.000031s : 215: predicate.same_eliminate 0.29% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.52% : 0.000005s : 32: predicate.shard_identity_eliminate 0.31% : 0.000003s : 20: predicate.special_op_eliminate 0.51% : 0.000005s : 32: predicate.specialize_transform 2.72% : 0.000027s : 198: predicate.split_environ_get_set_with_tuple_value 1.29% : 0.000013s : 82: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.49% : 0.000015s : 100: predicate.switch_defer_inline 4.28% : 0.000043s : 315: predicate.switch_layer_defer_inline 4.97% : 0.000050s : 332: predicate.switch_simplify 0.88% : 0.000009s : 68: predicate.tile_eliminate 0.88% : 0.000009s : 68: predicate.transpose_eliminate 1.38% : 0.000014s : 88: predicate.tuple_list_convert_item_index_to_positive 1.34% : 0.000013s : 88: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.27% : 0.000023s : 128: predicate.tuple_list_get_item_eliminator 1.30% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.86% : 0.000019s : 120: predicate.tuple_list_set_item_eliminator 1.30% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 2.05% : 0.000021s : 164: predicate.updatestate_pure_node_eliminater 2.56% : 0.000026s : 196: predicate.updatestate_useless_node_eliminater 0.16% : 0.000002s : 10: predicate.value_based_eliminate 0.49% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.49% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.19% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005053 75 63.42% : 0.003204s : 36: func_graph_cloner_run.FuncGraphClonerGraph 36.58% : 0.001848s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.195971 247 0.00% : 0.000003s : 1: ForceFp32Comm 1.60% : 0.003142s : 1: add_attr 1.60% : 0.003133s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000070s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.13% : 0.000258s : 1: auto_monad 0.02% : 0.000039s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.26% : 0.000515s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000032s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000052s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.15% : 0.000291s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000022s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.23% : 0.000460s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.26% : 0.000513s : 1: mutable_eliminate 0.01% : 0.000011s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 3.57% : 0.006989s : 125: opt.transform.opt_a 0.03% : 0.000065s : 1: opt.transform.opt_after_cconv 0.03% : 0.000050s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000324s : 28: opt.transform.opt_b 0.32% : 0.000626s : 2: opt.transform.opt_resolve 0.07% : 0.000134s : 2: opt.transform.opt_trans_graph 0.04% : 0.000079s : 4: opt.transform.symbol_engine_opt 19.22% : 0.037661s : 1: opt_a 0.10% : 0.000188s : 1: opt_after_cconv 0.28% : 0.000545s : 1: opt_after_jit_grad 0.24% : 0.000480s : 1: opt_b 20.72% : 0.040611s : 1: optimize 0.01% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000040s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000067s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000085s : 1: remove_dup_value 9.83% : 0.019260s : 2: renormalize.infer 2.31% : 0.004531s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.19% : 0.000367s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000130s : 1: symbol_engine_optimizer 14.07% : 0.027578s : 1: task_emit 0.08% : 0.000167s : 1: tuple_transform 23.95% : 0.046932s : 1: type_inference 0.05% : 0.000103s : 1: validate TotalTime = 0.0479392, [24] [bootstrap]: 0.00052091 [type_inference]: 0.0305402 [event_method]: 0.00011802 [auto_monad]: 0.00016884 [graph_reusing]: 1.233e-05 [inline]: 1.83002e-06 [add_attr]: 0.0031156, [1] [add_attr_with_inline]: 0.00310802, [1] [Cycle 1]: 6.037e-05, [2] [tag_attr]: 2.547e-05 [meta_addattr_fg_expand]: 7.6e-06 [parallel-infer-symbol]: 2.66e-06 [pre_auto_parallel]: 3.924e-05 [insert-virtual-dataset]: 2.72001e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.67001e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00569753, [53] [py_interpret_to_execute]: 4.45e-06 [rewriter_before_opt_a]: 0.0002208 [opt_a]: 0.00349802, [2] [Cycle 1]: 0.00280199, [45] [expand_dump_flag]: 3.63999e-06 [switch_simplify]: 8.07e-05 [loop_unroll]: 3.937e-05 [a_1]: 0.00078204 [with_stream_mark]: 1.446e-05 [recompute_prepare]: 9.34998e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 4.04002e-06 [updatestate_loads_eliminate]: 3.79002e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 9.516e-05 [accelerated_algorithm]: 7.6e-06 [shard]: 1.67999e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 9.00001e-06 [auto_parallel]: 6.36e-06 [parallel]: 1.716e-05 [flash_sp]: 7.2e-06 [merge_comm]: 4.10998e-06 [allreduce_fusion]: 4.05998e-06 [matmul_add_comm_reduction]: 9.49e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 8.12e-06 [virtual_dataset]: 7.03e-06 [get_grad_eliminate_]: 7.44002e-06 [virtual_output]: 6.86999e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 1.24003e-06 [offload_activation]: 1.008e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.41e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.185e-05 [set_forward_comm_id_for_comm_node_pass]: 4.18999e-06 [meta_fg_expand]: 3.75e-06 [flash_sp_send_recv_attached]: 2.41e-06 [receive_attached]: 2.28002e-06 [after_resolve]: 1.028e-05 [a_after_grad]: 1.043e-05 [renormalize]: 0.00123139 [add_forward_monad_depend]: 5.20999e-06 [auto_monad_grad]: 1.78002e-06 [auto_monad_eliminator]: 1.863e-05 [cse]: 3.462e-05 [a_3]: 5.062e-05 [Cycle 2]: 0.00068663, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 8.52e-06 [loop_unroll]: 7.08e-06 [a_1]: 0.00015389 [with_stream_mark]: 1.164e-05 [recompute_prepare]: 7.22997e-06 [updatestate_depend_eliminate]: 3.62002e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 9.99979e-07 [a_2]: 8.682e-05 [accelerated_algorithm]: 6.43e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.53002e-06 [shard_inline]: 6.28998e-06 [merge_send_recv]: 5.22e-06 [auto_parallel]: 6.34001e-06 [parallel]: 3.78999e-06 [flash_sp]: 3.05998e-06 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 4.14002e-06 [matmul_add_comm_reduction]: 5.87999e-06 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 7.3e-06 [virtual_dataset]: 6.49001e-06 [get_grad_eliminate_]: 6.14999e-06 [virtual_output]: 5.92999e-06 [merge_forward]: 3.52002e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 6.86001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.226e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 1.008e-05 [set_forward_comm_id_for_comm_node_pass]: 4.05e-06 [meta_fg_expand]: 2.74001e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 9.39996e-07 [after_resolve]: 8.72e-06 [a_after_grad]: 1.019e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.29998e-06 [cse]: 1.881e-05 [a_3]: 3.907e-05 [py_interpret_to_execute_after_opt_a]: 4.05e-06 [slice_cell_reuse_recomputed_activation]: 2.07999e-06 [rewriter_after_opt_a]: 2.202e-05 [convert_after_rewriter]: 1.18001e-06 [order_py_execute_after_rewriter]: 1.09998e-06 [mutable_eliminate]: 0.00046066 [opt_b]: 0.00024146, [1] [Cycle 1]: 0.00023564, [7] [b_1]: 0.00015729 [b_2]: 8.10999e-06 [updatestate_depend_eliminate]: 6.09999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 3.04999e-06 [renormalize]: 3.20026e-07 [cse]: 2.289e-05 [optimize_parallel_all_gather_comm]: 1.702e-05 [overlap_param_gather]: 2.26998e-06 [cconv]: 2.212e-05 [loop_unroll]: 0.00042782 [opt_after_cconv]: 0.00012141, [1] [Cycle 1]: 0.00011598, [7] [c_1]: 3.157e-05 [parameter_eliminate]: 2.55002e-06 [updatestate_depend_eliminate]: 6.53e-06 [updatestate_assign_eliminate]: 3.06001e-06 [updatestate_loads_eliminate]: 3.48e-06 [cse]: 2.362e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.83e-05 [tuple_transform]: 9.54e-05, [1] [Cycle 1]: 9.085e-05, [4] [d_1]: 6.365e-05 [none_parameter_eliminate]: 1.90001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.5e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 5.214e-05 [cse_after_recomputation]: 2.602e-05, [1] [Cycle 1]: 2.207e-05, [1] [cse]: 1.676e-05 [environ_conv]: 8.54e-06 [swap_dp_allreduce_reducescatter]: 6.66999e-06 [bias_add_comm_swap]: 2.10002e-06 [label_micro_interleaved_index]: 4.01001e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.69e-06 [slice_recompute_activation]: 1.89e-06 [micro_interleaved_order_control]: 2.02001e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 1.39e-06 [full_micro_interleaved_order_control]: 2.04999e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 9.5999e-07 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 1.404e-05 [grouped_pairwise_exchange_alltoall]: 1.45001e-06 [offloading_packed_experts]: 5.07999e-06 [overlap_recompute_and_grad_model_parallel]: 4.90999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.05999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.27e-06 [overlap_recompute_comm]: 2.16e-06 [overlap_grad_ring_attention]: 4.42998e-06 [overlap_grad_flash_sp]: 2e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.07999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 9.824e-05, [1] [Cycle 1]: 9.392e-05, [6] [build]: 9.13002e-06 [elim_shapecalc]: 1.138e-05 [elim_not_effective]: 1.41e-05 [opt_reshape]: 7.75e-06 [fold_const_symbol]: 2.284e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.71002e-06 [pipeline_parallel_scheduler]: 1.42999e-06 [auto_monad_reorder]: 2.101e-05 [get_jit_bprop_graph]: 1.05999e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00047277 [validate]: 3.928e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.00695467 [execute]: 6.68998e-06 Sums bootstrap : 0.000521s : 1.19% type_inference : 0.030540s : 69.74% event_method : 0.000118s : 0.27% auto_monad : 0.000169s : 0.39% graph_reusing : 0.000012s : 0.03% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000039s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000221s : 0.50% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000089s : 0.20% optimize.opt_a.loop_unroll : 0.000046s : 0.11% optimize.opt_a.a_1 : 0.000936s : 2.14% optimize.opt_a.with_stream_mark : 0.000026s : 0.06% optimize.opt_a.recompute_prepare : 0.000017s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000182s : 0.42% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.03% optimize.opt_a.merge_send_recv : 0.000014s : 0.03% optimize.opt_a.auto_parallel : 0.000013s : 0.03% optimize.opt_a.parallel : 0.000021s : 0.05% optimize.opt_a.flash_sp : 0.000010s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.02% optimize.opt_a.allreduce_fusion : 0.000008s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.04% optimize.opt_a.virtual_dataset : 0.000014s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.03% optimize.opt_a.virtual_output : 0.000013s : 0.03% optimize.opt_a.merge_forward : 0.000008s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000017s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.02% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000019s : 0.04% optimize.opt_a.a_after_grad : 0.000021s : 0.05% optimize.opt_a.renormalize : 0.001231s : 2.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.06% optimize.opt_a.cse : 0.000053s : 0.12% optimize.opt_a.a_3 : 0.000090s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000461s : 1.05% optimize.opt_b.b_1 : 0.000157s : 0.36% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.05% optimize.loop_unroll : 0.000428s : 0.98% optimize.opt_after_cconv.c_1 : 0.000032s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000024s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.04% optimize.tuple_transform.d_1 : 0.000064s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.12% optimize.cse_after_recomputation.cse : 0.000017s : 0.04% optimize.environ_conv : 0.000009s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.02% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000473s : 1.08% validate : 0.000039s : 0.09% backend_pass : 0.000001s : 0.00% task_emit : 0.006955s : 15.88% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000281 62 0.67% : 0.000002s : 3: substitution.elim_not_effective 2.26% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.54% : 0.000002s : 3: substitution.fold_const_symbol 2.04% : 0.000006s : 4: substitution.graph_param_transform 60.91% : 0.000171s : 8: substitution.inline 1.39% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.90% : 0.000005s : 2: substitution.minmaximum_grad 2.19% : 0.000006s : 6: substitution.remove_not_recompute_node 1.16% : 0.000003s : 2: substitution.replace_old_param 2.78% : 0.000008s : 1: substitution.switch_simplify 5.05% : 0.000014s : 4: substitution.tuple_list_convert_item_index_to_positive 2.29% : 0.000006s : 4: substitution.tuple_list_get_item_const_eliminator 3.29% : 0.000009s : 4: substitution.tuple_list_get_item_depend_reorder 10.26% : 0.000029s : 8: substitution.tuple_list_get_item_eliminator 3.27% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.030476 2 94.47% : 0.028791s : 1: type_inference.infer 5.53% : 0.001685s : 1: type_inference.specialize ------[replace.] 0.000106 11 52.48% : 0.000056s : 8: replace.inline 13.51% : 0.000014s : 1: replace.switch_simplify 34.02% : 0.000036s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000177 11 93.97% : 0.000167s : 8: match.inline 3.57% : 0.000006s : 1: match.switch_simplify 2.46% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1438 0.97% : 0.000002s : 16: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.05% : 0.000002s : 16: predicate.addn_zero_filter 0.90% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.37% : 0.000005s : 24: predicate.arithmetic_simplify 1.04% : 0.000002s : 16: predicate.cast_eliminate 0.55% : 0.000001s : 8: predicate.check_bprop_eliminate 0.47% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000001s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.12% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.10% : 0.000003s : 20: predicate.environ_get_depend_swap 1.63% : 0.000004s : 28: predicate.environ_get_eliminate 1.21% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.59% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.30% : 0.000005s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.24% : 0.000001s : 4: predicate.graph_param_transform 0.55% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.82% : 0.000013s : 66: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.25% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 42: predicate.load_eliminater 0.99% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.79% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 16: predicate.minmaximum_grad 1.06% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.18% : 0.000005s : 26: predicate.partial_defer_inline 1.43% : 0.000003s : 22: predicate.partial_eliminate 1.00% : 0.000002s : 16: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.42% : 0.000003s : 16: predicate.reduce_eliminate 2.42% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 26: predicate.replace_applicator 0.45% : 0.000001s : 8: predicate.replace_old_param 0.17% : 0.000000s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 16: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.65% : 0.000001s : 8: predicate.same_eliminate 0.36% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.63% : 0.000001s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.75% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.98% : 0.000014s : 86: predicate.switch_simplify 1.02% : 0.000002s : 16: predicate.tile_eliminate 0.99% : 0.000002s : 16: predicate.transpose_eliminate 1.81% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.83% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.56% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001537 23 59.20% : 0.000910s : 11: func_graph_cloner_run.FuncGraphClonerGraph 40.80% : 0.000627s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.059656 196 0.01% : 0.000003s : 1: ForceFp32Comm 5.23% : 0.003120s : 1: add_attr 5.22% : 0.003111s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.09% : 0.000056s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.30% : 0.000178s : 1: auto_monad 0.04% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.93% : 0.000556s : 1: bootstrap 0.04% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000004s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000012s : 1: environ_conv 0.21% : 0.000128s : 1: event_method 0.02% : 0.000011s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000017s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.73% : 0.000436s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.79% : 0.000469s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.03% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000016s : 1: opt.transform.mutable_eliminate 2.43% : 0.001448s : 78: opt.transform.opt_a 0.05% : 0.000030s : 1: opt.transform.opt_after_cconv 0.05% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.23% : 0.000138s : 28: opt.transform.opt_b 0.12% : 0.000069s : 2: opt.transform.opt_trans_graph 0.09% : 0.000052s : 4: opt.transform.symbol_engine_opt 5.87% : 0.003501s : 1: opt_a 0.21% : 0.000125s : 1: opt_after_cconv 0.81% : 0.000482s : 1: opt_after_jit_grad 0.41% : 0.000245s : 1: opt_b 9.56% : 0.005702s : 1: optimize 0.03% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000024s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000022s : 1: remove_dup_value 0.97% : 0.000581s : 1: renormalize.infer 1.08% : 0.000642s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000025s : 1: rewriter_after_opt_a 0.38% : 0.000227s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000004s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000101s : 1: symbol_engine_optimizer 11.68% : 0.006966s : 1: task_emit 0.16% : 0.000098s : 1: tuple_transform 51.22% : 0.030554s : 1: type_inference 0.12% : 0.000070s : 1: validate TotalTime = 0.105651, [24] [bootstrap]: 0.00044923 [type_inference]: 0.0500434 [event_method]: 0.0002599 [auto_monad]: 0.00022917 [graph_reusing]: 1.73e-05 [inline]: 2.30002e-06 [add_attr]: 0.00308743, [1] [add_attr_with_inline]: 0.00307933, [1] [Cycle 1]: 8.493e-05, [2] [tag_attr]: 4.385e-05 [meta_addattr_fg_expand]: 1.204e-05 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 6.194e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 1.22e-06 [dataset_repeat_opt]: 1.69e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.0393995, [53] [py_interpret_to_execute]: 4.95001e-06 [rewriter_before_opt_a]: 0.00035463 [opt_a]: 0.03644, [3] [Cycle 1]: 0.0303703, [45] [expand_dump_flag]: 4.60001e-06 [switch_simplify]: 0.0001554 [loop_unroll]: 7.026e-05 [a_1]: 0.00151028 [with_stream_mark]: 2.289e-05 [recompute_prepare]: 2.066e-05 [updatestate_depend_eliminate]: 8.27e-06 [updatestate_assign_eliminate]: 8.50999e-06 [updatestate_loads_eliminate]: 7.31999e-06 [parameter_eliminate]: 2.69001e-06 [a_2]: 0.00022016 [accelerated_algorithm]: 1.558e-05 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 4.25999e-06 [shard_inline]: 1.471e-05 [merge_send_recv]: 1.521e-05 [auto_parallel]: 1.08e-05 [parallel]: 1.696e-05 [flash_sp]: 9.49e-06 [merge_comm]: 9.34e-06 [allreduce_fusion]: 8.36002e-06 [matmul_add_comm_reduction]: 2.4e-05 [allreduce_slice_to_reducescatter]: 9.50007e-07 [virtual_shard_identity]: 1.553e-05 [virtual_dataset]: 1.354e-05 [get_grad_eliminate_]: 1.399e-05 [virtual_output]: 1.345e-05 [merge_forward]: 8.74003e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.613e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.707e-05 [merge_recompute_call_nodes]: 1.61998e-06 [before_grad]: 2.532e-05 [set_forward_comm_id_for_comm_node_pass]: 9.03002e-06 [meta_fg_expand]: 0.00456315 [flash_sp_send_recv_attached]: 4.62e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 8.85e-05 [a_after_grad]: 0.00012703 [renormalize]: 0.0211964 [add_forward_monad_depend]: 1.424e-05 [auto_monad_grad]: 1.257e-05 [auto_monad_eliminator]: 0.00010396 [cse]: 0.00029767 [a_3]: 0.00136644 [Cycle 2]: 0.00480747, [45] [expand_dump_flag]: 2.48e-06 [switch_simplify]: 8.498e-05 [loop_unroll]: 8.203e-05 [a_1]: 0.0017137 [with_stream_mark]: 1.904e-05 [recompute_prepare]: 1.545e-05 [updatestate_depend_eliminate]: 7.99002e-06 [updatestate_assign_eliminate]: 7.14001e-06 [updatestate_loads_eliminate]: 6.42001e-06 [parameter_eliminate]: 1.20001e-06 [a_2]: 0.00019564 [accelerated_algorithm]: 1.406e-05 [shard]: 1.09e-06 [meta_shard_fg_expand]: 2.99999e-06 [shard_inline]: 1.286e-05 [merge_send_recv]: 1.046e-05 [auto_parallel]: 1.096e-05 [parallel]: 4.32e-06 [flash_sp]: 3.16001e-06 [merge_comm]: 8.43001e-06 [allreduce_fusion]: 7.81001e-06 [matmul_add_comm_reduction]: 1.068e-05 [allreduce_slice_to_reducescatter]: 7.2e-07 [virtual_shard_identity]: 1.37e-05 [virtual_dataset]: 1.354e-05 [get_grad_eliminate_]: 1.258e-05 [virtual_output]: 1.289e-05 [merge_forward]: 6.83e-06 [cell_reuse_recompute_pass]: 9.80013e-07 [offload_activation]: 1.127e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.427e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 2.173e-05 [set_forward_comm_id_for_comm_node_pass]: 8.65999e-06 [meta_fg_expand]: 0.00016602 [flash_sp_send_recv_attached]: 1.12e-06 [receive_attached]: 1.34e-06 [after_resolve]: 2.018e-05 [a_after_grad]: 2.057e-05 [renormalize]: 0.00165613 [add_forward_monad_depend]: 4.27998e-06 [auto_monad_grad]: 1.28002e-06 [auto_monad_eliminator]: 2.168e-05 [cse]: 0.00016711 [a_3]: 0.00010273 [Cycle 3]: 0.00124795, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 1.635e-05 [loop_unroll]: 1.392e-05 [a_1]: 0.00035697 [with_stream_mark]: 1.447e-05 [recompute_prepare]: 1.37e-05 [updatestate_depend_eliminate]: 8.60001e-06 [updatestate_assign_eliminate]: 7.21999e-06 [updatestate_loads_eliminate]: 6.99001e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00019613 [accelerated_algorithm]: 1.364e-05 [shard]: 1.32999e-06 [meta_shard_fg_expand]: 2.76e-06 [shard_inline]: 1.358e-05 [merge_send_recv]: 9.83002e-06 [auto_parallel]: 1.133e-05 [parallel]: 4.44002e-06 [flash_sp]: 1.05001e-06 [merge_comm]: 8.13001e-06 [allreduce_fusion]: 7.84002e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 1.392e-05 [virtual_dataset]: 1.299e-05 [get_grad_eliminate_]: 1.253e-05 [virtual_output]: 1.34e-05 [merge_forward]: 7.05e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.181e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.528e-05 [merge_recompute_call_nodes]: 6.50005e-07 [before_grad]: 2.16e-05 [set_forward_comm_id_for_comm_node_pass]: 8.22e-06 [meta_fg_expand]: 5.34e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 1.07e-06 [after_resolve]: 1.563e-05 [a_after_grad]: 2.019e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 1.564e-05 [cse]: 4.814e-05 [a_3]: 8.901e-05 [py_interpret_to_execute_after_opt_a]: 4.03001e-06 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 4.584e-05 [convert_after_rewriter]: 1.22e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00051066 [opt_b]: 0.00047065, [1] [Cycle 1]: 0.00046429, [7] [b_1]: 0.0003356 [b_2]: 1.55e-05 [updatestate_depend_eliminate]: 1.013e-05 [updatestate_assign_eliminate]: 7.08998e-06 [updatestate_loads_eliminate]: 7.05002e-06 [renormalize]: 4.19997e-07 [cse]: 5.258e-05 [optimize_parallel_all_gather_comm]: 2.473e-05 [overlap_param_gather]: 2.07999e-06 [cconv]: 2.072e-05 [loop_unroll]: 0.00045078 [opt_after_cconv]: 0.00018253, [1] [Cycle 1]: 0.00017723, [7] [c_1]: 6.566e-05 [parameter_eliminate]: 2.24001e-06 [updatestate_depend_eliminate]: 1.03e-05 [updatestate_assign_eliminate]: 7.00998e-06 [updatestate_loads_eliminate]: 7.01999e-06 [cse]: 5.089e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 7.914e-05 [tuple_transform]: 0.00015963, [1] [Cycle 1]: 0.00015499, [4] [d_1]: 0.00011905 [none_parameter_eliminate]: 2.10002e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 1.395e-05 [partial_unused_args_eliminate]: 1.92999e-06 [add_recomputation]: 6.652e-05 [cse_after_recomputation]: 4.838e-05, [1] [Cycle 1]: 4.343e-05, [1] [cse]: 3.778e-05 [environ_conv]: 1.05e-05 [swap_dp_allreduce_reducescatter]: 1.117e-05 [bias_add_comm_swap]: 2.21e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.68998e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.29999e-06 [assign_add_opt]: 1.14998e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.22999e-06 [reorder_send_recv_between_fp_bp]: 2.59001e-06 [comm_op_add_attrs]: 1.25001e-06 [add_comm_op_reuse_tag]: 9.19972e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 9.79984e-07 [overlap_opt_shard_in_pipeline]: 1.12999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.62001e-06 [control_data_broadcast_order]: 2.81e-05 [grouped_pairwise_exchange_alltoall]: 1.45999e-06 [offloading_packed_experts]: 7.01001e-06 [overlap_recompute_and_grad_model_parallel]: 7.93001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.32999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.46998e-06 [overlap_grad_ring_attention]: 7.26999e-06 [overlap_grad_flash_sp]: 3.423e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.05002e-06 [split_layernorm_comm]: 1.75001e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 0.00012952, [1] [Cycle 1]: 0.000124, [6] [build]: 1.102e-05 [elim_shapecalc]: 1.982e-05 [elim_not_effective]: 2.702e-05 [opt_reshape]: 1.479e-05 [fold_const_symbol]: 2.198e-05 [renormalize]: 2.80008e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.40001e-06 [auto_monad_reorder]: 3.334e-05 [get_jit_bprop_graph]: 1.22999e-06 [rewriter_after_jit_bprop_graph]: 3.7e-06 [opt_after_jit_grad]: 0.00050703 [validate]: 6.146e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.0112503 [execute]: 7.19001e-06 Sums bootstrap : 0.000449s : 0.44% type_inference : 0.050043s : 49.45% event_method : 0.000260s : 0.26% auto_monad : 0.000229s : 0.23% graph_reusing : 0.000017s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000044s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000062s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000355s : 0.35% optimize.opt_a.expand_dump_flag : 0.000008s : 0.01% optimize.opt_a.switch_simplify : 0.000257s : 0.25% optimize.opt_a.loop_unroll : 0.000166s : 0.16% optimize.opt_a.a_1 : 0.003581s : 3.54% optimize.opt_a.with_stream_mark : 0.000056s : 0.06% optimize.opt_a.recompute_prepare : 0.000050s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000025s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000023s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000021s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000612s : 0.60% optimize.opt_a.accelerated_algorithm : 0.000043s : 0.04% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.01% optimize.opt_a.shard_inline : 0.000041s : 0.04% optimize.opt_a.merge_send_recv : 0.000036s : 0.04% optimize.opt_a.auto_parallel : 0.000033s : 0.03% optimize.opt_a.parallel : 0.000026s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000026s : 0.03% optimize.opt_a.allreduce_fusion : 0.000024s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000046s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000043s : 0.04% optimize.opt_a.virtual_dataset : 0.000040s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.04% optimize.opt_a.virtual_output : 0.000040s : 0.04% optimize.opt_a.merge_forward : 0.000023s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000039s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000077s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000069s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.03% optimize.opt_a.meta_fg_expand : 0.004735s : 4.68% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000124s : 0.12% optimize.opt_a.a_after_grad : 0.000168s : 0.17% optimize.opt_a.renormalize : 0.022853s : 22.58% optimize.opt_a.add_forward_monad_depend : 0.000020s : 0.02% optimize.opt_a.auto_monad_grad : 0.000015s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000141s : 0.14% optimize.opt_a.cse : 0.000513s : 0.51% optimize.opt_a.a_3 : 0.001558s : 1.54% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000511s : 0.50% optimize.opt_b.b_1 : 0.000336s : 0.33% optimize.opt_b.b_2 : 0.000016s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000053s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.02% optimize.loop_unroll : 0.000451s : 0.45% optimize.opt_after_cconv.c_1 : 0.000066s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.cse : 0.000051s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000079s : 0.08% optimize.tuple_transform.d_1 : 0.000119s : 0.12% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000014s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.07% optimize.cse_after_recomputation.cse : 0.000038s : 0.04% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000028s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000034s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000020s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000022s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000033s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000507s : 0.50% validate : 0.000061s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.011250s : 11.12% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.001486 311 0.25% : 0.000004s : 8: substitution.elim_not_effective 0.52% : 0.000008s : 12: substitution.float_depend_g_call 0.71% : 0.000011s : 9: substitution.float_tuple_getitem_switch 0.21% : 0.000003s : 8: substitution.fold_const_symbol 34.06% : 0.000506s : 5: substitution.getattr_setattr_resolve 0.61% : 0.000009s : 10: substitution.graph_param_transform 0.19% : 0.000003s : 2: substitution.incorporate_call 0.14% : 0.000002s : 2: substitution.incorporate_call_switch 37.25% : 0.000553s : 24: substitution.inline 1.40% : 0.000021s : 3: substitution.inline_without_move 0.86% : 0.000013s : 25: substitution.j_node_and_user_rematch 1.19% : 0.000018s : 13: substitution.minmaximum_grad 0.60% : 0.000009s : 12: substitution.partial_eliminate 1.18% : 0.000017s : 25: substitution.remove_not_recompute_node 5.06% : 0.000075s : 32: substitution.replace_applicator 0.70% : 0.000010s : 14: substitution.replace_old_param 0.16% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.73% : 0.000011s : 4: substitution.switch_simplify 0.55% : 0.000008s : 2: substitution.transpose_eliminate 2.96% : 0.000044s : 17: substitution.tuple_list_convert_item_index_to_positive 1.48% : 0.000022s : 17: substitution.tuple_list_get_item_const_eliminator 1.95% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 5.34% : 0.000079s : 32: substitution.tuple_list_get_item_eliminator 1.91% : 0.000028s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.049938 2 94.14% : 0.047010s : 1: type_inference.infer 5.86% : 0.002928s : 1: type_inference.specialize ------[replace.] 0.000457 45 11.78% : 0.000054s : 4: replace.getattr_setattr_resolve 51.03% : 0.000233s : 24: replace.inline 14.55% : 0.000067s : 5: replace.replace_applicator 8.70% : 0.000040s : 4: replace.switch_simplify 13.94% : 0.000064s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001059 45 43.69% : 0.000463s : 4: match.getattr_setattr_resolve 51.01% : 0.000540s : 24: match.inline 2.42% : 0.000026s : 5: match.replace_applicator 0.82% : 0.000009s : 4: match.switch_simplify 2.05% : 0.000022s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000989 7110 0.95% : 0.000009s : 68: predicate.accumulaten_eliminater 0.31% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.42% : 0.000004s : 32: predicate.addn_check_dump 0.90% : 0.000009s : 68: predicate.addn_zero_filter 0.86% : 0.000009s : 68: predicate.adjust_all_reduce_mul_add 1.74% : 0.000017s : 100: predicate.arithmetic_simplify 0.96% : 0.000009s : 68: predicate.cast_eliminate 2.79% : 0.000028s : 215: predicate.check_bprop_eliminate 0.43% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.42% : 0.000004s : 32: predicate.depend_value_elim 0.93% : 0.000009s : 68: predicate.dict_get_item_const_eliminator 1.05% : 0.000010s : 68: predicate.dict_get_item_eliminator 0.91% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.38% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 10: predicate.elim_not_effective 0.22% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000010s : 78: predicate.environ_add_const_eliminate 0.99% : 0.000010s : 78: predicate.environ_get_add_eliminate 1.00% : 0.000010s : 78: predicate.environ_get_depend_swap 1.45% : 0.000014s : 110: predicate.environ_get_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.35% : 0.000013s : 100: predicate.exchange_switch_depend_value 1.86% : 0.000018s : 100: predicate.float_depend_g_call 0.45% : 0.000004s : 32: predicate.float_environ_get_switch 0.62% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.48% : 0.000005s : 32: predicate.get_grad_eliminate 0.67% : 0.000007s : 31: predicate.getattr_setattr_resolve 0.09% : 0.000001s : 10: predicate.graph_param_transform 0.44% : 0.000004s : 32: predicate.incorporate_call 0.41% : 0.000004s : 32: predicate.incorporate_call_switch 4.61% : 0.000046s : 252: predicate.inline 1.47% : 0.000015s : 82: predicate.inline_without_move 0.24% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.54% : 0.000005s : 32: predicate.less_batch_normalization 1.32% : 0.000013s : 96: predicate.list_to_tuple_eliminator_ 2.18% : 0.000022s : 164: predicate.load_eliminater 0.35% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.46% : 0.000024s : 182: predicate.loop_unroll_before_grad 1.24% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.46% : 0.000005s : 32: predicate.merge_addn 2.58% : 0.000025s : 198: predicate.micro_step_allgather_replace 2.57% : 0.000025s : 198: predicate.mini_step_allgather_replace 0.94% : 0.000009s : 68: predicate.minmaximum_grad 0.38% : 0.000004s : 10: predicate.mutable_eliminate 0.17% : 0.000002s : 10: predicate.opt_reshape 0.19% : 0.000002s : 10: predicate.parallel_virtual_node 1.76% : 0.000017s : 100: predicate.partial_defer_inline 1.32% : 0.000013s : 86: predicate.partial_eliminate 0.87% : 0.000009s : 68: predicate.print_const_string_wrapper 0.46% : 0.000005s : 32: predicate.reduce_all_const_elim 1.18% : 0.000012s : 68: predicate.reduce_eliminate 2.17% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000002s : 32: predicate.remove_not_recompute_node 2.42% : 0.000024s : 284: predicate.replace_applicator 0.66% : 0.000007s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.96% : 0.000009s : 68: predicate.reshape_eliminate 2.59% : 0.000026s : 198: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 10: predicate.row_tensor_eliminate 2.93% : 0.000029s : 215: predicate.same_eliminate 0.29% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.51% : 0.000005s : 32: predicate.shard_identity_eliminate 0.31% : 0.000003s : 20: predicate.special_op_eliminate 0.50% : 0.000005s : 32: predicate.specialize_transform 2.64% : 0.000026s : 198: predicate.split_environ_get_set_with_tuple_value 1.30% : 0.000013s : 82: predicate.stack_unstack_eliminate 0.16% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.49% : 0.000015s : 100: predicate.switch_defer_inline 4.23% : 0.000042s : 315: predicate.switch_layer_defer_inline 4.83% : 0.000048s : 332: predicate.switch_simplify 0.94% : 0.000009s : 68: predicate.tile_eliminate 0.90% : 0.000009s : 68: predicate.transpose_eliminate 1.32% : 0.000013s : 88: predicate.tuple_list_convert_item_index_to_positive 1.39% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.48% : 0.000024s : 128: predicate.tuple_list_get_item_eliminator 1.35% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.76% : 0.000017s : 120: predicate.tuple_list_set_item_eliminator 1.28% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 2.08% : 0.000021s : 164: predicate.updatestate_pure_node_eliminater 2.62% : 0.000026s : 196: predicate.updatestate_useless_node_eliminater 0.15% : 0.000001s : 10: predicate.value_based_eliminate 0.48% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.48% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004539 75 65.39% : 0.002968s : 36: func_graph_cloner_run.FuncGraphClonerGraph 34.61% : 0.001571s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.178929 247 0.00% : 0.000003s : 1: ForceFp32Comm 1.73% : 0.003092s : 1: add_attr 1.72% : 0.003083s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.13% : 0.000240s : 1: auto_monad 0.02% : 0.000037s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.27% : 0.000478s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000031s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000051s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.15% : 0.000271s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000022s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.26% : 0.000459s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.29% : 0.000519s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 3.78% : 0.006757s : 125: opt.transform.opt_a 0.04% : 0.000064s : 1: opt.transform.opt_after_cconv 0.03% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000321s : 28: opt.transform.opt_b 0.35% : 0.000617s : 2: opt.transform.opt_resolve 0.07% : 0.000131s : 2: opt.transform.opt_trans_graph 0.04% : 0.000079s : 4: opt.transform.symbol_engine_opt 20.37% : 0.036443s : 1: opt_a 0.10% : 0.000186s : 1: opt_after_cconv 0.29% : 0.000517s : 1: opt_after_jit_grad 0.27% : 0.000475s : 1: opt_b 22.02% : 0.039404s : 1: optimize 0.02% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000037s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000067s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000084s : 1: remove_dup_value 10.48% : 0.018744s : 2: renormalize.infer 2.29% : 0.004092s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000049s : 1: rewriter_after_opt_a 0.20% : 0.000361s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000132s : 1: symbol_engine_optimizer 6.29% : 0.011262s : 1: task_emit 0.09% : 0.000162s : 1: tuple_transform 27.98% : 0.050059s : 1: type_inference 0.06% : 0.000099s : 1: validate TotalTime = 0.0420051, [24] [bootstrap]: 0.00045287 [type_inference]: 0.0247549 [event_method]: 2.114e-05 [auto_monad]: 7.941e-05 [graph_reusing]: 6.51e-06 [inline]: 2.27001e-06 [add_attr]: 0.00312826, [1] [add_attr_with_inline]: 0.00312024, [1] [Cycle 1]: 5.483e-05, [2] [tag_attr]: 2.12e-05 [meta_addattr_fg_expand]: 6.45997e-06 [parallel-infer-symbol]: 3.55e-06 [pre_auto_parallel]: 3.357e-05 [insert-virtual-dataset]: 2.43e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.76003e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0048375, [53] [py_interpret_to_execute]: 4.27e-06 [rewriter_before_opt_a]: 0.00023207 [opt_a]: 0.00282053, [2] [Cycle 1]: 0.00225593, [45] [expand_dump_flag]: 3.68e-06 [switch_simplify]: 7.398e-05 [loop_unroll]: 3.194e-05 [a_1]: 0.00059344 [with_stream_mark]: 1.337e-05 [recompute_prepare]: 7.25e-06 [updatestate_depend_eliminate]: 3.65998e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 7.093e-05 [accelerated_algorithm]: 5.83002e-06 [shard]: 1.64e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 5.51e-06 [merge_send_recv]: 7.87998e-06 [auto_parallel]: 5.45001e-06 [parallel]: 1.68e-05 [flash_sp]: 6.69999e-06 [merge_comm]: 3.9e-06 [allreduce_fusion]: 3.15002e-06 [matmul_add_comm_reduction]: 8.93002e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 7.24001e-06 [virtual_dataset]: 6.02999e-06 [get_grad_eliminate_]: 5.49e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 3.85998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.22001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.155e-05 [merge_recompute_call_nodes]: 1.37e-06 [before_grad]: 9.25999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.69999e-06 [flash_sp_send_recv_attached]: 2.24001e-06 [receive_attached]: 2.78e-06 [after_resolve]: 8.94e-06 [a_after_grad]: 8.40001e-06 [renormalize]: 0.00096337 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 1.81003e-06 [auto_monad_eliminator]: 1.443e-05 [cse]: 3.22e-05 [a_3]: 4.242e-05 [Cycle 2]: 0.0005554, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 6.81001e-06 [loop_unroll]: 5.82999e-06 [a_1]: 9.816e-05 [with_stream_mark]: 1.073e-05 [recompute_prepare]: 5.71e-06 [updatestate_depend_eliminate]: 2.78998e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.22999e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 6.189e-05 [accelerated_algorithm]: 5.40999e-06 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 5.24998e-06 [merge_send_recv]: 4.52e-06 [auto_parallel]: 5.19998e-06 [parallel]: 3.96001e-06 [flash_sp]: 2.98e-06 [merge_comm]: 2.83998e-06 [allreduce_fusion]: 2.78e-06 [matmul_add_comm_reduction]: 4.82e-06 [allreduce_slice_to_reducescatter]: 3.10014e-07 [virtual_shard_identity]: 6.23e-06 [virtual_dataset]: 5.30999e-06 [get_grad_eliminate_]: 5.07e-06 [virtual_output]: 4.99998e-06 [merge_forward]: 2.69999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 5.99e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.163e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 8.36002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 1.90001e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 7.51001e-06 [a_after_grad]: 7.81001e-06 [renormalize]: 1.10012e-07 [add_forward_monad_depend]: 1.06002e-06 [auto_monad_grad]: 7.80012e-07 [auto_monad_eliminator]: 6.12001e-06 [cse]: 1.42e-05 [a_3]: 3.086e-05 [py_interpret_to_execute_after_opt_a]: 4.43999e-06 [slice_cell_reuse_recomputed_activation]: 2.07999e-06 [rewriter_after_opt_a]: 1.641e-05 [convert_after_rewriter]: 1.54e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00045851 [opt_b]: 0.00018502, [1] [Cycle 1]: 0.00017929, [7] [b_1]: 0.00010834 [b_2]: 7.47002e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.30002e-06 [renormalize]: 3.50003e-07 [cse]: 1.941e-05 [optimize_parallel_all_gather_comm]: 1.478e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 2.217e-05 [loop_unroll]: 0.00042198 [opt_after_cconv]: 9.322e-05, [1] [Cycle 1]: 8.784e-05, [7] [c_1]: 2.515e-05 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 1.808e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 1.529e-05 [tuple_transform]: 6.512e-05, [1] [Cycle 1]: 6.075e-05, [4] [d_1]: 3.549e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 5.98998e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 4.379e-05 [cse_after_recomputation]: 2.231e-05, [1] [Cycle 1]: 1.792e-05, [1] [cse]: 1.288e-05 [environ_conv]: 7.7e-06 [swap_dp_allreduce_reducescatter]: 5.32001e-06 [bias_add_comm_swap]: 2.39001e-06 [label_micro_interleaved_index]: 3.87998e-06 [label_fine_grained_interleaved_index]: 2.51998e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.04e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 7.09988e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.19999e-06 [reorder_send_recv_between_fp_bp]: 2.93e-06 [comm_op_add_attrs]: 9.70002e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.91e-06 [control_data_broadcast_order]: 1.127e-05 [grouped_pairwise_exchange_alltoall]: 1.42e-06 [offloading_packed_experts]: 4.4e-06 [overlap_recompute_and_grad_model_parallel]: 4.87998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29998e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 3.73001e-06 [overlap_grad_flash_sp]: 1.629e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 2.01e-06 [handle_group_info]: 1.49998e-06 [symbol_engine_optimizer]: 7.984e-05, [1] [Cycle 1]: 7.559e-05, [6] [build]: 8.97999e-06 [elim_shapecalc]: 9.06998e-06 [elim_not_effective]: 1.263e-05 [opt_reshape]: 6.56999e-06 [fold_const_symbol]: 9.59999e-06 [renormalize]: 1.69995e-07 [detach_backward]: 1.51998e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.775e-05 [get_jit_bprop_graph]: 9.90025e-07 [rewriter_after_jit_bprop_graph]: 3.91001e-06 [opt_after_jit_grad]: 0.00045893 [validate]: 3.715e-05 [backend_pass]: 9.00007e-07 [task_emit]: 0.00795991 [execute]: 6.58998e-06 Sums bootstrap : 0.000453s : 1.19% type_inference : 0.024755s : 65.30% event_method : 0.000021s : 0.06% auto_monad : 0.000079s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000034s : 0.09% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000232s : 0.61% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000081s : 0.21% optimize.opt_a.loop_unroll : 0.000038s : 0.10% optimize.opt_a.a_1 : 0.000692s : 1.82% optimize.opt_a.with_stream_mark : 0.000024s : 0.06% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000133s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000021s : 0.05% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.04% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.000963s : 2.54% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.05% optimize.opt_a.cse : 0.000046s : 0.12% optimize.opt_a.a_3 : 0.000073s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000016s : 0.04% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000459s : 1.21% optimize.opt_b.b_1 : 0.000108s : 0.29% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.06% optimize.loop_unroll : 0.000422s : 1.11% optimize.opt_after_cconv.c_1 : 0.000025s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.04% optimize.tuple_transform.d_1 : 0.000035s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.12% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000011s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000016s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000459s : 1.21% validate : 0.000037s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007960s : 21.00% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000178 26 0.99% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 2.83% : 0.000005s : 3: substitution.graph_param_transform 79.94% : 0.000143s : 6: substitution.inline 1.65% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.43% : 0.000004s : 4: substitution.remove_not_recompute_node 1.63% : 0.000003s : 2: substitution.replace_old_param 3.55% : 0.000006s : 1: substitution.switch_simplify 6.25% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024697 2 95.17% : 0.023504s : 1: type_inference.infer 4.83% : 0.001193s : 1: type_inference.specialize ------[replace.] 0.000079 9 60.06% : 0.000047s : 6: replace.inline 20.40% : 0.000016s : 1: replace.switch_simplify 19.54% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000155 9 89.95% : 0.000139s : 6: match.inline 3.67% : 0.000006s : 1: match.switch_simplify 6.38% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000173 1092 0.99% : 0.000002s : 12: predicate.accumulaten_eliminater 0.80% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 6: predicate.addn_check_dump 0.91% : 0.000002s : 12: predicate.addn_zero_filter 0.89% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.37% : 0.000004s : 18: predicate.arithmetic_simplify 1.08% : 0.000002s : 12: predicate.cast_eliminate 0.50% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.48% : 0.000001s : 6: predicate.depend_value_elim 1.00% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.41% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.66% : 0.000003s : 21: predicate.environ_get_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.60% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.50% : 0.000004s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.29% : 0.000001s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.46% : 0.000001s : 6: predicate.incorporate_call_switch 6.02% : 0.000010s : 50: predicate.inline 0.64% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.78% : 0.000001s : 6: predicate.less_batch_normalization 1.73% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.36% : 0.000004s : 32: predicate.load_eliminater 0.95% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.93% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.75% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.87% : 0.000001s : 12: predicate.minmaximum_grad 1.04% : 0.000002s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.39% : 0.000001s : 3: predicate.parallel_virtual_node 1.96% : 0.000003s : 20: predicate.partial_defer_inline 1.43% : 0.000002s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.47% : 0.000001s : 6: predicate.reduce_all_const_elim 1.52% : 0.000003s : 12: predicate.reduce_eliminate 2.48% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 20: predicate.replace_applicator 0.54% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 1.04% : 0.000002s : 12: predicate.reshape_eliminate 0.50% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 3: predicate.row_tensor_eliminate 0.58% : 0.000001s : 6: predicate.same_eliminate 0.41% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 6: predicate.shard_identity_eliminate 0.78% : 0.000001s : 6: predicate.special_op_eliminate 0.61% : 0.000001s : 6: predicate.specialize_transform 0.79% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.80% : 0.000003s : 20: predicate.switch_defer_inline 2.29% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.16% : 0.000011s : 68: predicate.switch_simplify 0.95% : 0.000002s : 12: predicate.tile_eliminate 0.99% : 0.000002s : 12: predicate.transpose_eliminate 1.62% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.83% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.64% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.30% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.59% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001077 16 58.57% : 0.000631s : 8: func_graph_cloner_run.FuncGraphClonerGraph 41.43% : 0.000446s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.052153 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.01% : 0.003133s : 1: add_attr 5.99% : 0.003123s : 1: add_attr_with_inline 0.01% : 0.000003s : 1: add_comm_op_reuse_tag 0.09% : 0.000048s : 1: add_recomputation 0.01% : 0.000005s : 1: assign_add_opt 0.16% : 0.000085s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.92% : 0.000480s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000014s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.02% : 0.000011s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.82% : 0.000430s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.89% : 0.000467s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.09% : 0.001091s : 78: opt.transform.opt_a 0.05% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000088s : 28: opt.transform.opt_b 0.08% : 0.000040s : 2: opt.transform.opt_trans_graph 0.07% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.41% : 0.002824s : 1: opt_a 0.18% : 0.000096s : 1: opt_after_cconv 0.90% : 0.000468s : 1: opt_after_jit_grad 0.36% : 0.000188s : 1: opt_b 9.28% : 0.004842s : 1: optimize 0.03% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000038s : 1: pre_auto_parallel 0.01% : 0.000007s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.92% : 0.000477s : 1: renormalize.infer 0.92% : 0.000478s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000020s : 1: rewriter_after_opt_a 0.46% : 0.000237s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000083s : 1: symbol_engine_optimizer 15.28% : 0.007971s : 1: task_emit 0.13% : 0.000068s : 1: tuple_transform 47.49% : 0.024768s : 1: type_inference 0.12% : 0.000065s : 1: validate TotalTime = 0.127067, [24] [bootstrap]: 0.00045671 [type_inference]: 0.0441968 [event_method]: 0.0002232 [auto_monad]: 0.00014659 [graph_reusing]: 9.54999e-06 [inline]: 1.77001e-06 [add_attr]: 0.00303747, [1] [add_attr_with_inline]: 0.0030291, [1] [Cycle 1]: 8.015e-05, [2] [tag_attr]: 4.084e-05 [meta_addattr_fg_expand]: 1.079e-05 [parallel-infer-symbol]: 3.31001e-06 [pre_auto_parallel]: 5.73e-05 [insert-virtual-dataset]: 2.32999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.0412304, [53] [py_interpret_to_execute]: 4.58001e-06 [rewriter_before_opt_a]: 0.00035179 [opt_a]: 0.031196, [3] [Cycle 1]: 0.0161532, [45] [expand_dump_flag]: 3.89002e-06 [switch_simplify]: 0.00015036 [loop_unroll]: 6.466e-05 [a_1]: 0.00139485 [with_stream_mark]: 2.305e-05 [recompute_prepare]: 1.966e-05 [updatestate_depend_eliminate]: 7.95e-06 [updatestate_assign_eliminate]: 6.91001e-06 [updatestate_loads_eliminate]: 7.54002e-06 [parameter_eliminate]: 2.59001e-06 [a_2]: 0.00020768 [accelerated_algorithm]: 1.383e-05 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 3.24001e-06 [shard_inline]: 1.397e-05 [merge_send_recv]: 1.442e-05 [auto_parallel]: 9.47999e-06 [parallel]: 1.677e-05 [flash_sp]: 8.45001e-06 [merge_comm]: 8.41002e-06 [allreduce_fusion]: 7.58001e-06 [matmul_add_comm_reduction]: 2.509e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 1.518e-05 [virtual_dataset]: 1.341e-05 [get_grad_eliminate_]: 1.357e-05 [virtual_output]: 1.35e-05 [merge_forward]: 8.43999e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 1.614e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.558e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 2.493e-05 [set_forward_comm_id_for_comm_node_pass]: 7.93999e-06 [meta_fg_expand]: 0.0014575 [flash_sp_send_recv_attached]: 3.54002e-06 [receive_attached]: 2.61999e-06 [after_resolve]: 6.278e-05 [a_after_grad]: 8.447e-05 [renormalize]: 0.0108606 [add_forward_monad_depend]: 9.59e-06 [auto_monad_grad]: 5.62999e-06 [auto_monad_eliminator]: 0.00012858 [cse]: 0.00038875 [a_3]: 0.00070859 [Cycle 2]: 0.00977489, [45] [expand_dump_flag]: 1.84e-06 [switch_simplify]: 9.525e-05 [loop_unroll]: 9.176e-05 [a_1]: 0.00303631 [with_stream_mark]: 5.416e-05 [recompute_prepare]: 6.308e-05 [updatestate_depend_eliminate]: 3.673e-05 [updatestate_assign_eliminate]: 3.481e-05 [updatestate_loads_eliminate]: 3.359e-05 [parameter_eliminate]: 1.16997e-06 [a_2]: 0.00097414 [accelerated_algorithm]: 0.00011551 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 1.263e-05 [shard_inline]: 6.255e-05 [merge_send_recv]: 4.195e-05 [auto_parallel]: 3.868e-05 [parallel]: 5.07e-06 [flash_sp]: 3.06001e-06 [merge_comm]: 3.937e-05 [allreduce_fusion]: 3.874e-05 [matmul_add_comm_reduction]: 4.598e-05 [allreduce_slice_to_reducescatter]: 4.7998e-07 [virtual_shard_identity]: 7.46e-05 [virtual_dataset]: 6.082e-05 [get_grad_eliminate_]: 6.055e-05 [virtual_output]: 6.046e-05 [merge_forward]: 3.615e-05 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 4.58e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011429 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 0.00010491 [set_forward_comm_id_for_comm_node_pass]: 4.102e-05 [meta_fg_expand]: 0.00011519 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.00001e-06 [after_resolve]: 6.673e-05 [a_after_grad]: 0.00010065 [renormalize]: 0.00302091 [add_forward_monad_depend]: 4.71002e-06 [auto_monad_grad]: 1.28002e-06 [auto_monad_eliminator]: 0.00010001 [cse]: 0.00022315 [a_3]: 0.00045063 [Cycle 3]: 0.00525386, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 6.362e-05 [loop_unroll]: 6.067e-05 [a_1]: 0.00186547 [with_stream_mark]: 4.393e-05 [recompute_prepare]: 6.068e-05 [updatestate_depend_eliminate]: 3.73e-05 [updatestate_assign_eliminate]: 3.508e-05 [updatestate_loads_eliminate]: 3.469e-05 [parameter_eliminate]: 1.13001e-06 [a_2]: 0.00098384 [accelerated_algorithm]: 7.207e-05 [shard]: 9.70002e-07 [meta_shard_fg_expand]: 1.311e-05 [shard_inline]: 6.217e-05 [merge_send_recv]: 4.34e-05 [auto_parallel]: 4.208e-05 [parallel]: 4.03001e-06 [flash_sp]: 1.10001e-06 [merge_comm]: 4.081e-05 [allreduce_fusion]: 3.991e-05 [matmul_add_comm_reduction]: 4.499e-05 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 6.331e-05 [virtual_dataset]: 6.068e-05 [get_grad_eliminate_]: 6.021e-05 [virtual_output]: 5.96e-05 [merge_forward]: 3.613e-05 [cell_reuse_recompute_pass]: 1.58002e-06 [offload_activation]: 4.746e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011417 [merge_recompute_call_nodes]: 7.40023e-07 [before_grad]: 0.00010462 [set_forward_comm_id_for_comm_node_pass]: 4.176e-05 [meta_fg_expand]: 2.631e-05 [flash_sp_send_recv_attached]: 8.60018e-07 [receive_attached]: 1.06002e-06 [after_resolve]: 6.236e-05 [a_after_grad]: 0.00010145 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.76e-06 [auto_monad_grad]: 1.09003e-06 [auto_monad_eliminator]: 6.518e-05 [cse]: 0.00019339 [a_3]: 0.00044431 [py_interpret_to_execute_after_opt_a]: 4.56002e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 0.0001844 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00054206 [opt_b]: 0.00543671, [2] [Cycle 1]: 0.00351448, [7] [b_1]: 0.00308957 [b_2]: 6.509e-05 [updatestate_depend_eliminate]: 4.456e-05 [updatestate_assign_eliminate]: 3.546e-05 [updatestate_loads_eliminate]: 3.512e-05 [renormalize]: 4.50003e-07 [cse]: 0.00019983 [Cycle 2]: 0.00191211, [7] [b_1]: 0.00149975 [b_2]: 6.284e-05 [updatestate_depend_eliminate]: 3.976e-05 [updatestate_assign_eliminate]: 3.498e-05 [updatestate_loads_eliminate]: 3.564e-05 [renormalize]: 7.99773e-08 [cse]: 0.00018711 [optimize_parallel_all_gather_comm]: 7.579e-05 [overlap_param_gather]: 2.17001e-06 [cconv]: 3.267e-05 [loop_unroll]: 0.00051363 [opt_after_cconv]: 0.00068854, [1] [Cycle 1]: 0.00068254, [7] [c_1]: 0.00032925 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 4.784e-05 [updatestate_assign_eliminate]: 3.492e-05 [updatestate_loads_eliminate]: 3.483e-05 [cse]: 0.00019648 [renormalize]: 4.30009e-07 [remove_dup_value]: 0.00037361 [tuple_transform]: 0.00051347, [1] [Cycle 1]: 0.00050765, [4] [d_1]: 0.00042577 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 5.537e-05 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 0.00021267 [cse_after_recomputation]: 0.00012832, [1] [Cycle 1]: 0.00012311, [1] [cse]: 0.00011614 [environ_conv]: 2.304e-05 [swap_dp_allreduce_reducescatter]: 3.791e-05 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 3.97e-06 [label_fine_grained_interleaved_index]: 2.70002e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.06e-06 [assign_add_opt]: 1.34e-06 [ForceFp32Comm]: 9.79984e-07 [remove_cast_before_assign_add]: 1.52001e-06 [full_micro_interleaved_order_control]: 2.09999e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.29984e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 1.45001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 8.721e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 2.181e-05 [overlap_recompute_and_grad_model_parallel]: 2.145e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29998e-06 [overlap_recompute_comm]: 2.15002e-06 [overlap_grad_ring_attention]: 2.093e-05 [overlap_grad_flash_sp]: 0.00011889 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.04999e-06 [split_layernorm_comm]: 1.90001e-06 [handle_group_info]: 1.31998e-06 [symbol_engine_optimizer]: 0.00035998, [1] [Cycle 1]: 0.00035514, [6] [build]: 1.451e-05 [elim_shapecalc]: 6.207e-05 [elim_not_effective]: 9.796e-05 [opt_reshape]: 5.48e-05 [fold_const_symbol]: 9.525e-05 [renormalize]: 2.79979e-07 [detach_backward]: 2.02999e-06 [pipeline_parallel_scheduler]: 1.42e-06 [auto_monad_reorder]: 8.15e-05 [get_jit_bprop_graph]: 1.22999e-06 [rewriter_after_jit_bprop_graph]: 3.78001e-06 [opt_after_jit_grad]: 0.00067272 [validate]: 0.00013384 [backend_pass]: 9.89996e-07 [task_emit]: 0.0365231 [execute]: 7.21001e-06 Sums bootstrap : 0.000457s : 0.37% type_inference : 0.044197s : 36.05% event_method : 0.000223s : 0.18% auto_monad : 0.000147s : 0.12% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000011s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000057s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000352s : 0.29% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000309s : 0.25% optimize.opt_a.loop_unroll : 0.000217s : 0.18% optimize.opt_a.a_1 : 0.006297s : 5.14% optimize.opt_a.with_stream_mark : 0.000121s : 0.10% optimize.opt_a.recompute_prepare : 0.000143s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000082s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000077s : 0.06% optimize.opt_a.updatestate_loads_eliminate : 0.000076s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.002166s : 1.77% optimize.opt_a.accelerated_algorithm : 0.000201s : 0.16% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000029s : 0.02% optimize.opt_a.shard_inline : 0.000139s : 0.11% optimize.opt_a.merge_send_recv : 0.000100s : 0.08% optimize.opt_a.auto_parallel : 0.000090s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000089s : 0.07% optimize.opt_a.allreduce_fusion : 0.000086s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000116s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000153s : 0.12% optimize.opt_a.virtual_dataset : 0.000135s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000134s : 0.11% optimize.opt_a.virtual_output : 0.000134s : 0.11% optimize.opt_a.merge_forward : 0.000081s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000109s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000254s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000234s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000091s : 0.07% optimize.opt_a.meta_fg_expand : 0.001599s : 1.30% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000192s : 0.16% optimize.opt_a.a_after_grad : 0.000287s : 0.23% optimize.opt_a.renormalize : 0.013882s : 11.32% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000008s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000294s : 0.24% optimize.opt_a.cse : 0.000805s : 0.66% optimize.opt_a.a_3 : 0.001604s : 1.31% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000184s : 0.15% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000542s : 0.44% optimize.opt_b.b_1 : 0.004589s : 3.74% optimize.opt_b.b_2 : 0.000128s : 0.10% optimize.opt_b.updatestate_depend_eliminate : 0.000084s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000070s : 0.06% optimize.opt_b.updatestate_loads_eliminate : 0.000071s : 0.06% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000387s : 0.32% optimize.optimize_parallel_all_gather_comm : 0.000076s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.03% optimize.loop_unroll : 0.000514s : 0.42% optimize.opt_after_cconv.c_1 : 0.000329s : 0.27% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000048s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000035s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000035s : 0.03% optimize.opt_after_cconv.cse : 0.000196s : 0.16% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000374s : 0.30% optimize.tuple_transform.d_1 : 0.000426s : 0.35% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000055s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000213s : 0.17% optimize.cse_after_recomputation.cse : 0.000116s : 0.09% optimize.environ_conv : 0.000023s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000038s : 0.03% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000087s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000022s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000021s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.02% optimize.overlap_grad_flash_sp : 0.000119s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000062s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000098s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000055s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000095s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000082s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000673s : 0.55% validate : 0.000134s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.036523s : 29.79% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.001263 650 6.94% : 0.000088s : 36: substitution.arithmetic_simplify 1.14% : 0.000014s : 46: substitution.elim_not_effective 0.57% : 0.000007s : 11: substitution.float_depend_g_call 1.31% : 0.000017s : 17: substitution.float_tuple_getitem_switch 1.13% : 0.000014s : 46: substitution.fold_const_symbol 2.64% : 0.000033s : 51: substitution.graph_param_transform 0.18% : 0.000002s : 2: substitution.incorporate_call 0.17% : 0.000002s : 2: substitution.incorporate_call_switch 44.17% : 0.000558s : 21: substitution.inline 1.22% : 0.000015s : 2: substitution.inline_without_move 3.11% : 0.000039s : 102: substitution.j_node_and_user_rematch 3.71% : 0.000047s : 10: substitution.less_batch_normalization 1.32% : 0.000017s : 13: substitution.minmaximum_grad 0.65% : 0.000008s : 11: substitution.partial_eliminate 4.47% : 0.000056s : 102: substitution.remove_not_recompute_node 1.79% : 0.000023s : 9: substitution.replace_applicator 0.68% : 0.000009s : 11: substitution.replace_old_param 0.20% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.94% : 0.000012s : 4: substitution.switch_simplify 1.72% : 0.000022s : 12: substitution.transpose_eliminate 4.62% : 0.000058s : 25: substitution.tuple_list_convert_item_index_to_positive 2.32% : 0.000029s : 25: substitution.tuple_list_get_item_const_eliminator 3.02% : 0.000038s : 25: substitution.tuple_list_get_item_depend_reorder 6.91% : 0.000087s : 40: substitution.tuple_list_get_item_eliminator 3.07% : 0.000039s : 25: substitution.tuple_list_get_set_item_eliminator 2.02% : 0.000026s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.044113 2 94.53% : 0.041700s : 1: type_inference.infer 5.47% : 0.002413s : 1: type_inference.specialize ------[replace.] 0.000261 33 56.79% : 0.000148s : 21: replace.inline 15.46% : 0.000040s : 4: replace.switch_simplify 22.19% : 0.000058s : 7: replace.tuple_list_get_item_eliminator 5.56% : 0.000015s : 1: replace.zero_like_fill_zero ------[match.] 0.000600 33 91.10% : 0.000547s : 21: match.inline 1.52% : 0.000009s : 4: match.switch_simplify 3.27% : 0.000020s : 7: match.tuple_list_get_item_eliminator 4.10% : 0.000025s : 1: match.zero_like_fill_zero ------[predicate.] 0.002643 20376 0.77% : 0.000020s : 165: predicate.accumulaten_eliminater 0.42% : 0.000011s : 51: predicate.ad_related_special_op_eliminate 0.64% : 0.000017s : 132: predicate.addn_check_dump 0.75% : 0.000020s : 165: predicate.addn_zero_filter 0.73% : 0.000019s : 165: predicate.adjust_all_reduce_mul_add 1.77% : 0.000047s : 297: predicate.arithmetic_simplify 0.80% : 0.000021s : 165: predicate.cast_eliminate 1.06% : 0.000028s : 217: predicate.check_bprop_eliminate 0.63% : 0.000017s : 132: predicate.compare_switch_simplify 0.49% : 0.000013s : 177: predicate.const_output_eliminate 0.64% : 0.000017s : 132: predicate.depend_value_elim 0.85% : 0.000022s : 165: predicate.dict_get_item_const_eliminator 0.87% : 0.000023s : 165: predicate.dict_get_item_eliminator 0.76% : 0.000020s : 165: predicate.dict_set_item_eliminator 1.17% : 0.000031s : 228: predicate.dumpgradient_eliminate 0.13% : 0.000004s : 51: predicate.elim_not_effective 0.29% : 0.000008s : 51: predicate.elim_shapecalc_of_broadcastargs 1.60% : 0.000042s : 342: predicate.environ_add_const_eliminate 1.61% : 0.000043s : 342: predicate.environ_get_add_eliminate 1.58% : 0.000042s : 342: predicate.environ_get_depend_swap 2.25% : 0.000059s : 474: predicate.environ_get_eliminate 1.60% : 0.000042s : 342: predicate.environ_get_set_eliminate 0.93% : 0.000025s : 193: predicate.exchange_switch_depend_value 1.19% : 0.000032s : 193: predicate.float_depend_g_call 0.63% : 0.000017s : 132: predicate.float_environ_get_switch 1.52% : 0.000040s : 309: predicate.float_tuple_getitem_switch 0.13% : 0.000003s : 51: predicate.fold_const_symbol 0.67% : 0.000018s : 132: predicate.get_grad_eliminate 0.15% : 0.000004s : 51: predicate.graph_param_transform 0.64% : 0.000017s : 132: predicate.incorporate_call 0.63% : 0.000017s : 132: predicate.incorporate_call_switch 5.16% : 0.000136s : 858: predicate.inline 1.03% : 0.000027s : 159: predicate.inline_without_move 0.34% : 0.000009s : 132: predicate.j_node_and_user_rematch 0.72% : 0.000019s : 132: predicate.less_batch_normalization 1.98% : 0.000052s : 400: predicate.list_to_tuple_eliminator_ 2.67% : 0.000070s : 573: predicate.load_eliminater 0.48% : 0.000013s : 59: predicate.loop_unroll_after_grad 1.17% : 0.000031s : 233: predicate.loop_unroll_before_grad 1.91% : 0.000050s : 401: predicate.make_slice_get_slice_eliminator 0.65% : 0.000017s : 132: predicate.merge_addn 1.04% : 0.000028s : 217: predicate.micro_step_allgather_replace 1.05% : 0.000028s : 217: predicate.mini_step_allgather_replace 0.77% : 0.000020s : 165: predicate.minmaximum_grad 0.48% : 0.000013s : 60: predicate.mutable_eliminate 0.27% : 0.000007s : 51: predicate.opt_reshape 0.87% : 0.000023s : 177: predicate.parallel_virtual_node 1.18% : 0.000031s : 193: predicate.partial_defer_inline 1.23% : 0.000033s : 231: predicate.partial_eliminate 0.75% : 0.000020s : 165: predicate.print_const_string_wrapper 0.64% : 0.000017s : 132: predicate.reduce_all_const_elim 0.92% : 0.000024s : 165: predicate.reduce_eliminate 2.68% : 0.000071s : 573: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000009s : 132: predicate.remove_not_recompute_node 1.28% : 0.000034s : 389: predicate.replace_applicator 0.45% : 0.000012s : 159: predicate.replace_old_param 0.50% : 0.000013s : 177: predicate.reset_defer_inline 0.77% : 0.000020s : 165: predicate.reshape_eliminate 1.05% : 0.000028s : 217: predicate.row_tensor_add_zeros_like 0.64% : 0.000017s : 118: predicate.row_tensor_eliminate 1.15% : 0.000030s : 217: predicate.same_eliminate 0.39% : 0.000010s : 132: predicate.set_cell_output_no_recompute 1.11% : 0.000029s : 132: predicate.shard_identity_eliminate 1.16% : 0.000031s : 228: predicate.special_op_eliminate 0.72% : 0.000019s : 132: predicate.specialize_transform 1.08% : 0.000028s : 217: predicate.split_environ_get_set_with_tuple_value 0.92% : 0.000024s : 159: predicate.stack_unstack_eliminate 0.32% : 0.000008s : 59: predicate.switch_call_monad_eliminater 1.01% : 0.000027s : 193: predicate.switch_defer_inline 2.08% : 0.000055s : 410: predicate.switch_layer_defer_inline 3.21% : 0.000085s : 617: predicate.switch_simplify 0.79% : 0.000021s : 165: predicate.tile_eliminate 0.81% : 0.000021s : 165: predicate.transpose_eliminate 1.94% : 0.000051s : 393: predicate.tuple_list_convert_item_index_to_positive 1.94% : 0.000051s : 393: predicate.tuple_list_get_item_const_eliminator 1.89% : 0.000050s : 393: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000073s : 532: predicate.tuple_list_get_item_eliminator 1.90% : 0.000050s : 393: predicate.tuple_list_get_set_item_eliminator 2.55% : 0.000067s : 525: predicate.tuple_list_set_item_eliminator 1.92% : 0.000051s : 400: predicate.tuple_to_list_eliminator_ 2.64% : 0.000070s : 573: predicate.updatestate_pure_node_eliminater 3.35% : 0.000089s : 705: predicate.updatestate_useless_node_eliminater 0.85% : 0.000022s : 177: predicate.value_based_eliminate 0.68% : 0.000018s : 132: predicate.virtual_dataset_eliminate 0.69% : 0.000018s : 132: predicate.virtual_output_eliminate 0.26% : 0.000007s : 51: predicate.virtual_view_grad_eliminate 0.93% : 0.000025s : 179: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003691 51 65.57% : 0.002420s : 26: func_graph_cloner_run.FuncGraphClonerGraph 34.43% : 0.001271s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.203648 292 0.00% : 0.000004s : 1: ForceFp32Comm 1.49% : 0.003042s : 1: add_attr 1.49% : 0.003032s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.11% : 0.000218s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000156s : 1: auto_monad 0.04% : 0.000086s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.24% : 0.000484s : 1: bootstrap 0.02% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000091s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000131s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000026s : 1: environ_conv 0.12% : 0.000235s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.26% : 0.000522s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.27% : 0.000550s : 1: mutable_eliminate 0.01% : 0.000025s : 1: offloading_packed_experts 0.04% : 0.000081s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000082s : 1: opt.transform.mutable_eliminate 6.12% : 0.012455s : 117: opt.transform.opt_a 0.16% : 0.000328s : 1: opt.transform.opt_after_cconv 0.08% : 0.000170s : 1: opt.transform.opt_after_jit_grad 2.28% : 0.004634s : 83: opt.transform.opt_b 0.23% : 0.000478s : 2: opt.transform.opt_trans_graph 0.15% : 0.000306s : 4: opt.transform.symbol_engine_opt 15.32% : 0.031199s : 1: opt_a 0.34% : 0.000692s : 1: opt_after_cconv 0.33% : 0.000682s : 1: opt_after_jit_grad 2.67% : 0.005440s : 1: opt_b 20.25% : 0.041235s : 1: optimize 0.04% : 0.000080s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.06% : 0.000122s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000024s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.19% : 0.000381s : 1: remove_dup_value 4.84% : 0.009863s : 2: renormalize.infer 1.97% : 0.004002s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.09% : 0.000188s : 1: rewriter_after_opt_a 0.18% : 0.000358s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000041s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000363s : 1: symbol_engine_optimizer 17.94% : 0.036536s : 1: task_emit 0.25% : 0.000517s : 1: tuple_transform 21.71% : 0.044211s : 1: type_inference 0.11% : 0.000224s : 1: validate TotalTime = 0.0411917, [24] [bootstrap]: 0.00046848 [type_inference]: 0.0241814 [event_method]: 2.134e-05 [auto_monad]: 7.718e-05 [graph_reusing]: 6.24001e-06 [inline]: 1.98002e-06 [add_attr]: 0.00312061, [1] [add_attr_with_inline]: 0.00311309, [1] [Cycle 1]: 5.396e-05, [2] [tag_attr]: 2.109e-05 [meta_addattr_fg_expand]: 6.56e-06 [parallel-infer-symbol]: 3.39001e-06 [pre_auto_parallel]: 3.294e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.00482002, [53] [py_interpret_to_execute]: 4.77998e-06 [rewriter_before_opt_a]: 0.00022714 [opt_a]: 0.0028054, [2] [Cycle 1]: 0.00224195, [45] [expand_dump_flag]: 3.22997e-06 [switch_simplify]: 7.262e-05 [loop_unroll]: 3.121e-05 [a_1]: 0.00057709 [with_stream_mark]: 1.415e-05 [recompute_prepare]: 7.06001e-06 [updatestate_depend_eliminate]: 3.54002e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 3.10002e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 6.941e-05 [accelerated_algorithm]: 5.96e-06 [shard]: 1.47999e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 5.38002e-06 [merge_send_recv]: 7.53e-06 [auto_parallel]: 5.40001e-06 [parallel]: 2.042e-05 [flash_sp]: 6.96999e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 8.60001e-06 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 7.08e-06 [virtual_dataset]: 6.02001e-06 [get_grad_eliminate_]: 5.26998e-06 [virtual_output]: 6.00002e-06 [merge_forward]: 4.22998e-06 [cell_reuse_recompute_pass]: 1.11002e-06 [offload_activation]: 9.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.23e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 8.90001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.46001e-06 [meta_fg_expand]: 2.98e-06 [flash_sp_send_recv_attached]: 2.31998e-06 [receive_attached]: 2.23002e-06 [after_resolve]: 9.49e-06 [a_after_grad]: 8.60999e-06 [renormalize]: 0.00095324 [add_forward_monad_depend]: 5.22e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.494e-05 [cse]: 3.466e-05 [a_3]: 4.281e-05 [Cycle 2]: 0.00055468, [45] [expand_dump_flag]: 1.04998e-06 [switch_simplify]: 7.2e-06 [loop_unroll]: 5.84e-06 [a_1]: 9.793e-05 [with_stream_mark]: 1.066e-05 [recompute_prepare]: 5.55001e-06 [updatestate_depend_eliminate]: 2.79999e-06 [updatestate_assign_eliminate]: 2.21e-06 [updatestate_loads_eliminate]: 2.21998e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 6.065e-05 [accelerated_algorithm]: 5.38002e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.32e-06 [shard_inline]: 5.35001e-06 [merge_send_recv]: 4.33999e-06 [auto_parallel]: 5.18002e-06 [parallel]: 3.93001e-06 [flash_sp]: 2.83e-06 [merge_comm]: 2.91999e-06 [allreduce_fusion]: 2.64001e-06 [matmul_add_comm_reduction]: 4.91997e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 5.84e-06 [virtual_dataset]: 4.94998e-06 [get_grad_eliminate_]: 5.78997e-06 [virtual_output]: 4.94e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 5.92001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.162e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 8.29998e-06 [set_forward_comm_id_for_comm_node_pass]: 2.88e-06 [meta_fg_expand]: 1.62999e-06 [flash_sp_send_recv_attached]: 7.60017e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 7.66001e-06 [a_after_grad]: 7.55e-06 [renormalize]: 5.00004e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 9.5999e-07 [auto_monad_eliminator]: 6.17999e-06 [cse]: 1.558e-05 [a_3]: 3.07e-05 [py_interpret_to_execute_after_opt_a]: 3.78999e-06 [slice_cell_reuse_recomputed_activation]: 1.95001e-06 [rewriter_after_opt_a]: 1.601e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.07998e-06 [mutable_eliminate]: 0.00045452 [opt_b]: 0.00017835, [1] [Cycle 1]: 0.00017241, [7] [b_1]: 0.00010329 [b_2]: 6.72002e-06 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.20002e-06 [renormalize]: 4.7998e-07 [cse]: 1.963e-05 [optimize_parallel_all_gather_comm]: 1.445e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.202e-05 [loop_unroll]: 0.00041389 [opt_after_cconv]: 0.00012536, [1] [Cycle 1]: 0.00011957, [7] [c_1]: 2.477e-05 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.24999e-06 [cse]: 1.995e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 2.795e-05 [tuple_transform]: 6.549e-05, [1] [Cycle 1]: 6.111e-05, [4] [d_1]: 3.466e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 1.29978e-07 [switch_simplify]: 5.83002e-06 [partial_unused_args_eliminate]: 2.04e-06 [add_recomputation]: 4.201e-05 [cse_after_recomputation]: 2.022e-05, [1] [Cycle 1]: 1.586e-05, [1] [cse]: 1.088e-05 [environ_conv]: 7.56999e-06 [swap_dp_allreduce_reducescatter]: 4.65999e-06 [bias_add_comm_swap]: 2.17001e-06 [label_micro_interleaved_index]: 3.85998e-06 [label_fine_grained_interleaved_index]: 2.38998e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.91998e-06 [micro_interleaved_order_control]: 1.97999e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 7.10017e-07 [remove_cast_before_assign_add]: 1.14003e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 9.50007e-07 [interleave_split_concat_branches]: 1.07998e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.10001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.113e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.46001e-06 [overlap_recompute_and_grad_model_parallel]: 4.30999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 1.96e-06 [overlap_grad_ring_attention]: 3.66001e-06 [overlap_grad_flash_sp]: 1.76e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.28002e-06 [symbol_engine_optimizer]: 7.668e-05, [1] [Cycle 1]: 7.265e-05, [6] [build]: 9.06002e-06 [elim_shapecalc]: 8.72998e-06 [elim_not_effective]: 1.165e-05 [opt_reshape]: 6.26998e-06 [fold_const_symbol]: 9.39e-06 [renormalize]: 1.90019e-07 [detach_backward]: 1.54e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 1.474e-05 [get_jit_bprop_graph]: 1.13001e-06 [rewriter_after_jit_bprop_graph]: 3.43e-06 [opt_after_jit_grad]: 0.0004815 [validate]: 3.667e-05 [backend_pass]: 8.59989e-07 [task_emit]: 0.00770909 [execute]: 6.42001e-06 Sums bootstrap : 0.000468s : 1.26% type_inference : 0.024181s : 65.22% event_method : 0.000021s : 0.06% auto_monad : 0.000077s : 0.21% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000227s : 0.61% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000080s : 0.22% optimize.opt_a.loop_unroll : 0.000037s : 0.10% optimize.opt_a.a_1 : 0.000675s : 1.82% optimize.opt_a.with_stream_mark : 0.000025s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000130s : 0.35% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.07% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000017s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000017s : 0.05% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.000953s : 2.57% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.06% optimize.opt_a.cse : 0.000050s : 0.14% optimize.opt_a.a_3 : 0.000074s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000016s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000455s : 1.23% optimize.opt_b.b_1 : 0.000103s : 0.28% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000014s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.06% optimize.loop_unroll : 0.000414s : 1.12% optimize.opt_after_cconv.c_1 : 0.000025s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000028s : 0.08% optimize.tuple_transform.d_1 : 0.000035s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000042s : 0.11% optimize.cse_after_recomputation.cse : 0.000011s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000011s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.05% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000015s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000481s : 1.30% validate : 0.000037s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007709s : 20.79% execute : 0.000006s : 0.02% Time group info: ------[substitution.] 0.000169 26 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.81% : 0.000001s : 2: substitution.fold_const_symbol 2.84% : 0.000005s : 3: substitution.graph_param_transform 79.83% : 0.000135s : 6: substitution.inline 1.70% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000004s : 4: substitution.remove_not_recompute_node 1.72% : 0.000003s : 2: substitution.replace_old_param 3.75% : 0.000006s : 1: substitution.switch_simplify 5.91% : 0.000010s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024124 2 95.13% : 0.022948s : 1: type_inference.infer 4.87% : 0.001176s : 1: type_inference.specialize ------[replace.] 0.000076 9 59.49% : 0.000045s : 6: replace.inline 20.68% : 0.000016s : 1: replace.switch_simplify 19.83% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 9 90.14% : 0.000131s : 6: match.inline 3.83% : 0.000006s : 1: match.switch_simplify 6.03% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000170 1092 1.24% : 0.000002s : 12: predicate.accumulaten_eliminater 0.72% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.53% : 0.000001s : 6: predicate.addn_check_dump 0.99% : 0.000002s : 12: predicate.addn_zero_filter 0.90% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.56% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.52% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.44% : 0.000001s : 6: predicate.depend_value_elim 1.02% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.21% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.72% : 0.000003s : 21: predicate.environ_get_eliminate 1.19% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.61% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.57% : 0.000004s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.66% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.66% : 0.000001s : 6: predicate.get_grad_eliminate 0.20% : 0.000000s : 3: predicate.graph_param_transform 0.48% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 6.07% : 0.000010s : 50: predicate.inline 0.63% : 0.000001s : 6: predicate.inline_without_move 0.30% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.75% : 0.000001s : 6: predicate.less_batch_normalization 1.72% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.57% : 0.000004s : 32: predicate.load_eliminater 0.82% : 0.000001s : 3: predicate.loop_unroll_after_grad 2.84% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.77% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 6: predicate.merge_addn 0.49% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.90% : 0.000002s : 12: predicate.minmaximum_grad 1.03% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.31% : 0.000001s : 3: predicate.parallel_virtual_node 1.99% : 0.000003s : 20: predicate.partial_defer_inline 1.41% : 0.000002s : 17: predicate.partial_eliminate 1.01% : 0.000002s : 12: predicate.print_const_string_wrapper 0.48% : 0.000001s : 6: predicate.reduce_all_const_elim 1.43% : 0.000002s : 12: predicate.reduce_eliminate 2.45% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 6: predicate.remove_not_recompute_node 1.29% : 0.000002s : 20: predicate.replace_applicator 0.49% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 1.08% : 0.000002s : 12: predicate.reshape_eliminate 0.53% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 3: predicate.row_tensor_eliminate 0.89% : 0.000002s : 6: predicate.same_eliminate 0.42% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.66% : 0.000001s : 6: predicate.shard_identity_eliminate 0.59% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.86% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.76% : 0.000003s : 20: predicate.switch_defer_inline 2.57% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.04% : 0.000010s : 68: predicate.switch_simplify 1.07% : 0.000002s : 12: predicate.tile_eliminate 0.99% : 0.000002s : 12: predicate.transpose_eliminate 1.43% : 0.000002s : 18: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000002s : 18: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.65% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.29% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.01% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.62% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000995 16 57.42% : 0.000571s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.58% : 0.000424s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051287 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.09% : 0.003125s : 1: add_attr 6.08% : 0.003116s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000046s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000082s : 1: auto_monad 0.04% : 0.000018s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.98% : 0.000500s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000014s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000023s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.02% : 0.000011s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.82% : 0.000422s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.90% : 0.000462s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000013s : 1: opt.transform.mutable_eliminate 2.09% : 0.001071s : 78: opt.transform.opt_a 0.05% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000083s : 28: opt.transform.opt_b 0.07% : 0.000038s : 2: opt.transform.opt_trans_graph 0.06% : 0.000032s : 4: opt.transform.symbol_engine_opt 5.48% : 0.002809s : 1: opt_a 0.25% : 0.000129s : 1: opt_after_cconv 0.96% : 0.000491s : 1: opt_after_jit_grad 0.35% : 0.000182s : 1: opt_b 9.41% : 0.004824s : 1: optimize 0.04% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000037s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000032s : 1: remove_dup_value 0.93% : 0.000476s : 1: renormalize.infer 0.91% : 0.000469s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000019s : 1: rewriter_after_opt_a 0.45% : 0.000232s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000004s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.16% : 0.000080s : 1: symbol_engine_optimizer 15.05% : 0.007719s : 1: task_emit 0.13% : 0.000068s : 1: tuple_transform 47.17% : 0.024195s : 1: type_inference 0.13% : 0.000066s : 1: validate TotalTime = 0.144106, [24] [bootstrap]: 0.00047177 [type_inference]: 0.0436716 [event_method]: 0.00020749 [auto_monad]: 0.00016141 [graph_reusing]: 9.82001e-06 [inline]: 1.65001e-06 [add_attr]: 0.00301877, [1] [add_attr_with_inline]: 0.00301046, [1] [Cycle 1]: 8.131e-05, [2] [tag_attr]: 4.102e-05 [meta_addattr_fg_expand]: 1.157e-05 [parallel-infer-symbol]: 2.58998e-06 [pre_auto_parallel]: 5.768e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.61002e-06 [optimize]: 0.0465225, [53] [py_interpret_to_execute]: 4.22e-06 [rewriter_before_opt_a]: 0.00035611 [opt_a]: 0.0347183, [3] [Cycle 1]: 0.0169471, [45] [expand_dump_flag]: 4.34002e-06 [switch_simplify]: 0.00015468 [loop_unroll]: 6.749e-05 [a_1]: 0.0014154 [with_stream_mark]: 2.251e-05 [recompute_prepare]: 2.045e-05 [updatestate_depend_eliminate]: 7.87003e-06 [updatestate_assign_eliminate]: 7.42998e-06 [updatestate_loads_eliminate]: 7.49002e-06 [parameter_eliminate]: 2.73e-06 [a_2]: 0.00021035 [accelerated_algorithm]: 1.436e-05 [shard]: 1.54e-06 [meta_shard_fg_expand]: 3.86999e-06 [shard_inline]: 1.422e-05 [merge_send_recv]: 1.516e-05 [auto_parallel]: 9.97999e-06 [parallel]: 1.652e-05 [flash_sp]: 8.79e-06 [merge_comm]: 8.73001e-06 [allreduce_fusion]: 7.75e-06 [matmul_add_comm_reduction]: 2.473e-05 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 1.557e-05 [virtual_dataset]: 1.362e-05 [get_grad_eliminate_]: 1.343e-05 [virtual_output]: 1.366e-05 [merge_forward]: 8.50001e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.524e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.523e-05 [merge_recompute_call_nodes]: 1.41998e-06 [before_grad]: 2.398e-05 [set_forward_comm_id_for_comm_node_pass]: 7.98999e-06 [meta_fg_expand]: 0.00145679 [flash_sp_send_recv_attached]: 3.73999e-06 [receive_attached]: 2.28002e-06 [after_resolve]: 6.366e-05 [a_after_grad]: 8.372e-05 [renormalize]: 0.0114131 [add_forward_monad_depend]: 9.71e-06 [auto_monad_grad]: 5.46e-06 [auto_monad_eliminator]: 0.00014126 [cse]: 0.00045511 [a_3]: 0.00083859 [Cycle 2]: 0.0114329, [45] [expand_dump_flag]: 1.77001e-06 [switch_simplify]: 0.00011387 [loop_unroll]: 0.00010605 [a_1]: 0.00346773 [with_stream_mark]: 6.202e-05 [recompute_prepare]: 7.713e-05 [updatestate_depend_eliminate]: 4.337e-05 [updatestate_assign_eliminate]: 4.152e-05 [updatestate_loads_eliminate]: 4.035e-05 [parameter_eliminate]: 1.37e-06 [a_2]: 0.00118422 [accelerated_algorithm]: 0.00013363 [shard]: 1.03001e-06 [meta_shard_fg_expand]: 1.523e-05 [shard_inline]: 7.634e-05 [merge_send_recv]: 4.917e-05 [auto_parallel]: 4.607e-05 [parallel]: 4.86997e-06 [flash_sp]: 3.19001e-06 [merge_comm]: 4.716e-05 [allreduce_fusion]: 4.517e-05 [matmul_add_comm_reduction]: 5.154e-05 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.564e-05 [virtual_dataset]: 7.368e-05 [get_grad_eliminate_]: 7.402e-05 [virtual_output]: 7.38e-05 [merge_forward]: 4.168e-05 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 5.339e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013035 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 0.00012875 [set_forward_comm_id_for_comm_node_pass]: 5.048e-05 [meta_fg_expand]: 0.00012754 [flash_sp_send_recv_attached]: 1.04003e-06 [receive_attached]: 1.27e-06 [after_resolve]: 8.145e-05 [a_after_grad]: 0.00012176 [renormalize]: 0.0035373 [add_forward_monad_depend]: 4.63001e-06 [auto_monad_grad]: 1.36002e-06 [auto_monad_eliminator]: 0.00010011 [cse]: 0.00028755 [a_3]: 0.00058339 [Cycle 3]: 0.00632381, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.021e-05 [loop_unroll]: 7.44e-05 [a_1]: 0.00224122 [with_stream_mark]: 5.301e-05 [recompute_prepare]: 7.489e-05 [updatestate_depend_eliminate]: 4.545e-05 [updatestate_assign_eliminate]: 4.268e-05 [updatestate_loads_eliminate]: 4.284e-05 [parameter_eliminate]: 1.23002e-06 [a_2]: 0.00119147 [accelerated_algorithm]: 9.515e-05 [shard]: 1.10001e-06 [meta_shard_fg_expand]: 1.608e-05 [shard_inline]: 7.755e-05 [merge_send_recv]: 5.211e-05 [auto_parallel]: 4.835e-05 [parallel]: 4.42e-06 [flash_sp]: 9.80013e-07 [merge_comm]: 4.777e-05 [allreduce_fusion]: 4.687e-05 [matmul_add_comm_reduction]: 5.427e-05 [allreduce_slice_to_reducescatter]: 4.70027e-07 [virtual_shard_identity]: 7.659e-05 [virtual_dataset]: 7.448e-05 [get_grad_eliminate_]: 7.45e-05 [virtual_output]: 7.38e-05 [merge_forward]: 4.29e-05 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 5.637e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013224 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 0.00012482 [set_forward_comm_id_for_comm_node_pass]: 5.022e-05 [meta_fg_expand]: 3.261e-05 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 7.64e-05 [a_after_grad]: 0.00012221 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.99e-06 [auto_monad_grad]: 1.13001e-06 [auto_monad_eliminator]: 7.883e-05 [cse]: 0.00023581 [a_3]: 0.00055135 [py_interpret_to_execute_after_opt_a]: 4.65999e-06 [slice_cell_reuse_recomputed_activation]: 2.32999e-06 [rewriter_after_opt_a]: 0.00021797 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.22e-06 [mutable_eliminate]: 0.00056469 [opt_b]: 0.00662381, [2] [Cycle 1]: 0.00427429, [7] [b_1]: 0.00376719 [b_2]: 7.842e-05 [updatestate_depend_eliminate]: 5.277e-05 [updatestate_assign_eliminate]: 4.302e-05 [updatestate_loads_eliminate]: 4.268e-05 [renormalize]: 4.90021e-07 [cse]: 0.00024296 [Cycle 2]: 0.00233901, [7] [b_1]: 0.00185921 [b_2]: 7.7e-05 [updatestate_depend_eliminate]: 4.803e-05 [updatestate_assign_eliminate]: 4.232e-05 [updatestate_loads_eliminate]: 4.224e-05 [renormalize]: 6.99947e-08 [cse]: 0.00023031 [optimize_parallel_all_gather_comm]: 8.918e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 3.579e-05 [loop_unroll]: 0.00053409 [opt_after_cconv]: 0.00082718, [1] [Cycle 1]: 0.00082082, [7] [c_1]: 0.00039543 [parameter_eliminate]: 2.27999e-06 [updatestate_depend_eliminate]: 5.453e-05 [updatestate_assign_eliminate]: 4.355e-05 [updatestate_loads_eliminate]: 4.272e-05 [cse]: 0.00024302 [renormalize]: 4.30009e-07 [remove_dup_value]: 0.00047122 [tuple_transform]: 0.00059901, [1] [Cycle 1]: 0.00059301, [4] [d_1]: 0.00049868 [none_parameter_eliminate]: 2.17999e-06 [renormalize]: 3.10014e-07 [switch_simplify]: 6.763e-05 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 0.00024113 [cse_after_recomputation]: 0.00014732, [1] [Cycle 1]: 0.0001422, [1] [cse]: 0.00013465 [environ_conv]: 2.606e-05 [swap_dp_allreduce_reducescatter]: 4.446e-05 [bias_add_comm_swap]: 2.19001e-06 [label_micro_interleaved_index]: 4.02e-06 [label_fine_grained_interleaved_index]: 2.43998e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.24999e-06 [micro_interleaved_order_control]: 2.05002e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.63998e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 9.49978e-07 [interleave_split_concat_branches]: 1.07998e-06 [interleave_parallel_branches]: 9.89996e-07 [overlap_opt_shard_in_pipeline]: 1.66e-06 [overlap_opt_shard_grad_in_pipeline]: 1.64998e-06 [control_data_broadcast_order]: 0.00010238 [grouped_pairwise_exchange_alltoall]: 1.81e-06 [offloading_packed_experts]: 2.581e-05 [overlap_recompute_and_grad_model_parallel]: 2.574e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.16998e-06 [overlap_grad_ring_attention]: 2.541e-05 [overlap_grad_flash_sp]: 0.00013462 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.86003e-06 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 0.00042029, [1] [Cycle 1]: 0.00041529, [6] [build]: 1.587e-05 [elim_shapecalc]: 7.199e-05 [elim_not_effective]: 0.00011855 [opt_reshape]: 6.417e-05 [fold_const_symbol]: 0.00011416 [renormalize]: 2.59985e-07 [detach_backward]: 2.03997e-06 [pipeline_parallel_scheduler]: 1.82001e-06 [auto_monad_reorder]: 9.574e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 3.58e-06 [opt_after_jit_grad]: 0.00068031 [validate]: 0.00015333 [backend_pass]: 8.79983e-07 [task_emit]: 0.0487162 [execute]: 1.003e-05 Sums bootstrap : 0.000472s : 0.34% type_inference : 0.043672s : 31.29% event_method : 0.000207s : 0.15% auto_monad : 0.000161s : 0.12% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000058s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000356s : 0.26% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000349s : 0.25% optimize.opt_a.loop_unroll : 0.000248s : 0.18% optimize.opt_a.a_1 : 0.007124s : 5.10% optimize.opt_a.with_stream_mark : 0.000138s : 0.10% optimize.opt_a.recompute_prepare : 0.000172s : 0.12% optimize.opt_a.updatestate_depend_eliminate : 0.000097s : 0.07% optimize.opt_a.updatestate_assign_eliminate : 0.000092s : 0.07% optimize.opt_a.updatestate_loads_eliminate : 0.000091s : 0.06% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.002586s : 1.85% optimize.opt_a.accelerated_algorithm : 0.000243s : 0.17% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000035s : 0.03% optimize.opt_a.shard_inline : 0.000168s : 0.12% optimize.opt_a.merge_send_recv : 0.000116s : 0.08% optimize.opt_a.auto_parallel : 0.000104s : 0.07% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000104s : 0.07% optimize.opt_a.allreduce_fusion : 0.000100s : 0.07% optimize.opt_a.matmul_add_comm_reduction : 0.000131s : 0.09% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000168s : 0.12% optimize.opt_a.virtual_dataset : 0.000162s : 0.12% optimize.opt_a.get_grad_eliminate_ : 0.000162s : 0.12% optimize.opt_a.virtual_output : 0.000161s : 0.12% optimize.opt_a.merge_forward : 0.000093s : 0.07% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000125s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000288s : 0.21% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000278s : 0.20% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000109s : 0.08% optimize.opt_a.meta_fg_expand : 0.001617s : 1.16% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000222s : 0.16% optimize.opt_a.a_after_grad : 0.000328s : 0.23% optimize.opt_a.renormalize : 0.014950s : 10.71% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000008s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000320s : 0.23% optimize.opt_a.cse : 0.000978s : 0.70% optimize.opt_a.a_3 : 0.001973s : 1.41% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000218s : 0.16% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000565s : 0.40% optimize.opt_b.b_1 : 0.005626s : 4.03% optimize.opt_b.b_2 : 0.000155s : 0.11% optimize.opt_b.updatestate_depend_eliminate : 0.000101s : 0.07% optimize.opt_b.updatestate_assign_eliminate : 0.000085s : 0.06% optimize.opt_b.updatestate_loads_eliminate : 0.000085s : 0.06% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000473s : 0.34% optimize.optimize_parallel_all_gather_comm : 0.000089s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.03% optimize.loop_unroll : 0.000534s : 0.38% optimize.opt_after_cconv.c_1 : 0.000395s : 0.28% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000055s : 0.04% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000044s : 0.03% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000043s : 0.03% optimize.opt_after_cconv.cse : 0.000243s : 0.17% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000471s : 0.34% optimize.tuple_transform.d_1 : 0.000499s : 0.36% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000068s : 0.05% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000241s : 0.17% optimize.cse_after_recomputation.cse : 0.000135s : 0.10% optimize.environ_conv : 0.000026s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000044s : 0.03% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000102s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000026s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000026s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.02% optimize.overlap_grad_flash_sp : 0.000135s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000072s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000119s : 0.08% optimize.symbol_engine_optimizer.opt_reshape : 0.000064s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000114s : 0.08% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000096s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000680s : 0.49% validate : 0.000153s : 0.11% backend_pass : 0.000001s : 0.00% task_emit : 0.048716s : 34.90% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.001332 696 6.72% : 0.000090s : 36: substitution.arithmetic_simplify 1.31% : 0.000017s : 52: substitution.elim_not_effective 0.57% : 0.000008s : 11: substitution.float_depend_g_call 1.22% : 0.000016s : 17: substitution.float_tuple_getitem_switch 1.26% : 0.000017s : 52: substitution.fold_const_symbol 2.89% : 0.000039s : 59: substitution.graph_param_transform 0.20% : 0.000003s : 2: substitution.incorporate_call 0.14% : 0.000002s : 2: substitution.incorporate_call_switch 43.80% : 0.000583s : 21: substitution.inline 1.16% : 0.000015s : 2: substitution.inline_without_move 3.46% : 0.000046s : 114: substitution.j_node_and_user_rematch 4.01% : 0.000053s : 10: substitution.less_batch_normalization 1.27% : 0.000017s : 13: substitution.minmaximum_grad 0.64% : 0.000008s : 11: substitution.partial_eliminate 4.68% : 0.000062s : 114: substitution.remove_not_recompute_node 1.55% : 0.000021s : 9: substitution.replace_applicator 0.66% : 0.000009s : 11: substitution.replace_old_param 0.17% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.87% : 0.000012s : 4: substitution.switch_simplify 1.87% : 0.000025s : 14: substitution.transpose_eliminate 4.50% : 0.000060s : 25: substitution.tuple_list_convert_item_index_to_positive 2.30% : 0.000031s : 25: substitution.tuple_list_get_item_const_eliminator 2.94% : 0.000039s : 25: substitution.tuple_list_get_item_depend_reorder 6.98% : 0.000093s : 40: substitution.tuple_list_get_item_eliminator 2.94% : 0.000039s : 25: substitution.tuple_list_get_set_item_eliminator 1.90% : 0.000025s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.043585 2 94.40% : 0.041144s : 1: type_inference.infer 5.60% : 0.002440s : 1: type_inference.specialize ------[replace.] 0.000274 33 56.01% : 0.000154s : 21: replace.inline 15.35% : 0.000042s : 4: replace.switch_simplify 23.11% : 0.000063s : 7: replace.tuple_list_get_item_eliminator 5.52% : 0.000015s : 1: replace.zero_like_fill_zero ------[match.] 0.000627 33 91.27% : 0.000572s : 21: match.inline 1.43% : 0.000009s : 4: match.switch_simplify 3.42% : 0.000021s : 7: match.tuple_list_get_item_eliminator 3.87% : 0.000024s : 1: match.zero_like_fill_zero ------[predicate.] 0.003097 24043 0.73% : 0.000023s : 191: predicate.accumulaten_eliminater 0.42% : 0.000013s : 59: predicate.ad_related_special_op_eliminate 0.64% : 0.000020s : 158: predicate.addn_check_dump 0.74% : 0.000023s : 191: predicate.addn_zero_filter 0.72% : 0.000022s : 191: predicate.adjust_all_reduce_mul_add 1.83% : 0.000057s : 349: predicate.arithmetic_simplify 0.75% : 0.000023s : 191: predicate.cast_eliminate 1.07% : 0.000033s : 256: predicate.check_bprop_eliminate 0.65% : 0.000020s : 158: predicate.compare_switch_simplify 0.53% : 0.000017s : 216: predicate.const_output_eliminate 0.65% : 0.000020s : 158: predicate.depend_value_elim 0.82% : 0.000025s : 191: predicate.dict_get_item_const_eliminator 0.84% : 0.000026s : 191: predicate.dict_get_item_eliminator 0.74% : 0.000023s : 191: predicate.dict_set_item_eliminator 1.19% : 0.000037s : 275: predicate.dumpgradient_eliminate 0.13% : 0.000004s : 59: predicate.elim_not_effective 0.28% : 0.000009s : 59: predicate.elim_shapecalc_of_broadcastargs 1.60% : 0.000050s : 407: predicate.environ_add_const_eliminate 1.60% : 0.000049s : 407: predicate.environ_get_add_eliminate 1.59% : 0.000049s : 407: predicate.environ_get_depend_swap 2.25% : 0.000070s : 565: predicate.environ_get_eliminate 1.60% : 0.000050s : 407: predicate.environ_get_set_eliminate 0.87% : 0.000027s : 219: predicate.exchange_switch_depend_value 1.14% : 0.000035s : 219: predicate.float_depend_g_call 0.65% : 0.000020s : 158: predicate.float_environ_get_switch 1.57% : 0.000049s : 374: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 59: predicate.fold_const_symbol 0.68% : 0.000021s : 158: predicate.get_grad_eliminate 0.16% : 0.000005s : 59: predicate.graph_param_transform 0.65% : 0.000020s : 158: predicate.incorporate_call 0.62% : 0.000019s : 158: predicate.incorporate_call_switch 5.12% : 0.000158s : 1014: predicate.inline 1.00% : 0.000031s : 185: predicate.inline_without_move 0.37% : 0.000011s : 158: predicate.j_node_and_user_rematch 0.73% : 0.000022s : 158: predicate.less_batch_normalization 1.96% : 0.000061s : 473: predicate.list_to_tuple_eliminator_ 2.61% : 0.000081s : 677: predicate.load_eliminater 0.50% : 0.000015s : 72: predicate.loop_unroll_after_grad 1.12% : 0.000035s : 259: predicate.loop_unroll_before_grad 1.94% : 0.000060s : 479: predicate.make_slice_get_slice_eliminator 0.66% : 0.000020s : 158: predicate.merge_addn 1.04% : 0.000032s : 256: predicate.micro_step_allgather_replace 1.06% : 0.000033s : 256: predicate.mini_step_allgather_replace 0.73% : 0.000023s : 191: predicate.minmaximum_grad 0.50% : 0.000015s : 73: predicate.mutable_eliminate 0.27% : 0.000008s : 59: predicate.opt_reshape 0.93% : 0.000029s : 216: predicate.parallel_virtual_node 1.12% : 0.000035s : 219: predicate.partial_defer_inline 1.22% : 0.000038s : 270: predicate.partial_eliminate 0.73% : 0.000023s : 191: predicate.print_const_string_wrapper 0.65% : 0.000020s : 158: predicate.reduce_all_const_elim 0.94% : 0.000029s : 191: predicate.reduce_eliminate 2.64% : 0.000082s : 677: predicate.redundant_stop_gradient_eliminater 0.36% : 0.000011s : 158: predicate.remove_not_recompute_node 1.32% : 0.000041s : 454: predicate.replace_applicator 0.44% : 0.000014s : 185: predicate.replace_old_param 0.54% : 0.000017s : 216: predicate.reset_defer_inline 0.76% : 0.000024s : 191: predicate.reshape_eliminate 1.07% : 0.000033s : 256: predicate.row_tensor_add_zeros_like 0.65% : 0.000020s : 144: predicate.row_tensor_eliminate 1.19% : 0.000037s : 256: predicate.same_eliminate 0.42% : 0.000013s : 158: predicate.set_cell_output_no_recompute 0.70% : 0.000022s : 158: predicate.shard_identity_eliminate 1.20% : 0.000037s : 275: predicate.special_op_eliminate 0.72% : 0.000022s : 158: predicate.specialize_transform 1.10% : 0.000034s : 256: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000028s : 185: predicate.stack_unstack_eliminate 0.32% : 0.000010s : 72: predicate.switch_call_monad_eliminater 0.95% : 0.000030s : 219: predicate.switch_defer_inline 2.04% : 0.000063s : 475: predicate.switch_layer_defer_inline 3.08% : 0.000095s : 703: predicate.switch_simplify 0.75% : 0.000023s : 191: predicate.tile_eliminate 0.78% : 0.000024s : 191: predicate.transpose_eliminate 1.99% : 0.000062s : 466: predicate.tuple_list_convert_item_index_to_positive 2.06% : 0.000064s : 466: predicate.tuple_list_get_item_const_eliminator 1.96% : 0.000061s : 466: predicate.tuple_list_get_item_depend_reorder 2.90% : 0.000090s : 631: predicate.tuple_list_get_item_eliminator 1.97% : 0.000061s : 466: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000083s : 624: predicate.tuple_list_set_item_eliminator 1.93% : 0.000060s : 473: predicate.tuple_to_list_eliminator_ 2.63% : 0.000081s : 677: predicate.updatestate_pure_node_eliminater 3.34% : 0.000103s : 835: predicate.updatestate_useless_node_eliminater 0.92% : 0.000028s : 216: predicate.value_based_eliminate 0.68% : 0.000021s : 158: predicate.virtual_dataset_eliminate 0.70% : 0.000022s : 158: predicate.virtual_output_eliminate 0.28% : 0.000009s : 59: predicate.virtual_view_grad_eliminate 0.98% : 0.000030s : 218: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003827 51 64.69% : 0.002476s : 26: func_graph_cloner_run.FuncGraphClonerGraph 35.31% : 0.001351s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.230376 292 0.00% : 0.000003s : 1: ForceFp32Comm 1.31% : 0.003023s : 1: add_attr 1.31% : 0.003014s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.11% : 0.000246s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000171s : 1: auto_monad 0.04% : 0.000100s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.22% : 0.000501s : 1: bootstrap 0.02% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000106s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.07% : 0.000151s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000030s : 1: environ_conv 0.09% : 0.000218s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.24% : 0.000543s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.25% : 0.000573s : 1: mutable_eliminate 0.01% : 0.000029s : 1: offloading_packed_experts 0.04% : 0.000096s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000099s : 1: opt.transform.mutable_eliminate 6.29% : 0.014481s : 117: opt.transform.opt_a 0.17% : 0.000394s : 1: opt.transform.opt_after_cconv 0.09% : 0.000197s : 1: opt.transform.opt_after_jit_grad 2.47% : 0.005694s : 83: opt.transform.opt_b 0.24% : 0.000563s : 2: opt.transform.opt_trans_graph 0.16% : 0.000365s : 4: opt.transform.symbol_engine_opt 15.07% : 0.034722s : 1: opt_a 0.36% : 0.000831s : 1: opt_after_cconv 0.30% : 0.000690s : 1: opt_after_jit_grad 2.88% : 0.006628s : 1: opt_b 20.20% : 0.046527s : 1: optimize 0.04% : 0.000093s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.06% : 0.000138s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000029s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.21% : 0.000479s : 1: remove_dup_value 4.55% : 0.010477s : 2: renormalize.infer 1.93% : 0.004457s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.10% : 0.000222s : 1: rewriter_after_opt_a 0.16% : 0.000363s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000048s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000423s : 1: symbol_engine_optimizer 21.16% : 0.048741s : 1: task_emit 0.26% : 0.000602s : 1: tuple_transform 18.96% : 0.043687s : 1: type_inference 0.11% : 0.000256s : 1: validate . TotalTime = 23.3126, [24] [bootstrap]: 0.00088137 [type_inference]: 0.0808673 [event_method]: 0.00025938 [auto_monad]: 0.00029461 [graph_reusing]: 1.878e-05 [inline]: 4e-06 [add_attr]: 0.0068864, [1] [add_attr_with_inline]: 0.00686579, [1] [Cycle 1]: 0.00014799, [2] [tag_attr]: 8.783e-05 [meta_addattr_fg_expand]: 1.386e-05 [parallel-infer-symbol]: 3.84002e-06 [pre_auto_parallel]: 8.431e-05 [insert-virtual-dataset]: 3.16001e-06 [parallel-infer-symbol-second]: 1.52999e-06 [dataset_repeat_opt]: 1.91998e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.0832837, [53] [py_interpret_to_execute]: 8.13001e-06 [rewriter_before_opt_a]: 0.00050353 [opt_a]: 0.0795204, [3] [Cycle 1]: 0.0711741, [45] [expand_dump_flag]: 6.37001e-06 [switch_simplify]: 0.00020612 [loop_unroll]: 7.516e-05 [a_1]: 0.00185587 [with_stream_mark]: 3.574e-05 [recompute_prepare]: 2.599e-05 [updatestate_depend_eliminate]: 1.142e-05 [updatestate_assign_eliminate]: 8.78001e-06 [updatestate_loads_eliminate]: 8.91002e-06 [parameter_eliminate]: 3.64002e-06 [a_2]: 0.00023958 [accelerated_algorithm]: 1.643e-05 [shard]: 2.41e-06 [meta_shard_fg_expand]: 5.93002e-06 [shard_inline]: 1.498e-05 [merge_send_recv]: 2.157e-05 [auto_parallel]: 1.741e-05 [parallel]: 7.403e-05 [flash_sp]: 1.7e-05 [merge_comm]: 1.223e-05 [allreduce_fusion]: 9.67001e-06 [matmul_add_comm_reduction]: 3.745e-05 [allreduce_slice_to_reducescatter]: 1.24e-06 [virtual_shard_identity]: 2.363e-05 [virtual_dataset]: 1.601e-05 [get_grad_eliminate_]: 1.716e-05 [virtual_output]: 1.601e-05 [merge_forward]: 1.001e-05 [cell_reuse_recompute_pass]: 1.89e-06 [offload_activation]: 1.979e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.472e-05 [merge_recompute_call_nodes]: 1.63002e-06 [before_grad]: 2.877e-05 [set_forward_comm_id_for_comm_node_pass]: 1.087e-05 [meta_fg_expand]: 0.0255192 [flash_sp_send_recv_attached]: 6.05002e-06 [receive_attached]: 3.21999e-06 [after_resolve]: 0.00011765 [a_after_grad]: 0.00014618 [renormalize]: 0.0399824 [add_forward_monad_depend]: 2.73e-05 [auto_monad_grad]: 1.66e-05 [auto_monad_eliminator]: 0.00014161 [cse]: 0.00037252 [a_3]: 0.00149878 [Cycle 2]: 0.00686309, [45] [expand_dump_flag]: 4.03001e-06 [switch_simplify]: 9.446e-05 [loop_unroll]: 8.45e-05 [a_1]: 0.00198307 [with_stream_mark]: 4.057e-05 [recompute_prepare]: 2.076e-05 [updatestate_depend_eliminate]: 1.07e-05 [updatestate_assign_eliminate]: 8.08001e-06 [updatestate_loads_eliminate]: 7.58999e-06 [parameter_eliminate]: 2.74001e-06 [a_2]: 0.00021005 [accelerated_algorithm]: 4.194e-05 [shard]: 2.56e-06 [meta_shard_fg_expand]: 5.29e-06 [shard_inline]: 1.534e-05 [merge_send_recv]: 1.66e-05 [auto_parallel]: 1.665e-05 [parallel]: 1.161e-05 [flash_sp]: 4.43999e-06 [merge_comm]: 9.51e-06 [allreduce_fusion]: 8.62e-06 [matmul_add_comm_reduction]: 1.608e-05 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 1.754e-05 [virtual_dataset]: 1.421e-05 [get_grad_eliminate_]: 1.385e-05 [virtual_output]: 1.513e-05 [merge_forward]: 9.86e-06 [cell_reuse_recompute_pass]: 2.31998e-06 [offload_activation]: 1.768e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.131e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 2.328e-05 [set_forward_comm_id_for_comm_node_pass]: 9.20999e-06 [meta_fg_expand]: 0.00037464 [flash_sp_send_recv_attached]: 2.97002e-06 [receive_attached]: 4.2e-06 [after_resolve]: 3.131e-05 [a_after_grad]: 2.207e-05 [renormalize]: 0.00292489 [add_forward_monad_depend]: 1.124e-05 [auto_monad_grad]: 2.48e-06 [auto_monad_eliminator]: 3.772e-05 [cse]: 0.00020339 [a_3]: 0.00012023 [Cycle 3]: 0.00145815, [45] [expand_dump_flag]: 3.13e-06 [switch_simplify]: 1.796e-05 [loop_unroll]: 1.386e-05 [a_1]: 0.0003915 [with_stream_mark]: 2.593e-05 [recompute_prepare]: 1.563e-05 [updatestate_depend_eliminate]: 1.035e-05 [updatestate_assign_eliminate]: 7.97e-06 [updatestate_loads_eliminate]: 8.08999e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 0.00021511 [accelerated_algorithm]: 2.132e-05 [shard]: 2.51e-06 [meta_shard_fg_expand]: 4.22e-06 [shard_inline]: 1.412e-05 [merge_send_recv]: 1.694e-05 [auto_parallel]: 1.675e-05 [parallel]: 8.93002e-06 [flash_sp]: 1.17999e-06 [merge_comm]: 8.69e-06 [allreduce_fusion]: 9.20001e-06 [matmul_add_comm_reduction]: 1.595e-05 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 1.515e-05 [virtual_dataset]: 1.484e-05 [get_grad_eliminate_]: 1.329e-05 [virtual_output]: 1.336e-05 [merge_forward]: 9.86998e-06 [cell_reuse_recompute_pass]: 3.61001e-06 [offload_activation]: 1.75e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.695e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 2.339e-05 [set_forward_comm_id_for_comm_node_pass]: 8.85999e-06 [meta_fg_expand]: 5.99999e-06 [flash_sp_send_recv_attached]: 2.34001e-06 [receive_attached]: 2.42001e-06 [after_resolve]: 1.953e-05 [a_after_grad]: 2.101e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 2.89999e-06 [auto_monad_grad]: 1.59e-06 [auto_monad_eliminator]: 2.444e-05 [cse]: 7.166e-05 [a_3]: 9.297e-05 [py_interpret_to_execute_after_opt_a]: 9.78998e-06 [slice_cell_reuse_recomputed_activation]: 2.22999e-06 [rewriter_after_opt_a]: 5.629e-05 [convert_after_rewriter]: 1.28002e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00083463 [opt_b]: 0.00052354, [1] [Cycle 1]: 0.00051367, [7] [b_1]: 0.00035637 [b_2]: 1.607e-05 [updatestate_depend_eliminate]: 1.364e-05 [updatestate_assign_eliminate]: 7.10002e-06 [updatestate_loads_eliminate]: 7.50998e-06 [renormalize]: 1.07e-06 [cse]: 7.026e-05 [optimize_parallel_all_gather_comm]: 3.112e-05 [overlap_param_gather]: 2.34999e-06 [cconv]: 3.684e-05 [loop_unroll]: 0.00053108 [opt_after_cconv]: 0.00021292, [1] [Cycle 1]: 0.00020456, [7] [c_1]: 7.277e-05 [parameter_eliminate]: 4.30999e-06 [updatestate_depend_eliminate]: 1.157e-05 [updatestate_assign_eliminate]: 7.36001e-06 [updatestate_loads_eliminate]: 7.07002e-06 [cse]: 6.508e-05 [renormalize]: 5.59987e-07 [remove_dup_value]: 0.00011924 [tuple_transform]: 0.00017755, [1] [Cycle 1]: 0.00017146, [4] [d_1]: 0.00013035 [none_parameter_eliminate]: 2.80997e-06 [renormalize]: 4.39992e-07 [switch_simplify]: 1.547e-05 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 9.065e-05 [cse_after_recomputation]: 5.719e-05, [1] [Cycle 1]: 5.14e-05, [1] [cse]: 4.534e-05 [environ_conv]: 1.636e-05 [swap_dp_allreduce_reducescatter]: 1.285e-05 [bias_add_comm_swap]: 3.38999e-06 [label_micro_interleaved_index]: 6.38998e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 2.71999e-06 [assign_add_opt]: 1.16997e-06 [ForceFp32Comm]: 1.29e-06 [remove_cast_before_assign_add]: 1.52999e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 1.01997e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.57001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.31e-06 [control_data_broadcast_order]: 2.629e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 7.42002e-06 [overlap_recompute_and_grad_model_parallel]: 9.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.17999e-06 [overlap_grad_ring_attention]: 7.10998e-06 [overlap_grad_flash_sp]: 3.681e-05 [begin_end_overlap_inline]: 6.59988e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 2.13002e-06 [handle_group_info]: 1.16002e-06 [symbol_engine_optimizer]: 0.00013914, [1] [Cycle 1]: 0.00013308, [6] [build]: 1.432e-05 [elim_shapecalc]: 2.111e-05 [elim_not_effective]: 2.889e-05 [opt_reshape]: 1.506e-05 [fold_const_symbol]: 2.332e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.23998e-06 [pipeline_parallel_scheduler]: 1.56002e-06 [auto_monad_reorder]: 3.536e-05 [get_jit_bprop_graph]: 2.63998e-06 [rewriter_after_jit_bprop_graph]: 6.06e-06 [opt_after_jit_grad]: 0.00061819 [validate]: 8.68e-05 [backend_pass]: 1.20999e-06 [task_emit]: 23.1387 [execute]: 1.063e-05 Sums bootstrap : 0.000881s : 0.00% type_inference : 0.080867s : 0.35% event_method : 0.000259s : 0.00% auto_monad : 0.000295s : 0.00% graph_reusing : 0.000019s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000088s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000084s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000002s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.00% optimize.rewriter_before_opt_a : 0.000504s : 0.00% optimize.opt_a.expand_dump_flag : 0.000014s : 0.00% optimize.opt_a.switch_simplify : 0.000319s : 0.00% optimize.opt_a.loop_unroll : 0.000174s : 0.00% optimize.opt_a.a_1 : 0.004230s : 0.02% optimize.opt_a.with_stream_mark : 0.000102s : 0.00% optimize.opt_a.recompute_prepare : 0.000062s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000032s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000025s : 0.00% optimize.opt_a.parameter_eliminate : 0.000009s : 0.00% optimize.opt_a.a_2 : 0.000665s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000080s : 0.00% optimize.opt_a.shard : 0.000007s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000015s : 0.00% optimize.opt_a.shard_inline : 0.000044s : 0.00% optimize.opt_a.merge_send_recv : 0.000055s : 0.00% optimize.opt_a.auto_parallel : 0.000051s : 0.00% optimize.opt_a.parallel : 0.000095s : 0.00% optimize.opt_a.flash_sp : 0.000023s : 0.00% optimize.opt_a.merge_comm : 0.000030s : 0.00% optimize.opt_a.allreduce_fusion : 0.000027s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000069s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000056s : 0.00% optimize.opt_a.virtual_dataset : 0.000045s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000044s : 0.00% optimize.opt_a.virtual_output : 0.000044s : 0.00% optimize.opt_a.merge_forward : 0.000030s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000008s : 0.00% optimize.opt_a.offload_activation : 0.000055s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000093s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000075s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000029s : 0.00% optimize.opt_a.meta_fg_expand : 0.025900s : 0.11% optimize.opt_a.flash_sp_send_recv_attached : 0.000011s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000168s : 0.00% optimize.opt_a.a_after_grad : 0.000189s : 0.00% optimize.opt_a.renormalize : 0.042907s : 0.18% optimize.opt_a.add_forward_monad_depend : 0.000041s : 0.00% optimize.opt_a.auto_monad_grad : 0.000021s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000204s : 0.00% optimize.opt_a.cse : 0.000648s : 0.00% optimize.opt_a.a_3 : 0.001712s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000056s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000835s : 0.00% optimize.opt_b.b_1 : 0.000356s : 0.00% optimize.opt_b.b_2 : 0.000016s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000070s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000037s : 0.00% optimize.loop_unroll : 0.000531s : 0.00% optimize.opt_after_cconv.c_1 : 0.000073s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000065s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000119s : 0.00% optimize.tuple_transform.d_1 : 0.000130s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000015s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000091s : 0.00% optimize.cse_after_recomputation.cse : 0.000045s : 0.00% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000026s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000037s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000021s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000029s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000035s : 0.00% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000618s : 0.00% validate : 0.000087s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 23.138728s : 99.29% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.002289 315 0.19% : 0.000004s : 8: substitution.elim_not_effective 0.46% : 0.000011s : 12: substitution.float_depend_g_call 0.71% : 0.000016s : 9: substitution.float_tuple_getitem_switch 0.16% : 0.000004s : 8: substitution.fold_const_symbol 35.15% : 0.000805s : 5: substitution.getattr_setattr_resolve 0.46% : 0.000011s : 10: substitution.graph_param_transform 0.14% : 0.000003s : 2: substitution.incorporate_call 0.10% : 0.000002s : 2: substitution.incorporate_call_switch 38.45% : 0.000880s : 24: substitution.inline 1.52% : 0.000035s : 3: substitution.inline_without_move 0.70% : 0.000016s : 25: substitution.j_node_and_user_rematch 0.94% : 0.000022s : 4: substitution.less_batch_normalization 1.03% : 0.000024s : 13: substitution.minmaximum_grad 1.11% : 0.000025s : 12: substitution.partial_eliminate 0.90% : 0.000021s : 25: substitution.remove_not_recompute_node 4.41% : 0.000101s : 32: substitution.replace_applicator 0.79% : 0.000018s : 14: substitution.replace_old_param 0.12% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.63% : 0.000014s : 4: substitution.switch_simplify 0.58% : 0.000013s : 2: substitution.transpose_eliminate 2.41% : 0.000055s : 17: substitution.tuple_list_convert_item_index_to_positive 1.12% : 0.000026s : 17: substitution.tuple_list_get_item_const_eliminator 1.48% : 0.000034s : 17: substitution.tuple_list_get_item_depend_reorder 4.86% : 0.000111s : 32: substitution.tuple_list_get_item_eliminator 1.58% : 0.000036s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.080690 2 93.82% : 0.075701s : 1: type_inference.infer 6.18% : 0.004989s : 1: type_inference.specialize ------[replace.] 0.000639 45 12.64% : 0.000081s : 4: replace.getattr_setattr_resolve 50.45% : 0.000322s : 24: replace.inline 13.02% : 0.000083s : 5: replace.replace_applicator 10.50% : 0.000067s : 4: replace.switch_simplify 13.40% : 0.000086s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001693 45 44.36% : 0.000751s : 4: match.getattr_setattr_resolve 50.96% : 0.000863s : 24: match.inline 2.18% : 0.000037s : 5: match.replace_applicator 0.68% : 0.000012s : 4: match.switch_simplify 1.82% : 0.000031s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001073 7110 0.94% : 0.000010s : 68: predicate.accumulaten_eliminater 0.30% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.44% : 0.000005s : 32: predicate.addn_check_dump 0.87% : 0.000009s : 68: predicate.addn_zero_filter 0.82% : 0.000009s : 68: predicate.adjust_all_reduce_mul_add 1.85% : 0.000020s : 100: predicate.arithmetic_simplify 1.02% : 0.000011s : 68: predicate.cast_eliminate 2.71% : 0.000029s : 215: predicate.check_bprop_eliminate 0.39% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.45% : 0.000005s : 32: predicate.depend_value_elim 0.95% : 0.000010s : 68: predicate.dict_get_item_const_eliminator 1.05% : 0.000011s : 68: predicate.dict_get_item_eliminator 0.91% : 0.000010s : 68: predicate.dict_set_item_eliminator 0.37% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 10: predicate.elim_not_effective 0.18% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.01% : 0.000011s : 78: predicate.environ_add_const_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_add_eliminate 1.00% : 0.000011s : 78: predicate.environ_get_depend_swap 1.41% : 0.000015s : 110: predicate.environ_get_eliminate 0.95% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.39% : 0.000015s : 100: predicate.exchange_switch_depend_value 1.94% : 0.000021s : 100: predicate.float_depend_g_call 0.45% : 0.000005s : 32: predicate.float_environ_get_switch 0.66% : 0.000007s : 42: predicate.float_tuple_getitem_switch 0.07% : 0.000001s : 10: predicate.fold_const_symbol 0.53% : 0.000006s : 32: predicate.get_grad_eliminate 0.75% : 0.000008s : 31: predicate.getattr_setattr_resolve 0.11% : 0.000001s : 10: predicate.graph_param_transform 0.43% : 0.000005s : 32: predicate.incorporate_call 0.38% : 0.000004s : 32: predicate.incorporate_call_switch 4.58% : 0.000049s : 252: predicate.inline 1.43% : 0.000015s : 82: predicate.inline_without_move 0.21% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.76% : 0.000008s : 32: predicate.less_batch_normalization 1.36% : 0.000015s : 96: predicate.list_to_tuple_eliminator_ 2.13% : 0.000023s : 164: predicate.load_eliminater 0.37% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.52% : 0.000027s : 182: predicate.loop_unroll_before_grad 1.17% : 0.000013s : 88: predicate.make_slice_get_slice_eliminator 0.52% : 0.000006s : 32: predicate.merge_addn 2.53% : 0.000027s : 198: predicate.micro_step_allgather_replace 2.51% : 0.000027s : 198: predicate.mini_step_allgather_replace 0.86% : 0.000009s : 68: predicate.minmaximum_grad 0.48% : 0.000005s : 10: predicate.mutable_eliminate 0.17% : 0.000002s : 10: predicate.opt_reshape 0.15% : 0.000002s : 10: predicate.parallel_virtual_node 2.01% : 0.000022s : 100: predicate.partial_defer_inline 1.21% : 0.000013s : 86: predicate.partial_eliminate 0.88% : 0.000009s : 68: predicate.print_const_string_wrapper 0.46% : 0.000005s : 32: predicate.reduce_all_const_elim 1.09% : 0.000012s : 68: predicate.reduce_eliminate 2.00% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000003s : 32: predicate.remove_not_recompute_node 2.27% : 0.000024s : 284: predicate.replace_applicator 0.71% : 0.000008s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.92% : 0.000010s : 68: predicate.reshape_eliminate 2.61% : 0.000028s : 198: predicate.row_tensor_add_zeros_like 0.18% : 0.000002s : 10: predicate.row_tensor_eliminate 2.97% : 0.000032s : 215: predicate.same_eliminate 0.28% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.68% : 0.000007s : 32: predicate.shard_identity_eliminate 0.32% : 0.000003s : 20: predicate.special_op_eliminate 0.49% : 0.000005s : 32: predicate.specialize_transform 2.78% : 0.000030s : 198: predicate.split_environ_get_set_with_tuple_value 1.45% : 0.000016s : 82: predicate.stack_unstack_eliminate 0.14% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.46% : 0.000016s : 100: predicate.switch_defer_inline 4.20% : 0.000045s : 315: predicate.switch_layer_defer_inline 5.15% : 0.000055s : 332: predicate.switch_simplify 0.92% : 0.000010s : 68: predicate.tile_eliminate 0.84% : 0.000009s : 68: predicate.transpose_eliminate 1.31% : 0.000014s : 88: predicate.tuple_list_convert_item_index_to_positive 1.30% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.27% : 0.000014s : 88: predicate.tuple_list_get_item_depend_reorder 2.26% : 0.000024s : 128: predicate.tuple_list_get_item_eliminator 1.24% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.82% : 0.000020s : 120: predicate.tuple_list_set_item_eliminator 1.34% : 0.000014s : 96: predicate.tuple_to_list_eliminator_ 1.99% : 0.000021s : 164: predicate.updatestate_pure_node_eliminater 2.37% : 0.000025s : 196: predicate.updatestate_useless_node_eliminater 0.16% : 0.000002s : 10: predicate.value_based_eliminate 0.52% : 0.000006s : 32: predicate.virtual_dataset_eliminate 0.47% : 0.000005s : 32: predicate.virtual_output_eliminate 0.14% : 0.000002s : 10: predicate.virtual_view_grad_eliminate 0.20% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007456 75 63.95% : 0.004768s : 36: func_graph_cloner_run.FuncGraphClonerGraph 36.05% : 0.002688s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 23.454895 247 0.00% : 0.000005s : 1: ForceFp32Comm 0.03% : 0.006893s : 1: add_attr 0.03% : 0.006871s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000096s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000310s : 1: auto_monad 0.00% : 0.000039s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.00% : 0.000920s : 1: bootstrap 0.00% : 0.000041s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000030s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000019s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.00% : 0.000274s : 1: event_method 0.00% : 0.000037s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000024s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.00% : 0.000542s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.00% : 0.000850s : 1: mutable_eliminate 0.00% : 0.000011s : 1: offloading_packed_experts 0.00% : 0.000026s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000031s : 1: opt.transform.mutable_eliminate 0.03% : 0.007792s : 125: opt.transform.opt_a 0.00% : 0.000071s : 1: opt.transform.opt_after_cconv 0.00% : 0.000052s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000338s : 28: opt.transform.opt_b 0.00% : 0.000961s : 2: opt.transform.opt_resolve 0.00% : 0.000143s : 2: opt.transform.opt_trans_graph 0.00% : 0.000084s : 4: opt.transform.symbol_engine_opt 0.34% : 0.079525s : 1: opt_a 0.00% : 0.000217s : 1: opt_after_cconv 0.00% : 0.000632s : 1: opt_after_jit_grad 0.00% : 0.000527s : 1: opt_b 0.36% : 0.083290s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000040s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000089s : 1: pre_auto_parallel 0.00% : 0.000012s : 1: py_interpret_to_execute 0.00% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000124s : 1: remove_dup_value 0.16% : 0.036551s : 2: renormalize.infer 0.03% : 0.006323s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000061s : 1: rewriter_after_opt_a 0.00% : 0.000512s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000142s : 1: symbol_engine_optimizer 98.65% : 23.138788s : 1: task_emit 0.00% : 0.000181s : 1: tuple_transform 0.34% : 0.080906s : 1: type_inference 0.00% : 0.000141s : 1: validate TotalTime = 0.086137, [24] [bootstrap]: 0.00059985 [type_inference]: 0.0312342 [event_method]: 0.00011325 [auto_monad]: 0.00018565 [graph_reusing]: 1.255e-05 [inline]: 2.21e-06 [add_attr]: 0.00351187, [1] [add_attr_with_inline]: 0.00350358, [1] [Cycle 1]: 6.55e-05, [2] [tag_attr]: 2.908e-05 [meta_addattr_fg_expand]: 8.32e-06 [parallel-infer-symbol]: 2.89001e-06 [pre_auto_parallel]: 3.925e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.22001e-06 [pipeline_split]: 1.64998e-06 [optimize]: 0.00572384, [53] [py_interpret_to_execute]: 4.29002e-06 [rewriter_before_opt_a]: 0.00023467 [opt_a]: 0.00351633, [2] [Cycle 1]: 0.0028318, [45] [expand_dump_flag]: 4.02002e-06 [switch_simplify]: 8.222e-05 [loop_unroll]: 3.884e-05 [a_1]: 0.00078876 [with_stream_mark]: 1.538e-05 [recompute_prepare]: 9.67001e-06 [updatestate_depend_eliminate]: 5.10999e-06 [updatestate_assign_eliminate]: 4.53001e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 1.92001e-06 [a_2]: 9.52e-05 [accelerated_algorithm]: 7.85e-06 [shard]: 1.83002e-06 [meta_shard_fg_expand]: 2.39999e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 9.13002e-06 [auto_parallel]: 6.24001e-06 [parallel]: 3.964e-05 [flash_sp]: 7.48e-06 [merge_comm]: 4.84e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 7.40998e-06 [get_grad_eliminate_]: 6.68e-06 [virtual_output]: 6.87002e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 9.76e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.348e-05 [merge_recompute_call_nodes]: 1.82001e-06 [before_grad]: 1.165e-05 [set_forward_comm_id_for_comm_node_pass]: 4.67e-06 [meta_fg_expand]: 4.12e-06 [flash_sp_send_recv_attached]: 2.40997e-06 [receive_attached]: 2.00002e-06 [after_resolve]: 9.93998e-06 [a_after_grad]: 1.021e-05 [renormalize]: 0.00122541 [add_forward_monad_depend]: 5.09e-06 [auto_monad_grad]: 1.69e-06 [auto_monad_eliminator]: 1.726e-05 [cse]: 3.647e-05 [a_3]: 4.973e-05 [Cycle 2]: 0.00067538, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.96001e-06 [loop_unroll]: 7.21001e-06 [a_1]: 0.00015082 [with_stream_mark]: 1.231e-05 [recompute_prepare]: 7.33e-06 [updatestate_depend_eliminate]: 3.67998e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 8.4e-05 [accelerated_algorithm]: 6.86001e-06 [shard]: 9.39996e-07 [meta_shard_fg_expand]: 1.74998e-06 [shard_inline]: 6.66e-06 [merge_send_recv]: 5.49998e-06 [auto_parallel]: 5.86e-06 [parallel]: 4.73001e-06 [flash_sp]: 2.89999e-06 [merge_comm]: 3.85e-06 [allreduce_fusion]: 3.36001e-06 [matmul_add_comm_reduction]: 6.06998e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.90998e-06 [virtual_dataset]: 6.18998e-06 [get_grad_eliminate_]: 6.07001e-06 [virtual_output]: 5.87001e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 6.23998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.135e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 9.91e-06 [set_forward_comm_id_for_comm_node_pass]: 4.51002e-06 [meta_fg_expand]: 2.56e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.77e-06 [a_after_grad]: 9.17001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 7.90023e-07 [auto_monad_eliminator]: 7.91001e-06 [cse]: 1.89e-05 [a_3]: 3.824e-05 [py_interpret_to_execute_after_opt_a]: 4.03001e-06 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 2.214e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.12999e-06 [mutable_eliminate]: 0.00046497 [opt_b]: 0.00023936, [1] [Cycle 1]: 0.00023367, [7] [b_1]: 0.00015462 [b_2]: 8.17998e-06 [updatestate_depend_eliminate]: 6.10002e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.94999e-06 [renormalize]: 3.69997e-07 [cse]: 2.379e-05 [optimize_parallel_all_gather_comm]: 3.362e-05 [overlap_param_gather]: 2.21998e-06 [cconv]: 2.268e-05 [loop_unroll]: 0.00043705 [opt_after_cconv]: 0.00010752, [1] [Cycle 1]: 0.00010214, [7] [c_1]: 3.09e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.92001e-06 [updatestate_assign_eliminate]: 3.6e-06 [updatestate_loads_eliminate]: 2.88e-06 [cse]: 2.379e-05 [renormalize]: 6.10016e-07 [remove_dup_value]: 1.727e-05 [tuple_transform]: 9.295e-05, [1] [Cycle 1]: 8.802e-05, [4] [d_1]: 6.05e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.6e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 5.065e-05 [cse_after_recomputation]: 2.693e-05, [1] [Cycle 1]: 2.262e-05, [1] [cse]: 1.741e-05 [environ_conv]: 8.69e-06 [swap_dp_allreduce_reducescatter]: 6.38e-06 [bias_add_comm_swap]: 2.46e-06 [label_micro_interleaved_index]: 4.17e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.40001e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.66e-06 [assign_add_opt]: 1.16002e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.09998e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.30001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 9.5999e-07 [overlap_opt_shard_grad_in_pipeline]: 1.69998e-06 [control_data_broadcast_order]: 1.364e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 4.77998e-06 [overlap_recompute_and_grad_model_parallel]: 5.26002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.11e-06 [overlap_grad_ring_attention]: 5.07e-06 [overlap_grad_flash_sp]: 1.961e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.40002e-06 [split_layernorm_comm]: 1.90001e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 0.00011948, [1] [Cycle 1]: 0.00011495, [6] [build]: 9.77999e-06 [elim_shapecalc]: 1.034e-05 [elim_not_effective]: 4.657e-05 [opt_reshape]: 7.97e-06 [fold_const_symbol]: 1.186e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.67999e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 2.255e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 3.75998e-06 [opt_after_jit_grad]: 0.00049324 [validate]: 4.238e-05 [backend_pass]: 8.40024e-07 [task_emit]: 0.0439086 [execute]: 8.77999e-06 Sums bootstrap : 0.000600s : 0.73% type_inference : 0.031234s : 38.26% event_method : 0.000113s : 0.14% auto_monad : 0.000186s : 0.23% graph_reusing : 0.000013s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000039s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000235s : 0.29% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000090s : 0.11% optimize.opt_a.loop_unroll : 0.000046s : 0.06% optimize.opt_a.a_1 : 0.000940s : 1.15% optimize.opt_a.with_stream_mark : 0.000028s : 0.03% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000179s : 0.22% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000044s : 0.05% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.001225s : 1.50% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000055s : 0.07% optimize.opt_a.a_3 : 0.000088s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000465s : 0.57% optimize.opt_b.b_1 : 0.000155s : 0.19% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000034s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000437s : 0.54% optimize.opt_after_cconv.c_1 : 0.000031s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000060s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.06% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000047s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000023s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000493s : 0.60% validate : 0.000042s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.043909s : 53.78% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000313 62 0.77% : 0.000002s : 3: substitution.elim_not_effective 2.09% : 0.000007s : 3: substitution.float_tuple_getitem_switch 0.66% : 0.000002s : 3: substitution.fold_const_symbol 1.93% : 0.000006s : 4: substitution.graph_param_transform 56.69% : 0.000177s : 8: substitution.inline 1.33% : 0.000004s : 6: substitution.j_node_and_user_rematch 8.74% : 0.000027s : 2: substitution.minmaximum_grad 1.83% : 0.000006s : 6: substitution.remove_not_recompute_node 0.93% : 0.000003s : 2: substitution.replace_old_param 2.33% : 0.000007s : 1: substitution.switch_simplify 4.74% : 0.000015s : 4: substitution.tuple_list_convert_item_index_to_positive 2.15% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.23% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 9.55% : 0.000030s : 8: substitution.tuple_list_get_item_eliminator 3.01% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.031167 2 94.10% : 0.029328s : 1: type_inference.infer 5.90% : 0.001840s : 1: type_inference.specialize ------[replace.] 0.000090 11 64.06% : 0.000058s : 8: replace.inline 16.22% : 0.000015s : 1: replace.switch_simplify 19.72% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000184 11 94.04% : 0.000173s : 8: match.inline 3.41% : 0.000006s : 1: match.switch_simplify 2.54% : 0.000005s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000230 1438 1.08% : 0.000002s : 16: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 0.98% : 0.000002s : 16: predicate.addn_zero_filter 0.95% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.17% : 0.000005s : 24: predicate.arithmetic_simplify 1.01% : 0.000002s : 16: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.45% : 0.000001s : 8: predicate.depend_value_elim 1.05% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 16: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_depend_swap 1.68% : 0.000004s : 28: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.55% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.49% : 0.000006s : 26: predicate.float_depend_g_call 0.59% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.58% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 6.01% : 0.000014s : 66: predicate.inline 0.70% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.87% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.42% : 0.000006s : 42: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.79% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.00% : 0.000002s : 16: predicate.minmaximum_grad 0.98% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.35% : 0.000001s : 4: predicate.parallel_virtual_node 2.03% : 0.000005s : 26: predicate.partial_defer_inline 1.43% : 0.000003s : 22: predicate.partial_eliminate 1.01% : 0.000002s : 16: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.40% : 0.000003s : 16: predicate.reduce_eliminate 2.33% : 0.000005s : 42: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000003s : 26: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 0.91% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 4: predicate.row_tensor_eliminate 0.62% : 0.000001s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.70% : 0.000002s : 8: predicate.shard_identity_eliminate 0.63% : 0.000001s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.68% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.73% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.12% : 0.000014s : 86: predicate.switch_simplify 1.01% : 0.000002s : 16: predicate.tile_eliminate 0.93% : 0.000002s : 16: predicate.transpose_eliminate 1.78% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.84% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000008s : 34: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.76% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 4: predicate.value_based_eliminate 0.60% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001673 23 60.41% : 0.001010s : 11: func_graph_cloner_run.FuncGraphClonerGraph 39.59% : 0.000662s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.098317 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.58% : 0.003517s : 1: add_attr 3.57% : 0.003507s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.20% : 0.000196s : 1: auto_monad 0.03% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.65% : 0.000635s : 1: bootstrap 0.03% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000030s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.12% : 0.000123s : 1: event_method 0.02% : 0.000015s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.45% : 0.000445s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.48% : 0.000473s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.47% : 0.001447s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000135s : 28: opt.transform.opt_b 0.07% : 0.000066s : 2: opt.transform.opt_trans_graph 0.07% : 0.000073s : 4: opt.transform.symbol_engine_opt 3.58% : 0.003519s : 1: opt_a 0.11% : 0.000111s : 1: opt_after_cconv 0.51% : 0.000503s : 1: opt_after_jit_grad 0.25% : 0.000243s : 1: opt_b 5.83% : 0.005728s : 1: optimize 0.04% : 0.000038s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.64% : 0.000633s : 1: renormalize.infer 0.60% : 0.000585s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000025s : 1: rewriter_after_opt_a 0.24% : 0.000240s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000122s : 1: symbol_engine_optimizer 44.68% : 0.043926s : 1: task_emit 0.10% : 0.000096s : 1: tuple_transform 31.78% : 0.031248s : 1: type_inference 0.07% : 0.000067s : 1: validate TotalTime = 0.17963, [24] [bootstrap]: 0.00047348 [type_inference]: 0.0470896 [event_method]: 0.00022935 [auto_monad]: 0.00023792 [graph_reusing]: 1.902e-05 [inline]: 1.89999e-06 [add_attr]: 0.00317746, [1] [add_attr_with_inline]: 0.00316862, [1] [Cycle 1]: 8.898e-05, [2] [tag_attr]: 4.624e-05 [meta_addattr_fg_expand]: 1.381e-05 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 6.476e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 1.17e-06 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0414286, [53] [py_interpret_to_execute]: 4.92999e-06 [rewriter_before_opt_a]: 0.00037471 [opt_a]: 0.0383999, [3] [Cycle 1]: 0.0321155, [45] [expand_dump_flag]: 5.10001e-06 [switch_simplify]: 0.00015981 [loop_unroll]: 7.258e-05 [a_1]: 0.00154843 [with_stream_mark]: 2.377e-05 [recompute_prepare]: 2.106e-05 [updatestate_depend_eliminate]: 9.25001e-06 [updatestate_assign_eliminate]: 7.77e-06 [updatestate_loads_eliminate]: 7.41999e-06 [parameter_eliminate]: 3.09999e-06 [a_2]: 0.00024133 [accelerated_algorithm]: 1.604e-05 [shard]: 2.07999e-06 [meta_shard_fg_expand]: 4.58999e-06 [shard_inline]: 1.438e-05 [merge_send_recv]: 1.712e-05 [auto_parallel]: 1.116e-05 [parallel]: 1.767e-05 [flash_sp]: 9.20999e-06 [merge_comm]: 9.12001e-06 [allreduce_fusion]: 8.43999e-06 [matmul_add_comm_reduction]: 2.576e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.613e-05 [virtual_dataset]: 1.43e-05 [get_grad_eliminate_]: 1.412e-05 [virtual_output]: 1.381e-05 [merge_forward]: 9.31998e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 1.632e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.584e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 2.628e-05 [set_forward_comm_id_for_comm_node_pass]: 9.37001e-06 [meta_fg_expand]: 0.00461737 [flash_sp_send_recv_attached]: 4.65999e-06 [receive_attached]: 2.17999e-06 [after_resolve]: 8.735e-05 [a_after_grad]: 0.00012623 [renormalize]: 0.0227204 [add_forward_monad_depend]: 1.58e-05 [auto_monad_grad]: 1.359e-05 [auto_monad_eliminator]: 0.00011353 [cse]: 0.00033639 [a_3]: 0.00140006 [Cycle 2]: 0.0049734, [45] [expand_dump_flag]: 2.69999e-06 [switch_simplify]: 8.666e-05 [loop_unroll]: 8.315e-05 [a_1]: 0.00172305 [with_stream_mark]: 1.925e-05 [recompute_prepare]: 1.604e-05 [updatestate_depend_eliminate]: 8.37e-06 [updatestate_assign_eliminate]: 7.53e-06 [updatestate_loads_eliminate]: 6.83998e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.00019834 [accelerated_algorithm]: 3.251e-05 [shard]: 1.27e-06 [meta_shard_fg_expand]: 3.57997e-06 [shard_inline]: 1.42e-05 [merge_send_recv]: 1.032e-05 [auto_parallel]: 1.084e-05 [parallel]: 4.42e-06 [flash_sp]: 3.14999e-06 [merge_comm]: 8.52998e-06 [allreduce_fusion]: 7.75998e-06 [matmul_add_comm_reduction]: 1.12e-05 [allreduce_slice_to_reducescatter]: 6.79982e-07 [virtual_shard_identity]: 1.421e-05 [virtual_dataset]: 1.363e-05 [get_grad_eliminate_]: 1.289e-05 [virtual_output]: 1.296e-05 [merge_forward]: 7.11999e-06 [cell_reuse_recompute_pass]: 1.11002e-06 [offload_activation]: 1.152e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.401e-05 [merge_recompute_call_nodes]: 8.2e-07 [before_grad]: 2.304e-05 [set_forward_comm_id_for_comm_node_pass]: 8.15e-06 [meta_fg_expand]: 0.00022379 [flash_sp_send_recv_attached]: 1.14998e-06 [receive_attached]: 1.42e-06 [after_resolve]: 2.073e-05 [a_after_grad]: 2.082e-05 [renormalize]: 0.00174378 [add_forward_monad_depend]: 4.94e-06 [auto_monad_grad]: 1.22999e-06 [auto_monad_eliminator]: 2.328e-05 [cse]: 0.00018219 [a_3]: 0.00010229 [Cycle 3]: 0.00129604, [45] [expand_dump_flag]: 1.30001e-06 [switch_simplify]: 1.67e-05 [loop_unroll]: 1.352e-05 [a_1]: 0.00035967 [with_stream_mark]: 1.567e-05 [recompute_prepare]: 1.438e-05 [updatestate_depend_eliminate]: 8.99e-06 [updatestate_assign_eliminate]: 7.15e-06 [updatestate_loads_eliminate]: 6.97002e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 0.00019887 [accelerated_algorithm]: 1.859e-05 [shard]: 1.04998e-06 [meta_shard_fg_expand]: 3.00998e-06 [shard_inline]: 1.376e-05 [merge_send_recv]: 1.054e-05 [auto_parallel]: 1.132e-05 [parallel]: 4.15e-06 [flash_sp]: 9.09989e-07 [merge_comm]: 8.27998e-06 [allreduce_fusion]: 8.05e-06 [matmul_add_comm_reduction]: 1.099e-05 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 2.771e-05 [virtual_dataset]: 1.349e-05 [get_grad_eliminate_]: 1.321e-05 [virtual_output]: 1.375e-05 [merge_forward]: 7.50998e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 1.208e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.529e-05 [merge_recompute_call_nodes]: 7.90023e-07 [before_grad]: 2.145e-05 [set_forward_comm_id_for_comm_node_pass]: 8.38001e-06 [meta_fg_expand]: 5.64998e-06 [flash_sp_send_recv_attached]: 8.50006e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 1.606e-05 [a_after_grad]: 2.056e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 1.71e-05 [cse]: 5.759e-05 [a_3]: 9.207e-05 [py_interpret_to_execute_after_opt_a]: 4.05e-06 [slice_cell_reuse_recomputed_activation]: 2.39999e-06 [rewriter_after_opt_a]: 4.496e-05 [convert_after_rewriter]: 1.25999e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00052539 [opt_b]: 0.00048246, [1] [Cycle 1]: 0.00047582, [7] [b_1]: 0.00034083 [b_2]: 1.586e-05 [updatestate_depend_eliminate]: 1.042e-05 [updatestate_assign_eliminate]: 7.08e-06 [updatestate_loads_eliminate]: 6.86999e-06 [renormalize]: 4.00003e-07 [cse]: 5.79e-05 [optimize_parallel_all_gather_comm]: 2.575e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.197e-05 [loop_unroll]: 0.00045763 [opt_after_cconv]: 0.00019035, [1] [Cycle 1]: 0.00018467, [7] [c_1]: 6.759e-05 [parameter_eliminate]: 2.32999e-06 [updatestate_depend_eliminate]: 1.109e-05 [updatestate_assign_eliminate]: 7.26001e-06 [updatestate_loads_eliminate]: 7.13998e-06 [cse]: 5.457e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 8.799e-05 [tuple_transform]: 0.00017038, [1] [Cycle 1]: 0.00016524, [4] [d_1]: 0.00012622 [none_parameter_eliminate]: 2.17999e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 1.58e-05 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 7.16e-05 [cse_after_recomputation]: 5.682e-05, [1] [Cycle 1]: 5.154e-05, [1] [cse]: 4.536e-05 [environ_conv]: 1.18e-05 [swap_dp_allreduce_reducescatter]: 1.173e-05 [bias_add_comm_swap]: 3.09999e-06 [label_micro_interleaved_index]: 4.67e-06 [label_fine_grained_interleaved_index]: 2.63998e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 9.89996e-07 [remove_cast_before_assign_add]: 1.35999e-06 [full_micro_interleaved_order_control]: 2.14999e-06 [reorder_send_recv_between_fp_bp]: 2.66999e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.33002e-06 [overlap_opt_shard_in_pipeline]: 9.10019e-07 [overlap_opt_shard_grad_in_pipeline]: 1.78002e-06 [control_data_broadcast_order]: 2.693e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 6.95998e-06 [overlap_recompute_and_grad_model_parallel]: 7.32997e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 6.93998e-06 [overlap_grad_flash_sp]: 3.318e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.26e-06 [split_layernorm_comm]: 1.91003e-06 [handle_group_info]: 1.40999e-06 [symbol_engine_optimizer]: 0.00012846, [1] [Cycle 1]: 0.00012354, [6] [build]: 1.205e-05 [elim_shapecalc]: 1.85e-05 [elim_not_effective]: 2.617e-05 [opt_reshape]: 1.496e-05 [fold_const_symbol]: 2.299e-05 [renormalize]: 2.19996e-07 [detach_backward]: 1.79998e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 3.054e-05 [get_jit_bprop_graph]: 1.05001e-06 [rewriter_after_jit_bprop_graph]: 3.43e-06 [opt_after_jit_grad]: 0.00051285 [validate]: 6.275e-05 [backend_pass]: 9.20001e-07 [task_emit]: 0.0860148 [execute]: 8.28001e-06 Sums bootstrap : 0.000473s : 0.27% type_inference : 0.047090s : 26.89% event_method : 0.000229s : 0.13% auto_monad : 0.000238s : 0.14% graph_reusing : 0.000019s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000065s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000375s : 0.21% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000263s : 0.15% optimize.opt_a.loop_unroll : 0.000169s : 0.10% optimize.opt_a.a_1 : 0.003631s : 2.07% optimize.opt_a.with_stream_mark : 0.000059s : 0.03% optimize.opt_a.recompute_prepare : 0.000051s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000027s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000022s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000021s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000639s : 0.36% optimize.opt_a.accelerated_algorithm : 0.000067s : 0.04% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000011s : 0.01% optimize.opt_a.shard_inline : 0.000042s : 0.02% optimize.opt_a.merge_send_recv : 0.000038s : 0.02% optimize.opt_a.auto_parallel : 0.000033s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000026s : 0.01% optimize.opt_a.allreduce_fusion : 0.000024s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000048s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000058s : 0.03% optimize.opt_a.virtual_dataset : 0.000041s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.02% optimize.opt_a.virtual_output : 0.000041s : 0.02% optimize.opt_a.merge_forward : 0.000024s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000040s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000075s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000071s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.01% optimize.opt_a.meta_fg_expand : 0.004847s : 2.77% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000124s : 0.07% optimize.opt_a.a_after_grad : 0.000168s : 0.10% optimize.opt_a.renormalize : 0.024464s : 13.97% optimize.opt_a.add_forward_monad_depend : 0.000022s : 0.01% optimize.opt_a.auto_monad_grad : 0.000016s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000154s : 0.09% optimize.opt_a.cse : 0.000576s : 0.33% optimize.opt_a.a_3 : 0.001594s : 0.91% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000525s : 0.30% optimize.opt_b.b_1 : 0.000341s : 0.19% optimize.opt_b.b_2 : 0.000016s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000058s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.01% optimize.loop_unroll : 0.000458s : 0.26% optimize.opt_after_cconv.c_1 : 0.000068s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000055s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000088s : 0.05% optimize.tuple_transform.d_1 : 0.000126s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000016s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000072s : 0.04% optimize.cse_after_recomputation.cse : 0.000045s : 0.03% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000027s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000033s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000031s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000513s : 0.29% validate : 0.000063s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.086015s : 49.12% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.001596 315 0.24% : 0.000004s : 8: substitution.elim_not_effective 0.52% : 0.000008s : 12: substitution.float_depend_g_call 0.73% : 0.000012s : 9: substitution.float_tuple_getitem_switch 0.20% : 0.000003s : 8: substitution.fold_const_symbol 35.12% : 0.000560s : 5: substitution.getattr_setattr_resolve 0.68% : 0.000011s : 10: substitution.graph_param_transform 0.16% : 0.000003s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 36.56% : 0.000583s : 24: substitution.inline 1.28% : 0.000020s : 3: substitution.inline_without_move 0.83% : 0.000013s : 25: substitution.j_node_and_user_rematch 1.01% : 0.000016s : 4: substitution.less_batch_normalization 1.08% : 0.000017s : 13: substitution.minmaximum_grad 0.62% : 0.000010s : 12: substitution.partial_eliminate 1.04% : 0.000017s : 25: substitution.remove_not_recompute_node 4.80% : 0.000077s : 32: substitution.replace_applicator 0.62% : 0.000010s : 14: substitution.replace_old_param 0.16% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.74% : 0.000012s : 4: substitution.switch_simplify 0.52% : 0.000008s : 2: substitution.transpose_eliminate 2.93% : 0.000047s : 17: substitution.tuple_list_convert_item_index_to_positive 1.38% : 0.000022s : 17: substitution.tuple_list_get_item_const_eliminator 1.82% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 5.01% : 0.000080s : 32: substitution.tuple_list_get_item_eliminator 1.80% : 0.000029s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.046980 2 92.71% : 0.043554s : 1: type_inference.infer 7.29% : 0.003426s : 1: type_inference.specialize ------[replace.] 0.000476 45 12.22% : 0.000058s : 4: replace.getattr_setattr_resolve 50.62% : 0.000241s : 24: replace.inline 14.66% : 0.000070s : 5: replace.replace_applicator 8.55% : 0.000041s : 4: replace.switch_simplify 13.95% : 0.000066s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001143 45 45.17% : 0.000516s : 4: match.getattr_setattr_resolve 49.79% : 0.000569s : 24: match.inline 2.32% : 0.000027s : 5: match.replace_applicator 0.82% : 0.000009s : 4: match.switch_simplify 1.89% : 0.000022s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001019 7110 0.87% : 0.000009s : 68: predicate.accumulaten_eliminater 0.35% : 0.000004s : 10: predicate.ad_related_special_op_eliminate 0.41% : 0.000004s : 32: predicate.addn_check_dump 0.89% : 0.000009s : 68: predicate.addn_zero_filter 0.82% : 0.000008s : 68: predicate.adjust_all_reduce_mul_add 1.73% : 0.000018s : 100: predicate.arithmetic_simplify 0.92% : 0.000009s : 68: predicate.cast_eliminate 2.79% : 0.000028s : 215: predicate.check_bprop_eliminate 0.43% : 0.000004s : 32: predicate.compare_switch_simplify 0.09% : 0.000001s : 10: predicate.const_output_eliminate 0.42% : 0.000004s : 32: predicate.depend_value_elim 0.96% : 0.000010s : 68: predicate.dict_get_item_const_eliminator 1.02% : 0.000010s : 68: predicate.dict_get_item_eliminator 0.89% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.35% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 10: predicate.elim_not_effective 0.17% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 0.98% : 0.000010s : 78: predicate.environ_add_const_eliminate 0.97% : 0.000010s : 78: predicate.environ_get_add_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_depend_swap 1.46% : 0.000015s : 110: predicate.environ_get_eliminate 0.97% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.31% : 0.000013s : 100: predicate.exchange_switch_depend_value 3.17% : 0.000032s : 100: predicate.float_depend_g_call 0.42% : 0.000004s : 32: predicate.float_environ_get_switch 0.61% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.50% : 0.000005s : 32: predicate.get_grad_eliminate 0.65% : 0.000007s : 31: predicate.getattr_setattr_resolve 0.10% : 0.000001s : 10: predicate.graph_param_transform 0.42% : 0.000004s : 32: predicate.incorporate_call 0.39% : 0.000004s : 32: predicate.incorporate_call_switch 4.51% : 0.000046s : 252: predicate.inline 1.44% : 0.000015s : 82: predicate.inline_without_move 0.22% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.60% : 0.000006s : 32: predicate.less_batch_normalization 1.28% : 0.000013s : 96: predicate.list_to_tuple_eliminator_ 2.13% : 0.000022s : 164: predicate.load_eliminater 0.35% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.51% : 0.000026s : 182: predicate.loop_unroll_before_grad 1.17% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.46% : 0.000005s : 32: predicate.merge_addn 2.54% : 0.000026s : 198: predicate.micro_step_allgather_replace 2.57% : 0.000026s : 198: predicate.mini_step_allgather_replace 0.90% : 0.000009s : 68: predicate.minmaximum_grad 0.41% : 0.000004s : 10: predicate.mutable_eliminate 0.16% : 0.000002s : 10: predicate.opt_reshape 0.15% : 0.000001s : 10: predicate.parallel_virtual_node 1.77% : 0.000018s : 100: predicate.partial_defer_inline 1.25% : 0.000013s : 86: predicate.partial_eliminate 0.90% : 0.000009s : 68: predicate.print_const_string_wrapper 0.43% : 0.000004s : 32: predicate.reduce_all_const_elim 1.15% : 0.000012s : 68: predicate.reduce_eliminate 2.04% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000002s : 32: predicate.remove_not_recompute_node 2.36% : 0.000024s : 284: predicate.replace_applicator 0.67% : 0.000007s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.93% : 0.000009s : 68: predicate.reshape_eliminate 2.68% : 0.000027s : 198: predicate.row_tensor_add_zeros_like 0.17% : 0.000002s : 10: predicate.row_tensor_eliminate 3.05% : 0.000031s : 215: predicate.same_eliminate 0.28% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.51% : 0.000005s : 32: predicate.shard_identity_eliminate 0.32% : 0.000003s : 20: predicate.special_op_eliminate 0.51% : 0.000005s : 32: predicate.specialize_transform 2.67% : 0.000027s : 198: predicate.split_environ_get_set_with_tuple_value 1.25% : 0.000013s : 82: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.49% : 0.000015s : 100: predicate.switch_defer_inline 4.23% : 0.000043s : 315: predicate.switch_layer_defer_inline 4.85% : 0.000049s : 332: predicate.switch_simplify 0.94% : 0.000010s : 68: predicate.tile_eliminate 0.94% : 0.000010s : 68: predicate.transpose_eliminate 1.35% : 0.000014s : 88: predicate.tuple_list_convert_item_index_to_positive 1.37% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.24% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.23% : 0.000023s : 128: predicate.tuple_list_get_item_eliminator 1.30% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.82% : 0.000019s : 120: predicate.tuple_list_set_item_eliminator 1.24% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 2.00% : 0.000020s : 164: predicate.updatestate_pure_node_eliminater 2.51% : 0.000026s : 196: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 10: predicate.value_based_eliminate 0.48% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.46% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000002s : 10: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005268 75 61.81% : 0.003256s : 36: func_graph_cloner_run.FuncGraphClonerGraph 38.19% : 0.002012s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.256885 247 0.00% : 0.000004s : 1: ForceFp32Comm 1.24% : 0.003182s : 1: add_attr 1.24% : 0.003173s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000076s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000249s : 1: auto_monad 0.01% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.20% : 0.000506s : 1: bootstrap 0.01% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000030s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000060s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.09% : 0.000240s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000023s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.18% : 0.000466s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.21% : 0.000534s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 2.70% : 0.006924s : 125: opt.transform.opt_a 0.03% : 0.000066s : 1: opt.transform.opt_after_cconv 0.02% : 0.000048s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000326s : 28: opt.transform.opt_b 0.26% : 0.000675s : 2: opt.transform.opt_resolve 0.05% : 0.000139s : 2: opt.transform.opt_trans_graph 0.03% : 0.000079s : 4: opt.transform.symbol_engine_opt 14.95% : 0.038404s : 1: opt_a 0.08% : 0.000194s : 1: opt_after_cconv 0.20% : 0.000522s : 1: opt_after_jit_grad 0.19% : 0.000486s : 1: opt_b 16.13% : 0.041433s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000069s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000093s : 1: remove_dup_value 7.68% : 0.019732s : 2: renormalize.infer 1.84% : 0.004716s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.15% : 0.000382s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000131s : 1: symbol_engine_optimizer 33.50% : 0.086046s : 1: task_emit 0.07% : 0.000173s : 1: tuple_transform 18.34% : 0.047105s : 1: type_inference 0.04% : 0.000091s : 1: validate [WARNING] CORE(61814,ffffbf434f30,python3.9):2026-01-29-17:51:05.083.140 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph4 TotalTime = 0.0728179, [24] [bootstrap]: 0.00053854 [type_inference]: 0.0256838 [event_method]: 2.266e-05 [auto_monad]: 8.175e-05 [graph_reusing]: 6.19999e-06 [inline]: 1.96e-06 [add_attr]: 0.00331321, [1] [add_attr_with_inline]: 0.00330528, [1] [Cycle 1]: 5.599e-05, [2] [tag_attr]: 2.087e-05 [meta_addattr_fg_expand]: 6.94001e-06 [parallel-infer-symbol]: 2.88998e-06 [pre_auto_parallel]: 3.48e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.86e-06 [pipeline_split]: 1.53002e-06 [optimize]: 0.00485845, [53] [py_interpret_to_execute]: 4.62e-06 [rewriter_before_opt_a]: 0.00024252 [opt_a]: 0.00282835, [2] [Cycle 1]: 0.00226329, [45] [expand_dump_flag]: 3.91999e-06 [switch_simplify]: 7.728e-05 [loop_unroll]: 3.264e-05 [a_1]: 0.00058872 [with_stream_mark]: 1.427e-05 [recompute_prepare]: 8.05999e-06 [updatestate_depend_eliminate]: 3.97002e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 7.032e-05 [accelerated_algorithm]: 5.85002e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 5.45001e-06 [merge_send_recv]: 7.87e-06 [auto_parallel]: 5.84999e-06 [parallel]: 1.78e-05 [flash_sp]: 6.86999e-06 [merge_comm]: 3.63999e-06 [allreduce_fusion]: 3.18998e-06 [matmul_add_comm_reduction]: 8.28999e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 6.74999e-06 [virtual_dataset]: 5.61e-06 [get_grad_eliminate_]: 5.40999e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 3.75e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 9.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.115e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 9.59e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36999e-06 [meta_fg_expand]: 2.98998e-06 [flash_sp_send_recv_attached]: 2.61999e-06 [receive_attached]: 2.62001e-06 [after_resolve]: 8.85001e-06 [a_after_grad]: 8.22e-06 [renormalize]: 0.00097146 [add_forward_monad_depend]: 5.05001e-06 [auto_monad_grad]: 1.88002e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 3.337e-05 [a_3]: 4.145e-05 [Cycle 2]: 0.00055526, [45] [expand_dump_flag]: 1.03001e-06 [switch_simplify]: 6.85998e-06 [loop_unroll]: 5.85002e-06 [a_1]: 9.449e-05 [with_stream_mark]: 1.06e-05 [recompute_prepare]: 5.79e-06 [updatestate_depend_eliminate]: 2.94999e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.26e-06 [parameter_eliminate]: 9.79984e-07 [a_2]: 6.144e-05 [accelerated_algorithm]: 5.47001e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 5.15999e-06 [merge_send_recv]: 4.47998e-06 [auto_parallel]: 5.17999e-06 [parallel]: 4.19002e-06 [flash_sp]: 2.98e-06 [merge_comm]: 3.07002e-06 [allreduce_fusion]: 2.80002e-06 [matmul_add_comm_reduction]: 4.84003e-06 [allreduce_slice_to_reducescatter]: 5.3001e-07 [virtual_shard_identity]: 6.36e-06 [virtual_dataset]: 5.15001e-06 [get_grad_eliminate_]: 4.98001e-06 [virtual_output]: 5.05999e-06 [merge_forward]: 2.74001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 5.54998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.134e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 8.1e-06 [set_forward_comm_id_for_comm_node_pass]: 3.11999e-06 [meta_fg_expand]: 1.84e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 9.79984e-07 [after_resolve]: 7.55998e-06 [a_after_grad]: 8.05999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 9.79984e-07 [auto_monad_eliminator]: 6.12999e-06 [cse]: 1.477e-05 [a_3]: 3.122e-05 [py_interpret_to_execute_after_opt_a]: 4.12e-06 [slice_cell_reuse_recomputed_activation]: 1.87999e-06 [rewriter_after_opt_a]: 1.582e-05 [convert_after_rewriter]: 1.40999e-06 [order_py_execute_after_rewriter]: 1.84e-06 [mutable_eliminate]: 0.00045881 [opt_b]: 0.00018029, [1] [Cycle 1]: 0.00017437, [7] [b_1]: 0.00010449 [b_2]: 6.54999e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.36998e-06 [updatestate_loads_eliminate]: 2.29999e-06 [renormalize]: 3.89991e-07 [cse]: 1.996e-05 [optimize_parallel_all_gather_comm]: 1.596e-05 [overlap_param_gather]: 2.18002e-06 [cconv]: 2.269e-05 [loop_unroll]: 0.00042186 [opt_after_cconv]: 9.361e-05, [1] [Cycle 1]: 8.846e-05, [7] [c_1]: 2.439e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 2.38002e-06 [updatestate_loads_eliminate]: 2.25002e-06 [cse]: 1.903e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.545e-05 [tuple_transform]: 6.688e-05, [1] [Cycle 1]: 6.285e-05, [4] [d_1]: 3.63e-05 [none_parameter_eliminate]: 1.71e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 5.88002e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 4.211e-05 [cse_after_recomputation]: 2.332e-05, [1] [Cycle 1]: 1.929e-05, [1] [cse]: 1.352e-05 [environ_conv]: 8.23999e-06 [swap_dp_allreduce_reducescatter]: 5.62999e-06 [bias_add_comm_swap]: 3.06999e-06 [label_micro_interleaved_index]: 4.03001e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.57001e-06 [slice_recompute_activation]: 1.99999e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.47001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.07001e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.10999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.13e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 4.13001e-06 [overlap_recompute_and_grad_model_parallel]: 5.15001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.04999e-06 [overlap_grad_ring_attention]: 4.36002e-06 [overlap_grad_flash_sp]: 1.617e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.25002e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 1.09998e-06 [symbol_engine_optimizer]: 7.918e-05, [1] [Cycle 1]: 7.501e-05, [6] [build]: 9.26998e-06 [elim_shapecalc]: 9.20999e-06 [elim_not_effective]: 1.206e-05 [opt_reshape]: 6.29001e-06 [fold_const_symbol]: 9.64999e-06 [renormalize]: 2.20025e-07 [detach_backward]: 1.82001e-06 [pipeline_parallel_scheduler]: 1.44998e-06 [auto_monad_reorder]: 1.92e-05 [get_jit_bprop_graph]: 1.05001e-06 [rewriter_after_jit_bprop_graph]: 3.83999e-06 [opt_after_jit_grad]: 0.00046139 [validate]: 3.972e-05 [backend_pass]: 9.89996e-07 [task_emit]: 0.0375338 [execute]: 7.95e-06 Sums bootstrap : 0.000539s : 0.79% type_inference : 0.025684s : 37.48% event_method : 0.000023s : 0.03% auto_monad : 0.000082s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000035s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000243s : 0.35% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000084s : 0.12% optimize.opt_a.loop_unroll : 0.000038s : 0.06% optimize.opt_a.a_1 : 0.000683s : 1.00% optimize.opt_a.with_stream_mark : 0.000025s : 0.04% optimize.opt_a.recompute_prepare : 0.000014s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000132s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.02% optimize.opt_a.merge_send_recv : 0.000012s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.02% optimize.opt_a.parallel : 0.000022s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.02% optimize.opt_a.virtual_output : 0.000011s : 0.02% optimize.opt_a.merge_forward : 0.000006s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.02% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.000972s : 1.42% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.03% optimize.opt_a.cse : 0.000048s : 0.07% optimize.opt_a.a_3 : 0.000073s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000459s : 0.67% optimize.opt_b.b_1 : 0.000104s : 0.15% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000422s : 0.62% optimize.opt_after_cconv.c_1 : 0.000024s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.02% optimize.tuple_transform.d_1 : 0.000036s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.06% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000016s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000461s : 0.67% validate : 0.000040s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.037534s : 54.77% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000182 26 1.09% : 0.000002s : 2: substitution.elim_not_effective 0.82% : 0.000001s : 2: substitution.fold_const_symbol 2.91% : 0.000005s : 3: substitution.graph_param_transform 79.27% : 0.000145s : 6: substitution.inline 1.84% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.42% : 0.000004s : 4: substitution.remove_not_recompute_node 1.55% : 0.000003s : 2: substitution.replace_old_param 3.89% : 0.000007s : 1: substitution.switch_simplify 6.22% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.025611 2 94.82% : 0.024283s : 1: type_inference.infer 5.18% : 0.001327s : 1: type_inference.specialize ------[replace.] 0.000083 9 59.51% : 0.000049s : 6: replace.inline 21.63% : 0.000018s : 1: replace.switch_simplify 18.87% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 9 89.62% : 0.000141s : 6: match.inline 3.97% : 0.000006s : 1: match.switch_simplify 6.41% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000172 1092 0.99% : 0.000002s : 12: predicate.accumulaten_eliminater 0.97% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 0.97% : 0.000002s : 12: predicate.addn_zero_filter 0.90% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.47% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.51% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.85% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.71% : 0.000003s : 21: predicate.environ_get_eliminate 1.32% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.64% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.51% : 0.000004s : 20: predicate.float_depend_g_call 0.53% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 5.98% : 0.000010s : 50: predicate.inline 0.70% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.67% : 0.000001s : 6: predicate.less_batch_normalization 1.80% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.40% : 0.000004s : 32: predicate.load_eliminater 0.83% : 0.000001s : 3: predicate.loop_unroll_after_grad 2.98% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 6: predicate.merge_addn 0.44% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 12: predicate.minmaximum_grad 1.07% : 0.000002s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.37% : 0.000001s : 3: predicate.parallel_virtual_node 2.16% : 0.000004s : 20: predicate.partial_defer_inline 1.39% : 0.000002s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.49% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000002s : 12: predicate.reduce_eliminate 2.35% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 20: predicate.replace_applicator 0.49% : 0.000001s : 6: predicate.replace_old_param 0.23% : 0.000000s : 3: predicate.reset_defer_inline 0.93% : 0.000002s : 12: predicate.reshape_eliminate 0.52% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.64% : 0.000001s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.69% : 0.000001s : 6: predicate.shard_identity_eliminate 0.63% : 0.000001s : 6: predicate.special_op_eliminate 0.61% : 0.000001s : 6: predicate.specialize_transform 0.82% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.71% : 0.000003s : 20: predicate.switch_defer_inline 2.17% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.21% : 0.000011s : 68: predicate.switch_simplify 0.94% : 0.000002s : 12: predicate.tile_eliminate 1.00% : 0.000002s : 12: predicate.transpose_eliminate 1.59% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.21% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.46% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.37% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.12% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 3: predicate.value_based_eliminate 0.59% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 6: predicate.virtual_output_eliminate 0.30% : 0.000001s : 3: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001122 16 57.35% : 0.000644s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.65% : 0.000479s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.083187 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.99% : 0.003318s : 1: add_attr 3.98% : 0.003309s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000046s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000087s : 1: auto_monad 0.03% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.69% : 0.000571s : 1: bootstrap 0.03% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000014s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000028s : 1: event_method 0.02% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.52% : 0.000430s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.56% : 0.000467s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000013s : 1: opt.transform.mutable_eliminate 1.30% : 0.001083s : 78: opt.transform.opt_a 0.03% : 0.000023s : 1: opt.transform.opt_after_cconv 0.03% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000084s : 28: opt.transform.opt_b 0.05% : 0.000040s : 2: opt.transform.opt_trans_graph 0.04% : 0.000034s : 4: opt.transform.symbol_engine_opt 3.40% : 0.002831s : 1: opt_a 0.12% : 0.000098s : 1: opt_after_cconv 0.57% : 0.000471s : 1: opt_after_jit_grad 0.22% : 0.000184s : 1: opt_b 5.85% : 0.004863s : 1: optimize 0.02% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000005s : 1: order_py_execute_after_rewriter 0.02% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000039s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.61% : 0.000505s : 1: renormalize.infer 0.55% : 0.000459s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000019s : 1: rewriter_after_opt_a 0.30% : 0.000248s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000004s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.03% : 0.000027s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000082s : 1: symbol_engine_optimizer 45.14% : 0.037548s : 1: task_emit 0.08% : 0.000070s : 1: tuple_transform 30.89% : 0.025698s : 1: type_inference 0.08% : 0.000063s : 1: validate ... TotalTime = 24.9218, [24] [bootstrap]: 0.00059729 [type_inference]: 0.0487309 [event_method]: 0.00024845 [auto_monad]: 0.00015509 [graph_reusing]: 9.92001e-06 [inline]: 1.86e-06 [add_attr]: 0.00337354, [1] [add_attr_with_inline]: 0.003365, [1] [Cycle 1]: 8.544e-05, [2] [tag_attr]: 4.236e-05 [meta_addattr_fg_expand]: 1.299e-05 [parallel-infer-symbol]: 2.85002e-06 [pre_auto_parallel]: 6.176e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.0451374, [53] [py_interpret_to_execute]: 4.77e-06 [rewriter_before_opt_a]: 0.00037432 [opt_a]: 0.0345078, [3] [Cycle 1]: 0.0184874, [45] [expand_dump_flag]: 4.97e-06 [switch_simplify]: 0.00016114 [loop_unroll]: 6.747e-05 [a_1]: 0.00147407 [with_stream_mark]: 2.352e-05 [recompute_prepare]: 2.082e-05 [updatestate_depend_eliminate]: 8.30999e-06 [updatestate_assign_eliminate]: 7.46999e-06 [updatestate_loads_eliminate]: 7.23e-06 [parameter_eliminate]: 2.67001e-06 [a_2]: 0.00021234 [accelerated_algorithm]: 1.466e-05 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 4.35999e-06 [shard_inline]: 1.39e-05 [merge_send_recv]: 1.64e-05 [auto_parallel]: 1.006e-05 [parallel]: 2.075e-05 [flash_sp]: 9.65002e-06 [merge_comm]: 8.67998e-06 [allreduce_fusion]: 7.85e-06 [matmul_add_comm_reduction]: 2.568e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.593e-05 [virtual_dataset]: 1.393e-05 [get_grad_eliminate_]: 1.405e-05 [virtual_output]: 1.365e-05 [merge_forward]: 8.84e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.605e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.507e-05 [merge_recompute_call_nodes]: 1.99999e-06 [before_grad]: 2.408e-05 [set_forward_comm_id_for_comm_node_pass]: 8.72e-06 [meta_fg_expand]: 0.00153912 [flash_sp_send_recv_attached]: 3.81999e-06 [receive_attached]: 2.43e-06 [after_resolve]: 6.455e-05 [a_after_grad]: 8.898e-05 [renormalize]: 0.0128215 [add_forward_monad_depend]: 1.041e-05 [auto_monad_grad]: 6.24999e-06 [auto_monad_eliminator]: 0.00016352 [cse]: 0.00049278 [a_3]: 0.00073009 [Cycle 2]: 0.0104508, [45] [expand_dump_flag]: 1.67001e-06 [switch_simplify]: 0.00010027 [loop_unroll]: 9.498e-05 [a_1]: 0.00315979 [with_stream_mark]: 6.136e-05 [recompute_prepare]: 6.641e-05 [updatestate_depend_eliminate]: 3.993e-05 [updatestate_assign_eliminate]: 3.806e-05 [updatestate_loads_eliminate]: 3.577e-05 [parameter_eliminate]: 1.35999e-06 [a_2]: 0.00101268 [accelerated_algorithm]: 0.00012981 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.527e-05 [shard_inline]: 6.532e-05 [merge_send_recv]: 4.395e-05 [auto_parallel]: 4.1e-05 [parallel]: 4.84e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 4.157e-05 [allreduce_fusion]: 3.978e-05 [matmul_add_comm_reduction]: 4.724e-05 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 6.462e-05 [virtual_dataset]: 6.268e-05 [get_grad_eliminate_]: 6.222e-05 [virtual_output]: 6.3e-05 [merge_forward]: 3.759e-05 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 4.885e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011312 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 0.00010648 [set_forward_comm_id_for_comm_node_pass]: 4.34e-05 [meta_fg_expand]: 0.00012427 [flash_sp_send_recv_attached]: 1.10999e-06 [receive_attached]: 1.17e-06 [after_resolve]: 7.083e-05 [a_after_grad]: 0.00010454 [renormalize]: 0.00338582 [add_forward_monad_depend]: 4.89e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 9.744e-05 [cse]: 0.00026525 [a_3]: 0.00047015 [Cycle 3]: 0.00555468, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 6.645e-05 [loop_unroll]: 8.711e-05 [a_1]: 0.00190201 [with_stream_mark]: 4.825e-05 [recompute_prepare]: 6.43e-05 [updatestate_depend_eliminate]: 4.04e-05 [updatestate_assign_eliminate]: 3.789e-05 [updatestate_loads_eliminate]: 3.698e-05 [parameter_eliminate]: 1.32999e-06 [a_2]: 0.00100969 [accelerated_algorithm]: 8.475e-05 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.537e-05 [shard_inline]: 6.565e-05 [merge_send_recv]: 4.695e-05 [auto_parallel]: 4.5e-05 [parallel]: 4e-06 [flash_sp]: 1.09e-06 [merge_comm]: 4.391e-05 [allreduce_fusion]: 4.393e-05 [matmul_add_comm_reduction]: 4.877e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 6.592e-05 [virtual_dataset]: 6.342e-05 [get_grad_eliminate_]: 6.282e-05 [virtual_output]: 6.259e-05 [merge_forward]: 3.93e-05 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 5.039e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011667 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 0.00010834 [set_forward_comm_id_for_comm_node_pass]: 4.548e-05 [meta_fg_expand]: 3.11e-05 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.10001e-06 [after_resolve]: 6.623e-05 [a_after_grad]: 0.00010428 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.84e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 7.649e-05 [cse]: 0.00023744 [a_3]: 0.00046214 [py_interpret_to_execute_after_opt_a]: 4.82e-06 [slice_cell_reuse_recomputed_activation]: 2.34999e-06 [rewriter_after_opt_a]: 0.0001873 [convert_after_rewriter]: 1.42999e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00056604 [opt_b]: 0.00571433, [2] [Cycle 1]: 0.00369184, [7] [b_1]: 0.00321023 [b_2]: 6.69e-05 [updatestate_depend_eliminate]: 4.768e-05 [updatestate_assign_eliminate]: 3.801e-05 [updatestate_loads_eliminate]: 3.721e-05 [renormalize]: 4.49974e-07 [cse]: 0.00024323 [Cycle 2]: 0.00201242, [7] [b_1]: 0.00156369 [b_2]: 6.499e-05 [updatestate_depend_eliminate]: 4.39e-05 [updatestate_assign_eliminate]: 3.776e-05 [updatestate_loads_eliminate]: 3.712e-05 [renormalize]: 6.99947e-08 [cse]: 0.00022417 [optimize_parallel_all_gather_comm]: 0.0001047 [overlap_param_gather]: 2.73998e-06 [cconv]: 3.552e-05 [loop_unroll]: 0.00053357 [opt_after_cconv]: 0.00075649, [1] [Cycle 1]: 0.00075013, [7] [c_1]: 0.00033938 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 4.95e-05 [updatestate_assign_eliminate]: 3.864e-05 [updatestate_loads_eliminate]: 3.787e-05 [cse]: 0.00024193 [renormalize]: 4.10015e-07 [remove_dup_value]: 0.00040768 [tuple_transform]: 0.00053314, [1] [Cycle 1]: 0.00052663, [4] [d_1]: 0.00044079 [none_parameter_eliminate]: 2.53e-06 [renormalize]: 3.19997e-07 [switch_simplify]: 5.789e-05 [partial_unused_args_eliminate]: 2.28002e-06 [add_recomputation]: 0.00022892 [cse_after_recomputation]: 0.00014584, [1] [Cycle 1]: 0.00014019, [1] [cse]: 0.00013224 [environ_conv]: 3.102e-05 [swap_dp_allreduce_reducescatter]: 4.209e-05 [bias_add_comm_swap]: 2.76e-06 [label_micro_interleaved_index]: 4.43999e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.11998e-06 [micro_interleaved_order_control]: 2.51e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 1.10999e-06 [remove_cast_before_assign_add]: 1.19e-06 [full_micro_interleaved_order_control]: 2.69001e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.21002e-06 [overlap_opt_shard_in_pipeline]: 1.11002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 9.366e-05 [grouped_pairwise_exchange_alltoall]: 2.01998e-06 [offloading_packed_experts]: 2.288e-05 [overlap_recompute_and_grad_model_parallel]: 2.376e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.28002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.20002e-06 [overlap_grad_ring_attention]: 2.274e-05 [overlap_grad_flash_sp]: 0.00012245 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.34999e-06 [split_layernorm_comm]: 2.09e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 0.00037628, [1] [Cycle 1]: 0.00037106, [6] [build]: 1.997e-05 [elim_shapecalc]: 6.329e-05 [elim_not_effective]: 0.0001026 [opt_reshape]: 5.631e-05 [fold_const_symbol]: 9.737e-05 [renormalize]: 3.59985e-07 [detach_backward]: 1.69e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 8.816e-05 [get_jit_bprop_graph]: 1.15999e-06 [rewriter_after_jit_bprop_graph]: 4.05e-06 [opt_after_jit_grad]: 0.00069827 [validate]: 0.0001514 [backend_pass]: 1.07e-06 [task_emit]: 24.8223 [execute]: 1e-05 Sums bootstrap : 0.000597s : 0.00% type_inference : 0.048731s : 0.20% event_method : 0.000248s : 0.00% auto_monad : 0.000155s : 0.00% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000062s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000374s : 0.00% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000328s : 0.00% optimize.opt_a.loop_unroll : 0.000250s : 0.00% optimize.opt_a.a_1 : 0.006536s : 0.03% optimize.opt_a.with_stream_mark : 0.000133s : 0.00% optimize.opt_a.recompute_prepare : 0.000152s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000089s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000083s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000080s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.002235s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000229s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000035s : 0.00% optimize.opt_a.shard_inline : 0.000145s : 0.00% optimize.opt_a.merge_send_recv : 0.000107s : 0.00% optimize.opt_a.auto_parallel : 0.000096s : 0.00% optimize.opt_a.parallel : 0.000030s : 0.00% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000094s : 0.00% optimize.opt_a.allreduce_fusion : 0.000092s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000122s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000146s : 0.00% optimize.opt_a.virtual_dataset : 0.000140s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000139s : 0.00% optimize.opt_a.virtual_output : 0.000139s : 0.00% optimize.opt_a.merge_forward : 0.000086s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000115s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000255s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000239s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000098s : 0.00% optimize.opt_a.meta_fg_expand : 0.001694s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000202s : 0.00% optimize.opt_a.a_after_grad : 0.000298s : 0.00% optimize.opt_a.renormalize : 0.016207s : 0.07% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000337s : 0.00% optimize.opt_a.cse : 0.000995s : 0.00% optimize.opt_a.a_3 : 0.001662s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000187s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000566s : 0.00% optimize.opt_b.b_1 : 0.004774s : 0.02% optimize.opt_b.b_2 : 0.000132s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000092s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000076s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000074s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000467s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000105s : 0.00% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000036s : 0.00% optimize.loop_unroll : 0.000534s : 0.00% optimize.opt_after_cconv.c_1 : 0.000339s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000049s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000039s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000038s : 0.00% optimize.opt_after_cconv.cse : 0.000242s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000408s : 0.00% optimize.tuple_transform.d_1 : 0.000441s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000003s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000058s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000229s : 0.00% optimize.cse_after_recomputation.cse : 0.000132s : 0.00% optimize.environ_conv : 0.000031s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000042s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000094s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000023s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000024s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000122s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000020s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000063s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000103s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000056s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000097s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000088s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000698s : 0.00% validate : 0.000151s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 24.822289s : 99.62% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001350 670 7.50% : 0.000101s : 36: substitution.arithmetic_simplify 1.12% : 0.000015s : 46: substitution.elim_not_effective 0.53% : 0.000007s : 11: substitution.float_depend_g_call 1.22% : 0.000016s : 17: substitution.float_tuple_getitem_switch 1.08% : 0.000015s : 46: substitution.fold_const_symbol 2.55% : 0.000034s : 51: substitution.graph_param_transform 0.20% : 0.000003s : 2: substitution.incorporate_call 0.14% : 0.000002s : 2: substitution.incorporate_call_switch 43.79% : 0.000591s : 21: substitution.inline 1.27% : 0.000017s : 2: substitution.inline_without_move 2.98% : 0.000040s : 102: substitution.j_node_and_user_rematch 4.32% : 0.000058s : 30: substitution.less_batch_normalization 1.22% : 0.000017s : 13: substitution.minmaximum_grad 0.65% : 0.000009s : 11: substitution.partial_eliminate 4.17% : 0.000056s : 102: substitution.remove_not_recompute_node 1.68% : 0.000023s : 9: substitution.replace_applicator 0.65% : 0.000009s : 11: substitution.replace_old_param 0.17% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.92% : 0.000012s : 4: substitution.switch_simplify 1.60% : 0.000022s : 12: substitution.transpose_eliminate 4.48% : 0.000060s : 25: substitution.tuple_list_convert_item_index_to_positive 2.21% : 0.000030s : 25: substitution.tuple_list_get_item_const_eliminator 2.90% : 0.000039s : 25: substitution.tuple_list_get_item_depend_reorder 6.89% : 0.000093s : 40: substitution.tuple_list_get_item_eliminator 2.95% : 0.000040s : 25: substitution.tuple_list_get_set_item_eliminator 2.80% : 0.000038s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.048634 2 92.79% : 0.045126s : 1: type_inference.infer 7.21% : 0.003509s : 1: type_inference.specialize ------[replace.] 0.000314 33 60.93% : 0.000191s : 21: replace.inline 14.00% : 0.000044s : 4: replace.switch_simplify 20.11% : 0.000063s : 7: replace.tuple_list_get_item_eliminator 4.96% : 0.000016s : 1: replace.zero_like_fill_zero ------[match.] 0.000646 33 89.62% : 0.000579s : 21: match.inline 1.48% : 0.000010s : 4: match.switch_simplify 3.25% : 0.000021s : 7: match.tuple_list_get_item_eliminator 5.64% : 0.000036s : 1: match.zero_like_fill_zero ------[predicate.] 0.002779 20376 0.73% : 0.000020s : 165: predicate.accumulaten_eliminater 0.45% : 0.000013s : 51: predicate.ad_related_special_op_eliminate 0.63% : 0.000017s : 132: predicate.addn_check_dump 0.74% : 0.000021s : 165: predicate.addn_zero_filter 0.72% : 0.000020s : 165: predicate.adjust_all_reduce_mul_add 1.81% : 0.000050s : 297: predicate.arithmetic_simplify 0.75% : 0.000021s : 165: predicate.cast_eliminate 1.06% : 0.000030s : 217: predicate.check_bprop_eliminate 0.63% : 0.000017s : 132: predicate.compare_switch_simplify 0.49% : 0.000014s : 177: predicate.const_output_eliminate 0.62% : 0.000017s : 132: predicate.depend_value_elim 0.81% : 0.000022s : 165: predicate.dict_get_item_const_eliminator 0.87% : 0.000024s : 165: predicate.dict_get_item_eliminator 0.79% : 0.000022s : 165: predicate.dict_set_item_eliminator 1.15% : 0.000032s : 228: predicate.dumpgradient_eliminate 0.13% : 0.000004s : 51: predicate.elim_not_effective 0.28% : 0.000008s : 51: predicate.elim_shapecalc_of_broadcastargs 1.55% : 0.000043s : 342: predicate.environ_add_const_eliminate 1.57% : 0.000044s : 342: predicate.environ_get_add_eliminate 1.54% : 0.000043s : 342: predicate.environ_get_depend_swap 2.26% : 0.000063s : 474: predicate.environ_get_eliminate 1.56% : 0.000043s : 342: predicate.environ_get_set_eliminate 0.88% : 0.000024s : 193: predicate.exchange_switch_depend_value 1.18% : 0.000033s : 193: predicate.float_depend_g_call 0.63% : 0.000017s : 132: predicate.float_environ_get_switch 1.51% : 0.000042s : 309: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 51: predicate.fold_const_symbol 0.68% : 0.000019s : 132: predicate.get_grad_eliminate 0.15% : 0.000004s : 51: predicate.graph_param_transform 0.63% : 0.000017s : 132: predicate.incorporate_call 0.61% : 0.000017s : 132: predicate.incorporate_call_switch 5.09% : 0.000142s : 858: predicate.inline 1.00% : 0.000028s : 159: predicate.inline_without_move 0.33% : 0.000009s : 132: predicate.j_node_and_user_rematch 0.76% : 0.000021s : 132: predicate.less_batch_normalization 1.92% : 0.000053s : 400: predicate.list_to_tuple_eliminator_ 2.57% : 0.000071s : 573: predicate.load_eliminater 0.54% : 0.000015s : 59: predicate.loop_unroll_after_grad 1.99% : 0.000055s : 233: predicate.loop_unroll_before_grad 1.89% : 0.000052s : 401: predicate.make_slice_get_slice_eliminator 0.64% : 0.000018s : 132: predicate.merge_addn 1.02% : 0.000028s : 217: predicate.micro_step_allgather_replace 1.02% : 0.000028s : 217: predicate.mini_step_allgather_replace 0.74% : 0.000021s : 165: predicate.minmaximum_grad 0.55% : 0.000015s : 60: predicate.mutable_eliminate 0.27% : 0.000007s : 51: predicate.opt_reshape 0.91% : 0.000025s : 177: predicate.parallel_virtual_node 1.26% : 0.000035s : 193: predicate.partial_defer_inline 1.22% : 0.000034s : 231: predicate.partial_eliminate 0.77% : 0.000021s : 165: predicate.print_const_string_wrapper 0.63% : 0.000017s : 132: predicate.reduce_all_const_elim 0.93% : 0.000026s : 165: predicate.reduce_eliminate 2.61% : 0.000073s : 573: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000009s : 132: predicate.remove_not_recompute_node 1.24% : 0.000034s : 389: predicate.replace_applicator 0.43% : 0.000012s : 159: predicate.replace_old_param 0.49% : 0.000014s : 177: predicate.reset_defer_inline 0.73% : 0.000020s : 165: predicate.reshape_eliminate 1.05% : 0.000029s : 217: predicate.row_tensor_add_zeros_like 0.64% : 0.000018s : 118: predicate.row_tensor_eliminate 1.24% : 0.000035s : 217: predicate.same_eliminate 0.39% : 0.000011s : 132: predicate.set_cell_output_no_recompute 0.68% : 0.000019s : 132: predicate.shard_identity_eliminate 1.18% : 0.000033s : 228: predicate.special_op_eliminate 0.71% : 0.000020s : 132: predicate.specialize_transform 1.10% : 0.000031s : 217: predicate.split_environ_get_set_with_tuple_value 0.86% : 0.000024s : 159: predicate.stack_unstack_eliminate 0.31% : 0.000009s : 59: predicate.switch_call_monad_eliminater 0.96% : 0.000027s : 193: predicate.switch_defer_inline 1.98% : 0.000055s : 410: predicate.switch_layer_defer_inline 3.17% : 0.000088s : 617: predicate.switch_simplify 0.74% : 0.000021s : 165: predicate.tile_eliminate 0.76% : 0.000021s : 165: predicate.transpose_eliminate 2.00% : 0.000056s : 393: predicate.tuple_list_convert_item_index_to_positive 2.05% : 0.000057s : 393: predicate.tuple_list_get_item_const_eliminator 1.94% : 0.000054s : 393: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000080s : 532: predicate.tuple_list_get_item_eliminator 1.97% : 0.000055s : 393: predicate.tuple_list_get_set_item_eliminator 2.64% : 0.000073s : 525: predicate.tuple_list_set_item_eliminator 1.87% : 0.000052s : 400: predicate.tuple_to_list_eliminator_ 2.62% : 0.000073s : 573: predicate.updatestate_pure_node_eliminater 3.29% : 0.000091s : 705: predicate.updatestate_useless_node_eliminater 0.90% : 0.000025s : 177: predicate.value_based_eliminate 0.68% : 0.000019s : 132: predicate.virtual_dataset_eliminate 0.66% : 0.000018s : 132: predicate.virtual_output_eliminate 0.28% : 0.000008s : 51: predicate.virtual_view_grad_eliminate 0.99% : 0.000028s : 179: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005063 51 58.98% : 0.002986s : 26: func_graph_cloner_run.FuncGraphClonerGraph 41.02% : 0.002077s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 25.005692 292 0.00% : 0.000004s : 1: ForceFp32Comm 0.01% : 0.003378s : 1: add_attr 0.01% : 0.003368s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000234s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.00% : 0.000164s : 1: auto_monad 0.00% : 0.000092s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.00% : 0.000630s : 1: bootstrap 0.00% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000098s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000149s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000035s : 1: environ_conv 0.00% : 0.000261s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.00% : 0.000542s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.00% : 0.000574s : 1: mutable_eliminate 0.00% : 0.000026s : 1: offloading_packed_experts 0.00% : 0.000088s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000090s : 1: opt.transform.mutable_eliminate 0.05% : 0.012934s : 117: opt.transform.opt_a 0.00% : 0.000338s : 1: opt.transform.opt_after_cconv 0.00% : 0.000178s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.004811s : 83: opt.transform.opt_b 0.00% : 0.000495s : 2: opt.transform.opt_trans_graph 0.00% : 0.000315s : 4: opt.transform.symbol_engine_opt 0.14% : 0.034512s : 1: opt_a 0.00% : 0.000760s : 1: opt_after_cconv 0.00% : 0.000708s : 1: opt_after_jit_grad 0.02% : 0.005718s : 1: opt_b 0.18% : 0.045142s : 1: optimize 0.00% : 0.000110s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000126s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000027s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000066s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000416s : 1: remove_dup_value 0.04% : 0.010956s : 2: renormalize.infer 0.02% : 0.005234s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000192s : 1: rewriter_after_opt_a 0.00% : 0.000382s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000045s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000379s : 1: symbol_engine_optimizer 99.27% : 24.822315s : 1: task_emit 0.00% : 0.000537s : 1: tuple_transform 0.19% : 0.048747s : 1: type_inference 0.00% : 0.000207s : 1: validate [WARNING] CORE(61814,ffffbf434f30,python3.9):2026-01-29-17:51:30.632.874 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph6 TotalTime = 0.0714967, [24] [bootstrap]: 0.00047042 [type_inference]: 0.0242134 [event_method]: 2.207e-05 [auto_monad]: 8.191e-05 [graph_reusing]: 6.17999e-06 [inline]: 1.62999e-06 [add_attr]: 0.00310955, [1] [add_attr_with_inline]: 0.00310086, [1] [Cycle 1]: 5.425e-05, [2] [tag_attr]: 2.032e-05 [meta_addattr_fg_expand]: 6.43998e-06 [parallel-infer-symbol]: 2.78e-06 [pre_auto_parallel]: 3.265e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.76998e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00474088, [53] [py_interpret_to_execute]: 4.54998e-06 [rewriter_before_opt_a]: 0.00023475 [opt_a]: 0.00269561, [2] [Cycle 1]: 0.00213908, [45] [expand_dump_flag]: 3.21999e-06 [switch_simplify]: 7.473e-05 [loop_unroll]: 3.182e-05 [a_1]: 0.00058558 [with_stream_mark]: 1.449e-05 [recompute_prepare]: 7.31999e-06 [updatestate_depend_eliminate]: 3.95e-06 [updatestate_assign_eliminate]: 3.55003e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 6.832e-05 [accelerated_algorithm]: 6.05002e-06 [shard]: 1.85001e-06 [meta_shard_fg_expand]: 1.77001e-06 [shard_inline]: 5.69e-06 [merge_send_recv]: 7.80998e-06 [auto_parallel]: 5.44e-06 [parallel]: 1.77e-05 [flash_sp]: 6.89001e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.24001e-06 [matmul_add_comm_reduction]: 8.69e-06 [allreduce_slice_to_reducescatter]: 8.70001e-07 [virtual_shard_identity]: 6.71999e-06 [virtual_dataset]: 5.80002e-06 [get_grad_eliminate_]: 5.37001e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.31002e-06 [offload_activation]: 9.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.132e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 9.19e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36001e-06 [meta_fg_expand]: 3.13e-06 [flash_sp_send_recv_attached]: 2.28002e-06 [receive_attached]: 2.19001e-06 [after_resolve]: 9.25001e-06 [a_after_grad]: 7.97998e-06 [renormalize]: 0.00085718 [add_forward_monad_depend]: 5.10001e-06 [auto_monad_grad]: 1.77001e-06 [auto_monad_eliminator]: 1.462e-05 [cse]: 3.747e-05 [a_3]: 4.036e-05 [Cycle 2]: 0.00054758, [45] [expand_dump_flag]: 9.29984e-07 [switch_simplify]: 6.76999e-06 [loop_unroll]: 5.76998e-06 [a_1]: 9.595e-05 [with_stream_mark]: 1.062e-05 [recompute_prepare]: 5.47999e-06 [updatestate_depend_eliminate]: 2.88e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.14e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 6.136e-05 [accelerated_algorithm]: 5.10001e-06 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 1.33002e-06 [shard_inline]: 5.12e-06 [merge_send_recv]: 4.30999e-06 [auto_parallel]: 5.30999e-06 [parallel]: 4.25e-06 [flash_sp]: 2.91999e-06 [merge_comm]: 2.93e-06 [allreduce_fusion]: 2.71e-06 [matmul_add_comm_reduction]: 5.10001e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 6.24001e-06 [virtual_dataset]: 5.05001e-06 [get_grad_eliminate_]: 4.86997e-06 [virtual_output]: 4.83001e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 5.72001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.122e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 7.97e-06 [set_forward_comm_id_for_comm_node_pass]: 2.94001e-06 [meta_fg_expand]: 1.71e-06 [flash_sp_send_recv_attached]: 7.30011e-07 [receive_attached]: 1.04e-06 [after_resolve]: 7.65e-06 [a_after_grad]: 7.9e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.08001e-06 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 5.85002e-06 [cse]: 1.546e-05 [a_3]: 2.996e-05 [py_interpret_to_execute_after_opt_a]: 4.15e-06 [slice_cell_reuse_recomputed_activation]: 1.79e-06 [rewriter_after_opt_a]: 1.601e-05 [convert_after_rewriter]: 1.20999e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00046959 [opt_b]: 0.00018089, [1] [Cycle 1]: 0.00017529, [7] [b_1]: 0.0001048 [b_2]: 6.46e-06 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 2.48998e-06 [updatestate_loads_eliminate]: 2.76e-06 [renormalize]: 6.19999e-07 [cse]: 2.013e-05 [optimize_parallel_all_gather_comm]: 1.848e-05 [overlap_param_gather]: 2.10002e-06 [cconv]: 2.399e-05 [loop_unroll]: 0.00042132 [opt_after_cconv]: 9.331e-05, [1] [Cycle 1]: 8.807e-05, [7] [c_1]: 2.405e-05 [parameter_eliminate]: 2.27999e-06 [updatestate_depend_eliminate]: 4.86002e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.18998e-06 [cse]: 1.982e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 2.753e-05 [tuple_transform]: 8.465e-05, [1] [Cycle 1]: 8.019e-05, [4] [d_1]: 5.325e-05 [none_parameter_eliminate]: 1.85001e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 6.50997e-06 [partial_unused_args_eliminate]: 1.78002e-06 [add_recomputation]: 4.429e-05 [cse_after_recomputation]: 2.347e-05, [1] [Cycle 1]: 1.899e-05, [1] [cse]: 1.33e-05 [environ_conv]: 7.58999e-06 [swap_dp_allreduce_reducescatter]: 5.66e-06 [bias_add_comm_swap]: 3.05998e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.59999e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.09999e-06 [micro_interleaved_order_control]: 2.36e-06 [assign_add_opt]: 1.44998e-06 [ForceFp32Comm]: 7.80012e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.78998e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 9.09989e-07 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.2e-05 [grouped_pairwise_exchange_alltoall]: 2.29999e-06 [offloading_packed_experts]: 3.66999e-06 [overlap_recompute_and_grad_model_parallel]: 4.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.34999e-06 [overlap_grad_ring_attention]: 4.20999e-06 [overlap_grad_flash_sp]: 1.68e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 9.29984e-07 [symbol_engine_optimizer]: 7.726e-05, [1] [Cycle 1]: 7.315e-05, [6] [build]: 9.49e-06 [elim_shapecalc]: 8.45001e-06 [elim_not_effective]: 1.217e-05 [opt_reshape]: 6.30997e-06 [fold_const_symbol]: 9.28002e-06 [renormalize]: 2.3999e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.599e-05 [get_jit_bprop_graph]: 9.70002e-07 [rewriter_after_jit_bprop_graph]: 3.44001e-06 [opt_after_jit_grad]: 0.0004555 [validate]: 3.871e-05 [backend_pass]: 8.50006e-07 [task_emit]: 0.0380862 [execute]: 8.72998e-06 Sums bootstrap : 0.000470s : 0.70% type_inference : 0.024213s : 35.90% event_method : 0.000022s : 0.03% auto_monad : 0.000082s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000033s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000235s : 0.35% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000082s : 0.12% optimize.opt_a.loop_unroll : 0.000038s : 0.06% optimize.opt_a.a_1 : 0.000682s : 1.01% optimize.opt_a.with_stream_mark : 0.000025s : 0.04% optimize.opt_a.recompute_prepare : 0.000013s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000130s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.02% optimize.opt_a.merge_send_recv : 0.000012s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.02% optimize.opt_a.parallel : 0.000022s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.02% optimize.opt_a.virtual_output : 0.000010s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.03% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.000857s : 1.27% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.03% optimize.opt_a.cse : 0.000053s : 0.08% optimize.opt_a.a_3 : 0.000070s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000470s : 0.70% optimize.opt_b.b_1 : 0.000105s : 0.16% optimize.opt_b.b_2 : 0.000006s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.04% optimize.loop_unroll : 0.000421s : 0.62% optimize.opt_after_cconv.c_1 : 0.000024s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000028s : 0.04% optimize.tuple_transform.d_1 : 0.000053s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000456s : 0.68% validate : 0.000039s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.038086s : 56.47% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000200 26 0.99% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 11.09% : 0.000022s : 3: substitution.graph_param_transform 72.76% : 0.000145s : 6: substitution.inline 1.49% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.25% : 0.000004s : 4: substitution.remove_not_recompute_node 1.47% : 0.000003s : 2: substitution.replace_old_param 3.52% : 0.000007s : 1: substitution.switch_simplify 5.74% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024154 2 95.06% : 0.022961s : 1: type_inference.infer 4.94% : 0.001193s : 1: type_inference.specialize ------[replace.] 0.000079 9 58.85% : 0.000046s : 6: replace.inline 21.55% : 0.000017s : 1: replace.switch_simplify 19.60% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 9 89.75% : 0.000142s : 6: match.inline 3.83% : 0.000006s : 1: match.switch_simplify 6.42% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000173 1092 1.00% : 0.000002s : 12: predicate.accumulaten_eliminater 0.91% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 6: predicate.addn_check_dump 1.12% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.08% : 0.000004s : 18: predicate.arithmetic_simplify 0.97% : 0.000002s : 12: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.51% : 0.000001s : 6: predicate.depend_value_elim 0.98% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.82% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.35% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 15: predicate.environ_get_depend_swap 1.72% : 0.000003s : 21: predicate.environ_get_eliminate 1.19% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.57% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.63% : 0.000005s : 20: predicate.float_depend_g_call 0.50% : 0.000001s : 6: predicate.float_environ_get_switch 0.66% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.60% : 0.000001s : 6: predicate.get_grad_eliminate 0.24% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.45% : 0.000001s : 6: predicate.incorporate_call_switch 5.89% : 0.000010s : 50: predicate.inline 0.66% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.77% : 0.000001s : 6: predicate.less_batch_normalization 1.81% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.40% : 0.000004s : 32: predicate.load_eliminater 0.80% : 0.000001s : 3: predicate.loop_unroll_after_grad 2.88% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 12: predicate.minmaximum_grad 1.08% : 0.000002s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.35% : 0.000001s : 3: predicate.parallel_virtual_node 2.10% : 0.000004s : 20: predicate.partial_defer_inline 1.44% : 0.000002s : 17: predicate.partial_eliminate 0.95% : 0.000002s : 12: predicate.print_const_string_wrapper 0.54% : 0.000001s : 6: predicate.reduce_all_const_elim 1.45% : 0.000003s : 12: predicate.reduce_eliminate 2.49% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 6: predicate.remove_not_recompute_node 1.27% : 0.000002s : 20: predicate.replace_applicator 0.54% : 0.000001s : 6: predicate.replace_old_param 0.27% : 0.000000s : 3: predicate.reset_defer_inline 1.10% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 3: predicate.row_tensor_eliminate 0.64% : 0.000001s : 6: predicate.same_eliminate 0.40% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.69% : 0.000001s : 6: predicate.shard_identity_eliminate 0.62% : 0.000001s : 6: predicate.special_op_eliminate 0.64% : 0.000001s : 6: predicate.specialize_transform 0.78% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.74% : 0.000003s : 20: predicate.switch_defer_inline 2.19% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.08% : 0.000011s : 68: predicate.switch_simplify 0.95% : 0.000002s : 12: predicate.tile_eliminate 0.96% : 0.000002s : 12: predicate.transpose_eliminate 1.63% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.36% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.35% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.01% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.49% : 0.000001s : 3: predicate.value_based_eliminate 0.65% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.28% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001005 16 57.64% : 0.000579s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.36% : 0.000426s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.081435 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.82% : 0.003114s : 1: add_attr 3.81% : 0.003104s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000048s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.11% : 0.000087s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.61% : 0.000500s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000027s : 1: event_method 0.02% : 0.000014s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.53% : 0.000429s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.59% : 0.000478s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 1.32% : 0.001074s : 78: opt.transform.opt_a 0.03% : 0.000023s : 1: opt.transform.opt_after_cconv 0.03% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000085s : 28: opt.transform.opt_b 0.07% : 0.000058s : 2: opt.transform.opt_trans_graph 0.04% : 0.000033s : 4: opt.transform.symbol_engine_opt 3.31% : 0.002699s : 1: opt_a 0.12% : 0.000097s : 1: opt_after_cconv 0.57% : 0.000464s : 1: opt_after_jit_grad 0.23% : 0.000184s : 1: opt_b 5.83% : 0.004745s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000037s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000031s : 1: remove_dup_value 0.55% : 0.000451s : 1: renormalize.infer 0.49% : 0.000397s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000019s : 1: rewriter_after_opt_a 0.29% : 0.000240s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000080s : 1: symbol_engine_optimizer 46.79% : 0.038102s : 1: task_emit 0.11% : 0.000088s : 1: tuple_transform 29.75% : 0.024228s : 1: type_inference 0.08% : 0.000063s : 1: validate TotalTime = 2.34555, [24] [bootstrap]: 0.00044875 [type_inference]: 0.0485592 [event_method]: 0.00018942 [auto_monad]: 0.00015309 [graph_reusing]: 1.02e-05 [inline]: 2.24001e-06 [add_attr]: 0.00322936, [1] [add_attr_with_inline]: 0.00322071, [1] [Cycle 1]: 8.247e-05, [2] [tag_attr]: 4.089e-05 [meta_addattr_fg_expand]: 1.198e-05 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 6.262e-05 [insert-virtual-dataset]: 2.61999e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 2.19999e-06 [pipeline_split]: 1.89e-06 [optimize]: 0.0533462, [53] [py_interpret_to_execute]: 4.53999e-06 [rewriter_before_opt_a]: 0.00036902 [opt_a]: 0.037105, [3] [Cycle 1]: 0.0188605, [45] [expand_dump_flag]: 5.05001e-06 [switch_simplify]: 0.00015397 [loop_unroll]: 6.605e-05 [a_1]: 0.00141441 [with_stream_mark]: 2.316e-05 [recompute_prepare]: 2.071e-05 [updatestate_depend_eliminate]: 8.52e-06 [updatestate_assign_eliminate]: 7.2e-06 [updatestate_loads_eliminate]: 6.84001e-06 [parameter_eliminate]: 2.86999e-06 [a_2]: 0.00020874 [accelerated_algorithm]: 1.428e-05 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 4.37e-06 [shard_inline]: 1.381e-05 [merge_send_recv]: 1.58e-05 [auto_parallel]: 9.72999e-06 [parallel]: 1.645e-05 [flash_sp]: 9.04998e-06 [merge_comm]: 8.47998e-06 [allreduce_fusion]: 7.71001e-06 [matmul_add_comm_reduction]: 2.504e-05 [allreduce_slice_to_reducescatter]: 7.50006e-07 [virtual_shard_identity]: 1.494e-05 [virtual_dataset]: 1.351e-05 [get_grad_eliminate_]: 1.331e-05 [virtual_output]: 1.376e-05 [merge_forward]: 8.80001e-06 [cell_reuse_recompute_pass]: 1.09003e-06 [offload_activation]: 1.537e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.834e-05 [merge_recompute_call_nodes]: 1.54998e-06 [before_grad]: 2.406e-05 [set_forward_comm_id_for_comm_node_pass]: 8.77e-06 [meta_fg_expand]: 0.00149004 [flash_sp_send_recv_attached]: 4.55001e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 6.302e-05 [a_after_grad]: 8.486e-05 [renormalize]: 0.0131359 [add_forward_monad_depend]: 1.043e-05 [auto_monad_grad]: 5.95002e-06 [auto_monad_eliminator]: 0.00015398 [cse]: 0.00057305 [a_3]: 0.00083708 [Cycle 2]: 0.0116666, [45] [expand_dump_flag]: 2.42001e-06 [switch_simplify]: 0.00011883 [loop_unroll]: 0.00011254 [a_1]: 0.00347649 [with_stream_mark]: 6.542e-05 [recompute_prepare]: 7.96e-05 [updatestate_depend_eliminate]: 4.475e-05 [updatestate_assign_eliminate]: 4.173e-05 [updatestate_loads_eliminate]: 4.193e-05 [parameter_eliminate]: 1.25001e-06 [a_2]: 0.0011876 [accelerated_algorithm]: 0.00014836 [shard]: 9.70002e-07 [meta_shard_fg_expand]: 1.543e-05 [shard_inline]: 8.435e-05 [merge_send_recv]: 5.294e-05 [auto_parallel]: 5.113e-05 [parallel]: 4.53001e-06 [flash_sp]: 3.58999e-06 [merge_comm]: 5.028e-05 [allreduce_fusion]: 4.866e-05 [matmul_add_comm_reduction]: 5.508e-05 [allreduce_slice_to_reducescatter]: 3.4002e-07 [virtual_shard_identity]: 7.663e-05 [virtual_dataset]: 7.378e-05 [get_grad_eliminate_]: 7.412e-05 [virtual_output]: 7.38e-05 [merge_forward]: 4.311e-05 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 5.56e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013169 [merge_recompute_call_nodes]: 7.59988e-07 [before_grad]: 0.00012284 [set_forward_comm_id_for_comm_node_pass]: 5.037e-05 [meta_fg_expand]: 0.00012771 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 8.118e-05 [a_after_grad]: 0.0001222 [renormalize]: 0.00365576 [add_forward_monad_depend]: 4.84003e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 0.00010653 [cse]: 0.00032176 [a_3]: 0.0005725 [Cycle 3]: 0.00656339, [45] [expand_dump_flag]: 1.35001e-06 [switch_simplify]: 7.834e-05 [loop_unroll]: 7.443e-05 [a_1]: 0.00224686 [with_stream_mark]: 5.333e-05 [recompute_prepare]: 7.505e-05 [updatestate_depend_eliminate]: 4.623e-05 [updatestate_assign_eliminate]: 4.331e-05 [updatestate_loads_eliminate]: 4.377e-05 [parameter_eliminate]: 1.16002e-06 [a_2]: 0.0012386 [accelerated_algorithm]: 0.00020198 [shard]: 1.22999e-06 [meta_shard_fg_expand]: 1.756e-05 [shard_inline]: 7.857e-05 [merge_send_recv]: 5.502e-05 [auto_parallel]: 4.957e-05 [parallel]: 4.75001e-06 [flash_sp]: 1.10001e-06 [merge_comm]: 4.877e-05 [allreduce_fusion]: 4.694e-05 [matmul_add_comm_reduction]: 5.433e-05 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 7.662e-05 [virtual_dataset]: 7.46e-05 [get_grad_eliminate_]: 7.438e-05 [virtual_output]: 7.41e-05 [merge_forward]: 4.342e-05 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 5.707e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013222 [merge_recompute_call_nodes]: 8.09989e-07 [before_grad]: 0.00012448 [set_forward_comm_id_for_comm_node_pass]: 5.067e-05 [meta_fg_expand]: 3.389e-05 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 7.655e-05 [a_after_grad]: 0.00012771 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.10002e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 8.56e-05 [cse]: 0.0002781 [a_3]: 0.0005531 [py_interpret_to_execute_after_opt_a]: 4.48999e-06 [slice_cell_reuse_recomputed_activation]: 2.22999e-06 [rewriter_after_opt_a]: 0.00022031 [convert_after_rewriter]: 1.49e-06 [order_py_execute_after_rewriter]: 1.25001e-06 [mutable_eliminate]: 0.00057934 [opt_b]: 0.0104611, [2] [Cycle 1]: 0.00433441, [7] [b_1]: 0.00378709 [b_2]: 7.855e-05 [updatestate_depend_eliminate]: 5.365e-05 [updatestate_assign_eliminate]: 4.344e-05 [updatestate_loads_eliminate]: 4.223e-05 [renormalize]: 4.30009e-07 [cse]: 0.0002809 [Cycle 2]: 0.00260045, [7] [b_1]: 0.00199119 [b_2]: 7.835e-05 [updatestate_depend_eliminate]: 6.094e-05 [updatestate_assign_eliminate]: 4.406e-05 [updatestate_loads_eliminate]: 4.244e-05 [renormalize]: 6.99947e-08 [cse]: 0.00031996 [optimize_parallel_all_gather_comm]: 0.00010247 [overlap_param_gather]: 2.39001e-06 [cconv]: 4.626e-05 [loop_unroll]: 0.0007688 [opt_after_cconv]: 0.00090584, [1] [Cycle 1]: 0.00089694, [7] [c_1]: 0.00040662 [parameter_eliminate]: 3.71001e-06 [updatestate_depend_eliminate]: 6.031e-05 [updatestate_assign_eliminate]: 4.374e-05 [updatestate_loads_eliminate]: 4.291e-05 [cse]: 0.00029698 [renormalize]: 3.30008e-07 [remove_dup_value]: 0.00053941 [tuple_transform]: 0.00064117, [1] [Cycle 1]: 0.00063223, [4] [d_1]: 0.00053752 [none_parameter_eliminate]: 2.49001e-06 [renormalize]: 4.19997e-07 [switch_simplify]: 6.674e-05 [partial_unused_args_eliminate]: 2.28998e-06 [add_recomputation]: 0.00026571 [cse_after_recomputation]: 0.00016717, [1] [Cycle 1]: 0.00016174, [1] [cse]: 0.00015371 [environ_conv]: 3.443e-05 [swap_dp_allreduce_reducescatter]: 4.748e-05 [bias_add_comm_swap]: 6.41998e-06 [label_micro_interleaved_index]: 4.69998e-06 [label_fine_grained_interleaved_index]: 2.53e-06 [merge_cast_opt]: 1.56002e-06 [slice_recompute_activation]: 2.17001e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.73e-06 [reorder_send_recv_between_fp_bp]: 2.94999e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 8.99978e-07 [overlap_opt_shard_grad_in_pipeline]: 2.00002e-06 [control_data_broadcast_order]: 0.00010465 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 2.641e-05 [overlap_recompute_and_grad_model_parallel]: 2.607e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.02001e-06 [overlap_grad_ring_attention]: 2.526e-05 [overlap_grad_flash_sp]: 0.00013834 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.06e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 1.26997e-06 [symbol_engine_optimizer]: 0.00045944, [1] [Cycle 1]: 0.00045255, [6] [build]: 2.475e-05 [elim_shapecalc]: 7.472e-05 [elim_not_effective]: 0.00012417 [opt_reshape]: 7.54e-05 [fold_const_symbol]: 0.00011819 [renormalize]: 1.80007e-07 [detach_backward]: 2.07999e-06 [pipeline_parallel_scheduler]: 1.91998e-06 [auto_monad_reorder]: 0.00010045 [get_jit_bprop_graph]: 1.45001e-06 [rewriter_after_jit_bprop_graph]: 4.59998e-06 [opt_after_jit_grad]: 0.00073588 [validate]: 0.00018051 [backend_pass]: 1.23002e-06 [task_emit]: 2.23822 [execute]: 1.032e-05 Sums bootstrap : 0.000449s : 0.02% type_inference : 0.048559s : 2.08% event_method : 0.000189s : 0.01% auto_monad : 0.000153s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000063s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000369s : 0.02% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000351s : 0.02% optimize.opt_a.loop_unroll : 0.000253s : 0.01% optimize.opt_a.a_1 : 0.007138s : 0.31% optimize.opt_a.with_stream_mark : 0.000142s : 0.01% optimize.opt_a.recompute_prepare : 0.000175s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000100s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000092s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000093s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.002635s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000365s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000037s : 0.00% optimize.opt_a.shard_inline : 0.000177s : 0.01% optimize.opt_a.merge_send_recv : 0.000124s : 0.01% optimize.opt_a.auto_parallel : 0.000110s : 0.00% optimize.opt_a.parallel : 0.000026s : 0.00% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000108s : 0.00% optimize.opt_a.allreduce_fusion : 0.000103s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000134s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000168s : 0.01% optimize.opt_a.virtual_dataset : 0.000162s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000162s : 0.01% optimize.opt_a.virtual_output : 0.000162s : 0.01% optimize.opt_a.merge_forward : 0.000095s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000128s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000312s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000271s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000110s : 0.00% optimize.opt_a.meta_fg_expand : 0.001652s : 0.07% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000221s : 0.01% optimize.opt_a.a_after_grad : 0.000335s : 0.01% optimize.opt_a.renormalize : 0.016792s : 0.72% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000346s : 0.01% optimize.opt_a.cse : 0.001173s : 0.05% optimize.opt_a.a_3 : 0.001963s : 0.08% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000220s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000579s : 0.02% optimize.opt_b.b_1 : 0.005778s : 0.25% optimize.opt_b.b_2 : 0.000157s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000115s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000087s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000085s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000601s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000102s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000046s : 0.00% optimize.loop_unroll : 0.000769s : 0.03% optimize.opt_after_cconv.c_1 : 0.000407s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000060s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000044s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000043s : 0.00% optimize.opt_after_cconv.cse : 0.000297s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000539s : 0.02% optimize.tuple_transform.d_1 : 0.000538s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000067s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000266s : 0.01% optimize.cse_after_recomputation.cse : 0.000154s : 0.01% optimize.environ_conv : 0.000034s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000047s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000105s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000026s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000026s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.00% optimize.overlap_grad_flash_sp : 0.000138s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000025s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000075s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000124s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000075s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000118s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000100s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000736s : 0.03% validate : 0.000181s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 2.238220s : 95.76% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001376 720 6.47% : 0.000089s : 36: substitution.arithmetic_simplify 1.25% : 0.000017s : 52: substitution.elim_not_effective 0.59% : 0.000008s : 11: substitution.float_depend_g_call 1.31% : 0.000018s : 17: substitution.float_tuple_getitem_switch 1.20% : 0.000016s : 52: substitution.fold_const_symbol 3.02% : 0.000041s : 59: substitution.graph_param_transform 0.19% : 0.000003s : 2: substitution.incorporate_call 0.14% : 0.000002s : 2: substitution.incorporate_call_switch 42.00% : 0.000578s : 21: substitution.inline 1.18% : 0.000016s : 2: substitution.inline_without_move 3.11% : 0.000043s : 114: substitution.j_node_and_user_rematch 4.85% : 0.000067s : 34: substitution.less_batch_normalization 1.43% : 0.000020s : 13: substitution.minmaximum_grad 0.59% : 0.000008s : 11: substitution.partial_eliminate 6.21% : 0.000085s : 114: substitution.remove_not_recompute_node 1.59% : 0.000022s : 9: substitution.replace_applicator 0.62% : 0.000009s : 11: substitution.replace_old_param 0.18% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.87% : 0.000012s : 4: substitution.switch_simplify 1.71% : 0.000024s : 14: substitution.transpose_eliminate 4.58% : 0.000063s : 25: substitution.tuple_list_convert_item_index_to_positive 2.20% : 0.000030s : 25: substitution.tuple_list_get_item_const_eliminator 2.98% : 0.000041s : 25: substitution.tuple_list_get_item_depend_reorder 6.80% : 0.000094s : 40: substitution.tuple_list_get_item_eliminator 2.98% : 0.000041s : 25: substitution.tuple_list_get_set_item_eliminator 1.98% : 0.000027s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.048467 2 92.89% : 0.045019s : 1: type_inference.infer 7.11% : 0.003448s : 1: type_inference.specialize ------[replace.] 0.000272 33 56.52% : 0.000154s : 21: replace.inline 15.28% : 0.000042s : 4: replace.switch_simplify 22.74% : 0.000062s : 7: replace.tuple_list_get_item_eliminator 5.46% : 0.000015s : 1: replace.zero_like_fill_zero ------[match.] 0.000621 33 91.12% : 0.000566s : 21: match.inline 1.50% : 0.000009s : 4: match.switch_simplify 3.19% : 0.000020s : 7: match.tuple_list_get_item_eliminator 4.18% : 0.000026s : 1: match.zero_like_fill_zero ------[predicate.] 0.003234 24043 0.69% : 0.000022s : 191: predicate.accumulaten_eliminater 0.48% : 0.000016s : 59: predicate.ad_related_special_op_eliminate 0.62% : 0.000020s : 158: predicate.addn_check_dump 0.72% : 0.000023s : 191: predicate.addn_zero_filter 0.69% : 0.000022s : 191: predicate.adjust_all_reduce_mul_add 1.80% : 0.000058s : 349: predicate.arithmetic_simplify 0.72% : 0.000023s : 191: predicate.cast_eliminate 1.04% : 0.000034s : 256: predicate.check_bprop_eliminate 0.62% : 0.000020s : 158: predicate.compare_switch_simplify 0.51% : 0.000017s : 216: predicate.const_output_eliminate 0.65% : 0.000021s : 158: predicate.depend_value_elim 0.78% : 0.000025s : 191: predicate.dict_get_item_const_eliminator 0.79% : 0.000026s : 191: predicate.dict_get_item_eliminator 0.71% : 0.000023s : 191: predicate.dict_set_item_eliminator 1.15% : 0.000037s : 275: predicate.dumpgradient_eliminate 0.14% : 0.000005s : 59: predicate.elim_not_effective 0.28% : 0.000009s : 59: predicate.elim_shapecalc_of_broadcastargs 1.54% : 0.000050s : 407: predicate.environ_add_const_eliminate 1.55% : 0.000050s : 407: predicate.environ_get_add_eliminate 1.55% : 0.000050s : 407: predicate.environ_get_depend_swap 3.30% : 0.000107s : 565: predicate.environ_get_eliminate 1.54% : 0.000050s : 407: predicate.environ_get_set_eliminate 0.83% : 0.000027s : 219: predicate.exchange_switch_depend_value 1.10% : 0.000036s : 219: predicate.float_depend_g_call 0.62% : 0.000020s : 158: predicate.float_environ_get_switch 1.50% : 0.000049s : 374: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 59: predicate.fold_const_symbol 0.66% : 0.000021s : 158: predicate.get_grad_eliminate 0.16% : 0.000005s : 59: predicate.graph_param_transform 0.64% : 0.000021s : 158: predicate.incorporate_call 0.63% : 0.000020s : 158: predicate.incorporate_call_switch 5.02% : 0.000162s : 1014: predicate.inline 0.96% : 0.000031s : 185: predicate.inline_without_move 0.35% : 0.000011s : 158: predicate.j_node_and_user_rematch 0.75% : 0.000024s : 158: predicate.less_batch_normalization 1.93% : 0.000063s : 473: predicate.list_to_tuple_eliminator_ 2.54% : 0.000082s : 677: predicate.load_eliminater 0.61% : 0.000020s : 72: predicate.loop_unroll_after_grad 1.13% : 0.000037s : 259: predicate.loop_unroll_before_grad 1.86% : 0.000060s : 479: predicate.make_slice_get_slice_eliminator 0.63% : 0.000020s : 158: predicate.merge_addn 1.01% : 0.000033s : 256: predicate.micro_step_allgather_replace 1.01% : 0.000033s : 256: predicate.mini_step_allgather_replace 0.75% : 0.000024s : 191: predicate.minmaximum_grad 0.50% : 0.000016s : 73: predicate.mutable_eliminate 0.31% : 0.000010s : 59: predicate.opt_reshape 0.93% : 0.000030s : 216: predicate.parallel_virtual_node 1.14% : 0.000037s : 219: predicate.partial_defer_inline 1.17% : 0.000038s : 270: predicate.partial_eliminate 0.72% : 0.000023s : 191: predicate.print_const_string_wrapper 0.63% : 0.000020s : 158: predicate.reduce_all_const_elim 0.87% : 0.000028s : 191: predicate.reduce_eliminate 2.53% : 0.000082s : 677: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000011s : 158: predicate.remove_not_recompute_node 1.25% : 0.000041s : 454: predicate.replace_applicator 0.43% : 0.000014s : 185: predicate.replace_old_param 0.52% : 0.000017s : 216: predicate.reset_defer_inline 0.72% : 0.000023s : 191: predicate.reshape_eliminate 1.36% : 0.000044s : 256: predicate.row_tensor_add_zeros_like 0.63% : 0.000020s : 144: predicate.row_tensor_eliminate 1.21% : 0.000039s : 256: predicate.same_eliminate 0.40% : 0.000013s : 158: predicate.set_cell_output_no_recompute 0.68% : 0.000022s : 158: predicate.shard_identity_eliminate 1.18% : 0.000038s : 275: predicate.special_op_eliminate 0.70% : 0.000023s : 158: predicate.specialize_transform 1.06% : 0.000034s : 256: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000027s : 185: predicate.stack_unstack_eliminate 0.31% : 0.000010s : 72: predicate.switch_call_monad_eliminater 0.90% : 0.000029s : 219: predicate.switch_defer_inline 2.47% : 0.000080s : 475: predicate.switch_layer_defer_inline 3.05% : 0.000098s : 703: predicate.switch_simplify 0.71% : 0.000023s : 191: predicate.tile_eliminate 0.75% : 0.000024s : 191: predicate.transpose_eliminate 1.95% : 0.000063s : 466: predicate.tuple_list_convert_item_index_to_positive 2.05% : 0.000066s : 466: predicate.tuple_list_get_item_const_eliminator 1.94% : 0.000063s : 466: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000100s : 631: predicate.tuple_list_get_item_eliminator 1.97% : 0.000064s : 466: predicate.tuple_list_get_set_item_eliminator 2.62% : 0.000085s : 624: predicate.tuple_list_set_item_eliminator 1.85% : 0.000060s : 473: predicate.tuple_to_list_eliminator_ 2.52% : 0.000082s : 677: predicate.updatestate_pure_node_eliminater 3.22% : 0.000104s : 835: predicate.updatestate_useless_node_eliminater 0.89% : 0.000029s : 216: predicate.value_based_eliminate 0.66% : 0.000021s : 158: predicate.virtual_dataset_eliminate 0.68% : 0.000022s : 158: predicate.virtual_output_eliminate 0.24% : 0.000008s : 59: predicate.virtual_view_grad_eliminate 1.34% : 0.000043s : 218: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005431 51 59.04% : 0.003206s : 26: func_graph_cloner_run.FuncGraphClonerGraph 40.96% : 0.002224s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.441139 292 0.00% : 0.000004s : 1: ForceFp32Comm 0.13% : 0.003234s : 1: add_attr 0.13% : 0.003224s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000271s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000163s : 1: auto_monad 0.00% : 0.000105s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.02% : 0.000480s : 1: bootstrap 0.00% : 0.000051s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000108s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000170s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000038s : 1: environ_conv 0.01% : 0.000200s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.03% : 0.000778s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000588s : 1: mutable_eliminate 0.00% : 0.000030s : 1: offloading_packed_experts 0.00% : 0.000111s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000101s : 1: opt.transform.mutable_eliminate 0.60% : 0.014698s : 117: opt.transform.opt_a 0.02% : 0.000405s : 1: opt.transform.opt_after_cconv 0.01% : 0.000207s : 1: opt.transform.opt_after_jit_grad 0.24% : 0.005828s : 83: opt.transform.opt_b 0.02% : 0.000601s : 2: opt.transform.opt_trans_graph 0.02% : 0.000388s : 4: opt.transform.symbol_engine_opt 1.52% : 0.037108s : 1: opt_a 0.04% : 0.000910s : 1: opt_after_cconv 0.03% : 0.000746s : 1: opt_after_jit_grad 0.43% : 0.010465s : 1: opt_b 2.19% : 0.053351s : 1: optimize 0.00% : 0.000108s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000142s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000029s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000068s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000549s : 1: remove_dup_value 0.45% : 0.011055s : 2: renormalize.infer 0.23% : 0.005721s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000225s : 1: rewriter_after_opt_a 0.02% : 0.000375s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000051s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000463s : 1: symbol_engine_optimizer 91.69% : 2.238248s : 1: task_emit 0.03% : 0.000645s : 1: tuple_transform 1.99% : 0.048574s : 1: type_inference 0.01% : 0.000243s : 1: validate TotalTime = 0.0811823, [24] [bootstrap]: 0.00044893 [type_inference]: 0.0281751 [event_method]: 9.088e-05 [auto_monad]: 0.00016247 [graph_reusing]: 1.128e-05 [inline]: 2.02001e-06 [add_attr]: 0.00302779, [1] [add_attr_with_inline]: 0.00301971, [1] [Cycle 1]: 5.98e-05, [2] [tag_attr]: 2.555e-05 [meta_addattr_fg_expand]: 7.54002e-06 [parallel-infer-symbol]: 3.33e-06 [pre_auto_parallel]: 3.841e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.69998e-06 [optimize]: 0.00544953, [53] [py_interpret_to_execute]: 4.31002e-06 [rewriter_before_opt_a]: 0.00022446 [opt_a]: 0.00328827, [2] [Cycle 1]: 0.00260913, [45] [expand_dump_flag]: 3.83001e-06 [switch_simplify]: 7.992e-05 [loop_unroll]: 3.797e-05 [a_1]: 0.00073637 [with_stream_mark]: 1.405e-05 [recompute_prepare]: 8.89e-06 [updatestate_depend_eliminate]: 4.95999e-06 [updatestate_assign_eliminate]: 4.65999e-06 [updatestate_loads_eliminate]: 4.2e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 9.324e-05 [accelerated_algorithm]: 7.23e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 6.82002e-06 [merge_send_recv]: 8.94e-06 [auto_parallel]: 6.17999e-06 [parallel]: 1.738e-05 [flash_sp]: 7.36001e-06 [merge_comm]: 4.44002e-06 [allreduce_fusion]: 4.13001e-06 [matmul_add_comm_reduction]: 9.33002e-06 [allreduce_slice_to_reducescatter]: 9.00007e-07 [virtual_shard_identity]: 7.92e-06 [virtual_dataset]: 6.64001e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 6.29001e-06 [merge_forward]: 4.33001e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 9.89999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.297e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.057e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 3.26999e-06 [flash_sp_send_recv_attached]: 2.32001e-06 [receive_attached]: 2.32001e-06 [after_resolve]: 1.002e-05 [a_after_grad]: 9.76998e-06 [renormalize]: 0.0010973 [add_forward_monad_depend]: 5.77999e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.696e-05 [cse]: 4.235e-05 [a_3]: 4.99e-05 [Cycle 2]: 0.0006701, [45] [expand_dump_flag]: 1.19998e-06 [switch_simplify]: 8.03001e-06 [loop_unroll]: 7.16999e-06 [a_1]: 0.00014879 [with_stream_mark]: 1.201e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 3.81999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 9.49978e-07 [a_2]: 8.375e-05 [accelerated_algorithm]: 6.49999e-06 [shard]: 1.00001e-06 [meta_shard_fg_expand]: 1.47999e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 5.47999e-06 [auto_parallel]: 5.94e-06 [parallel]: 4.06001e-06 [flash_sp]: 2.84999e-06 [merge_comm]: 3.74002e-06 [allreduce_fusion]: 3.50998e-06 [matmul_add_comm_reduction]: 6.26e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.81001e-06 [virtual_dataset]: 6.33e-06 [get_grad_eliminate_]: 5.92999e-06 [virtual_output]: 6.01998e-06 [merge_forward]: 3.4e-06 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 6.16e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.192e-05 [merge_recompute_call_nodes]: 6.39993e-07 [before_grad]: 9.84001e-06 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 2.53e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 8.70001e-06 [a_after_grad]: 9.19998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.11997e-06 [auto_monad_grad]: 1.05001e-06 [auto_monad_eliminator]: 8.07998e-06 [cse]: 2.032e-05 [a_3]: 3.774e-05 [py_interpret_to_execute_after_opt_a]: 3.9e-06 [slice_cell_reuse_recomputed_activation]: 2.06e-06 [rewriter_after_opt_a]: 2.239e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.00045706 [opt_b]: 0.00024153, [1] [Cycle 1]: 0.00023585, [7] [b_1]: 0.00015445 [b_2]: 8.23999e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 3.14999e-06 [updatestate_loads_eliminate]: 3.48e-06 [renormalize]: 4.2998e-07 [cse]: 2.553e-05 [optimize_parallel_all_gather_comm]: 1.747e-05 [overlap_param_gather]: 2.24999e-06 [cconv]: 2.348e-05 [loop_unroll]: 0.00042577 [opt_after_cconv]: 0.00010682, [1] [Cycle 1]: 0.00010128, [7] [c_1]: 2.967e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 3.41001e-06 [updatestate_loads_eliminate]: 2.86e-06 [cse]: 2.398e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 2.913e-05 [tuple_transform]: 9.342e-05, [1] [Cycle 1]: 8.93e-05, [4] [d_1]: 6.026e-05 [none_parameter_eliminate]: 2.00002e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.6e-06 [partial_unused_args_eliminate]: 2.00002e-06 [add_recomputation]: 4.933e-05 [cse_after_recomputation]: 2.649e-05, [1] [Cycle 1]: 2.195e-05, [1] [cse]: 1.657e-05 [environ_conv]: 8.79e-06 [swap_dp_allreduce_reducescatter]: 6.59001e-06 [bias_add_comm_swap]: 2.48e-06 [label_micro_interleaved_index]: 4.50001e-06 [label_fine_grained_interleaved_index]: 2.59999e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.08002e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 7.79983e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.69999e-06 [reorder_send_recv_between_fp_bp]: 2.68003e-06 [comm_op_add_attrs]: 1.15001e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.09998e-06 [overlap_opt_shard_in_pipeline]: 9.00007e-07 [overlap_opt_shard_grad_in_pipeline]: 1.96998e-06 [control_data_broadcast_order]: 1.598e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 4.09002e-06 [overlap_recompute_and_grad_model_parallel]: 5.04998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.45002e-06 [overlap_grad_ring_attention]: 4.60001e-06 [overlap_grad_flash_sp]: 1.963e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 8.532e-05, [1] [Cycle 1]: 8.104e-05, [6] [build]: 9.94001e-06 [elim_shapecalc]: 1.059e-05 [elim_not_effective]: 1.449e-05 [opt_reshape]: 7.28999e-06 [fold_const_symbol]: 1.103e-05 [renormalize]: 1.99972e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.50001e-06 [auto_monad_reorder]: 2.045e-05 [get_jit_bprop_graph]: 1.12999e-06 [rewriter_after_jit_bprop_graph]: 3.3e-06 [opt_after_jit_grad]: 0.00055299 [validate]: 3.995e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.0429245 [execute]: 9.32001e-06 Sums bootstrap : 0.000449s : 0.58% type_inference : 0.028175s : 36.51% event_method : 0.000091s : 0.12% auto_monad : 0.000162s : 0.21% graph_reusing : 0.000011s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000038s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000224s : 0.29% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000088s : 0.11% optimize.opt_a.loop_unroll : 0.000045s : 0.06% optimize.opt_a.a_1 : 0.000885s : 1.15% optimize.opt_a.with_stream_mark : 0.000026s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000177s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000012s : 0.02% optimize.opt_a.parallel : 0.000021s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.001097s : 1.42% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000063s : 0.08% optimize.opt_a.a_3 : 0.000088s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000457s : 0.59% optimize.opt_b.b_1 : 0.000154s : 0.20% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000426s : 0.55% optimize.opt_after_cconv.c_1 : 0.000030s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000029s : 0.04% optimize.tuple_transform.d_1 : 0.000060s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.06% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000553s : 0.72% validate : 0.000040s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.042924s : 55.62% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000273 62 0.88% : 0.000002s : 3: substitution.elim_not_effective 1.98% : 0.000005s : 3: substitution.float_tuple_getitem_switch 0.73% : 0.000002s : 3: substitution.fold_const_symbol 2.18% : 0.000006s : 4: substitution.graph_param_transform 60.32% : 0.000164s : 8: substitution.inline 1.35% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.55% : 0.000004s : 2: substitution.minmaximum_grad 2.02% : 0.000006s : 6: substitution.remove_not_recompute_node 1.11% : 0.000003s : 2: substitution.replace_old_param 2.57% : 0.000007s : 1: substitution.switch_simplify 5.27% : 0.000014s : 4: substitution.tuple_list_convert_item_index_to_positive 2.46% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.71% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 10.55% : 0.000029s : 8: substitution.tuple_list_get_item_eliminator 3.32% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.028111 2 94.06% : 0.026441s : 1: type_inference.infer 5.94% : 0.001670s : 1: type_inference.specialize ------[replace.] 0.000082 11 64.29% : 0.000053s : 8: replace.inline 17.45% : 0.000014s : 1: replace.switch_simplify 18.26% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 11 94.21% : 0.000160s : 8: match.inline 3.61% : 0.000006s : 1: match.switch_simplify 2.18% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000229 1438 0.93% : 0.000002s : 16: predicate.accumulaten_eliminater 0.98% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 8: predicate.addn_check_dump 1.02% : 0.000002s : 16: predicate.addn_zero_filter 0.92% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 24: predicate.arithmetic_simplify 1.06% : 0.000002s : 16: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.09% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.32% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.91% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.80% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000001s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.86% : 0.000004s : 28: predicate.environ_get_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.41% : 0.000006s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.52% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.94% : 0.000014s : 66: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.72% : 0.000002s : 8: predicate.less_batch_normalization 1.72% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.45% : 0.000006s : 42: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.72% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.02% : 0.000002s : 16: predicate.minmaximum_grad 0.99% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 1.96% : 0.000004s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.36% : 0.000003s : 16: predicate.reduce_eliminate 2.52% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000001s : 8: predicate.remove_not_recompute_node 1.16% : 0.000003s : 26: predicate.replace_applicator 0.44% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000000s : 4: predicate.reset_defer_inline 1.12% : 0.000003s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 4: predicate.row_tensor_eliminate 0.92% : 0.000002s : 8: predicate.same_eliminate 0.35% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.70% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.80% : 0.000004s : 26: predicate.switch_defer_inline 2.24% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.78% : 0.000013s : 86: predicate.switch_simplify 1.03% : 0.000002s : 16: predicate.tile_eliminate 0.98% : 0.000002s : 16: predicate.transpose_eliminate 1.79% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.86% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.66% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.65% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.42% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 3.04% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.52% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001389 23 57.60% : 0.000800s : 11: func_graph_cloner_run.FuncGraphClonerGraph 42.40% : 0.000589s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.092376 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.28% : 0.003032s : 1: add_attr 3.27% : 0.003023s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.19% : 0.000171s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.03% : 0.000027s : 1: bias_add_comm_swap 0.52% : 0.000480s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.11% : 0.000099s : 1: event_method 0.02% : 0.000015s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000016s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.47% : 0.000434s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.50% : 0.000466s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.50% : 0.001382s : 78: opt.transform.opt_a 0.03% : 0.000029s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000134s : 28: opt.transform.opt_b 0.07% : 0.000066s : 2: opt.transform.opt_trans_graph 0.04% : 0.000040s : 4: opt.transform.symbol_engine_opt 3.56% : 0.003291s : 1: opt_a 0.12% : 0.000110s : 1: opt_after_cconv 0.61% : 0.000563s : 1: opt_after_jit_grad 0.27% : 0.000245s : 1: opt_b 5.90% : 0.005453s : 1: optimize 0.02% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000033s : 1: remove_dup_value 0.62% : 0.000572s : 1: renormalize.infer 0.56% : 0.000517s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000026s : 1: rewriter_after_opt_a 0.25% : 0.000230s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000088s : 1: symbol_engine_optimizer 46.49% : 0.042942s : 1: task_emit 0.10% : 0.000096s : 1: tuple_transform 30.51% : 0.028188s : 1: type_inference 0.07% : 0.000066s : 1: validate TotalTime = 2.31, [24] [bootstrap]: 0.00045825 [type_inference]: 0.0490263 [event_method]: 0.00020174 [auto_monad]: 0.00023807 [graph_reusing]: 1.882e-05 [inline]: 1.82999e-06 [add_attr]: 0.00322678, [1] [add_attr_with_inline]: 0.00321822, [1] [Cycle 1]: 8.543e-05, [2] [tag_attr]: 4.515e-05 [meta_addattr_fg_expand]: 1.244e-05 [parallel-infer-symbol]: 3.35998e-06 [pre_auto_parallel]: 6.718e-05 [insert-virtual-dataset]: 2.37001e-06 [parallel-infer-symbol-second]: 1.40001e-06 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.041322, [53] [py_interpret_to_execute]: 5.30999e-06 [rewriter_before_opt_a]: 0.00037244 [opt_a]: 0.0383258, [3] [Cycle 1]: 0.0321928, [45] [expand_dump_flag]: 4.97e-06 [switch_simplify]: 0.00015908 [loop_unroll]: 7.248e-05 [a_1]: 0.00153245 [with_stream_mark]: 2.316e-05 [recompute_prepare]: 2.138e-05 [updatestate_depend_eliminate]: 9.27999e-06 [updatestate_assign_eliminate]: 8.47998e-06 [updatestate_loads_eliminate]: 7.73001e-06 [parameter_eliminate]: 2.61999e-06 [a_2]: 0.00022005 [accelerated_algorithm]: 1.452e-05 [shard]: 1.74e-06 [meta_shard_fg_expand]: 4.35e-06 [shard_inline]: 1.432e-05 [merge_send_recv]: 1.729e-05 [auto_parallel]: 1.116e-05 [parallel]: 1.715e-05 [flash_sp]: 9.46e-06 [merge_comm]: 9.51e-06 [allreduce_fusion]: 8.69e-06 [matmul_add_comm_reduction]: 2.604e-05 [allreduce_slice_to_reducescatter]: 6.39993e-07 [virtual_shard_identity]: 1.564e-05 [virtual_dataset]: 1.451e-05 [get_grad_eliminate_]: 1.394e-05 [virtual_output]: 1.385e-05 [merge_forward]: 9.77999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.693e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.633e-05 [merge_recompute_call_nodes]: 1.86998e-06 [before_grad]: 2.562e-05 [set_forward_comm_id_for_comm_node_pass]: 9.57001e-06 [meta_fg_expand]: 0.00448807 [flash_sp_send_recv_attached]: 4.02e-06 [receive_attached]: 2.37001e-06 [after_resolve]: 9.994e-05 [a_after_grad]: 0.00012635 [renormalize]: 0.0229491 [add_forward_monad_depend]: 1.557e-05 [auto_monad_grad]: 1.318e-05 [auto_monad_eliminator]: 0.00011358 [cse]: 0.00033473 [a_3]: 0.00141168 [Cycle 2]: 0.00487237, [45] [expand_dump_flag]: 2.72001e-06 [switch_simplify]: 0.00010061 [loop_unroll]: 9.335e-05 [a_1]: 0.00172228 [with_stream_mark]: 1.896e-05 [recompute_prepare]: 1.607e-05 [updatestate_depend_eliminate]: 8.09002e-06 [updatestate_assign_eliminate]: 7.30998e-06 [updatestate_loads_eliminate]: 6.71e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 0.0001967 [accelerated_algorithm]: 1.407e-05 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 3.40998e-06 [shard_inline]: 1.388e-05 [merge_send_recv]: 1.032e-05 [auto_parallel]: 1.11e-05 [parallel]: 4.57e-06 [flash_sp]: 3.14001e-06 [merge_comm]: 8.35001e-06 [allreduce_fusion]: 7.88001e-06 [matmul_add_comm_reduction]: 1.092e-05 [allreduce_slice_to_reducescatter]: 4.90021e-07 [virtual_shard_identity]: 1.437e-05 [virtual_dataset]: 1.323e-05 [get_grad_eliminate_]: 1.295e-05 [virtual_output]: 1.285e-05 [merge_forward]: 7.38e-06 [cell_reuse_recompute_pass]: 9.79984e-07 [offload_activation]: 1.192e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.423e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 2.28e-05 [set_forward_comm_id_for_comm_node_pass]: 8.2e-06 [meta_fg_expand]: 0.00016541 [flash_sp_send_recv_attached]: 1.29e-06 [receive_attached]: 1.17e-06 [after_resolve]: 2.022e-05 [a_after_grad]: 2.096e-05 [renormalize]: 0.00171824 [add_forward_monad_depend]: 4.4e-06 [auto_monad_grad]: 1.15001e-06 [auto_monad_eliminator]: 2.264e-05 [cse]: 0.00016515 [a_3]: 0.00010085 [Cycle 3]: 0.00124595, [45] [expand_dump_flag]: 1.39e-06 [switch_simplify]: 1.516e-05 [loop_unroll]: 1.337e-05 [a_1]: 0.00035621 [with_stream_mark]: 1.534e-05 [recompute_prepare]: 1.383e-05 [updatestate_depend_eliminate]: 8.26002e-06 [updatestate_assign_eliminate]: 7.08e-06 [updatestate_loads_eliminate]: 7.16001e-06 [parameter_eliminate]: 9.90025e-07 [a_2]: 0.00019382 [accelerated_algorithm]: 1.35e-05 [shard]: 1.20999e-06 [meta_shard_fg_expand]: 3.05998e-06 [shard_inline]: 1.328e-05 [merge_send_recv]: 1.016e-05 [auto_parallel]: 1.035e-05 [parallel]: 4.16001e-06 [flash_sp]: 1.02e-06 [merge_comm]: 7.98999e-06 [allreduce_fusion]: 7.83001e-06 [matmul_add_comm_reduction]: 1.118e-05 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 1.406e-05 [virtual_dataset]: 1.308e-05 [get_grad_eliminate_]: 1.297e-05 [virtual_output]: 1.277e-05 [merge_forward]: 7.46999e-06 [cell_reuse_recompute_pass]: 1.72001e-06 [offload_activation]: 1.175e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.385e-05 [merge_recompute_call_nodes]: 6.50005e-07 [before_grad]: 2.144e-05 [set_forward_comm_id_for_comm_node_pass]: 8.27e-06 [meta_fg_expand]: 5.35999e-06 [flash_sp_send_recv_attached]: 8.99978e-07 [receive_attached]: 9.99979e-07 [after_resolve]: 1.551e-05 [a_after_grad]: 1.952e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.38002e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 1.636e-05 [cse]: 5.197e-05 [a_3]: 8.79e-05 [py_interpret_to_execute_after_opt_a]: 4.21001e-06 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 4.609e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.07998e-06 [mutable_eliminate]: 0.00050498 [opt_b]: 0.00049204, [1] [Cycle 1]: 0.00048561, [7] [b_1]: 0.00034954 [b_2]: 1.505e-05 [updatestate_depend_eliminate]: 1.064e-05 [updatestate_assign_eliminate]: 7.26001e-06 [updatestate_loads_eliminate]: 6.96001e-06 [renormalize]: 4.00003e-07 [cse]: 5.783e-05 [optimize_parallel_all_gather_comm]: 2.62e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.173e-05 [loop_unroll]: 0.00045166 [opt_after_cconv]: 0.00019326, [1] [Cycle 1]: 0.00018742, [7] [c_1]: 6.6e-05 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 9.92999e-06 [updatestate_assign_eliminate]: 7.1e-06 [updatestate_loads_eliminate]: 8.03999e-06 [cse]: 5.929e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 8.223e-05 [tuple_transform]: 0.00016876, [1] [Cycle 1]: 0.00016361, [4] [d_1]: 0.00012501 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.57e-05 [partial_unused_args_eliminate]: 1.69998e-06 [add_recomputation]: 6.942e-05 [cse_after_recomputation]: 5.49e-05, [1] [Cycle 1]: 4.988e-05, [1] [cse]: 4.37e-05 [environ_conv]: 1.228e-05 [swap_dp_allreduce_reducescatter]: 1.187e-05 [bias_add_comm_swap]: 2.48002e-06 [label_micro_interleaved_index]: 4.18001e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.33002e-06 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 2.21e-06 [assign_add_opt]: 1.23002e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.44001e-06 [reorder_send_recv_between_fp_bp]: 3.02002e-06 [comm_op_add_attrs]: 1.39e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.07998e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 2.645e-05 [grouped_pairwise_exchange_alltoall]: 1.89e-06 [offloading_packed_experts]: 7.15e-06 [overlap_recompute_and_grad_model_parallel]: 8.06001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 7.23e-06 [overlap_grad_flash_sp]: 3.238e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.51e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 1.00001e-06 [symbol_engine_optimizer]: 0.00012412, [1] [Cycle 1]: 0.00011902, [6] [build]: 1.077e-05 [elim_shapecalc]: 1.799e-05 [elim_not_effective]: 2.633e-05 [opt_reshape]: 1.453e-05 [fold_const_symbol]: 2.177e-05 [renormalize]: 2.3999e-07 [detach_backward]: 1.64998e-06 [pipeline_parallel_scheduler]: 1.49003e-06 [auto_monad_reorder]: 2.961e-05 [get_jit_bprop_graph]: 1.14003e-06 [rewriter_after_jit_bprop_graph]: 3.58999e-06 [opt_after_jit_grad]: 0.0005052 [validate]: 6.365e-05 [backend_pass]: 9.5999e-07 [task_emit]: 2.21458 [execute]: 8.77e-06 Sums bootstrap : 0.000458s : 0.02% type_inference : 0.049026s : 2.13% event_method : 0.000202s : 0.01% auto_monad : 0.000238s : 0.01% graph_reusing : 0.000019s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000045s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000067s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000372s : 0.02% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000275s : 0.01% optimize.opt_a.loop_unroll : 0.000179s : 0.01% optimize.opt_a.a_1 : 0.003611s : 0.16% optimize.opt_a.with_stream_mark : 0.000057s : 0.00% optimize.opt_a.recompute_prepare : 0.000051s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000026s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000022s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000611s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000042s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000011s : 0.00% optimize.opt_a.shard_inline : 0.000041s : 0.00% optimize.opt_a.merge_send_recv : 0.000038s : 0.00% optimize.opt_a.auto_parallel : 0.000033s : 0.00% optimize.opt_a.parallel : 0.000026s : 0.00% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000026s : 0.00% optimize.opt_a.allreduce_fusion : 0.000024s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000048s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000044s : 0.00% optimize.opt_a.virtual_dataset : 0.000041s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.00% optimize.opt_a.virtual_output : 0.000039s : 0.00% optimize.opt_a.merge_forward : 0.000025s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000041s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000074s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000070s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.004659s : 0.20% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000136s : 0.01% optimize.opt_a.a_after_grad : 0.000167s : 0.01% optimize.opt_a.renormalize : 0.024667s : 1.07% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.00% optimize.opt_a.auto_monad_grad : 0.000016s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000153s : 0.01% optimize.opt_a.cse : 0.000552s : 0.02% optimize.opt_a.a_3 : 0.001600s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000046s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000505s : 0.02% optimize.opt_b.b_1 : 0.000350s : 0.02% optimize.opt_b.b_2 : 0.000015s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000058s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.00% optimize.loop_unroll : 0.000452s : 0.02% optimize.opt_after_cconv.c_1 : 0.000066s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000059s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000082s : 0.00% optimize.tuple_transform.d_1 : 0.000125s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000016s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.00% optimize.cse_after_recomputation.cse : 0.000044s : 0.00% optimize.environ_conv : 0.000012s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000026s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000032s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000015s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000022s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000505s : 0.02% validate : 0.000064s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.214578s : 96.06% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.001518 311 0.26% : 0.000004s : 8: substitution.elim_not_effective 0.53% : 0.000008s : 12: substitution.float_depend_g_call 0.75% : 0.000011s : 9: substitution.float_tuple_getitem_switch 0.20% : 0.000003s : 8: substitution.fold_const_symbol 33.67% : 0.000511s : 5: substitution.getattr_setattr_resolve 0.69% : 0.000010s : 10: substitution.graph_param_transform 0.18% : 0.000003s : 2: substitution.incorporate_call 0.13% : 0.000002s : 2: substitution.incorporate_call_switch 37.10% : 0.000563s : 24: substitution.inline 1.36% : 0.000021s : 3: substitution.inline_without_move 0.83% : 0.000013s : 25: substitution.j_node_and_user_rematch 1.18% : 0.000018s : 13: substitution.minmaximum_grad 0.61% : 0.000009s : 12: substitution.partial_eliminate 1.10% : 0.000017s : 25: substitution.remove_not_recompute_node 4.97% : 0.000075s : 32: substitution.replace_applicator 1.47% : 0.000022s : 14: substitution.replace_old_param 0.16% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.78% : 0.000012s : 4: substitution.switch_simplify 0.52% : 0.000008s : 2: substitution.transpose_eliminate 2.99% : 0.000045s : 17: substitution.tuple_list_convert_item_index_to_positive 1.47% : 0.000022s : 17: substitution.tuple_list_get_item_const_eliminator 1.93% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 5.22% : 0.000079s : 32: substitution.tuple_list_get_item_eliminator 1.92% : 0.000029s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.048921 2 91.51% : 0.044769s : 1: type_inference.infer 8.49% : 0.004152s : 1: type_inference.specialize ------[replace.] 0.000467 45 11.47% : 0.000054s : 4: replace.getattr_setattr_resolve 51.14% : 0.000239s : 24: replace.inline 14.76% : 0.000069s : 5: replace.replace_applicator 8.70% : 0.000041s : 4: replace.switch_simplify 13.94% : 0.000065s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001076 45 43.56% : 0.000468s : 4: match.getattr_setattr_resolve 51.08% : 0.000549s : 24: match.inline 2.43% : 0.000026s : 5: match.replace_applicator 0.88% : 0.000009s : 4: match.switch_simplify 2.06% : 0.000022s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.001008 7110 0.87% : 0.000009s : 68: predicate.accumulaten_eliminater 0.32% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.42% : 0.000004s : 32: predicate.addn_check_dump 0.91% : 0.000009s : 68: predicate.addn_zero_filter 0.87% : 0.000009s : 68: predicate.adjust_all_reduce_mul_add 1.88% : 0.000019s : 100: predicate.arithmetic_simplify 0.90% : 0.000009s : 68: predicate.cast_eliminate 2.80% : 0.000028s : 215: predicate.check_bprop_eliminate 0.43% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.43% : 0.000004s : 32: predicate.depend_value_elim 1.04% : 0.000010s : 68: predicate.dict_get_item_const_eliminator 1.05% : 0.000011s : 68: predicate.dict_get_item_eliminator 0.90% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.36% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.10% : 0.000001s : 10: predicate.elim_not_effective 0.16% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.08% : 0.000011s : 78: predicate.environ_add_const_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_add_eliminate 1.01% : 0.000010s : 78: predicate.environ_get_depend_swap 1.47% : 0.000015s : 110: predicate.environ_get_eliminate 0.99% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.33% : 0.000013s : 100: predicate.exchange_switch_depend_value 1.89% : 0.000019s : 100: predicate.float_depend_g_call 0.42% : 0.000004s : 32: predicate.float_environ_get_switch 0.62% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.47% : 0.000005s : 32: predicate.get_grad_eliminate 0.61% : 0.000006s : 31: predicate.getattr_setattr_resolve 0.09% : 0.000001s : 10: predicate.graph_param_transform 0.43% : 0.000004s : 32: predicate.incorporate_call 0.39% : 0.000004s : 32: predicate.incorporate_call_switch 4.50% : 0.000045s : 252: predicate.inline 1.45% : 0.000015s : 82: predicate.inline_without_move 0.23% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.54% : 0.000005s : 32: predicate.less_batch_normalization 1.39% : 0.000014s : 96: predicate.list_to_tuple_eliminator_ 2.13% : 0.000021s : 164: predicate.load_eliminater 0.36% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.62% : 0.000026s : 182: predicate.loop_unroll_before_grad 1.18% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.46% : 0.000005s : 32: predicate.merge_addn 2.67% : 0.000027s : 198: predicate.micro_step_allgather_replace 2.57% : 0.000026s : 198: predicate.mini_step_allgather_replace 0.92% : 0.000009s : 68: predicate.minmaximum_grad 0.36% : 0.000004s : 10: predicate.mutable_eliminate 0.17% : 0.000002s : 10: predicate.opt_reshape 0.16% : 0.000002s : 10: predicate.parallel_virtual_node 1.75% : 0.000018s : 100: predicate.partial_defer_inline 1.26% : 0.000013s : 86: predicate.partial_eliminate 0.89% : 0.000009s : 68: predicate.print_const_string_wrapper 0.44% : 0.000004s : 32: predicate.reduce_all_const_elim 1.26% : 0.000013s : 68: predicate.reduce_eliminate 2.06% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000002s : 32: predicate.remove_not_recompute_node 2.37% : 0.000024s : 284: predicate.replace_applicator 0.66% : 0.000007s : 82: predicate.replace_old_param 0.08% : 0.000001s : 10: predicate.reset_defer_inline 0.87% : 0.000009s : 68: predicate.reshape_eliminate 2.64% : 0.000027s : 198: predicate.row_tensor_add_zeros_like 0.16% : 0.000002s : 10: predicate.row_tensor_eliminate 2.97% : 0.000030s : 215: predicate.same_eliminate 0.28% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.52% : 0.000005s : 32: predicate.shard_identity_eliminate 0.35% : 0.000004s : 20: predicate.special_op_eliminate 0.50% : 0.000005s : 32: predicate.specialize_transform 2.70% : 0.000027s : 198: predicate.split_environ_get_set_with_tuple_value 1.29% : 0.000013s : 82: predicate.stack_unstack_eliminate 0.15% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.54% : 0.000016s : 100: predicate.switch_defer_inline 4.23% : 0.000043s : 315: predicate.switch_layer_defer_inline 5.07% : 0.000051s : 332: predicate.switch_simplify 0.91% : 0.000009s : 68: predicate.tile_eliminate 0.92% : 0.000009s : 68: predicate.transpose_eliminate 1.34% : 0.000014s : 88: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000014s : 88: predicate.tuple_list_get_item_const_eliminator 1.26% : 0.000013s : 88: predicate.tuple_list_get_item_depend_reorder 2.33% : 0.000023s : 128: predicate.tuple_list_get_item_eliminator 1.30% : 0.000013s : 88: predicate.tuple_list_get_set_item_eliminator 1.87% : 0.000019s : 120: predicate.tuple_list_set_item_eliminator 1.31% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 2.08% : 0.000021s : 164: predicate.updatestate_pure_node_eliminater 2.52% : 0.000025s : 196: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 10: predicate.value_based_eliminate 0.48% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.47% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000002s : 10: predicate.virtual_view_grad_eliminate 0.17% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006008 75 61.43% : 0.003691s : 36: func_graph_cloner_run.FuncGraphClonerGraph 38.57% : 0.002317s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.387313 247 0.00% : 0.000003s : 1: ForceFp32Comm 0.14% : 0.003231s : 1: add_attr 0.13% : 0.003223s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000074s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000249s : 1: auto_monad 0.00% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.02% : 0.000490s : 1: bootstrap 0.00% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000030s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000058s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000016s : 1: environ_conv 0.01% : 0.000212s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000023s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.02% : 0.000460s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000513s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000025s : 1: opt.transform.mutable_eliminate 0.29% : 0.006873s : 125: opt.transform.opt_a 0.00% : 0.000065s : 1: opt.transform.opt_after_cconv 0.00% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000335s : 28: opt.transform.opt_b 0.03% : 0.000619s : 2: opt.transform.opt_resolve 0.01% : 0.000138s : 2: opt.transform.opt_trans_graph 0.00% : 0.000077s : 4: opt.transform.symbol_engine_opt 1.61% : 0.038329s : 1: opt_a 0.01% : 0.000197s : 1: opt_after_cconv 0.02% : 0.000515s : 1: opt_after_jit_grad 0.02% : 0.000495s : 1: opt_b 1.73% : 0.041327s : 1: optimize 0.00% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000072s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000087s : 1: remove_dup_value 0.84% : 0.020095s : 2: renormalize.infer 0.19% : 0.004557s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000050s : 1: rewriter_after_opt_a 0.02% : 0.000379s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000127s : 1: symbol_engine_optimizer 92.77% : 2.214602s : 1: task_emit 0.01% : 0.000172s : 1: tuple_transform 2.05% : 0.049042s : 1: type_inference 0.00% : 0.000091s : 1: validate TotalTime = 0.0758541, [24] [bootstrap]: 0.00050935 [type_inference]: 0.0293374 [event_method]: 0.00010238 [auto_monad]: 0.00016572 [graph_reusing]: 1.126e-05 [inline]: 1.74e-06 [add_attr]: 0.00308758, [1] [add_attr_with_inline]: 0.00307944, [1] [Cycle 1]: 6.255e-05, [2] [tag_attr]: 2.695e-05 [meta_addattr_fg_expand]: 7.26999e-06 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 3.795e-05 [insert-virtual-dataset]: 3.10998e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 2.04e-06 [optimize]: 0.00541656, [53] [py_interpret_to_execute]: 4.05e-06 [rewriter_before_opt_a]: 0.00022598 [opt_a]: 0.00329606, [2] [Cycle 1]: 0.00261912, [45] [expand_dump_flag]: 4.07e-06 [switch_simplify]: 8.259e-05 [loop_unroll]: 3.915e-05 [a_1]: 0.00074776 [with_stream_mark]: 1.481e-05 [recompute_prepare]: 9.05001e-06 [updatestate_depend_eliminate]: 4.94e-06 [updatestate_assign_eliminate]: 3.91999e-06 [updatestate_loads_eliminate]: 3.97e-06 [parameter_eliminate]: 1.91998e-06 [a_2]: 9.549e-05 [accelerated_algorithm]: 2.168e-05 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.32001e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 9.02e-06 [auto_parallel]: 6.39001e-06 [parallel]: 1.764e-05 [flash_sp]: 7.42002e-06 [merge_comm]: 4.38001e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 9.67999e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 6.73e-06 [get_grad_eliminate_]: 6.46e-06 [virtual_output]: 6.84001e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 1.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.328e-05 [merge_recompute_call_nodes]: 1.70001e-06 [before_grad]: 1.116e-05 [set_forward_comm_id_for_comm_node_pass]: 4.3e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 2.30002e-06 [receive_attached]: 2.07001e-06 [after_resolve]: 9.57001e-06 [a_after_grad]: 9.82001e-06 [renormalize]: 0.0010737 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 1.72001e-06 [auto_monad_eliminator]: 1.727e-05 [cse]: 3.804e-05 [a_3]: 4.947e-05 [Cycle 2]: 0.0006676, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 8.17998e-06 [loop_unroll]: 6.84999e-06 [a_1]: 0.00014965 [with_stream_mark]: 1.197e-05 [recompute_prepare]: 6.74999e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 3.01001e-06 [updatestate_loads_eliminate]: 3.04999e-06 [parameter_eliminate]: 9.29984e-07 [a_2]: 8.487e-05 [accelerated_algorithm]: 6.56e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.50999e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 5.28002e-06 [auto_parallel]: 5.81e-06 [parallel]: 4.42e-06 [flash_sp]: 3.28998e-06 [merge_comm]: 3.63e-06 [allreduce_fusion]: 3.41999e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 6.63e-06 [virtual_dataset]: 6.25002e-06 [get_grad_eliminate_]: 6.37001e-06 [virtual_output]: 6.21e-06 [merge_forward]: 3.23998e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 6.16998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.156e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 9.79999e-06 [set_forward_comm_id_for_comm_node_pass]: 4.07e-06 [meta_fg_expand]: 2.53998e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 9.24e-06 [a_after_grad]: 9.51998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 7.68001e-06 [cse]: 1.826e-05 [a_3]: 3.828e-05 [py_interpret_to_execute_after_opt_a]: 4.15e-06 [slice_cell_reuse_recomputed_activation]: 1.81e-06 [rewriter_after_opt_a]: 2.216e-05 [convert_after_rewriter]: 1.22e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00045489 [opt_b]: 0.00023794, [1] [Cycle 1]: 0.00023204, [7] [b_1]: 0.00015379 [b_2]: 8.59998e-06 [updatestate_depend_eliminate]: 5.96e-06 [updatestate_assign_eliminate]: 3.25002e-06 [updatestate_loads_eliminate]: 2.83e-06 [renormalize]: 5.39992e-07 [cse]: 2.309e-05 [optimize_parallel_all_gather_comm]: 1.714e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 2.277e-05 [loop_unroll]: 0.00041871 [opt_after_cconv]: 0.00010714, [1] [Cycle 1]: 0.00010188, [7] [c_1]: 3.112e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.94e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 3.11001e-06 [cse]: 2.388e-05 [renormalize]: 6.19999e-07 [remove_dup_value]: 1.698e-05 [tuple_transform]: 9.07e-05, [1] [Cycle 1]: 8.647e-05, [4] [d_1]: 5.853e-05 [none_parameter_eliminate]: 1.66e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.27002e-06 [partial_unused_args_eliminate]: 1.77001e-06 [add_recomputation]: 5.13e-05 [cse_after_recomputation]: 2.527e-05, [1] [Cycle 1]: 2.117e-05, [1] [cse]: 1.612e-05 [environ_conv]: 8.78001e-06 [swap_dp_allreduce_reducescatter]: 5.81e-06 [bias_add_comm_swap]: 2.34999e-06 [label_micro_interleaved_index]: 4.11001e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.77999e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.21002e-06 [ForceFp32Comm]: 7.40023e-07 [remove_cast_before_assign_add]: 1.24998e-06 [full_micro_interleaved_order_control]: 2.07001e-06 [reorder_send_recv_between_fp_bp]: 3.07002e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.35001e-06 [overlap_opt_shard_in_pipeline]: 9.99979e-07 [overlap_opt_shard_grad_in_pipeline]: 1.98002e-06 [control_data_broadcast_order]: 1.437e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 4.2e-06 [overlap_recompute_and_grad_model_parallel]: 4.82e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.48999e-06 [overlap_grad_flash_sp]: 1.83e-05 [begin_end_overlap_inline]: 5.20027e-07 [split_matmul_comm_elemetwise]: 2.19001e-06 [split_layernorm_comm]: 1.57999e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 9.96e-05, [1] [Cycle 1]: 9.486e-05, [6] [build]: 2.383e-05 [elim_shapecalc]: 9.92999e-06 [elim_not_effective]: 1.402e-05 [opt_reshape]: 7.78001e-06 [fold_const_symbol]: 1.067e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.57999e-06 [pipeline_parallel_scheduler]: 1.34e-06 [auto_monad_reorder]: 4.271e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 3.98001e-06 [opt_after_jit_grad]: 0.00046503 [validate]: 4.048e-05 [backend_pass]: 1.12e-06 [task_emit]: 0.0363926 [execute]: 8.05999e-06 Sums bootstrap : 0.000509s : 0.71% type_inference : 0.029337s : 40.86% event_method : 0.000102s : 0.14% auto_monad : 0.000166s : 0.23% graph_reusing : 0.000011s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000038s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000226s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000091s : 0.13% optimize.opt_a.loop_unroll : 0.000046s : 0.06% optimize.opt_a.a_1 : 0.000897s : 1.25% optimize.opt_a.with_stream_mark : 0.000027s : 0.04% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000180s : 0.25% optimize.opt_a.accelerated_algorithm : 0.000028s : 0.04% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000012s : 0.02% optimize.opt_a.parallel : 0.000022s : 0.03% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.03% optimize.opt_a.a_after_grad : 0.000019s : 0.03% optimize.opt_a.renormalize : 0.001074s : 1.50% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000056s : 0.08% optimize.opt_a.a_3 : 0.000088s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000455s : 0.63% optimize.opt_b.b_1 : 0.000154s : 0.21% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000419s : 0.58% optimize.opt_after_cconv.c_1 : 0.000031s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000059s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.07% optimize.cse_after_recomputation.cse : 0.000016s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000024s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000043s : 0.06% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000465s : 0.65% validate : 0.000040s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.036393s : 50.68% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000274 62 0.78% : 0.000002s : 3: substitution.elim_not_effective 2.16% : 0.000006s : 3: substitution.float_tuple_getitem_switch 0.57% : 0.000002s : 3: substitution.fold_const_symbol 2.10% : 0.000006s : 4: substitution.graph_param_transform 60.92% : 0.000167s : 8: substitution.inline 1.39% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.52% : 0.000004s : 2: substitution.minmaximum_grad 1.96% : 0.000005s : 6: substitution.remove_not_recompute_node 1.26% : 0.000003s : 2: substitution.replace_old_param 2.92% : 0.000008s : 1: substitution.switch_simplify 5.14% : 0.000014s : 4: substitution.tuple_list_convert_item_index_to_positive 2.40% : 0.000007s : 4: substitution.tuple_list_get_item_const_eliminator 3.44% : 0.000009s : 4: substitution.tuple_list_get_item_depend_reorder 10.24% : 0.000028s : 8: substitution.tuple_list_get_item_eliminator 3.19% : 0.000009s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.029274 2 94.07% : 0.027537s : 1: type_inference.infer 5.93% : 0.001736s : 1: type_inference.specialize ------[replace.] 0.000085 11 64.84% : 0.000055s : 8: replace.inline 17.04% : 0.000014s : 1: replace.switch_simplify 18.12% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000173 11 94.04% : 0.000162s : 8: match.inline 4.00% : 0.000007s : 1: match.switch_simplify 1.97% : 0.000003s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1438 0.96% : 0.000002s : 16: predicate.accumulaten_eliminater 0.74% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 8: predicate.addn_check_dump 0.96% : 0.000002s : 16: predicate.addn_zero_filter 0.89% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.35% : 0.000005s : 24: predicate.arithmetic_simplify 1.02% : 0.000002s : 16: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 1.10% : 0.000003s : 16: predicate.dict_get_item_const_eliminator 1.14% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.87% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.16% : 0.000003s : 20: predicate.environ_get_depend_swap 1.78% : 0.000004s : 28: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.54% : 0.000003s : 26: predicate.exchange_switch_depend_value 2.50% : 0.000006s : 26: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.83% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.96% : 0.000014s : 66: predicate.inline 0.73% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.49% : 0.000006s : 42: predicate.load_eliminater 0.81% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.83% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.45% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 16: predicate.minmaximum_grad 1.00% : 0.000002s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.31% : 0.000001s : 4: predicate.parallel_virtual_node 2.02% : 0.000005s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 1.07% : 0.000002s : 16: predicate.print_const_string_wrapper 0.49% : 0.000001s : 8: predicate.reduce_all_const_elim 1.48% : 0.000003s : 16: predicate.reduce_eliminate 2.43% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 8: predicate.remove_not_recompute_node 1.22% : 0.000003s : 26: predicate.replace_applicator 0.32% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 16: predicate.reshape_eliminate 0.54% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.62% : 0.000001s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.66% : 0.000001s : 8: predicate.shard_identity_eliminate 0.70% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.70% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.75% : 0.000004s : 26: predicate.switch_defer_inline 2.34% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.09% : 0.000014s : 86: predicate.switch_simplify 0.96% : 0.000002s : 16: predicate.tile_eliminate 1.02% : 0.000002s : 16: predicate.transpose_eliminate 1.70% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.18% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.62% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.52% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 2.98% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001487 23 60.04% : 0.000893s : 11: func_graph_cloner_run.FuncGraphClonerGraph 39.96% : 0.000594s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.087083 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.55% : 0.003092s : 1: add_attr 3.54% : 0.003083s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.06% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.20% : 0.000175s : 1: auto_monad 0.05% : 0.000047s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.63% : 0.000545s : 1: bootstrap 0.03% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.13% : 0.000112s : 1: event_method 0.02% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000016s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.49% : 0.000427s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.53% : 0.000463s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 1.63% : 0.001417s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000135s : 28: opt.transform.opt_b 0.07% : 0.000064s : 2: opt.transform.opt_trans_graph 0.04% : 0.000039s : 4: opt.transform.symbol_engine_opt 3.79% : 0.003299s : 1: opt_a 0.13% : 0.000110s : 1: opt_after_cconv 0.54% : 0.000474s : 1: opt_after_jit_grad 0.28% : 0.000241s : 1: opt_b 6.23% : 0.005421s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000007s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.62% : 0.000536s : 1: renormalize.infer 0.61% : 0.000530s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000025s : 1: rewriter_after_opt_a 0.27% : 0.000232s : 1: rewriter_before_opt_a 0.01% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.12% : 0.000102s : 1: symbol_engine_optimizer 41.81% : 0.036408s : 1: task_emit 0.11% : 0.000093s : 1: tuple_transform 33.70% : 0.029351s : 1: type_inference 0.08% : 0.000068s : 1: validate TotalTime = 0.169863, [24] [bootstrap]: 0.00045147 [type_inference]: 0.0465673 [event_method]: 0.00021618 [auto_monad]: 0.00024309 [graph_reusing]: 1.814e-05 [inline]: 2.26e-06 [add_attr]: 0.00313502, [1] [add_attr_with_inline]: 0.00312658, [1] [Cycle 1]: 0.00011405, [2] [tag_attr]: 7.144e-05 [meta_addattr_fg_expand]: 1.334e-05 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 6.666e-05 [insert-virtual-dataset]: 2.59001e-06 [parallel-infer-symbol-second]: 1.20999e-06 [dataset_repeat_opt]: 2.12001e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.0414812, [53] [py_interpret_to_execute]: 5.21998e-06 [rewriter_before_opt_a]: 0.0003729 [opt_a]: 0.0384821, [3] [Cycle 1]: 0.0324044, [45] [expand_dump_flag]: 4.84998e-06 [switch_simplify]: 0.00016006 [loop_unroll]: 7.242e-05 [a_1]: 0.00153408 [with_stream_mark]: 2.33e-05 [recompute_prepare]: 2.056e-05 [updatestate_depend_eliminate]: 8.27e-06 [updatestate_assign_eliminate]: 8.66002e-06 [updatestate_loads_eliminate]: 7.86001e-06 [parameter_eliminate]: 3.01999e-06 [a_2]: 0.00021941 [accelerated_algorithm]: 1.472e-05 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 4.58001e-06 [shard_inline]: 1.369e-05 [merge_send_recv]: 1.538e-05 [auto_parallel]: 1.062e-05 [parallel]: 1.748e-05 [flash_sp]: 9.70002e-06 [merge_comm]: 9.34998e-06 [allreduce_fusion]: 8.49998e-06 [matmul_add_comm_reduction]: 2.543e-05 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 1.589e-05 [virtual_dataset]: 1.446e-05 [get_grad_eliminate_]: 1.365e-05 [virtual_output]: 1.378e-05 [merge_forward]: 9.48997e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 1.618e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.647e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 2.541e-05 [set_forward_comm_id_for_comm_node_pass]: 8.94e-06 [meta_fg_expand]: 0.0044906 [flash_sp_send_recv_attached]: 3.81999e-06 [receive_attached]: 2.24001e-06 [after_resolve]: 8.792e-05 [a_after_grad]: 0.00012752 [renormalize]: 0.0231664 [add_forward_monad_depend]: 1.565e-05 [auto_monad_grad]: 1.384e-05 [auto_monad_eliminator]: 0.00011321 [cse]: 0.00033523 [a_3]: 0.00141485 [Cycle 2]: 0.00480718, [45] [expand_dump_flag]: 2.73003e-06 [switch_simplify]: 8.64e-05 [loop_unroll]: 8.299e-05 [a_1]: 0.0017012 [with_stream_mark]: 1.932e-05 [recompute_prepare]: 1.552e-05 [updatestate_depend_eliminate]: 8.60999e-06 [updatestate_assign_eliminate]: 7.55e-06 [updatestate_loads_eliminate]: 6.72002e-06 [parameter_eliminate]: 1.24998e-06 [a_2]: 0.00019659 [accelerated_algorithm]: 1.379e-05 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 3.40998e-06 [shard_inline]: 1.305e-05 [merge_send_recv]: 1.102e-05 [auto_parallel]: 1.159e-05 [parallel]: 4.23999e-06 [flash_sp]: 3.29001e-06 [merge_comm]: 8.90001e-06 [allreduce_fusion]: 7.87003e-06 [matmul_add_comm_reduction]: 1.083e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.425e-05 [virtual_dataset]: 1.368e-05 [get_grad_eliminate_]: 1.285e-05 [virtual_output]: 1.277e-05 [merge_forward]: 7.35e-06 [cell_reuse_recompute_pass]: 9.10019e-07 [offload_activation]: 1.138e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.453e-05 [merge_recompute_call_nodes]: 6.90023e-07 [before_grad]: 2.239e-05 [set_forward_comm_id_for_comm_node_pass]: 8.80999e-06 [meta_fg_expand]: 0.00016881 [flash_sp_send_recv_attached]: 1.26997e-06 [receive_attached]: 1.72999e-06 [after_resolve]: 2.051e-05 [a_after_grad]: 2.123e-05 [renormalize]: 0.00168953 [add_forward_monad_depend]: 4.48999e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 2.28e-05 [cse]: 0.00017312 [a_3]: 0.00010116 [Cycle 3]: 0.00125594, [45] [expand_dump_flag]: 1.38002e-06 [switch_simplify]: 1.6e-05 [loop_unroll]: 1.342e-05 [a_1]: 0.00035878 [with_stream_mark]: 1.495e-05 [recompute_prepare]: 1.343e-05 [updatestate_depend_eliminate]: 8.62e-06 [updatestate_assign_eliminate]: 7.08e-06 [updatestate_loads_eliminate]: 6.88998e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.00019645 [accelerated_algorithm]: 1.371e-05 [shard]: 1.14e-06 [meta_shard_fg_expand]: 2.99001e-06 [shard_inline]: 1.323e-05 [merge_send_recv]: 9.77999e-06 [auto_parallel]: 1.119e-05 [parallel]: 4.13999e-06 [flash_sp]: 1.04e-06 [merge_comm]: 8.15999e-06 [allreduce_fusion]: 7.8e-06 [matmul_add_comm_reduction]: 1.094e-05 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 1.357e-05 [virtual_dataset]: 1.298e-05 [get_grad_eliminate_]: 1.249e-05 [virtual_output]: 1.326e-05 [merge_forward]: 7.15998e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 1.138e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.463e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 2.182e-05 [set_forward_comm_id_for_comm_node_pass]: 8.21002e-06 [meta_fg_expand]: 5.34998e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 1.54e-05 [a_after_grad]: 1.966e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 1.706e-05 [cse]: 5.686e-05 [a_3]: 9.029e-05 [py_interpret_to_execute_after_opt_a]: 4.47e-06 [slice_cell_reuse_recomputed_activation]: 2.17999e-06 [rewriter_after_opt_a]: 4.471e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.37999e-06 [mutable_eliminate]: 0.00050767 [opt_b]: 0.00049307, [1] [Cycle 1]: 0.00048661, [7] [b_1]: 0.00035118 [b_2]: 1.612e-05 [updatestate_depend_eliminate]: 1.038e-05 [updatestate_assign_eliminate]: 6.97002e-06 [updatestate_loads_eliminate]: 6.96001e-06 [renormalize]: 4.69998e-07 [cse]: 5.729e-05 [optimize_parallel_all_gather_comm]: 2.499e-05 [overlap_param_gather]: 1.99e-06 [cconv]: 2.178e-05 [loop_unroll]: 0.00045469 [opt_after_cconv]: 0.00018823, [1] [Cycle 1]: 0.00018213, [7] [c_1]: 6.625e-05 [parameter_eliminate]: 2.26e-06 [updatestate_depend_eliminate]: 1.07e-05 [updatestate_assign_eliminate]: 7.01999e-06 [updatestate_loads_eliminate]: 6.81999e-06 [cse]: 5.508e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 8.024e-05 [tuple_transform]: 0.00016568, [1] [Cycle 1]: 0.00016034, [4] [d_1]: 0.00012433 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 1.434e-05 [partial_unused_args_eliminate]: 2.32001e-06 [add_recomputation]: 7.225e-05 [cse_after_recomputation]: 5.514e-05, [1] [Cycle 1]: 5.025e-05, [1] [cse]: 4.373e-05 [environ_conv]: 1.215e-05 [swap_dp_allreduce_reducescatter]: 1.177e-05 [bias_add_comm_swap]: 3.11001e-06 [label_micro_interleaved_index]: 4.03001e-06 [label_fine_grained_interleaved_index]: 2.53998e-06 [merge_cast_opt]: 1.32e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.76999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 9.00007e-07 [overlap_opt_shard_grad_in_pipeline]: 2.21e-06 [control_data_broadcast_order]: 2.786e-05 [grouped_pairwise_exchange_alltoall]: 1.72999e-06 [offloading_packed_experts]: 6.99001e-06 [overlap_recompute_and_grad_model_parallel]: 7.20998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 6.98e-06 [overlap_grad_flash_sp]: 3.222e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.71e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 0.0001254, [1] [Cycle 1]: 0.00012048, [6] [build]: 1.09e-05 [elim_shapecalc]: 1.807e-05 [elim_not_effective]: 2.54e-05 [opt_reshape]: 1.438e-05 [fold_const_symbol]: 2.3e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.57999e-06 [auto_monad_reorder]: 3.157e-05 [get_jit_bprop_graph]: 1.08001e-06 [rewriter_after_jit_bprop_graph]: 3.78001e-06 [opt_after_jit_grad]: 0.00051433 [validate]: 6.488e-05 [backend_pass]: 1.02e-06 [task_emit]: 0.0768228 [execute]: 9.38002e-06 Sums bootstrap : 0.000451s : 0.27% type_inference : 0.046567s : 28.14% event_method : 0.000216s : 0.13% auto_monad : 0.000243s : 0.15% graph_reusing : 0.000018s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000071s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000067s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000373s : 0.23% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000262s : 0.16% optimize.opt_a.loop_unroll : 0.000169s : 0.10% optimize.opt_a.a_1 : 0.003594s : 2.17% optimize.opt_a.with_stream_mark : 0.000058s : 0.03% optimize.opt_a.recompute_prepare : 0.000050s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000025s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000023s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000021s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000612s : 0.37% optimize.opt_a.accelerated_algorithm : 0.000042s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000011s : 0.01% optimize.opt_a.shard_inline : 0.000040s : 0.02% optimize.opt_a.merge_send_recv : 0.000036s : 0.02% optimize.opt_a.auto_parallel : 0.000033s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.02% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000026s : 0.02% optimize.opt_a.allreduce_fusion : 0.000024s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000044s : 0.03% optimize.opt_a.virtual_dataset : 0.000041s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.02% optimize.opt_a.virtual_output : 0.000040s : 0.02% optimize.opt_a.merge_forward : 0.000024s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000039s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000076s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000070s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.02% optimize.opt_a.meta_fg_expand : 0.004665s : 2.82% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000124s : 0.07% optimize.opt_a.a_after_grad : 0.000168s : 0.10% optimize.opt_a.renormalize : 0.024856s : 15.02% optimize.opt_a.add_forward_monad_depend : 0.000022s : 0.01% optimize.opt_a.auto_monad_grad : 0.000016s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000153s : 0.09% optimize.opt_a.cse : 0.000565s : 0.34% optimize.opt_a.a_3 : 0.001606s : 0.97% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000045s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000508s : 0.31% optimize.opt_b.b_1 : 0.000351s : 0.21% optimize.opt_b.b_2 : 0.000016s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000057s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.01% optimize.loop_unroll : 0.000455s : 0.27% optimize.opt_after_cconv.c_1 : 0.000066s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.cse : 0.000055s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000080s : 0.05% optimize.tuple_transform.d_1 : 0.000124s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000014s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000072s : 0.04% optimize.cse_after_recomputation.cse : 0.000044s : 0.03% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000012s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000028s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000032s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000014s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000023s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000032s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000514s : 0.31% validate : 0.000065s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.076823s : 46.43% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.001499 311 0.24% : 0.000004s : 8: substitution.elim_not_effective 0.54% : 0.000008s : 12: substitution.float_depend_g_call 0.71% : 0.000011s : 9: substitution.float_tuple_getitem_switch 0.21% : 0.000003s : 8: substitution.fold_const_symbol 33.25% : 0.000498s : 5: substitution.getattr_setattr_resolve 0.65% : 0.000010s : 10: substitution.graph_param_transform 0.17% : 0.000002s : 2: substitution.incorporate_call 0.13% : 0.000002s : 2: substitution.incorporate_call_switch 37.71% : 0.000565s : 24: substitution.inline 1.40% : 0.000021s : 3: substitution.inline_without_move 0.87% : 0.000013s : 25: substitution.j_node_and_user_rematch 1.18% : 0.000018s : 13: substitution.minmaximum_grad 0.63% : 0.000009s : 12: substitution.partial_eliminate 1.17% : 0.000018s : 25: substitution.remove_not_recompute_node 5.15% : 0.000077s : 32: substitution.replace_applicator 0.68% : 0.000010s : 14: substitution.replace_old_param 0.15% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.77% : 0.000011s : 4: substitution.switch_simplify 0.54% : 0.000008s : 2: substitution.transpose_eliminate 2.99% : 0.000045s : 17: substitution.tuple_list_convert_item_index_to_positive 1.46% : 0.000022s : 17: substitution.tuple_list_get_item_const_eliminator 1.95% : 0.000029s : 17: substitution.tuple_list_get_item_depend_reorder 5.53% : 0.000083s : 32: substitution.tuple_list_get_item_eliminator 1.92% : 0.000029s : 17: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.046463 2 92.84% : 0.043137s : 1: type_inference.infer 7.16% : 0.003326s : 1: type_inference.specialize ------[replace.] 0.000485 45 11.34% : 0.000055s : 4: replace.getattr_setattr_resolve 49.13% : 0.000238s : 24: replace.inline 17.52% : 0.000085s : 5: replace.replace_applicator 8.48% : 0.000041s : 4: replace.switch_simplify 13.53% : 0.000066s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.001068 45 42.74% : 0.000456s : 4: match.getattr_setattr_resolve 51.69% : 0.000552s : 24: match.inline 2.51% : 0.000027s : 5: match.replace_applicator 0.88% : 0.000009s : 4: match.switch_simplify 2.17% : 0.000023s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000998 7110 0.85% : 0.000008s : 68: predicate.accumulaten_eliminater 0.35% : 0.000004s : 10: predicate.ad_related_special_op_eliminate 0.42% : 0.000004s : 32: predicate.addn_check_dump 0.95% : 0.000009s : 68: predicate.addn_zero_filter 0.84% : 0.000008s : 68: predicate.adjust_all_reduce_mul_add 1.79% : 0.000018s : 100: predicate.arithmetic_simplify 0.92% : 0.000009s : 68: predicate.cast_eliminate 2.85% : 0.000028s : 215: predicate.check_bprop_eliminate 0.42% : 0.000004s : 32: predicate.compare_switch_simplify 0.08% : 0.000001s : 10: predicate.const_output_eliminate 0.42% : 0.000004s : 32: predicate.depend_value_elim 0.95% : 0.000009s : 68: predicate.dict_get_item_const_eliminator 1.07% : 0.000011s : 68: predicate.dict_get_item_eliminator 0.95% : 0.000009s : 68: predicate.dict_set_item_eliminator 0.35% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.09% : 0.000001s : 10: predicate.elim_not_effective 0.17% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000010s : 78: predicate.environ_add_const_eliminate 1.03% : 0.000010s : 78: predicate.environ_get_add_eliminate 0.99% : 0.000010s : 78: predicate.environ_get_depend_swap 1.44% : 0.000014s : 110: predicate.environ_get_eliminate 0.98% : 0.000010s : 78: predicate.environ_get_set_eliminate 1.34% : 0.000013s : 100: predicate.exchange_switch_depend_value 1.96% : 0.000020s : 100: predicate.float_depend_g_call 0.42% : 0.000004s : 32: predicate.float_environ_get_switch 0.62% : 0.000006s : 42: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 10: predicate.fold_const_symbol 0.48% : 0.000005s : 32: predicate.get_grad_eliminate 0.65% : 0.000006s : 31: predicate.getattr_setattr_resolve 0.10% : 0.000001s : 10: predicate.graph_param_transform 0.44% : 0.000004s : 32: predicate.incorporate_call 0.39% : 0.000004s : 32: predicate.incorporate_call_switch 4.59% : 0.000046s : 252: predicate.inline 1.44% : 0.000014s : 82: predicate.inline_without_move 0.23% : 0.000002s : 32: predicate.j_node_and_user_rematch 0.52% : 0.000005s : 32: predicate.less_batch_normalization 1.35% : 0.000014s : 96: predicate.list_to_tuple_eliminator_ 2.11% : 0.000021s : 164: predicate.load_eliminater 0.36% : 0.000004s : 10: predicate.loop_unroll_after_grad 2.51% : 0.000025s : 182: predicate.loop_unroll_before_grad 1.17% : 0.000012s : 88: predicate.make_slice_get_slice_eliminator 0.46% : 0.000005s : 32: predicate.merge_addn 2.60% : 0.000026s : 198: predicate.micro_step_allgather_replace 2.63% : 0.000026s : 198: predicate.mini_step_allgather_replace 0.91% : 0.000009s : 68: predicate.minmaximum_grad 0.35% : 0.000004s : 10: predicate.mutable_eliminate 0.17% : 0.000002s : 10: predicate.opt_reshape 0.16% : 0.000002s : 10: predicate.parallel_virtual_node 1.76% : 0.000018s : 100: predicate.partial_defer_inline 1.32% : 0.000013s : 86: predicate.partial_eliminate 0.87% : 0.000009s : 68: predicate.print_const_string_wrapper 0.45% : 0.000004s : 32: predicate.reduce_all_const_elim 1.14% : 0.000011s : 68: predicate.reduce_eliminate 2.10% : 0.000021s : 164: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000002s : 32: predicate.remove_not_recompute_node 2.39% : 0.000024s : 284: predicate.replace_applicator 0.65% : 0.000006s : 82: predicate.replace_old_param 0.09% : 0.000001s : 10: predicate.reset_defer_inline 0.95% : 0.000009s : 68: predicate.reshape_eliminate 2.72% : 0.000027s : 198: predicate.row_tensor_add_zeros_like 0.18% : 0.000002s : 10: predicate.row_tensor_eliminate 3.12% : 0.000031s : 215: predicate.same_eliminate 0.28% : 0.000003s : 32: predicate.set_cell_output_no_recompute 0.51% : 0.000005s : 32: predicate.shard_identity_eliminate 0.31% : 0.000003s : 20: predicate.special_op_eliminate 0.51% : 0.000005s : 32: predicate.specialize_transform 2.68% : 0.000027s : 198: predicate.split_environ_get_set_with_tuple_value 1.33% : 0.000013s : 82: predicate.stack_unstack_eliminate 0.16% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.48% : 0.000015s : 100: predicate.switch_defer_inline 4.27% : 0.000043s : 315: predicate.switch_layer_defer_inline 4.86% : 0.000049s : 332: predicate.switch_simplify 0.93% : 0.000009s : 68: predicate.tile_eliminate 0.90% : 0.000009s : 68: predicate.transpose_eliminate 1.35% : 0.000013s : 88: predicate.tuple_list_convert_item_index_to_positive 1.34% : 0.000013s : 88: predicate.tuple_list_get_item_const_eliminator 1.23% : 0.000012s : 88: predicate.tuple_list_get_item_depend_reorder 2.34% : 0.000023s : 128: predicate.tuple_list_get_item_eliminator 1.40% : 0.000014s : 88: predicate.tuple_list_get_set_item_eliminator 1.89% : 0.000019s : 120: predicate.tuple_list_set_item_eliminator 1.32% : 0.000013s : 96: predicate.tuple_to_list_eliminator_ 2.03% : 0.000020s : 164: predicate.updatestate_pure_node_eliminater 2.50% : 0.000025s : 196: predicate.updatestate_useless_node_eliminater 0.15% : 0.000002s : 10: predicate.value_based_eliminate 0.48% : 0.000005s : 32: predicate.virtual_dataset_eliminate 0.46% : 0.000005s : 32: predicate.virtual_output_eliminate 0.15% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.18% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005153 75 64.34% : 0.003315s : 36: func_graph_cloner_run.FuncGraphClonerGraph 35.66% : 0.001838s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.247381 247 0.00% : 0.000003s : 1: ForceFp32Comm 1.27% : 0.003139s : 1: add_attr 1.27% : 0.003131s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000077s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000255s : 1: auto_monad 0.01% : 0.000036s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.19% : 0.000482s : 1: bootstrap 0.01% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000031s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000058s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.09% : 0.000226s : 1: event_method 0.01% : 0.000015s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000023s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.19% : 0.000463s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.21% : 0.000516s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000025s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000025s : 1: opt.transform.mutable_eliminate 2.76% : 0.006830s : 125: opt.transform.opt_a 0.03% : 0.000065s : 1: opt.transform.opt_after_cconv 0.02% : 0.000050s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000336s : 28: opt.transform.opt_b 0.25% : 0.000609s : 2: opt.transform.opt_resolve 0.05% : 0.000136s : 2: opt.transform.opt_trans_graph 0.03% : 0.000077s : 4: opt.transform.symbol_engine_opt 15.56% : 0.038486s : 1: opt_a 0.08% : 0.000192s : 1: opt_after_cconv 0.21% : 0.000524s : 1: opt_after_jit_grad 0.20% : 0.000497s : 1: opt_b 16.77% : 0.041486s : 1: optimize 0.01% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000071s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000086s : 1: remove_dup_value 8.23% : 0.020371s : 2: renormalize.infer 1.81% : 0.004468s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000048s : 1: rewriter_after_opt_a 0.15% : 0.000380s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000015s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000128s : 1: symbol_engine_optimizer 31.06% : 0.076839s : 1: task_emit 0.07% : 0.000169s : 1: tuple_transform 18.83% : 0.046582s : 1: type_inference 0.04% : 0.000092s : 1: validate [WARNING] CORE(61814,ffffbf434f30,python3.9):2026-01-29-17:51:37.275.531 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph12 TotalTime = 0.0718782, [24] [bootstrap]: 0.00044864 [type_inference]: 0.024475 [event_method]: 2.241e-05 [auto_monad]: 8.04e-05 [graph_reusing]: 6.41e-06 [inline]: 1.92999e-06 [add_attr]: 0.00312695, [1] [add_attr_with_inline]: 0.00311901, [1] [Cycle 1]: 5.596e-05, [2] [tag_attr]: 2.093e-05 [meta_addattr_fg_expand]: 6.87002e-06 [parallel-infer-symbol]: 3.45e-06 [pre_auto_parallel]: 3.486e-05 [insert-virtual-dataset]: 2.34001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.87999e-06 [pipeline_split]: 1.58002e-06 [optimize]: 0.00477952, [53] [py_interpret_to_execute]: 4.33001e-06 [rewriter_before_opt_a]: 0.00023968 [opt_a]: 0.00278592, [2] [Cycle 1]: 0.00221905, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 7.6e-05 [loop_unroll]: 3.286e-05 [a_1]: 0.0006126 [with_stream_mark]: 1.451e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 3.91999e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 7.198e-05 [accelerated_algorithm]: 6.02999e-06 [shard]: 1.73997e-06 [meta_shard_fg_expand]: 2.11e-06 [shard_inline]: 5.67999e-06 [merge_send_recv]: 8.58001e-06 [auto_parallel]: 5.51998e-06 [parallel]: 1.811e-05 [flash_sp]: 6.98998e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 3.28998e-06 [matmul_add_comm_reduction]: 9.41e-06 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 6.98998e-06 [virtual_dataset]: 5.86e-06 [get_grad_eliminate_]: 5.31002e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 4.2e-06 [cell_reuse_recompute_pass]: 1.08001e-06 [offload_activation]: 9.14e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.137e-05 [merge_recompute_call_nodes]: 1.43002e-06 [before_grad]: 1.009e-05 [set_forward_comm_id_for_comm_node_pass]: 3.36001e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.36e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 9.12001e-06 [a_after_grad]: 8.05999e-06 [renormalize]: 0.00089673 [add_forward_monad_depend]: 5.37999e-06 [auto_monad_grad]: 1.77999e-06 [auto_monad_eliminator]: 1.473e-05 [cse]: 3.248e-05 [a_3]: 4.302e-05 [Cycle 2]: 0.00055738, [45] [expand_dump_flag]: 1.15001e-06 [switch_simplify]: 7.23999e-06 [loop_unroll]: 5.84e-06 [a_1]: 9.836e-05 [with_stream_mark]: 1.04e-05 [recompute_prepare]: 5.69999e-06 [updatestate_depend_eliminate]: 2.89999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.13002e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 6.31e-05 [accelerated_algorithm]: 5.58002e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.27999e-06 [shard_inline]: 5.31998e-06 [merge_send_recv]: 4.45e-06 [auto_parallel]: 5.10001e-06 [parallel]: 3.93001e-06 [flash_sp]: 3.18e-06 [merge_comm]: 3.08e-06 [allreduce_fusion]: 2.89999e-06 [matmul_add_comm_reduction]: 5.09e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.38e-06 [virtual_dataset]: 5.15999e-06 [get_grad_eliminate_]: 4.99e-06 [virtual_output]: 4.84e-06 [merge_forward]: 2.64999e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 5.41002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.213e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 8.38001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.05002e-06 [meta_fg_expand]: 1.84e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.17e-06 [after_resolve]: 8.02e-06 [a_after_grad]: 7.8e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.25001e-06 [auto_monad_grad]: 7.59988e-07 [auto_monad_eliminator]: 6.18998e-06 [cse]: 1.439e-05 [a_3]: 3.077e-05 [py_interpret_to_execute_after_opt_a]: 4.25999e-06 [slice_cell_reuse_recomputed_activation]: 1.91e-06 [rewriter_after_opt_a]: 1.602e-05 [convert_after_rewriter]: 1.76998e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00045197 [opt_b]: 0.00018215, [1] [Cycle 1]: 0.00017648, [7] [b_1]: 0.00010631 [b_2]: 6.56e-06 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.24001e-06 [renormalize]: 5.69999e-07 [cse]: 1.912e-05 [optimize_parallel_all_gather_comm]: 1.671e-05 [overlap_param_gather]: 2.26e-06 [cconv]: 2.326e-05 [loop_unroll]: 0.00041741 [opt_after_cconv]: 9.535e-05, [1] [Cycle 1]: 8.962e-05, [7] [c_1]: 2.488e-05 [parameter_eliminate]: 2.24999e-06 [updatestate_depend_eliminate]: 4.86002e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.19999e-06 [cse]: 1.919e-05 [renormalize]: 3.49974e-07 [remove_dup_value]: 1.523e-05 [tuple_transform]: 6.517e-05, [1] [Cycle 1]: 6.1e-05, [4] [d_1]: 3.529e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.05002e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 4.498e-05 [cse_after_recomputation]: 2.276e-05, [1] [Cycle 1]: 1.879e-05, [1] [cse]: 1.369e-05 [environ_conv]: 7.80998e-06 [swap_dp_allreduce_reducescatter]: 5.09e-06 [bias_add_comm_swap]: 2.67001e-06 [label_micro_interleaved_index]: 4.26001e-06 [label_fine_grained_interleaved_index]: 2.89001e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 2.14e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.39994e-07 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.48998e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.25999e-06 [interleave_split_concat_branches]: 1.36998e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 8.70001e-07 [overlap_opt_shard_grad_in_pipeline]: 1.65001e-06 [control_data_broadcast_order]: 1.115e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 4.63001e-06 [overlap_recompute_and_grad_model_parallel]: 4.23999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.46002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 3.72002e-06 [overlap_grad_flash_sp]: 1.737e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 1.89e-06 [split_layernorm_comm]: 1.86e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 7.637e-05, [1] [Cycle 1]: 7.245e-05, [6] [build]: 9.76998e-06 [elim_shapecalc]: 9.00001e-06 [elim_not_effective]: 1.178e-05 [opt_reshape]: 6.10002e-06 [fold_const_symbol]: 9.29e-06 [renormalize]: 2.10013e-07 [detach_backward]: 1.75001e-06 [pipeline_parallel_scheduler]: 1.38002e-06 [auto_monad_reorder]: 1.839e-05 [get_jit_bprop_graph]: 1.853e-05 [rewriter_after_jit_bprop_graph]: 3.87998e-06 [opt_after_jit_grad]: 0.00048051 [validate]: 3.92e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.0381239 [execute]: 7.34002e-06 Sums bootstrap : 0.000449s : 0.66% type_inference : 0.024475s : 36.10% event_method : 0.000022s : 0.03% auto_monad : 0.000080s : 0.12% graph_reusing : 0.000006s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000035s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000240s : 0.35% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000083s : 0.12% optimize.opt_a.loop_unroll : 0.000039s : 0.06% optimize.opt_a.a_1 : 0.000711s : 1.05% optimize.opt_a.with_stream_mark : 0.000025s : 0.04% optimize.opt_a.recompute_prepare : 0.000013s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000135s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.02% optimize.opt_a.merge_send_recv : 0.000013s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.02% optimize.opt_a.parallel : 0.000022s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.02% optimize.opt_a.virtual_output : 0.000010s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000017s : 0.03% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.000897s : 1.32% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.03% optimize.opt_a.cse : 0.000047s : 0.07% optimize.opt_a.a_3 : 0.000074s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000452s : 0.67% optimize.opt_b.b_1 : 0.000106s : 0.16% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000019s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000417s : 0.62% optimize.opt_after_cconv.c_1 : 0.000025s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.02% optimize.tuple_transform.d_1 : 0.000035s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000045s : 0.07% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.03% get_jit_bprop_graph : 0.000019s : 0.03% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000481s : 0.71% validate : 0.000039s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.038124s : 56.23% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000181 26 0.96% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 2.78% : 0.000005s : 3: substitution.graph_param_transform 80.01% : 0.000144s : 6: substitution.inline 1.89% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.62% : 0.000005s : 4: substitution.remove_not_recompute_node 1.70% : 0.000003s : 2: substitution.replace_old_param 3.51% : 0.000006s : 1: substitution.switch_simplify 5.79% : 0.000010s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024416 2 95.03% : 0.023203s : 1: type_inference.infer 4.97% : 0.001213s : 1: type_inference.specialize ------[replace.] 0.000082 9 59.54% : 0.000049s : 6: replace.inline 20.76% : 0.000017s : 1: replace.switch_simplify 19.70% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 9 90.47% : 0.000141s : 6: match.inline 3.62% : 0.000006s : 1: match.switch_simplify 5.91% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000173 1092 0.96% : 0.000002s : 12: predicate.accumulaten_eliminater 0.88% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 6: predicate.addn_check_dump 1.03% : 0.000002s : 12: predicate.addn_zero_filter 0.84% : 0.000001s : 12: predicate.adjust_all_reduce_mul_add 2.34% : 0.000004s : 18: predicate.arithmetic_simplify 0.97% : 0.000002s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.55% : 0.000001s : 6: predicate.depend_value_elim 0.98% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.84% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_depend_swap 1.71% : 0.000003s : 21: predicate.environ_get_eliminate 1.17% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.66% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.52% : 0.000004s : 20: predicate.float_depend_g_call 0.48% : 0.000001s : 6: predicate.float_environ_get_switch 0.80% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.61% : 0.000001s : 6: predicate.get_grad_eliminate 0.20% : 0.000000s : 3: predicate.graph_param_transform 0.53% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.94% : 0.000010s : 50: predicate.inline 0.65% : 0.000001s : 6: predicate.inline_without_move 0.31% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.68% : 0.000001s : 6: predicate.less_batch_normalization 1.74% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.41% : 0.000004s : 32: predicate.load_eliminater 1.03% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.94% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.74% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 6: predicate.merge_addn 0.62% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.08% : 0.000002s : 3: predicate.mutable_eliminate 0.31% : 0.000001s : 3: predicate.opt_reshape 0.50% : 0.000001s : 3: predicate.parallel_virtual_node 1.99% : 0.000003s : 20: predicate.partial_defer_inline 1.42% : 0.000002s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.34% : 0.000002s : 12: predicate.reduce_eliminate 2.45% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000002s : 20: predicate.replace_applicator 0.53% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 1.14% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 3: predicate.row_tensor_eliminate 0.72% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.70% : 0.000001s : 6: predicate.shard_identity_eliminate 0.63% : 0.000001s : 6: predicate.special_op_eliminate 0.61% : 0.000001s : 6: predicate.specialize_transform 0.70% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.59% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.69% : 0.000003s : 20: predicate.switch_defer_inline 2.19% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.36% : 0.000011s : 68: predicate.switch_simplify 1.04% : 0.000002s : 12: predicate.tile_eliminate 1.08% : 0.000002s : 12: predicate.transpose_eliminate 1.58% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.14% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.39% : 0.000002s : 18: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.69% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.37% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.97% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 3: predicate.value_based_eliminate 0.61% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001029 16 57.39% : 0.000590s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.61% : 0.000438s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.081943 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.82% : 0.003131s : 1: add_attr 3.81% : 0.003123s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000049s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000086s : 1: auto_monad 0.03% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.59% : 0.000480s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000014s : 1: control_data_broadcast_order 0.01% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000028s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.03% : 0.000023s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.52% : 0.000425s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.56% : 0.000461s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 1.36% : 0.001117s : 78: opt.transform.opt_a 0.03% : 0.000024s : 1: opt.transform.opt_after_cconv 0.03% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000086s : 28: opt.transform.opt_b 0.05% : 0.000039s : 2: opt.transform.opt_trans_graph 0.04% : 0.000032s : 4: opt.transform.symbol_engine_opt 3.40% : 0.002789s : 1: opt_a 0.12% : 0.000099s : 1: opt_after_cconv 0.60% : 0.000490s : 1: opt_after_jit_grad 0.23% : 0.000185s : 1: opt_b 5.84% : 0.004783s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000006s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000039s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.57% : 0.000467s : 1: renormalize.infer 0.52% : 0.000422s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000019s : 1: rewriter_after_opt_a 0.30% : 0.000245s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000079s : 1: symbol_engine_optimizer 46.54% : 0.038139s : 1: task_emit 0.08% : 0.000068s : 1: tuple_transform 29.88% : 0.024488s : 1: type_inference 0.08% : 0.000063s : 1: validate .. TotalTime = 13.0487, [24] [bootstrap]: 0.00045395 [type_inference]: 0.0447585 [event_method]: 0.00021894 [auto_monad]: 0.00015278 [graph_reusing]: 1.001e-05 [inline]: 2.81999e-06 [add_attr]: 0.00311476, [1] [add_attr_with_inline]: 0.0031064, [1] [Cycle 1]: 8.268e-05, [2] [tag_attr]: 4.137e-05 [meta_addattr_fg_expand]: 1.239e-05 [parallel-infer-symbol]: 3.20002e-06 [pre_auto_parallel]: 5.902e-05 [insert-virtual-dataset]: 2.84001e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.98002e-06 [optimize]: 0.0432327, [53] [py_interpret_to_execute]: 4.53999e-06 [rewriter_before_opt_a]: 0.00039087 [opt_a]: 0.0327072, [3] [Cycle 1]: 0.0168971, [45] [expand_dump_flag]: 4.77998e-06 [switch_simplify]: 0.00015683 [loop_unroll]: 6.677e-05 [a_1]: 0.00141499 [with_stream_mark]: 2.325e-05 [recompute_prepare]: 1.974e-05 [updatestate_depend_eliminate]: 8.38999e-06 [updatestate_assign_eliminate]: 7.45e-06 [updatestate_loads_eliminate]: 7.63999e-06 [parameter_eliminate]: 2.79001e-06 [a_2]: 0.00021169 [accelerated_algorithm]: 1.454e-05 [shard]: 1.91998e-06 [meta_shard_fg_expand]: 4.13001e-06 [shard_inline]: 1.409e-05 [merge_send_recv]: 1.57e-05 [auto_parallel]: 1.003e-05 [parallel]: 1.888e-05 [flash_sp]: 8.93002e-06 [merge_comm]: 8.48999e-06 [allreduce_fusion]: 7.83999e-06 [matmul_add_comm_reduction]: 2.617e-05 [allreduce_slice_to_reducescatter]: 1.12999e-06 [virtual_shard_identity]: 1.575e-05 [virtual_dataset]: 1.402e-05 [get_grad_eliminate_]: 1.404e-05 [virtual_output]: 1.323e-05 [merge_forward]: 8.35999e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [offload_activation]: 1.587e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.513e-05 [merge_recompute_call_nodes]: 1.85001e-06 [before_grad]: 2.447e-05 [set_forward_comm_id_for_comm_node_pass]: 8.24998e-06 [meta_fg_expand]: 0.00153336 [flash_sp_send_recv_attached]: 4.3e-06 [receive_attached]: 2.29001e-06 [after_resolve]: 6.259e-05 [a_after_grad]: 8.549e-05 [renormalize]: 0.0113565 [add_forward_monad_depend]: 1.073e-05 [auto_monad_grad]: 5.51e-06 [auto_monad_eliminator]: 0.00016302 [cse]: 0.00045987 [a_3]: 0.00072527 [Cycle 2]: 0.0103019, [45] [expand_dump_flag]: 2.07001e-06 [switch_simplify]: 9.897e-05 [loop_unroll]: 9.376e-05 [a_1]: 0.00313101 [with_stream_mark]: 5.878e-05 [recompute_prepare]: 6.643e-05 [updatestate_depend_eliminate]: 3.927e-05 [updatestate_assign_eliminate]: 3.704e-05 [updatestate_loads_eliminate]: 3.553e-05 [parameter_eliminate]: 1.12999e-06 [a_2]: 0.00100733 [accelerated_algorithm]: 0.00011822 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.374e-05 [shard_inline]: 6.497e-05 [merge_send_recv]: 4.376e-05 [auto_parallel]: 4.124e-05 [parallel]: 4.88001e-06 [flash_sp]: 3.44001e-06 [merge_comm]: 4.168e-05 [allreduce_fusion]: 4.051e-05 [matmul_add_comm_reduction]: 4.829e-05 [allreduce_slice_to_reducescatter]: 3.60014e-07 [virtual_shard_identity]: 6.409e-05 [virtual_dataset]: 6.241e-05 [get_grad_eliminate_]: 6.201e-05 [virtual_output]: 6.177e-05 [merge_forward]: 3.732e-05 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 4.862e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011251 [merge_recompute_call_nodes]: 7.89994e-07 [before_grad]: 0.00010601 [set_forward_comm_id_for_comm_node_pass]: 4.304e-05 [meta_fg_expand]: 0.00012141 [flash_sp_send_recv_attached]: 1.02e-06 [receive_attached]: 9.90025e-07 [after_resolve]: 6.946e-05 [a_after_grad]: 0.00010168 [renormalize]: 0.0033176 [add_forward_monad_depend]: 4.85999e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 9.553e-05 [cse]: 0.00025942 [a_3]: 0.00046819 [Cycle 3]: 0.00549399, [45] [expand_dump_flag]: 1.37e-06 [switch_simplify]: 6.639e-05 [loop_unroll]: 6.268e-05 [a_1]: 0.00191646 [with_stream_mark]: 4.74e-05 [recompute_prepare]: 6.338e-05 [updatestate_depend_eliminate]: 4.09e-05 [updatestate_assign_eliminate]: 3.802e-05 [updatestate_loads_eliminate]: 3.674e-05 [parameter_eliminate]: 1.08001e-06 [a_2]: 0.00100358 [accelerated_algorithm]: 7.327e-05 [shard]: 1.26997e-06 [meta_shard_fg_expand]: 1.41e-05 [shard_inline]: 6.396e-05 [merge_send_recv]: 4.94e-05 [auto_parallel]: 4.446e-05 [parallel]: 4.35e-06 [flash_sp]: 1.07e-06 [merge_comm]: 4.413e-05 [allreduce_fusion]: 4.313e-05 [matmul_add_comm_reduction]: 4.872e-05 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 6.456e-05 [virtual_dataset]: 6.335e-05 [get_grad_eliminate_]: 6.254e-05 [virtual_output]: 6.245e-05 [merge_forward]: 3.909e-05 [cell_reuse_recompute_pass]: 1.67999e-06 [offload_activation]: 5.046e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00011457 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 0.00010756 [set_forward_comm_id_for_comm_node_pass]: 6.194e-05 [meta_fg_expand]: 2.928e-05 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 9.99979e-07 [after_resolve]: 6.506e-05 [a_after_grad]: 0.00010305 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.99999e-06 [auto_monad_grad]: 1.27e-06 [auto_monad_eliminator]: 7.55e-05 [cse]: 0.00023282 [a_3]: 0.00045915 [py_interpret_to_execute_after_opt_a]: 4.60001e-06 [slice_cell_reuse_recomputed_activation]: 2.16998e-06 [rewriter_after_opt_a]: 0.00018692 [convert_after_rewriter]: 1.37e-06 [order_py_execute_after_rewriter]: 1.29998e-06 [mutable_eliminate]: 0.00055631 [opt_b]: 0.00566647, [2] [Cycle 1]: 0.00365383, [7] [b_1]: 0.00317855 [b_2]: 6.674e-05 [updatestate_depend_eliminate]: 4.738e-05 [updatestate_assign_eliminate]: 3.832e-05 [updatestate_loads_eliminate]: 3.692e-05 [renormalize]: 3.80009e-07 [cse]: 0.00023884 [Cycle 2]: 0.00200169, [7] [b_1]: 0.00155858 [b_2]: 6.473e-05 [updatestate_depend_eliminate]: 4.309e-05 [updatestate_assign_eliminate]: 3.753e-05 [updatestate_loads_eliminate]: 3.708e-05 [renormalize]: 7.00238e-08 [cse]: 0.00022064 [optimize_parallel_all_gather_comm]: 8.19e-05 [overlap_param_gather]: 2.41e-06 [cconv]: 3.474e-05 [loop_unroll]: 0.00054238 [opt_after_cconv]: 0.00074918, [1] [Cycle 1]: 0.0007429, [7] [c_1]: 0.00033879 [parameter_eliminate]: 2.31e-06 [updatestate_depend_eliminate]: 4.998e-05 [updatestate_assign_eliminate]: 3.911e-05 [updatestate_loads_eliminate]: 3.782e-05 [cse]: 0.00023474 [renormalize]: 4.89992e-07 [remove_dup_value]: 0.00039352 [tuple_transform]: 0.00052886, [1] [Cycle 1]: 0.00052263, [4] [d_1]: 0.000439 [none_parameter_eliminate]: 2.23998e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 5.788e-05 [partial_unused_args_eliminate]: 1.84e-06 [add_recomputation]: 0.00022171 [cse_after_recomputation]: 0.00014409, [1] [Cycle 1]: 0.00013874, [1] [cse]: 0.0001308 [environ_conv]: 3.016e-05 [swap_dp_allreduce_reducescatter]: 4.139e-05 [bias_add_comm_swap]: 2.74999e-06 [label_micro_interleaved_index]: 4.55001e-06 [label_fine_grained_interleaved_index]: 2.89001e-06 [merge_cast_opt]: 1.38002e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.18001e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 3.08e-06 [comm_op_add_attrs]: 1.14998e-06 [add_comm_op_reuse_tag]: 1.14998e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 9.09989e-07 [overlap_opt_shard_grad_in_pipeline]: 2.13002e-06 [control_data_broadcast_order]: 9.18e-05 [grouped_pairwise_exchange_alltoall]: 1.61002e-06 [offloading_packed_experts]: 2.228e-05 [overlap_recompute_and_grad_model_parallel]: 2.253e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.42999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.06998e-06 [overlap_grad_ring_attention]: 2.204e-05 [overlap_grad_flash_sp]: 0.00012098 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.09e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 0.00037492, [1] [Cycle 1]: 0.00036982, [6] [build]: 1.829e-05 [elim_shapecalc]: 6.273e-05 [elim_not_effective]: 0.00010184 [opt_reshape]: 5.694e-05 [fold_const_symbol]: 9.853e-05 [renormalize]: 2.69996e-07 [detach_backward]: 1.72001e-06 [pipeline_parallel_scheduler]: 1.53002e-06 [auto_monad_reorder]: 8.763e-05 [get_jit_bprop_graph]: 1.07998e-06 [rewriter_after_jit_bprop_graph]: 3.91001e-06 [opt_after_jit_grad]: 0.0006626 [validate]: 0.00015232 [backend_pass]: 1.00001e-06 [task_emit]: 12.9555 [execute]: 1.078e-05 Sums bootstrap : 0.000454s : 0.00% type_inference : 0.044759s : 0.34% event_method : 0.000219s : 0.00% auto_monad : 0.000153s : 0.00% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000059s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000391s : 0.00% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000322s : 0.00% optimize.opt_a.loop_unroll : 0.000223s : 0.00% optimize.opt_a.a_1 : 0.006462s : 0.05% optimize.opt_a.with_stream_mark : 0.000129s : 0.00% optimize.opt_a.recompute_prepare : 0.000150s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000089s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000083s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000080s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.002223s : 0.02% optimize.opt_a.accelerated_algorithm : 0.000206s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000032s : 0.00% optimize.opt_a.shard_inline : 0.000143s : 0.00% optimize.opt_a.merge_send_recv : 0.000109s : 0.00% optimize.opt_a.auto_parallel : 0.000096s : 0.00% optimize.opt_a.parallel : 0.000028s : 0.00% optimize.opt_a.flash_sp : 0.000013s : 0.00% optimize.opt_a.merge_comm : 0.000094s : 0.00% optimize.opt_a.allreduce_fusion : 0.000091s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000123s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000144s : 0.00% optimize.opt_a.virtual_dataset : 0.000140s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000139s : 0.00% optimize.opt_a.virtual_output : 0.000137s : 0.00% optimize.opt_a.merge_forward : 0.000085s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000115s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000252s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000238s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000113s : 0.00% optimize.opt_a.meta_fg_expand : 0.001684s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000197s : 0.00% optimize.opt_a.a_after_grad : 0.000290s : 0.00% optimize.opt_a.renormalize : 0.014674s : 0.11% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.00% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000334s : 0.00% optimize.opt_a.cse : 0.000952s : 0.01% optimize.opt_a.a_3 : 0.001653s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000187s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000556s : 0.00% optimize.opt_b.b_1 : 0.004737s : 0.04% optimize.opt_b.b_2 : 0.000131s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000090s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000076s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000074s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000459s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000082s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.00% optimize.loop_unroll : 0.000542s : 0.00% optimize.opt_after_cconv.c_1 : 0.000339s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000050s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000039s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000038s : 0.00% optimize.opt_after_cconv.cse : 0.000235s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000394s : 0.00% optimize.tuple_transform.d_1 : 0.000439s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000058s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000222s : 0.00% optimize.cse_after_recomputation.cse : 0.000131s : 0.00% optimize.environ_conv : 0.000030s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000041s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000092s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000022s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000023s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000121s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000018s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000063s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000102s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000057s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000099s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000088s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000663s : 0.01% validate : 0.000152s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 12.955472s : 99.32% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.001292 650 6.88% : 0.000089s : 36: substitution.arithmetic_simplify 1.14% : 0.000015s : 46: substitution.elim_not_effective 0.56% : 0.000007s : 11: substitution.float_depend_g_call 1.30% : 0.000017s : 17: substitution.float_tuple_getitem_switch 1.10% : 0.000014s : 46: substitution.fold_const_symbol 2.71% : 0.000035s : 51: substitution.graph_param_transform 0.21% : 0.000003s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 43.91% : 0.000567s : 21: substitution.inline 1.23% : 0.000016s : 2: substitution.inline_without_move 3.01% : 0.000039s : 102: substitution.j_node_and_user_rematch 3.86% : 0.000050s : 10: substitution.less_batch_normalization 1.33% : 0.000017s : 13: substitution.minmaximum_grad 0.61% : 0.000008s : 11: substitution.partial_eliminate 4.34% : 0.000056s : 102: substitution.remove_not_recompute_node 1.80% : 0.000023s : 9: substitution.replace_applicator 0.64% : 0.000008s : 11: substitution.replace_old_param 0.17% : 0.000002s : 1: substitution.set_cell_output_no_recompute 1.03% : 0.000013s : 4: substitution.switch_simplify 1.87% : 0.000024s : 12: substitution.transpose_eliminate 4.71% : 0.000061s : 25: substitution.tuple_list_convert_item_index_to_positive 2.30% : 0.000030s : 25: substitution.tuple_list_get_item_const_eliminator 2.99% : 0.000039s : 25: substitution.tuple_list_get_item_depend_reorder 7.15% : 0.000092s : 40: substitution.tuple_list_get_item_eliminator 3.05% : 0.000039s : 25: substitution.tuple_list_get_set_item_eliminator 1.96% : 0.000025s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.044669 2 93.78% : 0.041889s : 1: type_inference.infer 6.22% : 0.002780s : 1: type_inference.specialize ------[replace.] 0.000273 33 56.34% : 0.000154s : 21: replace.inline 15.49% : 0.000042s : 4: replace.switch_simplify 22.47% : 0.000061s : 7: replace.tuple_list_get_item_eliminator 5.70% : 0.000016s : 1: replace.zero_like_fill_zero ------[match.] 0.000610 33 90.88% : 0.000554s : 21: match.inline 1.71% : 0.000010s : 4: match.switch_simplify 3.45% : 0.000021s : 7: match.tuple_list_get_item_eliminator 3.95% : 0.000024s : 1: match.zero_like_fill_zero ------[predicate.] 0.002736 20376 0.76% : 0.000021s : 165: predicate.accumulaten_eliminater 0.43% : 0.000012s : 51: predicate.ad_related_special_op_eliminate 0.64% : 0.000017s : 132: predicate.addn_check_dump 0.78% : 0.000021s : 165: predicate.addn_zero_filter 0.72% : 0.000020s : 165: predicate.adjust_all_reduce_mul_add 1.81% : 0.000050s : 297: predicate.arithmetic_simplify 0.77% : 0.000021s : 165: predicate.cast_eliminate 1.06% : 0.000029s : 217: predicate.check_bprop_eliminate 0.65% : 0.000018s : 132: predicate.compare_switch_simplify 0.49% : 0.000013s : 177: predicate.const_output_eliminate 0.62% : 0.000017s : 132: predicate.depend_value_elim 0.82% : 0.000022s : 165: predicate.dict_get_item_const_eliminator 0.84% : 0.000023s : 165: predicate.dict_get_item_eliminator 0.75% : 0.000020s : 165: predicate.dict_set_item_eliminator 1.15% : 0.000032s : 228: predicate.dumpgradient_eliminate 0.14% : 0.000004s : 51: predicate.elim_not_effective 0.28% : 0.000008s : 51: predicate.elim_shapecalc_of_broadcastargs 1.58% : 0.000043s : 342: predicate.environ_add_const_eliminate 1.56% : 0.000043s : 342: predicate.environ_get_add_eliminate 1.58% : 0.000043s : 342: predicate.environ_get_depend_swap 2.22% : 0.000061s : 474: predicate.environ_get_eliminate 1.56% : 0.000043s : 342: predicate.environ_get_set_eliminate 0.92% : 0.000025s : 193: predicate.exchange_switch_depend_value 1.21% : 0.000033s : 193: predicate.float_depend_g_call 0.63% : 0.000017s : 132: predicate.float_environ_get_switch 1.53% : 0.000042s : 309: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 51: predicate.fold_const_symbol 0.67% : 0.000018s : 132: predicate.get_grad_eliminate 0.15% : 0.000004s : 51: predicate.graph_param_transform 0.63% : 0.000017s : 132: predicate.incorporate_call 0.61% : 0.000017s : 132: predicate.incorporate_call_switch 5.13% : 0.000141s : 858: predicate.inline 1.01% : 0.000028s : 159: predicate.inline_without_move 0.34% : 0.000009s : 132: predicate.j_node_and_user_rematch 0.75% : 0.000020s : 132: predicate.less_batch_normalization 1.95% : 0.000053s : 400: predicate.list_to_tuple_eliminator_ 2.60% : 0.000071s : 573: predicate.load_eliminater 0.50% : 0.000014s : 59: predicate.loop_unroll_after_grad 1.18% : 0.000032s : 233: predicate.loop_unroll_before_grad 1.90% : 0.000052s : 401: predicate.make_slice_get_slice_eliminator 0.65% : 0.000018s : 132: predicate.merge_addn 1.04% : 0.000028s : 217: predicate.micro_step_allgather_replace 1.04% : 0.000028s : 217: predicate.mini_step_allgather_replace 0.77% : 0.000021s : 165: predicate.minmaximum_grad 0.51% : 0.000014s : 60: predicate.mutable_eliminate 0.27% : 0.000007s : 51: predicate.opt_reshape 0.93% : 0.000025s : 177: predicate.parallel_virtual_node 1.22% : 0.000033s : 193: predicate.partial_defer_inline 1.22% : 0.000033s : 231: predicate.partial_eliminate 0.75% : 0.000020s : 165: predicate.print_const_string_wrapper 0.64% : 0.000018s : 132: predicate.reduce_all_const_elim 0.93% : 0.000025s : 165: predicate.reduce_eliminate 2.59% : 0.000071s : 573: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000009s : 132: predicate.remove_not_recompute_node 1.24% : 0.000034s : 389: predicate.replace_applicator 0.44% : 0.000012s : 159: predicate.replace_old_param 0.49% : 0.000013s : 177: predicate.reset_defer_inline 0.74% : 0.000020s : 165: predicate.reshape_eliminate 1.05% : 0.000029s : 217: predicate.row_tensor_add_zeros_like 0.64% : 0.000017s : 118: predicate.row_tensor_eliminate 1.23% : 0.000034s : 217: predicate.same_eliminate 0.39% : 0.000011s : 132: predicate.set_cell_output_no_recompute 0.69% : 0.000019s : 132: predicate.shard_identity_eliminate 1.19% : 0.000032s : 228: predicate.special_op_eliminate 0.71% : 0.000019s : 132: predicate.specialize_transform 1.10% : 0.000030s : 217: predicate.split_environ_get_set_with_tuple_value 0.88% : 0.000024s : 159: predicate.stack_unstack_eliminate 0.31% : 0.000009s : 59: predicate.switch_call_monad_eliminater 0.97% : 0.000026s : 193: predicate.switch_defer_inline 2.04% : 0.000056s : 410: predicate.switch_layer_defer_inline 3.18% : 0.000087s : 617: predicate.switch_simplify 0.76% : 0.000021s : 165: predicate.tile_eliminate 0.79% : 0.000022s : 165: predicate.transpose_eliminate 2.01% : 0.000055s : 393: predicate.tuple_list_convert_item_index_to_positive 2.03% : 0.000056s : 393: predicate.tuple_list_get_item_const_eliminator 1.94% : 0.000053s : 393: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000079s : 532: predicate.tuple_list_get_item_eliminator 2.04% : 0.000056s : 393: predicate.tuple_list_get_set_item_eliminator 2.65% : 0.000073s : 525: predicate.tuple_list_set_item_eliminator 1.88% : 0.000051s : 400: predicate.tuple_to_list_eliminator_ 2.99% : 0.000082s : 573: predicate.updatestate_pure_node_eliminater 3.30% : 0.000090s : 705: predicate.updatestate_useless_node_eliminater 0.90% : 0.000025s : 177: predicate.value_based_eliminate 0.68% : 0.000019s : 132: predicate.virtual_dataset_eliminate 0.67% : 0.000018s : 132: predicate.virtual_output_eliminate 0.25% : 0.000007s : 51: predicate.virtual_view_grad_eliminate 0.97% : 0.000026s : 179: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004079 51 63.24% : 0.002580s : 26: func_graph_cloner_run.FuncGraphClonerGraph 36.76% : 0.001499s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 13.128639 292 0.00% : 0.000004s : 1: ForceFp32Comm 0.02% : 0.003119s : 1: add_attr 0.02% : 0.003110s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000227s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000162s : 1: auto_monad 0.00% : 0.000092s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.00% : 0.000483s : 1: bootstrap 0.00% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000096s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000147s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000034s : 1: environ_conv 0.00% : 0.000231s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.00% : 0.000551s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.00% : 0.000565s : 1: mutable_eliminate 0.00% : 0.000025s : 1: offloading_packed_experts 0.00% : 0.000085s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000086s : 1: opt.transform.mutable_eliminate 0.10% : 0.012768s : 117: opt.transform.opt_a 0.00% : 0.000337s : 1: opt.transform.opt_after_cconv 0.00% : 0.000175s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.004780s : 83: opt.transform.opt_b 0.00% : 0.000493s : 2: opt.transform.opt_trans_graph 0.00% : 0.000316s : 4: opt.transform.symbol_engine_opt 0.25% : 0.032711s : 1: opt_a 0.01% : 0.000753s : 1: opt_after_cconv 0.01% : 0.000690s : 1: opt_after_jit_grad 0.04% : 0.005671s : 1: opt_b 0.33% : 0.043238s : 1: optimize 0.00% : 0.000086s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000125s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000025s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000064s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000401s : 1: remove_dup_value 0.08% : 0.010183s : 2: renormalize.infer 0.03% : 0.004474s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000191s : 1: rewriter_after_opt_a 0.00% : 0.000398s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000045s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000378s : 1: symbol_engine_optimizer 98.68% : 12.955500s : 1: task_emit 0.00% : 0.000532s : 1: tuple_transform 0.34% : 0.044774s : 1: type_inference 0.00% : 0.000206s : 1: validate [WARNING] CORE(61814,ffffbf434f30,python3.9):2026-01-29-17:51:50.950.351 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph14 TotalTime = 0.0724859, [24] [bootstrap]: 0.00046694 [type_inference]: 0.0245953 [event_method]: 2.307e-05 [auto_monad]: 8.158e-05 [graph_reusing]: 7.02002e-06 [inline]: 1.93002e-06 [add_attr]: 0.00312514, [1] [add_attr_with_inline]: 0.0031175, [1] [Cycle 1]: 5.399e-05, [2] [tag_attr]: 2.108e-05 [meta_addattr_fg_expand]: 6.60002e-06 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 3.353e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 6.90023e-07 [dataset_repeat_opt]: 1.72999e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00480114, [53] [py_interpret_to_execute]: 4.57e-06 [rewriter_before_opt_a]: 0.0002418 [opt_a]: 0.00276651, [2] [Cycle 1]: 0.00220239, [45] [expand_dump_flag]: 3.87002e-06 [switch_simplify]: 7.706e-05 [loop_unroll]: 3.296e-05 [a_1]: 0.000588 [with_stream_mark]: 1.32e-05 [recompute_prepare]: 6.90998e-06 [updatestate_depend_eliminate]: 3.48999e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.07002e-06 [parameter_eliminate]: 1.86e-06 [a_2]: 6.909e-05 [accelerated_algorithm]: 5.96e-06 [shard]: 1.75001e-06 [meta_shard_fg_expand]: 2.06998e-06 [shard_inline]: 5.69e-06 [merge_send_recv]: 8.1e-06 [auto_parallel]: 5.72001e-06 [parallel]: 2.052e-05 [flash_sp]: 7.16001e-06 [merge_comm]: 3.69002e-06 [allreduce_fusion]: 3.2e-06 [matmul_add_comm_reduction]: 9.05001e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 6.58e-06 [virtual_dataset]: 5.71e-06 [get_grad_eliminate_]: 5.30999e-06 [virtual_output]: 5.52001e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 1.17999e-06 [offload_activation]: 9.12001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.121e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 9.03002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.39001e-06 [meta_fg_expand]: 2.93e-06 [flash_sp_send_recv_attached]: 2.34999e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 8.98002e-06 [a_after_grad]: 7.87998e-06 [renormalize]: 0.00091175 [add_forward_monad_depend]: 5.28002e-06 [auto_monad_grad]: 1.92999e-06 [auto_monad_eliminator]: 1.489e-05 [cse]: 3.517e-05 [a_3]: 4.138e-05 [Cycle 2]: 0.00055479, [45] [expand_dump_flag]: 1.16997e-06 [switch_simplify]: 6.93e-06 [loop_unroll]: 5.82999e-06 [a_1]: 9.634e-05 [with_stream_mark]: 1.048e-05 [recompute_prepare]: 5.44e-06 [updatestate_depend_eliminate]: 2.79001e-06 [updatestate_assign_eliminate]: 2.24001e-06 [updatestate_loads_eliminate]: 2.20002e-06 [parameter_eliminate]: 9.00007e-07 [a_2]: 6.029e-05 [accelerated_algorithm]: 5.34e-06 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 1.29998e-06 [shard_inline]: 4.97999e-06 [merge_send_recv]: 4.57e-06 [auto_parallel]: 5.00001e-06 [parallel]: 4.3e-06 [flash_sp]: 3.55e-06 [merge_comm]: 3.01001e-06 [allreduce_fusion]: 2.84001e-06 [matmul_add_comm_reduction]: 4.80999e-06 [allreduce_slice_to_reducescatter]: 3.19997e-07 [virtual_shard_identity]: 6.31e-06 [virtual_dataset]: 5.47001e-06 [get_grad_eliminate_]: 5.30001e-06 [virtual_output]: 5.09e-06 [merge_forward]: 2.89001e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 5.54e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.184e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 7.73999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.08998e-06 [meta_fg_expand]: 1.82999e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 8.01001e-06 [a_after_grad]: 7.15e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 7.7e-07 [auto_monad_eliminator]: 6.06e-06 [cse]: 1.538e-05 [a_3]: 3.067e-05 [py_interpret_to_execute_after_opt_a]: 4.15999e-06 [slice_cell_reuse_recomputed_activation]: 2.07001e-06 [rewriter_after_opt_a]: 1.666e-05 [convert_after_rewriter]: 1.14e-06 [order_py_execute_after_rewriter]: 1.12999e-06 [mutable_eliminate]: 0.00045596 [opt_b]: 0.0001803, [1] [Cycle 1]: 0.00017435, [7] [b_1]: 0.00010399 [b_2]: 6.36e-06 [updatestate_depend_eliminate]: 5.25001e-06 [updatestate_assign_eliminate]: 2.96999e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 3.80009e-07 [cse]: 1.984e-05 [optimize_parallel_all_gather_comm]: 1.597e-05 [overlap_param_gather]: 2.64001e-06 [cconv]: 2.273e-05 [loop_unroll]: 0.00041877 [opt_after_cconv]: 0.0001126, [1] [Cycle 1]: 8.946e-05, [7] [c_1]: 2.413e-05 [parameter_eliminate]: 2.27999e-06 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.32999e-06 [cse]: 2.052e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 2.785e-05 [tuple_transform]: 6.888e-05, [1] [Cycle 1]: 6.392e-05, [4] [d_1]: 3.629e-05 [none_parameter_eliminate]: 2.03997e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 6.43e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.577e-05 [cse_after_recomputation]: 2.316e-05, [1] [Cycle 1]: 1.881e-05, [1] [cse]: 1.341e-05 [environ_conv]: 7.73999e-06 [swap_dp_allreduce_reducescatter]: 5.13002e-06 [bias_add_comm_swap]: 2.64001e-06 [label_micro_interleaved_index]: 4.26001e-06 [label_fine_grained_interleaved_index]: 2.87002e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.39001e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.09e-06 [full_micro_interleaved_order_control]: 2.41e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.16002e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 8.99978e-07 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 1.165e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 3.88999e-06 [overlap_recompute_and_grad_model_parallel]: 4.60999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 1.692e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.81e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 7.583e-05, [1] [Cycle 1]: 7.166e-05, [6] [build]: 9.25999e-06 [elim_shapecalc]: 8.69e-06 [elim_not_effective]: 1.146e-05 [opt_reshape]: 6.22001e-06 [fold_const_symbol]: 9.11998e-06 [renormalize]: 1.30007e-07 [detach_backward]: 1.40001e-06 [pipeline_parallel_scheduler]: 1.49e-06 [auto_monad_reorder]: 1.598e-05 [get_jit_bprop_graph]: 9.70002e-07 [rewriter_after_jit_bprop_graph]: 3.35e-06 [opt_after_jit_grad]: 0.00045502 [validate]: 3.757e-05 [backend_pass]: 1.14e-06 [task_emit]: 0.0386181 [execute]: 7.96001e-06 Sums bootstrap : 0.000467s : 0.68% type_inference : 0.024595s : 35.96% event_method : 0.000023s : 0.03% auto_monad : 0.000082s : 0.12% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000034s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000242s : 0.35% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000084s : 0.12% optimize.opt_a.loop_unroll : 0.000039s : 0.06% optimize.opt_a.a_1 : 0.000684s : 1.00% optimize.opt_a.with_stream_mark : 0.000024s : 0.03% optimize.opt_a.recompute_prepare : 0.000012s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000129s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.02% optimize.opt_a.merge_send_recv : 0.000013s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.02% optimize.opt_a.parallel : 0.000025s : 0.04% optimize.opt_a.flash_sp : 0.000011s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.02% optimize.opt_a.virtual_output : 0.000011s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.02% optimize.opt_a.a_after_grad : 0.000015s : 0.02% optimize.opt_a.renormalize : 0.000912s : 1.33% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.03% optimize.opt_a.cse : 0.000051s : 0.07% optimize.opt_a.a_3 : 0.000072s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000456s : 0.67% optimize.opt_b.b_1 : 0.000104s : 0.15% optimize.opt_b.b_2 : 0.000006s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000419s : 0.61% optimize.opt_after_cconv.c_1 : 0.000024s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000028s : 0.04% optimize.tuple_transform.d_1 : 0.000036s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000046s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000455s : 0.67% validate : 0.000038s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.038618s : 56.47% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000182 26 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.77% : 0.000001s : 2: substitution.fold_const_symbol 2.93% : 0.000005s : 3: substitution.graph_param_transform 79.24% : 0.000144s : 6: substitution.inline 1.61% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.41% : 0.000004s : 4: substitution.remove_not_recompute_node 1.51% : 0.000003s : 2: substitution.replace_old_param 3.85% : 0.000007s : 1: substitution.switch_simplify 6.66% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024535 2 94.65% : 0.023221s : 1: type_inference.infer 5.35% : 0.001314s : 1: type_inference.specialize ------[replace.] 0.000080 9 59.79% : 0.000048s : 6: replace.inline 20.66% : 0.000017s : 1: replace.switch_simplify 19.56% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000158 9 89.19% : 0.000141s : 6: match.inline 3.87% : 0.000006s : 1: match.switch_simplify 6.94% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000172 1092 1.06% : 0.000002s : 12: predicate.accumulaten_eliminater 0.74% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.12% : 0.000004s : 18: predicate.arithmetic_simplify 1.01% : 0.000002s : 12: predicate.cast_eliminate 0.53% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.45% : 0.000001s : 6: predicate.depend_value_elim 1.02% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.81% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.32% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_depend_swap 1.65% : 0.000003s : 21: predicate.environ_get_eliminate 1.14% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.59% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.51% : 0.000004s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.66% : 0.000001s : 6: predicate.get_grad_eliminate 0.28% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.45% : 0.000001s : 6: predicate.incorporate_call_switch 5.88% : 0.000010s : 50: predicate.inline 0.64% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.75% : 0.000001s : 6: predicate.less_batch_normalization 1.64% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.67% : 0.000005s : 32: predicate.load_eliminater 0.94% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.99% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 6: predicate.merge_addn 0.50% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.08% : 0.000002s : 3: predicate.mutable_eliminate 0.31% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 2.20% : 0.000004s : 20: predicate.partial_defer_inline 1.43% : 0.000002s : 17: predicate.partial_eliminate 1.06% : 0.000002s : 12: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000002s : 12: predicate.reduce_eliminate 2.44% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 6: predicate.remove_not_recompute_node 1.28% : 0.000002s : 20: predicate.replace_applicator 0.71% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.03% : 0.000002s : 12: predicate.reshape_eliminate 0.53% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 3: predicate.row_tensor_eliminate 0.70% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.65% : 0.000001s : 6: predicate.special_op_eliminate 0.63% : 0.000001s : 6: predicate.specialize_transform 0.74% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.72% : 0.000003s : 20: predicate.switch_defer_inline 2.21% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.46% : 0.000011s : 68: predicate.switch_simplify 1.02% : 0.000002s : 12: predicate.tile_eliminate 1.01% : 0.000002s : 12: predicate.transpose_eliminate 1.62% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.49% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.13% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.65% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001135 16 53.14% : 0.000603s : 8: func_graph_cloner_run.FuncGraphClonerGraph 46.86% : 0.000532s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.082542 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.79% : 0.003130s : 1: add_attr 3.78% : 0.003121s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000050s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000087s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.61% : 0.000500s : 1: bootstrap 0.03% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.03% : 0.000028s : 1: event_method 0.02% : 0.000014s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.52% : 0.000427s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.56% : 0.000464s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000013s : 1: opt.transform.mutable_eliminate 1.31% : 0.001080s : 78: opt.transform.opt_a 0.03% : 0.000023s : 1: opt.transform.opt_after_cconv 0.03% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000083s : 28: opt.transform.opt_b 0.05% : 0.000041s : 2: opt.transform.opt_trans_graph 0.04% : 0.000032s : 4: opt.transform.symbol_engine_opt 3.36% : 0.002770s : 1: opt_a 0.14% : 0.000117s : 1: opt_after_cconv 0.56% : 0.000464s : 1: opt_after_jit_grad 0.22% : 0.000184s : 1: opt_b 5.82% : 0.004805s : 1: optimize 0.02% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.05% : 0.000038s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000032s : 1: remove_dup_value 0.56% : 0.000459s : 1: renormalize.infer 0.54% : 0.000445s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000020s : 1: rewriter_after_opt_a 0.30% : 0.000248s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000079s : 1: symbol_engine_optimizer 46.80% : 0.038633s : 1: task_emit 0.09% : 0.000072s : 1: tuple_transform 29.81% : 0.024608s : 1: type_inference 0.08% : 0.000062s : 1: validate TotalTime = 2.68673, [24] [bootstrap]: 0.00046219 [type_inference]: 0.0451147 [event_method]: 0.00019445 [auto_monad]: 0.00015703 [graph_reusing]: 1.072e-05 [inline]: 1.98002e-06 [add_attr]: 0.00313685, [1] [add_attr_with_inline]: 0.00312796, [1] [Cycle 1]: 8.374e-05, [2] [tag_attr]: 4.18e-05 [meta_addattr_fg_expand]: 1.226e-05 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 6.21e-05 [insert-virtual-dataset]: 2.45002e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 2.07001e-06 [optimize]: 0.048654, [53] [py_interpret_to_execute]: 4.48001e-06 [rewriter_before_opt_a]: 0.00037647 [opt_a]: 0.0364071, [3] [Cycle 1]: 0.0179626, [45] [expand_dump_flag]: 4.89e-06 [switch_simplify]: 0.00016034 [loop_unroll]: 6.845e-05 [a_1]: 0.00142223 [with_stream_mark]: 2.268e-05 [recompute_prepare]: 2.058e-05 [updatestate_depend_eliminate]: 8.58001e-06 [updatestate_assign_eliminate]: 7.25e-06 [updatestate_loads_eliminate]: 7.09001e-06 [parameter_eliminate]: 3.01001e-06 [a_2]: 0.00021198 [accelerated_algorithm]: 1.485e-05 [shard]: 1.63002e-06 [meta_shard_fg_expand]: 4.23999e-06 [shard_inline]: 1.484e-05 [merge_send_recv]: 1.505e-05 [auto_parallel]: 9.99999e-06 [parallel]: 1.857e-05 [flash_sp]: 9.22001e-06 [merge_comm]: 8.84e-06 [allreduce_fusion]: 8.17e-06 [matmul_add_comm_reduction]: 2.588e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 1.564e-05 [virtual_dataset]: 1.386e-05 [get_grad_eliminate_]: 1.342e-05 [virtual_output]: 1.388e-05 [merge_forward]: 8.66997e-06 [cell_reuse_recompute_pass]: 1.07998e-06 [offload_activation]: 1.603e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.486e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 2.416e-05 [set_forward_comm_id_for_comm_node_pass]: 8.43999e-06 [meta_fg_expand]: 0.00152555 [flash_sp_send_recv_attached]: 4.43001e-06 [receive_attached]: 2.32999e-06 [after_resolve]: 6.338e-05 [a_after_grad]: 8.927e-05 [renormalize]: 0.0122242 [add_forward_monad_depend]: 1.021e-05 [auto_monad_grad]: 5.67999e-06 [auto_monad_eliminator]: 0.00015407 [cse]: 0.00053523 [a_3]: 0.00081856 [Cycle 2]: 0.0119049, [45] [expand_dump_flag]: 1.83997e-06 [switch_simplify]: 0.00014278 [loop_unroll]: 0.00010824 [a_1]: 0.00351839 [with_stream_mark]: 6.574e-05 [recompute_prepare]: 7.933e-05 [updatestate_depend_eliminate]: 4.543e-05 [updatestate_assign_eliminate]: 4.243e-05 [updatestate_loads_eliminate]: 4.221e-05 [parameter_eliminate]: 1.67001e-06 [a_2]: 0.00120648 [accelerated_algorithm]: 0.00013751 [shard]: 1.01002e-06 [meta_shard_fg_expand]: 1.633e-05 [shard_inline]: 7.92e-05 [merge_send_recv]: 5.116e-05 [auto_parallel]: 4.76e-05 [parallel]: 4.65001e-06 [flash_sp]: 3.31999e-06 [merge_comm]: 4.866e-05 [allreduce_fusion]: 4.724e-05 [matmul_add_comm_reduction]: 5.552e-05 [allreduce_slice_to_reducescatter]: 2.80008e-07 [virtual_shard_identity]: 8.405e-05 [virtual_dataset]: 7.768e-05 [get_grad_eliminate_]: 7.73e-05 [virtual_output]: 7.569e-05 [merge_forward]: 4.658e-05 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 5.5e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013304 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 0.00013006 [set_forward_comm_id_for_comm_node_pass]: 5.213e-05 [meta_fg_expand]: 0.00012791 [flash_sp_send_recv_attached]: 1.20001e-06 [receive_attached]: 1.27e-06 [after_resolve]: 8.387e-05 [a_after_grad]: 0.000125 [renormalize]: 0.00378328 [add_forward_monad_depend]: 4.87e-06 [auto_monad_grad]: 1.35999e-06 [auto_monad_eliminator]: 0.00010686 [cse]: 0.0003249 [a_3]: 0.00056542 [Cycle 3]: 0.00652456, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 7.884e-05 [loop_unroll]: 8.394e-05 [a_1]: 0.00228308 [with_stream_mark]: 5.415e-05 [recompute_prepare]: 7.569e-05 [updatestate_depend_eliminate]: 4.607e-05 [updatestate_assign_eliminate]: 4.335e-05 [updatestate_loads_eliminate]: 4.35e-05 [parameter_eliminate]: 1.30999e-06 [a_2]: 0.00120718 [accelerated_algorithm]: 8.646e-05 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.592e-05 [shard_inline]: 9.821e-05 [merge_send_recv]: 5.571e-05 [auto_parallel]: 5.497e-05 [parallel]: 4.15999e-06 [flash_sp]: 1.05999e-06 [merge_comm]: 5.213e-05 [allreduce_fusion]: 4.953e-05 [matmul_add_comm_reduction]: 5.749e-05 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 7.858e-05 [virtual_dataset]: 7.69e-05 [get_grad_eliminate_]: 7.653e-05 [virtual_output]: 7.538e-05 [merge_forward]: 4.622e-05 [cell_reuse_recompute_pass]: 1.69e-06 [offload_activation]: 6.006e-05 [cell_reuse_handle_not_recompute_node_pass]: 0.00013568 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 0.00012608 [set_forward_comm_id_for_comm_node_pass]: 5.294e-05 [meta_fg_expand]: 3.457e-05 [flash_sp_send_recv_attached]: 9.50007e-07 [receive_attached]: 1.02e-06 [after_resolve]: 7.979e-05 [a_after_grad]: 0.00012448 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.01e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 8.639e-05 [cse]: 0.00027748 [a_3]: 0.00055782 [py_interpret_to_execute_after_opt_a]: 4.85999e-06 [slice_cell_reuse_recomputed_activation]: 2.54999e-06 [rewriter_after_opt_a]: 0.0002198 [convert_after_rewriter]: 1.41998e-06 [order_py_execute_after_rewriter]: 1.29e-06 [mutable_eliminate]: 0.00057022 [opt_b]: 0.00684364, [2] [Cycle 1]: 0.00440594, [7] [b_1]: 0.00383838 [b_2]: 8.043e-05 [updatestate_depend_eliminate]: 5.668e-05 [updatestate_assign_eliminate]: 4.642e-05 [updatestate_loads_eliminate]: 4.724e-05 [renormalize]: 4.80009e-07 [cse]: 0.0002869 [Cycle 2]: 0.00242616, [7] [b_1]: 0.00188883 [b_2]: 7.947e-05 [updatestate_depend_eliminate]: 5.223e-05 [updatestate_assign_eliminate]: 4.558e-05 [updatestate_loads_eliminate]: 4.61e-05 [renormalize]: 5.9983e-08 [cse]: 0.00027094 [optimize_parallel_all_gather_comm]: 9.64e-05 [overlap_param_gather]: 2.48e-06 [cconv]: 3.824e-05 [loop_unroll]: 0.00054655 [opt_after_cconv]: 0.00089504, [1] [Cycle 1]: 0.000889, [7] [c_1]: 0.00040885 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 5.989e-05 [updatestate_assign_eliminate]: 4.668e-05 [updatestate_loads_eliminate]: 4.598e-05 [cse]: 0.00028276 [renormalize]: 4.80009e-07 [remove_dup_value]: 0.00051688 [tuple_transform]: 0.00060579, [1] [Cycle 1]: 0.00059924, [4] [d_1]: 0.0005043 [none_parameter_eliminate]: 2.49001e-06 [renormalize]: 4.60015e-07 [switch_simplify]: 6.703e-05 [partial_unused_args_eliminate]: 2.41e-06 [add_recomputation]: 0.00024947 [cse_after_recomputation]: 0.00015881, [1] [Cycle 1]: 0.00015314, [1] [cse]: 0.00014496 [environ_conv]: 3.332e-05 [swap_dp_allreduce_reducescatter]: 4.792e-05 [bias_add_comm_swap]: 2.99001e-06 [label_micro_interleaved_index]: 4.61002e-06 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.37999e-06 [micro_interleaved_order_control]: 2.49999e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.07e-06 [full_micro_interleaved_order_control]: 2.34999e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.02e-06 [overlap_opt_shard_in_pipeline]: 9.00007e-07 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 0.00010457 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 2.548e-05 [overlap_recompute_and_grad_model_parallel]: 2.591e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.31998e-06 [overlap_grad_ring_attention]: 2.511e-05 [overlap_grad_flash_sp]: 0.00013863 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 1.18001e-06 [symbol_engine_optimizer]: 0.00042381, [1] [Cycle 1]: 0.00041877, [6] [build]: 2.041e-05 [elim_shapecalc]: 7.351e-05 [elim_not_effective]: 0.00011517 [opt_reshape]: 6.559e-05 [fold_const_symbol]: 0.00011191 [renormalize]: 2.9002e-07 [detach_backward]: 1.96003e-06 [pipeline_parallel_scheduler]: 1.99e-06 [auto_monad_reorder]: 9.708e-05 [get_jit_bprop_graph]: 1.34e-06 [rewriter_after_jit_bprop_graph]: 4.13001e-06 [opt_after_jit_grad]: 0.00069417 [validate]: 0.00016491 [backend_pass]: 1.09e-06 [task_emit]: 2.58768 [execute]: 1.012e-05 Sums bootstrap : 0.000462s : 0.02% type_inference : 0.045115s : 1.68% event_method : 0.000194s : 0.01% auto_monad : 0.000157s : 0.01% graph_reusing : 0.000011s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000062s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000376s : 0.01% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000382s : 0.01% optimize.opt_a.loop_unroll : 0.000261s : 0.01% optimize.opt_a.a_1 : 0.007224s : 0.27% optimize.opt_a.with_stream_mark : 0.000143s : 0.01% optimize.opt_a.recompute_prepare : 0.000176s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000100s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000093s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000093s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.002626s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000239s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000036s : 0.00% optimize.opt_a.shard_inline : 0.000192s : 0.01% optimize.opt_a.merge_send_recv : 0.000122s : 0.00% optimize.opt_a.auto_parallel : 0.000113s : 0.00% optimize.opt_a.parallel : 0.000027s : 0.00% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000110s : 0.00% optimize.opt_a.allreduce_fusion : 0.000105s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000139s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000178s : 0.01% optimize.opt_a.virtual_dataset : 0.000168s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000167s : 0.01% optimize.opt_a.virtual_output : 0.000165s : 0.01% optimize.opt_a.merge_forward : 0.000101s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000131s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000294s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000280s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000114s : 0.00% optimize.opt_a.meta_fg_expand : 0.001688s : 0.06% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000227s : 0.01% optimize.opt_a.a_after_grad : 0.000339s : 0.01% optimize.opt_a.renormalize : 0.016008s : 0.60% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000347s : 0.01% optimize.opt_a.cse : 0.001138s : 0.04% optimize.opt_a.a_3 : 0.001942s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000220s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000570s : 0.02% optimize.opt_b.b_1 : 0.005727s : 0.21% optimize.opt_b.b_2 : 0.000160s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000109s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000092s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000093s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000558s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000096s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000038s : 0.00% optimize.loop_unroll : 0.000547s : 0.02% optimize.opt_after_cconv.c_1 : 0.000409s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000060s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000047s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000046s : 0.00% optimize.opt_after_cconv.cse : 0.000283s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000517s : 0.02% optimize.tuple_transform.d_1 : 0.000504s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000067s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000249s : 0.01% optimize.cse_after_recomputation.cse : 0.000145s : 0.01% optimize.environ_conv : 0.000033s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000048s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000105s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000025s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000026s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.00% optimize.overlap_grad_flash_sp : 0.000139s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000020s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000074s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000115s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000066s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000112s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000097s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000694s : 0.03% validate : 0.000165s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 2.587680s : 96.48% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001339 696 6.62% : 0.000089s : 36: substitution.arithmetic_simplify 1.23% : 0.000016s : 52: substitution.elim_not_effective 0.60% : 0.000008s : 11: substitution.float_depend_g_call 1.24% : 0.000017s : 17: substitution.float_tuple_getitem_switch 1.18% : 0.000016s : 52: substitution.fold_const_symbol 2.98% : 0.000040s : 59: substitution.graph_param_transform 0.21% : 0.000003s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 43.50% : 0.000583s : 21: substitution.inline 1.26% : 0.000017s : 2: substitution.inline_without_move 3.23% : 0.000043s : 114: substitution.j_node_and_user_rematch 4.18% : 0.000056s : 10: substitution.less_batch_normalization 1.23% : 0.000016s : 13: substitution.minmaximum_grad 0.64% : 0.000009s : 11: substitution.partial_eliminate 4.74% : 0.000063s : 114: substitution.remove_not_recompute_node 1.64% : 0.000022s : 9: substitution.replace_applicator 0.64% : 0.000009s : 11: substitution.replace_old_param 0.18% : 0.000002s : 1: substitution.set_cell_output_no_recompute 0.89% : 0.000012s : 4: substitution.switch_simplify 1.90% : 0.000025s : 14: substitution.transpose_eliminate 4.52% : 0.000061s : 25: substitution.tuple_list_convert_item_index_to_positive 2.24% : 0.000030s : 25: substitution.tuple_list_get_item_const_eliminator 2.93% : 0.000039s : 25: substitution.tuple_list_get_item_depend_reorder 7.09% : 0.000095s : 40: substitution.tuple_list_get_item_eliminator 2.97% : 0.000040s : 25: substitution.tuple_list_get_set_item_eliminator 2.00% : 0.000027s : 1: substitution.zero_like_fill_zero ------[type_inference.] 0.045023 2 93.65% : 0.042165s : 1: type_inference.infer 6.35% : 0.002858s : 1: type_inference.specialize ------[replace.] 0.000271 33 56.35% : 0.000153s : 21: replace.inline 15.57% : 0.000042s : 4: replace.switch_simplify 22.49% : 0.000061s : 7: replace.tuple_list_get_item_eliminator 5.59% : 0.000015s : 1: replace.zero_like_fill_zero ------[match.] 0.000628 33 90.92% : 0.000571s : 21: match.inline 1.53% : 0.000010s : 4: match.switch_simplify 3.44% : 0.000022s : 7: match.tuple_list_get_item_eliminator 4.10% : 0.000026s : 1: match.zero_like_fill_zero ------[predicate.] 0.003153 24043 0.71% : 0.000022s : 191: predicate.accumulaten_eliminater 0.42% : 0.000013s : 59: predicate.ad_related_special_op_eliminate 0.64% : 0.000020s : 158: predicate.addn_check_dump 0.73% : 0.000023s : 191: predicate.addn_zero_filter 0.71% : 0.000022s : 191: predicate.adjust_all_reduce_mul_add 1.75% : 0.000055s : 349: predicate.arithmetic_simplify 0.75% : 0.000024s : 191: predicate.cast_eliminate 1.08% : 0.000034s : 256: predicate.check_bprop_eliminate 0.64% : 0.000020s : 158: predicate.compare_switch_simplify 0.53% : 0.000017s : 216: predicate.const_output_eliminate 0.65% : 0.000020s : 158: predicate.depend_value_elim 0.82% : 0.000026s : 191: predicate.dict_get_item_const_eliminator 0.84% : 0.000026s : 191: predicate.dict_get_item_eliminator 0.72% : 0.000023s : 191: predicate.dict_set_item_eliminator 1.18% : 0.000037s : 275: predicate.dumpgradient_eliminate 0.14% : 0.000004s : 59: predicate.elim_not_effective 0.29% : 0.000009s : 59: predicate.elim_shapecalc_of_broadcastargs 1.61% : 0.000051s : 407: predicate.environ_add_const_eliminate 1.60% : 0.000050s : 407: predicate.environ_get_add_eliminate 1.61% : 0.000051s : 407: predicate.environ_get_depend_swap 2.25% : 0.000071s : 565: predicate.environ_get_eliminate 1.60% : 0.000051s : 407: predicate.environ_get_set_eliminate 0.85% : 0.000027s : 219: predicate.exchange_switch_depend_value 1.16% : 0.000037s : 219: predicate.float_depend_g_call 0.64% : 0.000020s : 158: predicate.float_environ_get_switch 1.55% : 0.000049s : 374: predicate.float_tuple_getitem_switch 0.13% : 0.000004s : 59: predicate.fold_const_symbol 0.68% : 0.000022s : 158: predicate.get_grad_eliminate 0.19% : 0.000006s : 59: predicate.graph_param_transform 0.65% : 0.000021s : 158: predicate.incorporate_call 0.64% : 0.000020s : 158: predicate.incorporate_call_switch 5.22% : 0.000165s : 1014: predicate.inline 1.00% : 0.000031s : 185: predicate.inline_without_move 0.36% : 0.000011s : 158: predicate.j_node_and_user_rematch 0.75% : 0.000024s : 158: predicate.less_batch_normalization 1.96% : 0.000062s : 473: predicate.list_to_tuple_eliminator_ 2.61% : 0.000082s : 677: predicate.load_eliminater 0.50% : 0.000016s : 72: predicate.loop_unroll_after_grad 1.13% : 0.000036s : 259: predicate.loop_unroll_before_grad 1.91% : 0.000060s : 479: predicate.make_slice_get_slice_eliminator 0.64% : 0.000020s : 158: predicate.merge_addn 1.04% : 0.000033s : 256: predicate.micro_step_allgather_replace 1.05% : 0.000033s : 256: predicate.mini_step_allgather_replace 0.73% : 0.000023s : 191: predicate.minmaximum_grad 0.51% : 0.000016s : 73: predicate.mutable_eliminate 0.27% : 0.000008s : 59: predicate.opt_reshape 0.96% : 0.000030s : 216: predicate.parallel_virtual_node 1.19% : 0.000037s : 219: predicate.partial_defer_inline 1.25% : 0.000040s : 270: predicate.partial_eliminate 0.73% : 0.000023s : 191: predicate.print_const_string_wrapper 0.64% : 0.000020s : 158: predicate.reduce_all_const_elim 0.93% : 0.000029s : 191: predicate.reduce_eliminate 2.58% : 0.000081s : 677: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000011s : 158: predicate.remove_not_recompute_node 1.31% : 0.000041s : 454: predicate.replace_applicator 0.44% : 0.000014s : 185: predicate.replace_old_param 0.54% : 0.000017s : 216: predicate.reset_defer_inline 0.74% : 0.000023s : 191: predicate.reshape_eliminate 1.05% : 0.000033s : 256: predicate.row_tensor_add_zeros_like 0.67% : 0.000021s : 144: predicate.row_tensor_eliminate 1.28% : 0.000040s : 256: predicate.same_eliminate 0.42% : 0.000013s : 158: predicate.set_cell_output_no_recompute 0.71% : 0.000022s : 158: predicate.shard_identity_eliminate 1.21% : 0.000038s : 275: predicate.special_op_eliminate 0.72% : 0.000023s : 158: predicate.specialize_transform 1.11% : 0.000035s : 256: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000027s : 185: predicate.stack_unstack_eliminate 0.32% : 0.000010s : 72: predicate.switch_call_monad_eliminater 0.94% : 0.000030s : 219: predicate.switch_defer_inline 1.96% : 0.000062s : 475: predicate.switch_layer_defer_inline 3.11% : 0.000098s : 703: predicate.switch_simplify 0.74% : 0.000023s : 191: predicate.tile_eliminate 0.76% : 0.000024s : 191: predicate.transpose_eliminate 1.97% : 0.000062s : 466: predicate.tuple_list_convert_item_index_to_positive 2.04% : 0.000064s : 466: predicate.tuple_list_get_item_const_eliminator 1.99% : 0.000063s : 466: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000091s : 631: predicate.tuple_list_get_item_eliminator 2.00% : 0.000063s : 466: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000085s : 624: predicate.tuple_list_set_item_eliminator 1.91% : 0.000060s : 473: predicate.tuple_to_list_eliminator_ 2.60% : 0.000082s : 677: predicate.updatestate_pure_node_eliminater 3.34% : 0.000105s : 835: predicate.updatestate_useless_node_eliminater 0.93% : 0.000029s : 216: predicate.value_based_eliminate 0.70% : 0.000022s : 158: predicate.virtual_dataset_eliminate 0.70% : 0.000022s : 158: predicate.virtual_output_eliminate 0.25% : 0.000008s : 59: predicate.virtual_view_grad_eliminate 1.01% : 0.000032s : 218: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004438 51 63.56% : 0.002821s : 26: func_graph_cloner_run.FuncGraphClonerGraph 36.44% : 0.001617s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.776650 292 0.00% : 0.000003s : 1: ForceFp32Comm 0.11% : 0.003142s : 1: add_attr 0.11% : 0.003131s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000255s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000167s : 1: auto_monad 0.00% : 0.000102s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.02% : 0.000494s : 1: bootstrap 0.00% : 0.000042s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000108s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000162s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000037s : 1: environ_conv 0.01% : 0.000206s : 1: event_method 0.00% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000015s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.02% : 0.000556s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000579s : 1: mutable_eliminate 0.00% : 0.000029s : 1: offloading_packed_experts 0.00% : 0.000099s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000100s : 1: opt.transform.mutable_eliminate 0.53% : 0.014702s : 117: opt.transform.opt_a 0.01% : 0.000407s : 1: opt.transform.opt_after_cconv 0.01% : 0.000200s : 1: opt.transform.opt_after_jit_grad 0.21% : 0.005792s : 83: opt.transform.opt_b 0.02% : 0.000568s : 2: opt.transform.opt_trans_graph 0.01% : 0.000362s : 4: opt.transform.symbol_engine_opt 1.31% : 0.036411s : 1: opt_a 0.03% : 0.000899s : 1: opt_after_cconv 0.03% : 0.000704s : 1: opt_after_jit_grad 0.25% : 0.006847s : 1: opt_b 1.75% : 0.048659s : 1: optimize 0.00% : 0.000101s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000142s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000029s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000067s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000526s : 1: remove_dup_value 0.39% : 0.010916s : 2: renormalize.infer 0.18% : 0.005075s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000224s : 1: rewriter_after_opt_a 0.01% : 0.000384s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000051s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000427s : 1: symbol_engine_optimizer 93.20% : 2.587708s : 1: task_emit 0.02% : 0.000609s : 1: tuple_transform 1.63% : 0.045130s : 1: type_inference 0.01% : 0.000224s : 1: validate group_cases_16 have all been run, results of sub cases are below: case: ('ge', ) {} pass. case: ('kbk', ) {} pass. case: ('pynative', ) {} pass. case: ('pynative', ) {} pass. case: ('kbk', ) {} pass. case: ('kbk', ) {} pass. case: ('kbk', ) {} pass. case: ('ge', ) {} pass. ops group_cases_17 with 8 cases start to running, all cases are below: case: (, 'ge', ) case: (, 'ge', ) case: (, 'pynative', ) case: (, 'kbk', ) case: (, 'ge', ) case: (, 0, 0) case: (, 0, 2) case: (, 0, -1) ops group_cases_17 total running memory: 32M, memory threshold: 51200M random_generator: generate a numpy.ndarray(shape=(2, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(2, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(2, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' TotalTime = 3.15802, [24] [bootstrap]: 0.00077162 [type_inference]: 0.0447783 [event_method]: 1.77e-05 [auto_monad]: 7.955e-05 [graph_reusing]: 3.95e-06 [inline]: 1.50001e-06 [add_attr]: 0.00673237, [1] [add_attr_with_inline]: 0.00672258, [1] [Cycle 1]: 8.568e-05, [2] [tag_attr]: 2.227e-05 [meta_addattr_fg_expand]: 9.56998e-06 [parallel-infer-symbol]: 1.78002e-06 [pre_auto_parallel]: 3.566e-05 [insert-virtual-dataset]: 1.31002e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.22e-06 [pipeline_split]: 8.60018e-07 [optimize]: 0.00498484, [53] [py_interpret_to_execute]: 3.7e-06 [rewriter_before_opt_a]: 0.00017964 [opt_a]: 0.00275258, [2] [Cycle 1]: 0.00216269, [45] [expand_dump_flag]: 2.09e-06 [switch_simplify]: 6.16e-05 [loop_unroll]: 2.94e-05 [a_1]: 0.00049752 [with_stream_mark]: 1.102e-05 [recompute_prepare]: 7.06999e-06 [updatestate_depend_eliminate]: 8.10999e-06 [updatestate_assign_eliminate]: 6.71e-06 [updatestate_loads_eliminate]: 2.12999e-06 [parameter_eliminate]: 1.25999e-06 [a_2]: 6.778e-05 [accelerated_algorithm]: 5.91998e-06 [shard]: 9.70002e-07 [meta_shard_fg_expand]: 1.37999e-06 [shard_inline]: 5.66e-06 [merge_send_recv]: 2.595e-05 [auto_parallel]: 6.26998e-06 [parallel]: 5.616e-05 [flash_sp]: 3.427e-05 [merge_comm]: 4.13001e-06 [allreduce_fusion]: 1.496e-05 [matmul_add_comm_reduction]: 2.074e-05 [allreduce_slice_to_reducescatter]: 1.146e-05 [virtual_shard_identity]: 8.99998e-06 [virtual_dataset]: 6.14999e-06 [get_grad_eliminate_]: 5.43002e-06 [virtual_output]: 5.64e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 1.809e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.359e-05 [merge_recompute_call_nodes]: 1.28002e-06 [before_grad]: 9.05999e-06 [set_forward_comm_id_for_comm_node_pass]: 1.348e-05 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.31998e-06 [receive_attached]: 2.25e-05 [after_resolve]: 8.80999e-06 [a_after_grad]: 7.87e-06 [renormalize]: 0.0007819 [add_forward_monad_depend]: 5.90002e-06 [auto_monad_grad]: 2.20002e-06 [auto_monad_eliminator]: 2.346e-05 [cse]: 5.589e-05 [a_3]: 4.027e-05 [Cycle 2]: 0.00058061, [45] [expand_dump_flag]: 1.50001e-06 [switch_simplify]: 1.936e-05 [loop_unroll]: 6.02999e-06 [a_1]: 9.971e-05 [with_stream_mark]: 1.14e-05 [recompute_prepare]: 5.69e-06 [updatestate_depend_eliminate]: 3.09999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.50002e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 6.078e-05 [accelerated_algorithm]: 5.10999e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.32999e-06 [shard_inline]: 5.32001e-06 [merge_send_recv]: 4.2e-06 [auto_parallel]: 5.52001e-06 [parallel]: 4.25999e-06 [flash_sp]: 7.61001e-06 [merge_comm]: 2.83e-06 [allreduce_fusion]: 2.54001e-06 [matmul_add_comm_reduction]: 5.46e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 5.85002e-06 [virtual_dataset]: 5.17e-06 [get_grad_eliminate_]: 4.83001e-06 [virtual_output]: 5.00999e-06 [merge_forward]: 2.63e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 6.25002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.953e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 8.42e-06 [set_forward_comm_id_for_comm_node_pass]: 3.20998e-06 [meta_fg_expand]: 1.77001e-06 [flash_sp_send_recv_attached]: 9.49978e-07 [receive_attached]: 1.10999e-06 [after_resolve]: 8.06001e-06 [a_after_grad]: 7.18e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.38002e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 6.28e-06 [cse]: 1.251e-05 [a_3]: 3.024e-05 [py_interpret_to_execute_after_opt_a]: 4.05e-06 [slice_cell_reuse_recomputed_activation]: 4.99998e-06 [rewriter_after_opt_a]: 2.394e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00054399 [opt_b]: 0.00017632, [1] [Cycle 1]: 0.00017, [7] [b_1]: 0.00010189 [b_2]: 7.22002e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.26998e-06 [updatestate_loads_eliminate]: 2.14999e-06 [renormalize]: 4.50003e-07 [cse]: 1.867e-05 [optimize_parallel_all_gather_comm]: 2.859e-05 [overlap_param_gather]: 1.377e-05 [cconv]: 2.484e-05 [loop_unroll]: 0.00041058 [opt_after_cconv]: 9.103e-05, [1] [Cycle 1]: 8.512e-05, [7] [c_1]: 2.415e-05 [parameter_eliminate]: 3.06999e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.06e-06 [cse]: 1.615e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.34e-05 [tuple_transform]: 6.363e-05, [1] [Cycle 1]: 5.957e-05, [4] [d_1]: 3.465e-05 [none_parameter_eliminate]: 1.15001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 5.94e-06 [partial_unused_args_eliminate]: 1.66998e-06 [add_recomputation]: 6.057e-05 [cse_after_recomputation]: 2.026e-05, [1] [Cycle 1]: 1.636e-05, [1] [cse]: 1.113e-05 [environ_conv]: 1.378e-05 [swap_dp_allreduce_reducescatter]: 2.791e-05 [bias_add_comm_swap]: 1.347e-05 [label_micro_interleaved_index]: 1.497e-05 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.67999e-06 [slice_recompute_activation]: 1.77999e-06 [micro_interleaved_order_control]: 2.04999e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 5.11002e-06 [remove_cast_before_assign_add]: 7.18998e-06 [full_micro_interleaved_order_control]: 1.122e-05 [reorder_send_recv_between_fp_bp]: 2.26e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.00001e-06 [interleave_parallel_branches]: 1.318e-05 [overlap_opt_shard_in_pipeline]: 2.303e-05 [overlap_opt_shard_grad_in_pipeline]: 9.09989e-07 [control_data_broadcast_order]: 1.174e-05 [grouped_pairwise_exchange_alltoall]: 1.30001e-06 [offloading_packed_experts]: 3.53e-06 [overlap_recompute_and_grad_model_parallel]: 1.217e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.20001e-06 [overlap_recompute_comm]: 2.12001e-06 [overlap_grad_ring_attention]: 2.609e-05 [overlap_grad_flash_sp]: 5.099e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 1.532e-05 [split_layernorm_comm]: 7.59988e-07 [handle_group_info]: 4.68001e-06 [symbol_engine_optimizer]: 6.826e-05, [1] [Cycle 1]: 6.415e-05, [6] [build]: 1.76e-06 [elim_shapecalc]: 9.61003e-06 [elim_not_effective]: 1.094e-05 [opt_reshape]: 6.06e-06 [fold_const_symbol]: 8.76002e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.05999e-06 [pipeline_parallel_scheduler]: 9.10019e-07 [auto_monad_reorder]: 2.049e-05 [get_jit_bprop_graph]: 1.49998e-06 [rewriter_after_jit_bprop_graph]: 3.11001e-06 [opt_after_jit_grad]: 0.00044045 [validate]: 5.66e-05 [backend_pass]: 8.09989e-07 [task_emit]: 3.0998 [execute]: 1.084e-05 Sums bootstrap : 0.000772s : 0.02% type_inference : 0.044778s : 1.42% event_method : 0.000018s : 0.00% auto_monad : 0.000080s : 0.00% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000010s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000036s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000180s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000081s : 0.00% optimize.opt_a.loop_unroll : 0.000035s : 0.00% optimize.opt_a.a_1 : 0.000597s : 0.02% optimize.opt_a.with_stream_mark : 0.000022s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000129s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000030s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000060s : 0.00% optimize.opt_a.flash_sp : 0.000042s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000043s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.00% optimize.opt_a.a_after_grad : 0.000015s : 0.00% optimize.opt_a.renormalize : 0.000782s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.00% optimize.opt_a.cse : 0.000068s : 0.00% optimize.opt_a.a_3 : 0.000071s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000544s : 0.02% optimize.opt_b.b_1 : 0.000102s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000025s : 0.00% optimize.loop_unroll : 0.000411s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000035s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000014s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000005s : 0.00% optimize.remove_cast_before_assign_add : 0.000007s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000023s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000015s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000005s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000020s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000440s : 0.01% validate : 0.000057s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.099802s : 98.40% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000156 24 0.74% : 0.000001s : 2: substitution.elim_not_effective 0.65% : 0.000001s : 2: substitution.fold_const_symbol 3.15% : 0.000005s : 3: substitution.graph_param_transform 68.98% : 0.000108s : 5: substitution.inline 2.08% : 0.000003s : 4: substitution.j_node_and_user_rematch 14.53% : 0.000023s : 4: substitution.remove_not_recompute_node 1.83% : 0.000003s : 2: substitution.replace_old_param 8.04% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.044713 2 97.65% : 0.043662s : 1: type_inference.infer 2.35% : 0.001051s : 1: type_inference.specialize ------[replace.] 0.000054 7 73.30% : 0.000039s : 5: replace.inline 26.70% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000116 7 90.16% : 0.000105s : 5: match.inline 9.84% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000163 1031 0.97% : 0.000002s : 11: predicate.accumulaten_eliminater 0.88% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 11: predicate.addn_zero_filter 0.85% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.56% : 0.000004s : 17: predicate.arithmetic_simplify 0.94% : 0.000002s : 11: predicate.cast_eliminate 0.60% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.49% : 0.000001s : 6: predicate.depend_value_elim 0.92% : 0.000001s : 11: predicate.dict_get_item_const_eliminator 1.04% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.65% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 14: predicate.environ_get_depend_swap 1.65% : 0.000003s : 20: predicate.environ_get_eliminate 1.08% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.49% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.37% : 0.000004s : 18: predicate.float_depend_g_call 0.51% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 3: predicate.fold_const_symbol 0.69% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.52% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 5.83% : 0.000009s : 47: predicate.inline 0.65% : 0.000001s : 6: predicate.inline_without_move 0.33% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.80% : 0.000001s : 6: predicate.less_batch_normalization 1.74% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.42% : 0.000004s : 30: predicate.load_eliminater 0.98% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.81% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.54% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.59% : 0.000001s : 6: predicate.merge_addn 0.56% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 6: predicate.mini_step_allgather_replace 1.06% : 0.000002s : 11: predicate.minmaximum_grad 1.49% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.31% : 0.000001s : 3: predicate.parallel_virtual_node 1.97% : 0.000003s : 18: predicate.partial_defer_inline 1.44% : 0.000002s : 16: predicate.partial_eliminate 0.94% : 0.000002s : 11: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.27% : 0.000002s : 11: predicate.reduce_eliminate 2.51% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.49% : 0.000001s : 6: predicate.remove_not_recompute_node 1.29% : 0.000002s : 19: predicate.replace_applicator 0.44% : 0.000001s : 6: predicate.replace_old_param 0.39% : 0.000001s : 3: predicate.reset_defer_inline 1.19% : 0.000002s : 11: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.72% : 0.000001s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.76% : 0.000001s : 6: predicate.shard_identity_eliminate 0.56% : 0.000001s : 6: predicate.special_op_eliminate 0.73% : 0.000001s : 6: predicate.specialize_transform 0.77% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.42% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.58% : 0.000003s : 18: predicate.switch_defer_inline 2.10% : 0.000003s : 24: predicate.switch_layer_defer_inline 5.55% : 0.000009s : 61: predicate.switch_simplify 0.90% : 0.000001s : 11: predicate.tile_eliminate 0.96% : 0.000002s : 11: predicate.transpose_eliminate 1.67% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 17: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.19% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.44% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.93% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.28% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 2.88% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.68% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.66% : 0.000001s : 6: predicate.virtual_output_eliminate 0.28% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000634 13 52.07% : 0.000330s : 6: func_graph_cloner_run.FuncGraphClonerGraph 47.93% : 0.000304s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.171643 196 0.00% : 0.000008s : 1: ForceFp32Comm 0.21% : 0.006737s : 1: add_attr 0.21% : 0.006726s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000086s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.03% : 0.000813s : 1: bootstrap 0.00% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000018s : 1: environ_conv 0.00% : 0.000023s : 1: event_method 0.00% : 0.000020s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000004s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000419s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000553s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.001007s : 78: opt.transform.opt_a 0.00% : 0.000023s : 1: opt.transform.opt_after_cconv 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000083s : 28: opt.transform.opt_b 0.00% : 0.000039s : 2: opt.transform.opt_trans_graph 0.00% : 0.000032s : 4: opt.transform.symbol_engine_opt 0.09% : 0.002755s : 1: opt_a 0.00% : 0.000094s : 1: opt_after_cconv 0.01% : 0.000449s : 1: opt_after_jit_grad 0.01% : 0.000179s : 1: opt_b 0.16% : 0.004989s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000029s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000027s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000040s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000010s : 1: remove_cast_before_assign_add 0.00% : 0.000017s : 1: remove_dup_value 0.01% : 0.000437s : 1: renormalize.infer 0.01% : 0.000337s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000027s : 1: rewriter_after_opt_a 0.01% : 0.000185s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000018s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000071s : 1: symbol_engine_optimizer 97.74% : 3.099854s : 1: task_emit 0.00% : 0.000067s : 1: tuple_transform 1.41% : 0.044794s : 1: type_inference 0.00% : 0.000085s : 1: validate TotalTime = 3.18864, [24] [bootstrap]: 0.00077504 [type_inference]: 0.0448074 [event_method]: 1.73e-05 [auto_monad]: 8.619e-05 [graph_reusing]: 3.78001e-06 [inline]: 1.58002e-06 [add_attr]: 0.00687621, [1] [add_attr_with_inline]: 0.00686592, [1] [Cycle 1]: 7.55e-05, [2] [tag_attr]: 1.976e-05 [meta_addattr_fg_expand]: 9.29e-06 [parallel-infer-symbol]: 1.76003e-06 [pre_auto_parallel]: 3.836e-05 [insert-virtual-dataset]: 1.05999e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 1.03001e-06 [pipeline_split]: 7.39994e-07 [optimize]: 0.00480394, [53] [py_interpret_to_execute]: 3.48e-06 [rewriter_before_opt_a]: 0.00017772 [opt_a]: 0.00257482, [2] [Cycle 1]: 0.00198666, [45] [expand_dump_flag]: 1.63002e-06 [switch_simplify]: 5.505e-05 [loop_unroll]: 2.912e-05 [a_1]: 0.00050584 [with_stream_mark]: 1.984e-05 [recompute_prepare]: 6.71999e-06 [updatestate_depend_eliminate]: 6.54001e-06 [updatestate_assign_eliminate]: 5.79e-06 [updatestate_loads_eliminate]: 2.14999e-06 [parameter_eliminate]: 8.2e-07 [a_2]: 6.856e-05 [accelerated_algorithm]: 6.73e-06 [shard]: 8.00006e-07 [meta_shard_fg_expand]: 1.14e-06 [shard_inline]: 6.01e-06 [merge_send_recv]: 2.265e-05 [auto_parallel]: 5.16998e-06 [parallel]: 5.847e-05 [flash_sp]: 1.975e-05 [merge_comm]: 3.61001e-06 [allreduce_fusion]: 8.67998e-06 [matmul_add_comm_reduction]: 9.34998e-06 [allreduce_slice_to_reducescatter]: 6.24001e-06 [virtual_shard_identity]: 8.52e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.47001e-06 [virtual_output]: 5.30999e-06 [merge_forward]: 2.83e-06 [cell_reuse_recompute_pass]: 4.68999e-06 [offload_activation]: 1.406e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.568e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 9.35001e-06 [set_forward_comm_id_for_comm_node_pass]: 7.56999e-06 [meta_fg_expand]: 2.43998e-06 [flash_sp_send_recv_attached]: 1.36002e-06 [receive_attached]: 8.75001e-06 [after_resolve]: 8.69003e-06 [a_after_grad]: 8.25e-06 [renormalize]: 0.0006744 [add_forward_monad_depend]: 4.31002e-06 [auto_monad_grad]: 1.22999e-06 [auto_monad_eliminator]: 1.961e-05 [cse]: 5.626e-05 [a_3]: 4.078e-05 [Cycle 2]: 0.00057864, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 3.653e-05 [loop_unroll]: 5.81e-06 [a_1]: 9.576e-05 [with_stream_mark]: 9.04003e-06 [recompute_prepare]: 5.22999e-06 [updatestate_depend_eliminate]: 2.63e-06 [updatestate_assign_eliminate]: 2.17001e-06 [updatestate_loads_eliminate]: 2.43e-06 [parameter_eliminate]: 8.30012e-07 [a_2]: 5.959e-05 [accelerated_algorithm]: 5.24998e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.15999e-06 [shard_inline]: 5.00999e-06 [merge_send_recv]: 4.45999e-06 [auto_parallel]: 5.11002e-06 [parallel]: 4.05e-06 [flash_sp]: 6.83e-06 [merge_comm]: 3.00998e-06 [allreduce_fusion]: 2.46e-06 [matmul_add_comm_reduction]: 3.88001e-06 [allreduce_slice_to_reducescatter]: 2.80008e-07 [virtual_shard_identity]: 5.90002e-06 [virtual_dataset]: 5.02999e-06 [get_grad_eliminate_]: 5.42999e-06 [virtual_output]: 4.87998e-06 [merge_forward]: 2.49001e-06 [cell_reuse_recompute_pass]: 1.08001e-06 [offload_activation]: 4.70001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.878e-05 [merge_recompute_call_nodes]: 5.50004e-07 [before_grad]: 8.22e-06 [set_forward_comm_id_for_comm_node_pass]: 3.11001e-06 [meta_fg_expand]: 1.74998e-06 [flash_sp_send_recv_attached]: 7.10017e-07 [receive_attached]: 7.50006e-07 [after_resolve]: 7.3e-06 [a_after_grad]: 7.16001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 9.39996e-07 [auto_monad_grad]: 6.89994e-07 [auto_monad_eliminator]: 5.47001e-06 [cse]: 1.235e-05 [a_3]: 2.964e-05 [py_interpret_to_execute_after_opt_a]: 4.84998e-06 [slice_cell_reuse_recomputed_activation]: 1.52001e-06 [rewriter_after_opt_a]: 2.823e-05 [convert_after_rewriter]: 1.14998e-06 [order_py_execute_after_rewriter]: 1.03001e-06 [mutable_eliminate]: 0.00054378 [opt_b]: 0.0001767, [1] [Cycle 1]: 0.00017004, [7] [b_1]: 0.00010192 [b_2]: 6.41e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.29999e-06 [renormalize]: 4.19997e-07 [cse]: 1.926e-05 [optimize_parallel_all_gather_comm]: 2.809e-05 [overlap_param_gather]: 1.393e-05 [cconv]: 2.504e-05 [loop_unroll]: 0.00041412 [opt_after_cconv]: 8.928e-05, [1] [Cycle 1]: 8.399e-05, [7] [c_1]: 2.395e-05 [parameter_eliminate]: 2.58e-06 [updatestate_depend_eliminate]: 4.89e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.21e-06 [cse]: 1.638e-05 [renormalize]: 3.29979e-07 [remove_dup_value]: 1.118e-05 [tuple_transform]: 6.788e-05, [1] [Cycle 1]: 6.379e-05, [4] [d_1]: 3.894e-05 [none_parameter_eliminate]: 7.39994e-07 [renormalize]: 1.19995e-07 [switch_simplify]: 5.96e-06 [partial_unused_args_eliminate]: 8.30012e-07 [add_recomputation]: 5.676e-05 [cse_after_recomputation]: 1.946e-05, [1] [Cycle 1]: 1.512e-05, [1] [cse]: 9.87999e-06 [environ_conv]: 1.524e-05 [swap_dp_allreduce_reducescatter]: 2.828e-05 [bias_add_comm_swap]: 1.303e-05 [label_micro_interleaved_index]: 1.509e-05 [label_fine_grained_interleaved_index]: 5.04e-06 [merge_cast_opt]: 7.59988e-07 [slice_recompute_activation]: 6.29982e-07 [micro_interleaved_order_control]: 1.50999e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 6.69999e-07 [remove_cast_before_assign_add]: 1.121e-05 [full_micro_interleaved_order_control]: 1.206e-05 [reorder_send_recv_between_fp_bp]: 2.16998e-06 [comm_op_add_attrs]: 8.80013e-07 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.09e-06 [interleave_parallel_branches]: 1.297e-05 [overlap_opt_shard_in_pipeline]: 1.903e-05 [overlap_opt_shard_grad_in_pipeline]: 1.97999e-06 [control_data_broadcast_order]: 1.522e-05 [grouped_pairwise_exchange_alltoall]: 6.69001e-06 [offloading_packed_experts]: 2.84999e-06 [overlap_recompute_and_grad_model_parallel]: 1.437e-05 [overlap_grad_matmul_and_grad_allreduce]: 8.00006e-07 [overlap_recompute_allgather_and_fa_grad]: 6.89994e-07 [overlap_recompute_comm]: 5.05001e-06 [overlap_grad_ring_attention]: 1.65e-05 [overlap_grad_flash_sp]: 5.033e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.227e-05 [split_layernorm_comm]: 1.65001e-06 [handle_group_info]: 4.85999e-06 [symbol_engine_optimizer]: 6.816e-05, [1] [Cycle 1]: 6.383e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 8.22e-06 [elim_not_effective]: 1.13e-05 [opt_reshape]: 5.86e-06 [fold_const_symbol]: 8.38001e-06 [renormalize]: 1.80007e-07 [detach_backward]: 1.80001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.097e-05 [get_jit_bprop_graph]: 1.20999e-06 [rewriter_after_jit_bprop_graph]: 3.11999e-06 [opt_after_jit_grad]: 0.00044763 [validate]: 4.86e-05 [backend_pass]: 8.80013e-07 [task_emit]: 3.13042 [execute]: 1.104e-05 Sums bootstrap : 0.000775s : 0.02% type_inference : 0.044807s : 1.41% event_method : 0.000017s : 0.00% auto_monad : 0.000086s : 0.00% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000038s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000003s : 0.00% optimize.rewriter_before_opt_a : 0.000178s : 0.01% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000092s : 0.00% optimize.opt_a.loop_unroll : 0.000035s : 0.00% optimize.opt_a.a_1 : 0.000602s : 0.02% optimize.opt_a.with_stream_mark : 0.000029s : 0.00% optimize.opt_a.recompute_prepare : 0.000012s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000128s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000002s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000027s : 0.00% optimize.opt_a.auto_parallel : 0.000010s : 0.00% optimize.opt_a.parallel : 0.000063s : 0.00% optimize.opt_a.flash_sp : 0.000027s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000007s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000010s : 0.00% optimize.opt_a.merge_forward : 0.000005s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000001s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000010s : 0.00% optimize.opt_a.after_resolve : 0.000016s : 0.00% optimize.opt_a.a_after_grad : 0.000015s : 0.00% optimize.opt_a.renormalize : 0.000674s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000005s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.00% optimize.opt_a.cse : 0.000069s : 0.00% optimize.opt_a.a_3 : 0.000070s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000544s : 0.02% optimize.opt_b.b_1 : 0.000102s : 0.00% optimize.opt_b.b_2 : 0.000006s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000025s : 0.00% optimize.loop_unroll : 0.000414s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000011s : 0.00% optimize.tuple_transform.d_1 : 0.000039s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000057s : 0.00% optimize.cse_after_recomputation.cse : 0.000010s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000013s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000019s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000007s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000005s : 0.00% optimize.overlap_grad_ring_attention : 0.000016s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000005s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000008s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000448s : 0.01% validate : 0.000049s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.130422s : 98.42% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000165 24 1.04% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 5.89% : 0.000010s : 3: substitution.graph_param_transform 72.95% : 0.000120s : 5: substitution.inline 1.77% : 0.000003s : 4: substitution.j_node_and_user_rematch 9.84% : 0.000016s : 4: substitution.remove_not_recompute_node 1.34% : 0.000002s : 2: substitution.replace_old_param 6.46% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.044744 2 97.64% : 0.043688s : 1: type_inference.infer 2.36% : 0.001056s : 1: type_inference.specialize ------[replace.] 0.000051 7 72.65% : 0.000037s : 5: replace.inline 27.35% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000127 7 92.54% : 0.000117s : 5: match.inline 7.46% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000162 1031 0.95% : 0.000002s : 11: predicate.accumulaten_eliminater 0.82% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 1.04% : 0.000002s : 11: predicate.addn_zero_filter 0.85% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.29% : 0.000004s : 17: predicate.arithmetic_simplify 1.02% : 0.000002s : 11: predicate.cast_eliminate 0.56% : 0.000001s : 6: predicate.check_bprop_eliminate 0.54% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.57% : 0.000001s : 6: predicate.depend_value_elim 0.97% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.17% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 11: predicate.dict_set_item_eliminator 1.02% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 3: predicate.elim_not_effective 0.36% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_depend_swap 1.66% : 0.000003s : 20: predicate.environ_get_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.49% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.09% : 0.000003s : 18: predicate.float_depend_g_call 0.52% : 0.000001s : 6: predicate.float_environ_get_switch 0.71% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 3: predicate.fold_const_symbol 0.65% : 0.000001s : 6: predicate.get_grad_eliminate 0.30% : 0.000000s : 3: predicate.graph_param_transform 0.53% : 0.000001s : 6: predicate.incorporate_call 0.48% : 0.000001s : 6: predicate.incorporate_call_switch 5.83% : 0.000009s : 47: predicate.inline 0.68% : 0.000001s : 6: predicate.inline_without_move 0.32% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.91% : 0.000001s : 6: predicate.less_batch_normalization 1.82% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.65% : 0.000004s : 30: predicate.load_eliminater 1.22% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.82% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.66% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 6: predicate.merge_addn 0.51% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.86% : 0.000001s : 11: predicate.minmaximum_grad 1.26% : 0.000002s : 3: predicate.mutable_eliminate 0.33% : 0.000001s : 3: predicate.opt_reshape 0.36% : 0.000001s : 3: predicate.parallel_virtual_node 1.90% : 0.000003s : 18: predicate.partial_defer_inline 1.46% : 0.000002s : 16: predicate.partial_eliminate 0.95% : 0.000002s : 11: predicate.print_const_string_wrapper 0.55% : 0.000001s : 6: predicate.reduce_all_const_elim 1.27% : 0.000002s : 11: predicate.reduce_eliminate 2.38% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 6: predicate.remove_not_recompute_node 1.26% : 0.000002s : 19: predicate.replace_applicator 0.40% : 0.000001s : 6: predicate.replace_old_param 0.36% : 0.000001s : 3: predicate.reset_defer_inline 1.03% : 0.000002s : 11: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.82% : 0.000001s : 6: predicate.same_eliminate 0.41% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.99% : 0.000002s : 6: predicate.shard_identity_eliminate 0.66% : 0.000001s : 6: predicate.special_op_eliminate 0.69% : 0.000001s : 6: predicate.specialize_transform 0.82% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.64% : 0.000003s : 18: predicate.switch_defer_inline 2.16% : 0.000003s : 24: predicate.switch_layer_defer_inline 5.57% : 0.000009s : 61: predicate.switch_simplify 0.95% : 0.000002s : 11: predicate.tile_eliminate 1.04% : 0.000002s : 11: predicate.transpose_eliminate 1.60% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000002s : 17: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000005s : 25: predicate.tuple_list_get_item_eliminator 1.48% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.71% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.31% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 3.01% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 3: predicate.value_based_eliminate 0.65% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000600 13 52.66% : 0.000316s : 6: func_graph_cloner_run.FuncGraphClonerGraph 47.34% : 0.000284s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.202124 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.21% : 0.006880s : 1: add_attr 0.21% : 0.006869s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000092s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000016s : 1: bias_add_comm_swap 0.03% : 0.000817s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000022s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000022s : 1: event_method 0.00% : 0.000023s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000007s : 1: graph_reusing 0.00% : 0.000009s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000008s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.01% : 0.000422s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000553s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.03% : 0.001013s : 78: opt.transform.opt_a 0.00% : 0.000023s : 1: opt.transform.opt_after_cconv 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000082s : 28: opt.transform.opt_b 0.00% : 0.000043s : 2: opt.transform.opt_trans_graph 0.00% : 0.000030s : 4: opt.transform.symbol_engine_opt 0.08% : 0.002578s : 1: opt_a 0.00% : 0.000093s : 1: opt_after_cconv 0.01% : 0.000457s : 1: opt_after_jit_grad 0.01% : 0.000180s : 1: opt_b 0.15% : 0.004808s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000020s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000023s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000008s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000042s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000014s : 1: remove_dup_value 0.01% : 0.000363s : 1: renormalize.infer 0.01% : 0.000306s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000032s : 1: rewriter_after_opt_a 0.01% : 0.000183s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000071s : 1: symbol_engine_optimizer 97.76% : 3.130465s : 1: task_emit 0.00% : 0.000071s : 1: tuple_transform 1.40% : 0.044822s : 1: type_inference 0.00% : 0.000077s : 1: validate TotalTime = 3.24916, [24] [bootstrap]: 0.00076974 [type_inference]: 0.044807 [event_method]: 1.73e-05 [auto_monad]: 8.601e-05 [graph_reusing]: 3.90998e-06 [inline]: 2.19999e-06 [add_attr]: 0.00687588, [1] [add_attr_with_inline]: 0.0068652, [1] [Cycle 1]: 8.362e-05, [2] [tag_attr]: 2.236e-05 [meta_addattr_fg_expand]: 7.91001e-06 [parallel-infer-symbol]: 1.87999e-06 [pre_auto_parallel]: 3.741e-05 [insert-virtual-dataset]: 1.24e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 1.02998e-06 [pipeline_split]: 1.22999e-06 [optimize]: 0.00480634, [53] [py_interpret_to_execute]: 4.04997e-06 [rewriter_before_opt_a]: 0.00017696 [opt_a]: 0.00257466, [2] [Cycle 1]: 0.00198658, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 5.464e-05 [loop_unroll]: 2.941e-05 [a_1]: 0.00050581 [with_stream_mark]: 1.055e-05 [recompute_prepare]: 8.2e-06 [updatestate_depend_eliminate]: 8.13001e-06 [updatestate_assign_eliminate]: 6.83e-06 [updatestate_loads_eliminate]: 2.29999e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 6.979e-05 [accelerated_algorithm]: 6.45002e-06 [shard]: 1.02e-06 [meta_shard_fg_expand]: 1.37999e-06 [shard_inline]: 5.45001e-06 [merge_send_recv]: 2.651e-05 [auto_parallel]: 5.95002e-06 [parallel]: 5.645e-05 [flash_sp]: 2.136e-05 [merge_comm]: 3.43e-06 [allreduce_fusion]: 7.98001e-06 [matmul_add_comm_reduction]: 1e-05 [allreduce_slice_to_reducescatter]: 6.24001e-06 [virtual_shard_identity]: 7.77e-06 [virtual_dataset]: 5.82999e-06 [get_grad_eliminate_]: 5.40999e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 6.83e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 2.28e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.423e-05 [merge_recompute_call_nodes]: 4.79e-06 [before_grad]: 9.55001e-06 [set_forward_comm_id_for_comm_node_pass]: 6.76e-06 [meta_fg_expand]: 2.32999e-06 [flash_sp_send_recv_attached]: 1.35001e-06 [receive_attached]: 7.01001e-06 [after_resolve]: 8.05999e-06 [a_after_grad]: 8.60001e-06 [renormalize]: 0.00066749 [add_forward_monad_depend]: 4.34002e-06 [auto_monad_grad]: 1.09e-06 [auto_monad_eliminator]: 2.128e-05 [cse]: 5.57e-05 [a_3]: 3.976e-05 [Cycle 2]: 0.00057892, [45] [expand_dump_flag]: 1.07998e-06 [switch_simplify]: 3.644e-05 [loop_unroll]: 5.61e-06 [a_1]: 9.548e-05 [with_stream_mark]: 9.78002e-06 [recompute_prepare]: 5.27999e-06 [updatestate_depend_eliminate]: 2.71e-06 [updatestate_assign_eliminate]: 2.27001e-06 [updatestate_loads_eliminate]: 2.61999e-06 [parameter_eliminate]: 9.5999e-07 [a_2]: 6.099e-05 [accelerated_algorithm]: 5.34e-06 [shard]: 9.09989e-07 [meta_shard_fg_expand]: 1.03001e-06 [shard_inline]: 5.00999e-06 [merge_send_recv]: 3.91001e-06 [auto_parallel]: 4.90999e-06 [parallel]: 3.55e-06 [flash_sp]: 1.97001e-06 [merge_comm]: 2.69999e-06 [allreduce_fusion]: 2.52001e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 5.66e-06 [virtual_dataset]: 4.93001e-06 [get_grad_eliminate_]: 4.88001e-06 [virtual_output]: 5.10001e-06 [merge_forward]: 2.56998e-06 [cell_reuse_recompute_pass]: 1.50001e-06 [offload_activation]: 6.56999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.157e-05 [merge_recompute_call_nodes]: 9.29984e-07 [before_grad]: 7.6e-06 [set_forward_comm_id_for_comm_node_pass]: 2.76e-06 [meta_fg_expand]: 1.81e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.12e-06 [after_resolve]: 8.05e-06 [a_after_grad]: 7.8e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17999e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 6.61999e-06 [cse]: 1.242e-05 [a_3]: 3.002e-05 [py_interpret_to_execute_after_opt_a]: 4.89e-06 [slice_cell_reuse_recomputed_activation]: 1.84e-06 [rewriter_after_opt_a]: 2.828e-05 [convert_after_rewriter]: 1.14e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00054386 [opt_b]: 0.00017656, [1] [Cycle 1]: 0.00016995, [7] [b_1]: 0.00010297 [b_2]: 6.68e-06 [updatestate_depend_eliminate]: 4.68001e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.29999e-06 [renormalize]: 4.30009e-07 [cse]: 1.91e-05 [optimize_parallel_all_gather_comm]: 2.825e-05 [overlap_param_gather]: 1.393e-05 [cconv]: 2.499e-05 [loop_unroll]: 0.00041422 [opt_after_cconv]: 8.943e-05, [1] [Cycle 1]: 8.401e-05, [7] [c_1]: 2.4e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 4.76002e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.26e-06 [cse]: 1.644e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 1.111e-05 [tuple_transform]: 6.364e-05, [1] [Cycle 1]: 5.944e-05, [4] [d_1]: 3.443e-05 [none_parameter_eliminate]: 1.38002e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 6.09999e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 6.032e-05 [cse_after_recomputation]: 1.954e-05, [1] [Cycle 1]: 1.529e-05, [1] [cse]: 1.013e-05 [environ_conv]: 1.537e-05 [swap_dp_allreduce_reducescatter]: 2.81e-05 [bias_add_comm_swap]: 1.847e-05 [label_micro_interleaved_index]: 1.213e-05 [label_fine_grained_interleaved_index]: 1.20999e-06 [merge_cast_opt]: 8.70001e-07 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.17001e-06 [assign_add_opt]: 1.24e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 1.156e-05 [full_micro_interleaved_order_control]: 1.189e-05 [reorder_send_recv_between_fp_bp]: 2.27001e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 9.99979e-07 [interleave_parallel_branches]: 1.281e-05 [overlap_opt_shard_in_pipeline]: 2.363e-05 [overlap_opt_shard_grad_in_pipeline]: 6.90023e-07 [control_data_broadcast_order]: 1.232e-05 [grouped_pairwise_exchange_alltoall]: 6.64001e-06 [offloading_packed_experts]: 2.72001e-06 [overlap_recompute_and_grad_model_parallel]: 1.457e-05 [overlap_grad_matmul_and_grad_allreduce]: 7.7e-07 [overlap_recompute_allgather_and_fa_grad]: 7.59988e-07 [overlap_recompute_comm]: 1.02e-06 [overlap_grad_ring_attention]: 2.064e-05 [overlap_grad_flash_sp]: 5.084e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 1.664e-05 [split_layernorm_comm]: 4.13001e-06 [handle_group_info]: 4.49998e-06 [symbol_engine_optimizer]: 6.472e-05, [1] [Cycle 1]: 6.086e-05, [6] [build]: 1.48002e-06 [elim_shapecalc]: 8.43001e-06 [elim_not_effective]: 9.96e-06 [opt_reshape]: 5.96e-06 [fold_const_symbol]: 8.37e-06 [renormalize]: 2.19996e-07 [detach_backward]: 9.89996e-07 [pipeline_parallel_scheduler]: 1.22e-06 [auto_monad_reorder]: 2.328e-05 [get_jit_bprop_graph]: 8.70001e-07 [rewriter_after_jit_bprop_graph]: 2.39001e-06 [opt_after_jit_grad]: 0.00044617 [validate]: 4.874e-05 [backend_pass]: 9.29984e-07 [task_emit]: 3.19094 [execute]: 1.123e-05 Sums bootstrap : 0.000770s : 0.02% type_inference : 0.044807s : 1.38% event_method : 0.000017s : 0.00% auto_monad : 0.000086s : 0.00% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000037s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000177s : 0.01% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000091s : 0.00% optimize.opt_a.loop_unroll : 0.000035s : 0.00% optimize.opt_a.a_1 : 0.000601s : 0.02% optimize.opt_a.with_stream_mark : 0.000020s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000131s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000002s : 0.00% optimize.opt_a.shard_inline : 0.000010s : 0.00% optimize.opt_a.merge_send_recv : 0.000030s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000060s : 0.00% optimize.opt_a.flash_sp : 0.000023s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000007s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.00% optimize.opt_a.meta_fg_expand : 0.000004s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000016s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.000668s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.00% optimize.opt_a.cse : 0.000068s : 0.00% optimize.opt_a.a_3 : 0.000070s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000544s : 0.02% optimize.opt_b.b_1 : 0.000103s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000014s : 0.00% optimize.cconv : 0.000025s : 0.00% optimize.loop_unroll : 0.000414s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000016s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000011s : 0.00% optimize.tuple_transform.d_1 : 0.000034s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.00% optimize.cse_after_recomputation.cse : 0.000010s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000018s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000012s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000024s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000007s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000017s : 0.00% optimize.split_layernorm_comm : 0.000004s : 0.00% optimize.handle_group_info : 0.000004s : 0.00% optimize.symbol_engine_optimizer.build : 0.000001s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000010s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000008s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000002s : 0.00% opt_after_jit_grad : 0.000446s : 0.01% validate : 0.000049s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 3.190935s : 98.45% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000147 24 0.79% : 0.000001s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 3.44% : 0.000005s : 3: substitution.graph_param_transform 78.13% : 0.000115s : 5: substitution.inline 1.83% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.08% : 0.000007s : 4: substitution.remove_not_recompute_node 1.67% : 0.000002s : 2: substitution.replace_old_param 8.33% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.044744 2 97.63% : 0.043682s : 1: type_inference.infer 2.37% : 0.001062s : 1: type_inference.specialize ------[replace.] 0.000054 7 72.90% : 0.000039s : 5: replace.inline 27.10% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000124 7 90.94% : 0.000112s : 5: match.inline 9.06% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000164 1031 0.89% : 0.000001s : 11: predicate.accumulaten_eliminater 0.95% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 6: predicate.addn_check_dump 0.94% : 0.000002s : 11: predicate.addn_zero_filter 0.90% : 0.000001s : 11: predicate.adjust_all_reduce_mul_add 2.23% : 0.000004s : 17: predicate.arithmetic_simplify 0.89% : 0.000001s : 11: predicate.cast_eliminate 0.58% : 0.000001s : 6: predicate.check_bprop_eliminate 0.53% : 0.000001s : 6: predicate.compare_switch_simplify 0.18% : 0.000000s : 3: predicate.const_output_eliminate 0.55% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 11: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 11: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 11: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.43% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.33% : 0.000002s : 14: predicate.environ_get_depend_swap 1.67% : 0.000003s : 20: predicate.environ_get_eliminate 1.12% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.49% : 0.000002s : 18: predicate.exchange_switch_depend_value 2.19% : 0.000004s : 18: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.64% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.54% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 5.80% : 0.000009s : 47: predicate.inline 0.81% : 0.000001s : 6: predicate.inline_without_move 0.32% : 0.000001s : 6: predicate.j_node_and_user_rematch 1.09% : 0.000002s : 6: predicate.less_batch_normalization 1.72% : 0.000003s : 19: predicate.list_to_tuple_eliminator_ 2.41% : 0.000004s : 30: predicate.load_eliminater 1.16% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.85% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.75% : 0.000003s : 17: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 6: predicate.merge_addn 0.55% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000001s : 11: predicate.minmaximum_grad 1.28% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.40% : 0.000001s : 3: predicate.parallel_virtual_node 1.90% : 0.000003s : 18: predicate.partial_defer_inline 1.46% : 0.000002s : 16: predicate.partial_eliminate 0.96% : 0.000002s : 11: predicate.print_const_string_wrapper 0.60% : 0.000001s : 6: predicate.reduce_all_const_elim 1.22% : 0.000002s : 11: predicate.reduce_eliminate 2.53% : 0.000004s : 30: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 6: predicate.remove_not_recompute_node 1.36% : 0.000002s : 19: predicate.replace_applicator 0.35% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000001s : 3: predicate.reset_defer_inline 0.98% : 0.000002s : 11: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.81% : 0.000001s : 6: predicate.same_eliminate 0.42% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.88% : 0.000001s : 6: predicate.shard_identity_eliminate 0.61% : 0.000001s : 6: predicate.special_op_eliminate 0.76% : 0.000001s : 6: predicate.specialize_transform 0.76% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.57% : 0.000003s : 18: predicate.switch_defer_inline 2.12% : 0.000003s : 24: predicate.switch_layer_defer_inline 5.50% : 0.000009s : 61: predicate.switch_simplify 0.98% : 0.000002s : 11: predicate.tile_eliminate 0.92% : 0.000002s : 11: predicate.transpose_eliminate 1.61% : 0.000003s : 17: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000002s : 17: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000002s : 17: predicate.tuple_list_get_item_depend_reorder 3.55% : 0.000006s : 25: predicate.tuple_list_get_item_eliminator 1.43% : 0.000002s : 17: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000004s : 23: predicate.tuple_list_set_item_eliminator 1.68% : 0.000003s : 19: predicate.tuple_to_list_eliminator_ 2.27% : 0.000004s : 30: predicate.updatestate_pure_node_eliminater 3.08% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.69% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 6: predicate.virtual_output_eliminate 0.29% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000610 13 52.96% : 0.000323s : 6: func_graph_cloner_run.FuncGraphClonerGraph 47.04% : 0.000287s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 3.262635 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.21% : 0.006880s : 1: add_attr 0.21% : 0.006869s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000064s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000092s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000021s : 1: bias_add_comm_swap 0.02% : 0.000811s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000022s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000022s : 1: event_method 0.00% : 0.000025s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000007s : 1: graph_reusing 0.00% : 0.000009s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000007s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.01% : 0.000423s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000553s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.03% : 0.001005s : 78: opt.transform.opt_a 0.00% : 0.000023s : 1: opt.transform.opt_after_cconv 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000083s : 28: opt.transform.opt_b 0.00% : 0.000038s : 2: opt.transform.opt_trans_graph 0.00% : 0.000029s : 4: opt.transform.symbol_engine_opt 0.08% : 0.002578s : 1: opt_a 0.00% : 0.000093s : 1: opt_after_cconv 0.01% : 0.000456s : 1: opt_after_jit_grad 0.01% : 0.000180s : 1: opt_b 0.15% : 0.004810s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000003s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000003s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000027s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000042s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000014s : 1: remove_dup_value 0.01% : 0.000356s : 1: renormalize.infer 0.01% : 0.000306s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000032s : 1: rewriter_after_opt_a 0.01% : 0.000182s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000007s : 1: split_layernorm_comm 0.00% : 0.000019s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000067s : 1: symbol_engine_optimizer 97.80% : 3.190995s : 1: task_emit 0.00% : 0.000067s : 1: tuple_transform 1.37% : 0.044822s : 1: type_inference 0.00% : 0.000077s : 1: validate TotalTime = 0.106547, [24] [bootstrap]: 0.00060333 [type_inference]: 0.0663893 [event_method]: 0.00026317 [auto_monad]: 0.00017878 [graph_reusing]: 9.40001e-06 [inline]: 3.02002e-06 [add_attr]: 0.00431547, [1] [add_attr_with_inline]: 0.00429954, [1] [Cycle 1]: 0.00011243, [2] [tag_attr]: 5.349e-05 [meta_addattr_fg_expand]: 9.34998e-06 [parallel-infer-symbol]: 3.42997e-06 [pre_auto_parallel]: 7.087e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0238571, [53] [py_interpret_to_execute]: 8.12e-06 [rewriter_before_opt_a]: 0.00036808 [opt_a]: 0.0210959, [3] [Cycle 1]: 0.017149, [45] [expand_dump_flag]: 5.09e-06 [switch_simplify]: 0.00017767 [loop_unroll]: 6.281e-05 [a_1]: 0.00152348 [with_stream_mark]: 3.593e-05 [recompute_prepare]: 2.542e-05 [updatestate_depend_eliminate]: 8.94998e-06 [updatestate_assign_eliminate]: 7.29001e-06 [updatestate_loads_eliminate]: 6.53e-06 [parameter_eliminate]: 3.69002e-06 [a_2]: 0.00020573 [accelerated_algorithm]: 1.569e-05 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 3.75e-06 [shard_inline]: 1.308e-05 [merge_send_recv]: 1.805e-05 [auto_parallel]: 1.285e-05 [parallel]: 3.563e-05 [flash_sp]: 1.304e-05 [merge_comm]: 9.50001e-06 [allreduce_fusion]: 8.37e-06 [matmul_add_comm_reduction]: 3.371e-05 [allreduce_slice_to_reducescatter]: 8.99978e-07 [virtual_shard_identity]: 1.88e-05 [virtual_dataset]: 1.361e-05 [get_grad_eliminate_]: 1.278e-05 [virtual_output]: 1.326e-05 [merge_forward]: 2.03e-05 [cell_reuse_recompute_pass]: 1.46998e-06 [offload_activation]: 1.767e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.833e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 2.488e-05 [set_forward_comm_id_for_comm_node_pass]: 1.041e-05 [meta_fg_expand]: 0.00224014 [flash_sp_send_recv_attached]: 4.89e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 7.054e-05 [a_after_grad]: 8.637e-05 [renormalize]: 0.0111863 [add_forward_monad_depend]: 1.505e-05 [auto_monad_grad]: 6.94001e-06 [auto_monad_eliminator]: 6.022e-05 [cse]: 0.00033409 [a_3]: 0.00033161 [Cycle 2]: 0.00325073, [45] [expand_dump_flag]: 2.95002e-06 [switch_simplify]: 4.297e-05 [loop_unroll]: 3.759e-05 [a_1]: 0.0012914 [with_stream_mark]: 3.019e-05 [recompute_prepare]: 1.364e-05 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.39001e-06 [parameter_eliminate]: 2.27999e-06 [a_2]: 7.155e-05 [accelerated_algorithm]: 6.78e-06 [shard]: 2.54001e-06 [meta_shard_fg_expand]: 2.36e-06 [shard_inline]: 6.33998e-06 [merge_send_recv]: 9.44e-06 [auto_parallel]: 9.94001e-06 [parallel]: 8.99e-06 [flash_sp]: 4.42e-06 [merge_comm]: 3.66001e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 1.02e-05 [allreduce_slice_to_reducescatter]: 1.00001e-06 [virtual_shard_identity]: 8.61002e-06 [virtual_dataset]: 5.79999e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 5.59e-06 [merge_forward]: 4.50001e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.053e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.558e-05 [merge_recompute_call_nodes]: 1.41998e-06 [before_grad]: 1.063e-05 [set_forward_comm_id_for_comm_node_pass]: 4.48999e-06 [meta_fg_expand]: 9.678e-05 [flash_sp_send_recv_attached]: 1.87999e-06 [receive_attached]: 2.27001e-06 [after_resolve]: 1.764e-05 [a_after_grad]: 9.57001e-06 [renormalize]: 0.0010593 [add_forward_monad_depend]: 8.2e-06 [auto_monad_grad]: 3.65998e-06 [auto_monad_eliminator]: 1.996e-05 [cse]: 4.35e-05 [a_3]: 4.988e-05 [Cycle 3]: 0.00067527, [45] [expand_dump_flag]: 2.71999e-06 [switch_simplify]: 7.96001e-06 [loop_unroll]: 6.07001e-06 [a_1]: 0.00010032 [with_stream_mark]: 1.53e-05 [recompute_prepare]: 6.07001e-06 [updatestate_depend_eliminate]: 4.14997e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.47001e-06 [parameter_eliminate]: 2.81e-06 [a_2]: 6.654e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 2.20002e-06 [meta_shard_fg_expand]: 2.01e-06 [shard_inline]: 5.67999e-06 [merge_send_recv]: 7.28e-06 [auto_parallel]: 8e-06 [parallel]: 8.10999e-06 [flash_sp]: 1.55001e-06 [merge_comm]: 3.61001e-06 [allreduce_fusion]: 3.07002e-06 [matmul_add_comm_reduction]: 8.43999e-06 [allreduce_slice_to_reducescatter]: 5.99975e-07 [virtual_shard_identity]: 7.97e-06 [virtual_dataset]: 6.01998e-06 [get_grad_eliminate_]: 6.16998e-06 [virtual_output]: 6.45997e-06 [merge_forward]: 4.19997e-06 [cell_reuse_recompute_pass]: 2.43e-06 [offload_activation]: 9.74e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.249e-05 [merge_recompute_call_nodes]: 1.72001e-06 [before_grad]: 1.162e-05 [set_forward_comm_id_for_comm_node_pass]: 4.97e-06 [meta_fg_expand]: 2.06003e-06 [flash_sp_send_recv_attached]: 1.64e-06 [receive_attached]: 1.71e-06 [after_resolve]: 1.018e-05 [a_after_grad]: 7.58001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.62001e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 8.92e-06 [cse]: 2.428e-05 [a_3]: 3.492e-05 [py_interpret_to_execute_after_opt_a]: 8.33999e-06 [slice_cell_reuse_recomputed_activation]: 2.18002e-06 [rewriter_after_opt_a]: 2.299e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00077817 [opt_b]: 0.0002386, [1] [Cycle 1]: 0.0002303, [7] [b_1]: 0.00011649 [b_2]: 2.21e-05 [updatestate_depend_eliminate]: 9.00999e-06 [updatestate_assign_eliminate]: 2.37999e-06 [updatestate_loads_eliminate]: 3.18998e-06 [renormalize]: 8.2e-07 [cse]: 3.548e-05 [optimize_parallel_all_gather_comm]: 2.17e-05 [overlap_param_gather]: 2.11e-06 [cconv]: 3.384e-05 [loop_unroll]: 0.00051192 [opt_after_cconv]: 0.00012051, [1] [Cycle 1]: 0.00011347, [7] [c_1]: 2.691e-05 [parameter_eliminate]: 5.46998e-06 [updatestate_depend_eliminate]: 7.86001e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 3.039e-05 [renormalize]: 9.49978e-07 [remove_dup_value]: 1.753e-05 [tuple_transform]: 7.449e-05, [1] [Cycle 1]: 6.838e-05, [4] [d_1]: 4.086e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 1.00001e-07 [switch_simplify]: 6.77002e-06 [partial_unused_args_eliminate]: 2.05002e-06 [add_recomputation]: 5.59e-05 [cse_after_recomputation]: 2.918e-05, [1] [Cycle 1]: 2.336e-05, [1] [cse]: 1.693e-05 [environ_conv]: 6.90998e-06 [swap_dp_allreduce_reducescatter]: 6.26e-06 [bias_add_comm_swap]: 2.94999e-06 [label_micro_interleaved_index]: 6.11e-06 [label_fine_grained_interleaved_index]: 2.51e-06 [merge_cast_opt]: 1.25999e-06 [slice_recompute_activation]: 2.61e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.40999e-06 [ForceFp32Comm]: 1.01002e-06 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.54999e-06 [reorder_send_recv_between_fp_bp]: 2.81999e-06 [comm_op_add_attrs]: 1.35999e-06 [add_comm_op_reuse_tag]: 1.04003e-06 [interleave_split_concat_branches]: 1.04003e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 8.60999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.10002e-06 [control_data_broadcast_order]: 1.581e-05 [grouped_pairwise_exchange_alltoall]: 1.79e-06 [offloading_packed_experts]: 4.41002e-06 [overlap_recompute_and_grad_model_parallel]: 4.65001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 4.22e-06 [overlap_grad_flash_sp]: 2.483e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 2.75997e-06 [split_layernorm_comm]: 1.89999e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 8.528e-05, [1] [Cycle 1]: 7.979e-05, [6] [build]: 4.25e-06 [elim_shapecalc]: 1.348e-05 [elim_not_effective]: 1.371e-05 [opt_reshape]: 6.51e-06 [fold_const_symbol]: 1.078e-05 [renormalize]: 5.00004e-07 [detach_backward]: 2.57001e-06 [pipeline_parallel_scheduler]: 1.77001e-06 [auto_monad_reorder]: 2.21e-05 [get_jit_bprop_graph]: 2.14999e-06 [rewriter_after_jit_bprop_graph]: 5.69e-06 [opt_after_jit_grad]: 0.00057309 [validate]: 5.333e-05 [backend_pass]: 1.23002e-06 [task_emit]: 0.00987222 [execute]: 9.22999e-06 Sums bootstrap : 0.000603s : 0.60% type_inference : 0.066389s : 65.97% event_method : 0.000263s : 0.26% auto_monad : 0.000179s : 0.18% graph_reusing : 0.000009s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000053s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000071s : 0.07% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000368s : 0.37% optimize.opt_a.expand_dump_flag : 0.000011s : 0.01% optimize.opt_a.switch_simplify : 0.000229s : 0.23% optimize.opt_a.loop_unroll : 0.000106s : 0.11% optimize.opt_a.a_1 : 0.002915s : 2.90% optimize.opt_a.with_stream_mark : 0.000081s : 0.08% optimize.opt_a.recompute_prepare : 0.000045s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_a.parameter_eliminate : 0.000009s : 0.01% optimize.opt_a.a_2 : 0.000344s : 0.34% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.03% optimize.opt_a.shard : 0.000007s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000008s : 0.01% optimize.opt_a.shard_inline : 0.000025s : 0.02% optimize.opt_a.merge_send_recv : 0.000035s : 0.03% optimize.opt_a.auto_parallel : 0.000031s : 0.03% optimize.opt_a.parallel : 0.000053s : 0.05% optimize.opt_a.flash_sp : 0.000019s : 0.02% optimize.opt_a.merge_comm : 0.000017s : 0.02% optimize.opt_a.allreduce_fusion : 0.000015s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000052s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000035s : 0.04% optimize.opt_a.virtual_dataset : 0.000025s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000025s : 0.03% optimize.opt_a.virtual_output : 0.000025s : 0.03% optimize.opt_a.merge_forward : 0.000029s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000038s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000076s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000047s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.02% optimize.opt_a.meta_fg_expand : 0.002339s : 2.32% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.01% optimize.opt_a.after_resolve : 0.000098s : 0.10% optimize.opt_a.a_after_grad : 0.000104s : 0.10% optimize.opt_a.renormalize : 0.012246s : 12.17% optimize.opt_a.add_forward_monad_depend : 0.000026s : 0.03% optimize.opt_a.auto_monad_grad : 0.000012s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000089s : 0.09% optimize.opt_a.cse : 0.000402s : 0.40% optimize.opt_a.a_3 : 0.000416s : 0.41% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000023s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000778s : 0.77% optimize.opt_b.b_1 : 0.000116s : 0.12% optimize.opt_b.b_2 : 0.000022s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000035s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.03% optimize.loop_unroll : 0.000512s : 0.51% optimize.opt_after_cconv.c_1 : 0.000027s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000041s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000056s : 0.06% optimize.cse_after_recomputation.cse : 0.000017s : 0.02% optimize.environ_conv : 0.000007s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000009s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000573s : 0.57% validate : 0.000053s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.009872s : 9.81% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000924 154 0.23% : 0.000002s : 2: substitution.elim_not_effective 0.94% : 0.000009s : 11: substitution.float_depend_g_call 0.39% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.20% : 0.000002s : 2: substitution.fold_const_symbol 0.64% : 0.000006s : 3: substitution.graph_param_transform 0.35% : 0.000003s : 2: substitution.incorporate_call 0.26% : 0.000002s : 2: substitution.incorporate_call_switch 65.55% : 0.000605s : 20: substitution.inline 2.41% : 0.000022s : 2: substitution.inline_without_move 1.33% : 0.000012s : 12: substitution.j_node_and_user_rematch 1.25% : 0.000012s : 7: substitution.minmaximum_grad 2.36% : 0.000022s : 11: substitution.partial_eliminate 2.83% : 0.000026s : 12: substitution.remove_not_recompute_node 3.09% : 0.000029s : 9: substitution.replace_applicator 1.39% : 0.000013s : 9: substitution.replace_old_param 0.38% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.93% : 0.000027s : 3: substitution.switch_simplify 2.55% : 0.000024s : 7: substitution.tuple_list_convert_item_index_to_positive 1.07% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.69% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 6.52% : 0.000060s : 16: substitution.tuple_list_get_item_eliminator 1.64% : 0.000015s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.066264 2 95.29% : 0.063142s : 1: type_inference.infer 4.71% : 0.003122s : 1: type_inference.specialize ------[replace.] 0.000297 30 62.08% : 0.000184s : 20: replace.inline 16.18% : 0.000048s : 3: replace.switch_simplify 21.74% : 0.000064s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000648 30 91.72% : 0.000594s : 20: match.inline 3.88% : 0.000025s : 3: match.switch_simplify 4.41% : 0.000029s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000617 3823 1.07% : 0.000007s : 49: predicate.accumulaten_eliminater 0.39% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.40% : 0.000002s : 17: predicate.addn_check_dump 1.08% : 0.000007s : 49: predicate.addn_zero_filter 0.98% : 0.000006s : 49: predicate.adjust_all_reduce_mul_add 1.97% : 0.000012s : 66: predicate.arithmetic_simplify 1.11% : 0.000007s : 49: predicate.cast_eliminate 1.00% : 0.000006s : 44: predicate.check_bprop_eliminate 0.43% : 0.000003s : 17: predicate.compare_switch_simplify 0.05% : 0.000000s : 3: predicate.const_output_eliminate 0.40% : 0.000002s : 17: predicate.depend_value_elim 1.15% : 0.000007s : 49: predicate.dict_get_item_const_eliminator 1.42% : 0.000009s : 49: predicate.dict_get_item_eliminator 1.03% : 0.000006s : 49: predicate.dict_set_item_eliminator 0.51% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.06% : 0.000000s : 3: predicate.elim_not_effective 0.20% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000007s : 52: predicate.environ_add_const_eliminate 1.05% : 0.000006s : 52: predicate.environ_get_add_eliminate 1.14% : 0.000007s : 52: predicate.environ_get_depend_swap 1.49% : 0.000009s : 69: predicate.environ_get_eliminate 1.12% : 0.000007s : 52: predicate.environ_get_set_eliminate 1.69% : 0.000010s : 76: predicate.exchange_switch_depend_value 2.83% : 0.000017s : 76: predicate.float_depend_g_call 0.41% : 0.000003s : 17: predicate.float_environ_get_switch 0.47% : 0.000003s : 20: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 3: predicate.fold_const_symbol 0.50% : 0.000003s : 17: predicate.get_grad_eliminate 0.08% : 0.000001s : 3: predicate.graph_param_transform 0.38% : 0.000002s : 17: predicate.incorporate_call 0.35% : 0.000002s : 17: predicate.incorporate_call_switch 5.14% : 0.000032s : 165: predicate.inline 1.47% : 0.000009s : 41: predicate.inline_without_move 0.21% : 0.000001s : 17: predicate.j_node_and_user_rematch 0.62% : 0.000004s : 17: predicate.less_batch_normalization 1.43% : 0.000009s : 62: predicate.list_to_tuple_eliminator_ 2.30% : 0.000014s : 111: predicate.load_eliminater 0.51% : 0.000003s : 3: predicate.loop_unroll_after_grad 2.66% : 0.000016s : 113: predicate.loop_unroll_before_grad 1.22% : 0.000008s : 55: predicate.make_slice_get_slice_eliminator 0.42% : 0.000003s : 17: predicate.merge_addn 0.98% : 0.000006s : 44: predicate.micro_step_allgather_replace 0.97% : 0.000006s : 44: predicate.mini_step_allgather_replace 1.00% : 0.000006s : 49: predicate.minmaximum_grad 0.62% : 0.000004s : 3: predicate.mutable_eliminate 0.13% : 0.000001s : 3: predicate.opt_reshape 0.13% : 0.000001s : 3: predicate.parallel_virtual_node 2.70% : 0.000017s : 76: predicate.partial_defer_inline 1.42% : 0.000009s : 59: predicate.partial_eliminate 1.03% : 0.000006s : 49: predicate.print_const_string_wrapper 0.44% : 0.000003s : 17: predicate.reduce_all_const_elim 1.32% : 0.000008s : 49: predicate.reduce_eliminate 2.28% : 0.000014s : 111: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000002s : 17: predicate.remove_not_recompute_node 1.65% : 0.000010s : 100: predicate.replace_applicator 0.64% : 0.000004s : 41: predicate.replace_old_param 0.09% : 0.000001s : 3: predicate.reset_defer_inline 1.06% : 0.000007s : 49: predicate.reshape_eliminate 1.07% : 0.000007s : 44: predicate.row_tensor_add_zeros_like 0.11% : 0.000001s : 3: predicate.row_tensor_eliminate 1.26% : 0.000008s : 44: predicate.same_eliminate 0.30% : 0.000002s : 17: predicate.set_cell_output_no_recompute 0.72% : 0.000004s : 17: predicate.shard_identity_eliminate 0.20% : 0.000001s : 6: predicate.special_op_eliminate 0.43% : 0.000003s : 17: predicate.specialize_transform 1.32% : 0.000008s : 44: predicate.split_environ_get_set_with_tuple_value 1.30% : 0.000008s : 41: predicate.stack_unstack_eliminate 0.08% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.84% : 0.000011s : 76: predicate.switch_defer_inline 2.80% : 0.000017s : 120: predicate.switch_layer_defer_inline 5.55% : 0.000034s : 215: predicate.switch_simplify 1.05% : 0.000006s : 49: predicate.tile_eliminate 1.10% : 0.000007s : 49: predicate.transpose_eliminate 1.26% : 0.000008s : 55: predicate.tuple_list_convert_item_index_to_positive 1.33% : 0.000008s : 55: predicate.tuple_list_get_item_const_eliminator 1.34% : 0.000008s : 55: predicate.tuple_list_get_item_depend_reorder 2.73% : 0.000017s : 79: predicate.tuple_list_get_item_eliminator 1.37% : 0.000008s : 55: predicate.tuple_list_get_set_item_eliminator 6.50% : 0.000040s : 72: predicate.tuple_list_set_item_eliminator 1.45% : 0.000009s : 62: predicate.tuple_to_list_eliminator_ 2.25% : 0.000014s : 111: predicate.updatestate_pure_node_eliminater 2.67% : 0.000017s : 128: predicate.updatestate_useless_node_eliminater 0.10% : 0.000001s : 3: predicate.value_based_eliminate 0.50% : 0.000003s : 17: predicate.virtual_dataset_eliminate 0.51% : 0.000003s : 17: predicate.virtual_output_eliminate 0.06% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.11% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003164 41 63.34% : 0.002004s : 17: func_graph_cloner_run.FuncGraphClonerGraph 36.66% : 0.001160s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.151505 237 0.00% : 0.000004s : 1: ForceFp32Comm 2.85% : 0.004322s : 1: add_attr 2.84% : 0.004304s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000061s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.13% : 0.000192s : 1: auto_monad 0.02% : 0.000029s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.42% : 0.000642s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.18% : 0.000278s : 1: event_method 0.01% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.34% : 0.000522s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.52% : 0.000792s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000021s : 1: opt.transform.mutable_eliminate 2.89% : 0.004386s : 117: opt.transform.opt_a 0.02% : 0.000026s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000109s : 28: opt.transform.opt_b 0.03% : 0.000045s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 13.93% : 0.021100s : 1: opt_a 0.08% : 0.000125s : 1: opt_after_cconv 0.39% : 0.000588s : 1: opt_after_jit_grad 0.16% : 0.000242s : 1: opt_b 15.75% : 0.023863s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000013s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.05% : 0.000077s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 6.53% : 0.009897s : 2: renormalize.infer 1.53% : 0.002323s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.25% : 0.000379s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000088s : 1: symbol_engine_optimizer 6.53% : 0.009895s : 1: task_emit 0.05% : 0.000077s : 1: tuple_transform 43.84% : 0.066420s : 1: type_inference 0.07% : 0.000101s : 1: validate TotalTime = 0.110887, [24] [bootstrap]: 0.00164016 [type_inference]: 0.0688865 [event_method]: 0.00027084 [auto_monad]: 0.00017883 [graph_reusing]: 1.005e-05 [inline]: 3.83001e-06 [add_attr]: 0.00580708, [1] [add_attr_with_inline]: 0.00576697, [1] [Cycle 1]: 0.00016098, [2] [tag_attr]: 8.05e-05 [meta_addattr_fg_expand]: 1.07e-05 [parallel-infer-symbol]: 4.12998e-06 [pre_auto_parallel]: 6.737e-05 [insert-virtual-dataset]: 2.66999e-06 [parallel-infer-symbol-second]: 9.30013e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.63997e-06 [optimize]: 0.0231833, [53] [py_interpret_to_execute]: 7.25e-06 [rewriter_before_opt_a]: 0.00036466 [opt_a]: 0.020529, [3] [Cycle 1]: 0.0170589, [45] [expand_dump_flag]: 5.22e-06 [switch_simplify]: 0.00017358 [loop_unroll]: 6.197e-05 [a_1]: 0.00142162 [with_stream_mark]: 6.254e-05 [recompute_prepare]: 2.44e-05 [updatestate_depend_eliminate]: 1.011e-05 [updatestate_assign_eliminate]: 6.80998e-06 [updatestate_loads_eliminate]: 6.41998e-06 [parameter_eliminate]: 3.44001e-06 [a_2]: 0.00020054 [accelerated_algorithm]: 1.405e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 4.01001e-06 [shard_inline]: 1.305e-05 [merge_send_recv]: 1.67e-05 [auto_parallel]: 1.205e-05 [parallel]: 0.00013517 [flash_sp]: 1.255e-05 [merge_comm]: 1.06e-05 [allreduce_fusion]: 8.03999e-06 [matmul_add_comm_reduction]: 3.368e-05 [allreduce_slice_to_reducescatter]: 9.90025e-07 [virtual_shard_identity]: 1.741e-05 [virtual_dataset]: 1.325e-05 [get_grad_eliminate_]: 1.26e-05 [virtual_output]: 1.265e-05 [merge_forward]: 9.09e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 1.93e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.62e-05 [merge_recompute_call_nodes]: 1.60001e-06 [before_grad]: 2.407e-05 [set_forward_comm_id_for_comm_node_pass]: 8.61002e-06 [meta_fg_expand]: 0.00218428 [flash_sp_send_recv_attached]: 5.23002e-06 [receive_attached]: 2.88e-06 [after_resolve]: 6.938e-05 [a_after_grad]: 8.3e-05 [renormalize]: 0.0112981 [add_forward_monad_depend]: 1.393e-05 [auto_monad_grad]: 6.35002e-06 [auto_monad_eliminator]: 5.98e-05 [cse]: 0.00029474 [a_3]: 0.00030749 [Cycle 2]: 0.00287981, [45] [expand_dump_flag]: 3.48e-06 [switch_simplify]: 4.146e-05 [loop_unroll]: 3.66e-05 [a_1]: 0.00117126 [with_stream_mark]: 2.172e-05 [recompute_prepare]: 9.41e-06 [updatestate_depend_eliminate]: 4.3e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 2.45002e-06 [a_2]: 6.822e-05 [accelerated_algorithm]: 6.22001e-06 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.51e-06 [shard_inline]: 5.61998e-06 [merge_send_recv]: 9.85002e-06 [auto_parallel]: 9.76e-06 [parallel]: 9.81e-06 [flash_sp]: 4.2e-06 [merge_comm]: 3.19001e-06 [allreduce_fusion]: 3.30998e-06 [matmul_add_comm_reduction]: 1.07e-05 [allreduce_slice_to_reducescatter]: 8.80013e-07 [virtual_shard_identity]: 7.70998e-06 [virtual_dataset]: 5.81998e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 5.78002e-06 [merge_forward]: 4.01001e-06 [cell_reuse_recompute_pass]: 1.46998e-06 [offload_activation]: 1.077e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.261e-05 [merge_recompute_call_nodes]: 2.04e-06 [before_grad]: 9.89999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.68999e-06 [meta_fg_expand]: 8.011e-05 [flash_sp_send_recv_attached]: 2.29999e-06 [receive_attached]: 3.09999e-06 [after_resolve]: 1.255e-05 [a_after_grad]: 8.28999e-06 [renormalize]: 0.00093981 [add_forward_monad_depend]: 5.24e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 1.41e-05 [cse]: 2.765e-05 [a_3]: 4.351e-05 [Cycle 3]: 0.0005705, [45] [expand_dump_flag]: 1.59998e-06 [switch_simplify]: 6.43e-06 [loop_unroll]: 6.17999e-06 [a_1]: 9.451e-05 [with_stream_mark]: 1.034e-05 [recompute_prepare]: 6.38e-06 [updatestate_depend_eliminate]: 2.98e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.30002e-06 [parameter_eliminate]: 1.45999e-06 [a_2]: 6.396e-05 [accelerated_algorithm]: 5.38002e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 2.03002e-06 [shard_inline]: 5.47999e-06 [merge_send_recv]: 4.46002e-06 [auto_parallel]: 6.51999e-06 [parallel]: 5.79e-06 [flash_sp]: 8.19971e-07 [merge_comm]: 3.26001e-06 [allreduce_fusion]: 3.28e-06 [matmul_add_comm_reduction]: 6.16e-06 [allreduce_slice_to_reducescatter]: 8.60018e-07 [virtual_shard_identity]: 6.36e-06 [virtual_dataset]: 5.15999e-06 [get_grad_eliminate_]: 5.14e-06 [virtual_output]: 5.37001e-06 [merge_forward]: 3.42002e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 6.16998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.268e-05 [merge_recompute_call_nodes]: 1.00001e-06 [before_grad]: 8.37e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21001e-06 [meta_fg_expand]: 2.37999e-06 [flash_sp_send_recv_attached]: 1.12999e-06 [receive_attached]: 1.19998e-06 [after_resolve]: 8.35001e-06 [a_after_grad]: 7.87e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.12e-06 [auto_monad_grad]: 1.45999e-06 [auto_monad_eliminator]: 6.74999e-06 [cse]: 1.615e-05 [a_3]: 3.174e-05 [py_interpret_to_execute_after_opt_a]: 5.57001e-06 [slice_cell_reuse_recomputed_activation]: 2.64999e-06 [rewriter_after_opt_a]: 2.336e-05 [convert_after_rewriter]: 1.67001e-06 [order_py_execute_after_rewriter]: 1.07998e-06 [mutable_eliminate]: 0.00075345 [opt_b]: 0.00020248, [1] [Cycle 1]: 0.00019461, [7] [b_1]: 0.00010884 [b_2]: 1.657e-05 [updatestate_depend_eliminate]: 5.91998e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 5.19998e-07 [cse]: 2.288e-05 [optimize_parallel_all_gather_comm]: 6.223e-05 [overlap_param_gather]: 2.39001e-06 [cconv]: 2.795e-05 [loop_unroll]: 0.00046378 [opt_after_cconv]: 9.844e-05, [1] [Cycle 1]: 9.215e-05, [7] [c_1]: 2.594e-05 [parameter_eliminate]: 3.01999e-06 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.43e-06 [cse]: 1.981e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 1.488e-05 [tuple_transform]: 6.754e-05, [1] [Cycle 1]: 6.195e-05, [4] [d_1]: 3.6e-05 [none_parameter_eliminate]: 1.64998e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 6.03002e-06 [partial_unused_args_eliminate]: 1.66e-06 [add_recomputation]: 7.6e-05 [cse_after_recomputation]: 2.494e-05, [1] [Cycle 1]: 2e-05, [1] [cse]: 1.413e-05 [environ_conv]: 6.12999e-06 [swap_dp_allreduce_reducescatter]: 5.82999e-06 [bias_add_comm_swap]: 2.66999e-06 [label_micro_interleaved_index]: 4.95999e-06 [label_fine_grained_interleaved_index]: 2.78998e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.04e-06 [micro_interleaved_order_control]: 2.76e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 1.09998e-06 [interleave_split_concat_branches]: 1.17e-06 [interleave_parallel_branches]: 1.35001e-06 [overlap_opt_shard_in_pipeline]: 2.684e-05 [overlap_opt_shard_grad_in_pipeline]: 1.81e-06 [control_data_broadcast_order]: 1.429e-05 [grouped_pairwise_exchange_alltoall]: 1.56002e-06 [offloading_packed_experts]: 4.21001e-06 [overlap_recompute_and_grad_model_parallel]: 5.14e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.56e-06 [overlap_grad_ring_attention]: 4.1e-06 [overlap_grad_flash_sp]: 2.292e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.58e-06 [split_layernorm_comm]: 1.79998e-06 [handle_group_info]: 1.12e-06 [symbol_engine_optimizer]: 7.294e-05, [1] [Cycle 1]: 6.819e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 9.52001e-06 [elim_not_effective]: 1.258e-05 [opt_reshape]: 6.58998e-06 [fold_const_symbol]: 9.27999e-06 [renormalize]: 1.69995e-07 [detach_backward]: 2.58e-06 [pipeline_parallel_scheduler]: 1.74e-06 [auto_monad_reorder]: 1.999e-05 [get_jit_bprop_graph]: 2.44999e-06 [rewriter_after_jit_bprop_graph]: 3.52002e-06 [opt_after_jit_grad]: 0.00049901 [validate]: 7.307e-05 [backend_pass]: 1.04e-06 [task_emit]: 0.00992325 [execute]: 9.91e-06 Sums bootstrap : 0.001640s : 1.58% type_inference : 0.068887s : 66.43% event_method : 0.000271s : 0.26% auto_monad : 0.000179s : 0.17% graph_reusing : 0.000010s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000081s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000011s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000067s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000365s : 0.35% optimize.opt_a.expand_dump_flag : 0.000010s : 0.01% optimize.opt_a.switch_simplify : 0.000221s : 0.21% optimize.opt_a.loop_unroll : 0.000105s : 0.10% optimize.opt_a.a_1 : 0.002687s : 2.59% optimize.opt_a.with_stream_mark : 0.000095s : 0.09% optimize.opt_a.recompute_prepare : 0.000040s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000017s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% optimize.opt_a.parameter_eliminate : 0.000007s : 0.01% optimize.opt_a.a_2 : 0.000333s : 0.32% optimize.opt_a.accelerated_algorithm : 0.000026s : 0.02% optimize.opt_a.shard : 0.000006s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.01% optimize.opt_a.shard_inline : 0.000024s : 0.02% optimize.opt_a.merge_send_recv : 0.000031s : 0.03% optimize.opt_a.auto_parallel : 0.000028s : 0.03% optimize.opt_a.parallel : 0.000151s : 0.15% optimize.opt_a.flash_sp : 0.000018s : 0.02% optimize.opt_a.merge_comm : 0.000017s : 0.02% optimize.opt_a.allreduce_fusion : 0.000015s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000051s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000003s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000031s : 0.03% optimize.opt_a.virtual_dataset : 0.000024s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000024s : 0.02% optimize.opt_a.virtual_output : 0.000024s : 0.02% optimize.opt_a.merge_forward : 0.000017s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000036s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000051s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000005s : 0.00% optimize.opt_a.before_grad : 0.000042s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.01% optimize.opt_a.meta_fg_expand : 0.002267s : 2.19% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.01% optimize.opt_a.receive_attached : 0.000007s : 0.01% optimize.opt_a.after_resolve : 0.000090s : 0.09% optimize.opt_a.a_after_grad : 0.000099s : 0.10% optimize.opt_a.renormalize : 0.012238s : 11.80% optimize.opt_a.add_forward_monad_depend : 0.000020s : 0.02% optimize.opt_a.auto_monad_grad : 0.000010s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000081s : 0.08% optimize.opt_a.cse : 0.000339s : 0.33% optimize.opt_a.a_3 : 0.000383s : 0.37% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000023s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000753s : 0.73% optimize.opt_b.b_1 : 0.000109s : 0.10% optimize.opt_b.b_2 : 0.000017s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000023s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000062s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.03% optimize.loop_unroll : 0.000464s : 0.45% optimize.opt_after_cconv.c_1 : 0.000026s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000036s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000076s : 0.07% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000027s : 0.03% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000499s : 0.48% validate : 0.000073s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.009923s : 9.57% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000812 154 0.24% : 0.000002s : 2: substitution.elim_not_effective 1.09% : 0.000009s : 11: substitution.float_depend_g_call 0.43% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.18% : 0.000001s : 2: substitution.fold_const_symbol 0.68% : 0.000006s : 3: substitution.graph_param_transform 0.47% : 0.000004s : 2: substitution.incorporate_call 0.28% : 0.000002s : 2: substitution.incorporate_call_switch 63.95% : 0.000519s : 20: substitution.inline 2.65% : 0.000022s : 2: substitution.inline_without_move 1.19% : 0.000010s : 12: substitution.j_node_and_user_rematch 1.39% : 0.000011s : 7: substitution.minmaximum_grad 2.75% : 0.000022s : 11: substitution.partial_eliminate 1.44% : 0.000012s : 12: substitution.remove_not_recompute_node 4.49% : 0.000036s : 9: substitution.replace_applicator 1.38% : 0.000011s : 9: substitution.replace_old_param 0.45% : 0.000004s : 1: substitution.set_cell_output_no_recompute 3.45% : 0.000028s : 3: substitution.switch_simplify 2.67% : 0.000022s : 7: substitution.tuple_list_convert_item_index_to_positive 1.19% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.76% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 6.03% : 0.000049s : 16: substitution.tuple_list_get_item_eliminator 1.83% : 0.000015s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.068732 2 95.66% : 0.065752s : 1: type_inference.infer 4.34% : 0.002980s : 1: type_inference.specialize ------[replace.] 0.000255 30 59.72% : 0.000152s : 20: replace.inline 17.65% : 0.000045s : 3: replace.switch_simplify 22.63% : 0.000058s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000556 30 91.49% : 0.000509s : 20: match.inline 4.65% : 0.000026s : 3: match.switch_simplify 3.86% : 0.000021s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000559 3823 1.18% : 0.000007s : 49: predicate.accumulaten_eliminater 0.23% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.41% : 0.000002s : 17: predicate.addn_check_dump 1.22% : 0.000007s : 49: predicate.addn_zero_filter 1.06% : 0.000006s : 49: predicate.adjust_all_reduce_mul_add 2.13% : 0.000012s : 66: predicate.arithmetic_simplify 1.17% : 0.000007s : 49: predicate.cast_eliminate 1.08% : 0.000006s : 44: predicate.check_bprop_eliminate 0.42% : 0.000002s : 17: predicate.compare_switch_simplify 0.05% : 0.000000s : 3: predicate.const_output_eliminate 0.45% : 0.000002s : 17: predicate.depend_value_elim 1.19% : 0.000007s : 49: predicate.dict_get_item_const_eliminator 1.45% : 0.000008s : 49: predicate.dict_get_item_eliminator 1.21% : 0.000007s : 49: predicate.dict_set_item_eliminator 0.28% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.08% : 0.000000s : 3: predicate.elim_not_effective 0.13% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000007s : 52: predicate.environ_add_const_eliminate 1.15% : 0.000006s : 52: predicate.environ_get_add_eliminate 1.17% : 0.000007s : 52: predicate.environ_get_depend_swap 1.57% : 0.000009s : 69: predicate.environ_get_eliminate 1.17% : 0.000007s : 52: predicate.environ_get_set_eliminate 1.85% : 0.000010s : 76: predicate.exchange_switch_depend_value 2.57% : 0.000014s : 76: predicate.float_depend_g_call 0.44% : 0.000002s : 17: predicate.float_environ_get_switch 0.52% : 0.000003s : 20: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 3: predicate.fold_const_symbol 0.54% : 0.000003s : 17: predicate.get_grad_eliminate 0.09% : 0.000001s : 3: predicate.graph_param_transform 0.42% : 0.000002s : 17: predicate.incorporate_call 0.38% : 0.000002s : 17: predicate.incorporate_call_switch 5.61% : 0.000031s : 165: predicate.inline 1.38% : 0.000008s : 41: predicate.inline_without_move 0.23% : 0.000001s : 17: predicate.j_node_and_user_rematch 0.71% : 0.000004s : 17: predicate.less_batch_normalization 1.51% : 0.000008s : 62: predicate.list_to_tuple_eliminator_ 2.52% : 0.000014s : 111: predicate.load_eliminater 0.31% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.86% : 0.000016s : 113: predicate.loop_unroll_before_grad 1.38% : 0.000008s : 55: predicate.make_slice_get_slice_eliminator 0.48% : 0.000003s : 17: predicate.merge_addn 1.03% : 0.000006s : 44: predicate.micro_step_allgather_replace 1.11% : 0.000006s : 44: predicate.mini_step_allgather_replace 1.11% : 0.000006s : 49: predicate.minmaximum_grad 0.45% : 0.000002s : 3: predicate.mutable_eliminate 0.12% : 0.000001s : 3: predicate.opt_reshape 0.11% : 0.000001s : 3: predicate.parallel_virtual_node 2.37% : 0.000013s : 76: predicate.partial_defer_inline 1.58% : 0.000009s : 59: predicate.partial_eliminate 1.13% : 0.000006s : 49: predicate.print_const_string_wrapper 0.53% : 0.000003s : 17: predicate.reduce_all_const_elim 1.31% : 0.000007s : 49: predicate.reduce_eliminate 2.52% : 0.000014s : 111: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000002s : 17: predicate.remove_not_recompute_node 1.91% : 0.000011s : 100: predicate.replace_applicator 0.72% : 0.000004s : 41: predicate.replace_old_param 0.06% : 0.000000s : 3: predicate.reset_defer_inline 1.17% : 0.000007s : 49: predicate.reshape_eliminate 1.11% : 0.000006s : 44: predicate.row_tensor_add_zeros_like 0.11% : 0.000001s : 3: predicate.row_tensor_eliminate 1.36% : 0.000008s : 44: predicate.same_eliminate 0.34% : 0.000002s : 17: predicate.set_cell_output_no_recompute 0.75% : 0.000004s : 17: predicate.shard_identity_eliminate 0.21% : 0.000001s : 6: predicate.special_op_eliminate 0.49% : 0.000003s : 17: predicate.specialize_transform 1.36% : 0.000008s : 44: predicate.split_environ_get_set_with_tuple_value 1.32% : 0.000007s : 41: predicate.stack_unstack_eliminate 0.09% : 0.000000s : 3: predicate.switch_call_monad_eliminater 2.04% : 0.000011s : 76: predicate.switch_defer_inline 3.01% : 0.000017s : 120: predicate.switch_layer_defer_inline 5.80% : 0.000032s : 215: predicate.switch_simplify 1.11% : 0.000006s : 49: predicate.tile_eliminate 1.11% : 0.000006s : 49: predicate.transpose_eliminate 1.41% : 0.000008s : 55: predicate.tuple_list_convert_item_index_to_positive 1.45% : 0.000008s : 55: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000008s : 55: predicate.tuple_list_get_item_depend_reorder 2.65% : 0.000015s : 79: predicate.tuple_list_get_item_eliminator 1.43% : 0.000008s : 55: predicate.tuple_list_get_set_item_eliminator 1.86% : 0.000010s : 72: predicate.tuple_list_set_item_eliminator 1.49% : 0.000008s : 62: predicate.tuple_to_list_eliminator_ 2.44% : 0.000014s : 111: predicate.updatestate_pure_node_eliminater 2.94% : 0.000016s : 128: predicate.updatestate_useless_node_eliminater 0.10% : 0.000001s : 3: predicate.value_based_eliminate 0.52% : 0.000003s : 17: predicate.virtual_dataset_eliminate 0.54% : 0.000003s : 17: predicate.virtual_output_eliminate 0.07% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.14% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003278 41 66.66% : 0.002185s : 17: func_graph_cloner_run.FuncGraphClonerGraph 33.34% : 0.001093s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.156254 237 0.00% : 0.000004s : 1: ForceFp32Comm 3.72% : 0.005813s : 1: add_attr 3.69% : 0.005771s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000081s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.12% : 0.000190s : 1: auto_monad 0.02% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 1.08% : 0.001693s : 1: bootstrap 0.02% : 0.000031s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.18% : 0.000288s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.30% : 0.000472s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.49% : 0.000764s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 2.60% : 0.004062s : 117: opt.transform.opt_a 0.02% : 0.000025s : 1: opt.transform.opt_after_cconv 0.01% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000090s : 28: opt.transform.opt_b 0.03% : 0.000040s : 2: opt.transform.opt_trans_graph 0.02% : 0.000034s : 4: opt.transform.symbol_engine_opt 13.14% : 0.020532s : 1: opt_a 0.07% : 0.000102s : 1: opt_after_cconv 0.33% : 0.000508s : 1: opt_after_jit_grad 0.13% : 0.000206s : 1: opt_b 14.84% : 0.023189s : 1: optimize 0.04% : 0.000067s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000030s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.05% : 0.000072s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.01% : 0.000018s : 1: remove_dup_value 6.43% : 0.010053s : 2: renormalize.infer 1.38% : 0.002164s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.24% : 0.000372s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000076s : 1: symbol_engine_optimizer 6.36% : 0.009945s : 1: task_emit 0.05% : 0.000070s : 1: tuple_transform 44.11% : 0.068917s : 1: type_inference 0.07% : 0.000114s : 1: validate TotalTime = 4.75287, [24] [bootstrap]: 0.00088308 [type_inference]: 0.0585459 [event_method]: 9.167e-05 [auto_monad]: 0.00017074 [graph_reusing]: 9.54999e-06 [inline]: 1.79e-06 [add_attr]: 0.00735908, [1] [add_attr_with_inline]: 0.00734211, [1] [Cycle 1]: 9.597e-05, [2] [tag_attr]: 3.4e-05 [meta_addattr_fg_expand]: 9.69999e-06 [parallel-infer-symbol]: 1.86e-06 [pre_auto_parallel]: 4.412e-05 [insert-virtual-dataset]: 1.20999e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 9.79984e-07 [pipeline_split]: 1.00999e-06 [optimize]: 0.00584415, [53] [py_interpret_to_execute]: 4.42998e-06 [rewriter_before_opt_a]: 0.00023151 [opt_a]: 0.00360977, [2] [Cycle 1]: 0.00290264, [45] [expand_dump_flag]: 2.16998e-06 [switch_simplify]: 9.955e-05 [loop_unroll]: 3.916e-05 [a_1]: 0.00076078 [with_stream_mark]: 1.411e-05 [recompute_prepare]: 8.98002e-06 [updatestate_depend_eliminate]: 8.07e-06 [updatestate_assign_eliminate]: 6.39001e-06 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.14003e-06 [a_2]: 9.581e-05 [accelerated_algorithm]: 8.07998e-06 [shard]: 1.01002e-06 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 6.98e-06 [merge_send_recv]: 2.302e-05 [auto_parallel]: 6.86001e-06 [parallel]: 4.417e-05 [flash_sp]: 1.583e-05 [merge_comm]: 4.85999e-06 [allreduce_fusion]: 7.18998e-06 [matmul_add_comm_reduction]: 1.008e-05 [allreduce_slice_to_reducescatter]: 3.3e-06 [virtual_shard_identity]: 9.85002e-06 [virtual_dataset]: 7.63001e-06 [get_grad_eliminate_]: 6.58e-06 [virtual_output]: 6.63e-06 [merge_forward]: 3.35e-06 [cell_reuse_recompute_pass]: 9.89996e-07 [offload_activation]: 1.008e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.322e-05 [merge_recompute_call_nodes]: 7.61001e-06 [before_grad]: 1.125e-05 [set_forward_comm_id_for_comm_node_pass]: 7.64002e-06 [meta_fg_expand]: 3.26001e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 7.46999e-06 [after_resolve]: 9.90002e-06 [a_after_grad]: 1.059e-05 [renormalize]: 0.00123938 [add_forward_monad_depend]: 5.14998e-06 [auto_monad_grad]: 1.49e-06 [auto_monad_eliminator]: 2.25e-05 [cse]: 4.587e-05 [a_3]: 5.032e-05 [Cycle 2]: 0.00069696, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 8.08999e-06 [loop_unroll]: 6.63998e-06 [a_1]: 0.00014833 [with_stream_mark]: 1.089e-05 [recompute_prepare]: 6.59001e-06 [updatestate_depend_eliminate]: 3.76999e-06 [updatestate_assign_eliminate]: 3.13998e-06 [updatestate_loads_eliminate]: 3.21999e-06 [parameter_eliminate]: 1.12999e-06 [a_2]: 8.356e-05 [accelerated_algorithm]: 6.58003e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.44e-06 [shard_inline]: 6.48e-06 [merge_send_recv]: 5.51e-06 [auto_parallel]: 6.26998e-06 [parallel]: 4.79e-06 [flash_sp]: 9.25001e-06 [merge_comm]: 4.58999e-06 [allreduce_fusion]: 3.68e-06 [matmul_add_comm_reduction]: 6.39999e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 7.56001e-06 [virtual_dataset]: 6.39001e-06 [get_grad_eliminate_]: 6.17999e-06 [virtual_output]: 5.80002e-06 [merge_forward]: 3.56999e-06 [cell_reuse_recompute_pass]: 1.24e-06 [offload_activation]: 6.34001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.122e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 9.66998e-06 [set_forward_comm_id_for_comm_node_pass]: 4.15999e-06 [meta_fg_expand]: 2.56998e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 9.09998e-06 [a_after_grad]: 8.87e-06 [renormalize]: 2.29978e-07 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 7.85998e-06 [cse]: 1.991e-05 [a_3]: 3.728e-05 [py_interpret_to_execute_after_opt_a]: 4.42003e-06 [slice_cell_reuse_recomputed_activation]: 9.89996e-07 [rewriter_after_opt_a]: 2.416e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 9.30013e-07 [mutable_eliminate]: 0.00049881 [opt_b]: 0.00025644, [1] [Cycle 1]: 0.00025004, [7] [b_1]: 0.0001518 [b_2]: 7.78999e-06 [updatestate_depend_eliminate]: 7.23e-06 [updatestate_assign_eliminate]: 3.08e-06 [updatestate_loads_eliminate]: 3.01999e-06 [renormalize]: 4.39992e-07 [cse]: 2.535e-05 [optimize_parallel_all_gather_comm]: 1.871e-05 [overlap_param_gather]: 5.29998e-06 [cconv]: 1.45e-05 [loop_unroll]: 0.00042914 [opt_after_cconv]: 0.00010656, [1] [Cycle 1]: 0.0001009, [7] [c_1]: 2.905e-05 [parameter_eliminate]: 2.34001e-06 [updatestate_depend_eliminate]: 6.04001e-06 [updatestate_assign_eliminate]: 3.41999e-06 [updatestate_loads_eliminate]: 3.19001e-06 [cse]: 2.372e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 2.22e-05 [tuple_transform]: 8.843e-05, [1] [Cycle 1]: 8.381e-05, [4] [d_1]: 5.716e-05 [none_parameter_eliminate]: 1.12999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7e-06 [partial_unused_args_eliminate]: 1.07e-06 [add_recomputation]: 4.28e-05 [cse_after_recomputation]: 2.568e-05, [1] [Cycle 1]: 2.132e-05, [1] [cse]: 1.609e-05 [environ_conv]: 1.5e-05 [swap_dp_allreduce_reducescatter]: 1.429e-05 [bias_add_comm_swap]: 5.15001e-06 [label_micro_interleaved_index]: 6.57002e-06 [label_fine_grained_interleaved_index]: 1.59998e-06 [merge_cast_opt]: 6.29982e-07 [slice_recompute_activation]: 9.09989e-07 [micro_interleaved_order_control]: 1.20999e-06 [assign_add_opt]: 6.30011e-07 [ForceFp32Comm]: 4.39992e-07 [remove_cast_before_assign_add]: 3.91001e-06 [full_micro_interleaved_order_control]: 5.35999e-06 [reorder_send_recv_between_fp_bp]: 1.22e-06 [comm_op_add_attrs]: 4.69998e-07 [add_comm_op_reuse_tag]: 4.20026e-07 [interleave_split_concat_branches]: 7.39994e-07 [interleave_parallel_branches]: 3.99002e-06 [overlap_opt_shard_in_pipeline]: 1.351e-05 [overlap_opt_shard_grad_in_pipeline]: 7.59988e-07 [control_data_broadcast_order]: 1.188e-05 [grouped_pairwise_exchange_alltoall]: 5.69999e-07 [offloading_packed_experts]: 3.91001e-06 [overlap_recompute_and_grad_model_parallel]: 7.42002e-06 [overlap_grad_matmul_and_grad_allreduce]: 8.2e-07 [overlap_recompute_allgather_and_fa_grad]: 7.49977e-07 [overlap_recompute_comm]: 1.29003e-06 [overlap_grad_ring_attention]: 1.021e-05 [overlap_grad_flash_sp]: 2.482e-05 [begin_end_overlap_inline]: 3.29979e-07 [split_matmul_comm_elemetwise]: 4.37e-06 [split_layernorm_comm]: 9.40025e-07 [handle_group_info]: 4.80009e-07 [symbol_engine_optimizer]: 9.016e-05, [1] [Cycle 1]: 8.534e-05, [6] [build]: 1.433e-05 [elim_shapecalc]: 1.094e-05 [elim_not_effective]: 1.329e-05 [opt_reshape]: 7.42998e-06 [fold_const_symbol]: 1.067e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.09e-06 [pipeline_parallel_scheduler]: 9.29984e-07 [auto_monad_reorder]: 2.078e-05 [get_jit_bprop_graph]: 1.19e-06 [rewriter_after_jit_bprop_graph]: 3.06001e-06 [opt_after_jit_grad]: 0.00046881 [validate]: 5.6e-05 [backend_pass]: 6.90023e-07 [task_emit]: 4.67901 [execute]: 1.096e-05 Sums bootstrap : 0.000883s : 0.02% type_inference : 0.058546s : 1.23% event_method : 0.000092s : 0.00% auto_monad : 0.000171s : 0.00% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000010s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000044s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000232s : 0.00% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000108s : 0.00% optimize.opt_a.loop_unroll : 0.000046s : 0.00% optimize.opt_a.a_1 : 0.000909s : 0.02% optimize.opt_a.with_stream_mark : 0.000025s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000179s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000029s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000049s : 0.00% optimize.opt_a.flash_sp : 0.000025s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000008s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000008s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.001240s : 0.03% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.00% optimize.opt_a.cse : 0.000066s : 0.00% optimize.opt_a.a_3 : 0.000088s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000499s : 0.01% optimize.opt_b.b_1 : 0.000152s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.00% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000014s : 0.00% optimize.loop_unroll : 0.000429s : 0.01% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.00% optimize.tuple_transform.d_1 : 0.000057s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000043s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000000s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000000s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000469s : 0.01% validate : 0.000056s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.679015s : 98.62% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000290 62 0.60% : 0.000002s : 3: substitution.elim_not_effective 1.74% : 0.000005s : 3: substitution.float_tuple_getitem_switch 0.54% : 0.000002s : 3: substitution.fold_const_symbol 1.42% : 0.000004s : 4: substitution.graph_param_transform 61.48% : 0.000178s : 8: substitution.inline 1.22% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.22% : 0.000004s : 2: substitution.minmaximum_grad 1.73% : 0.000005s : 6: substitution.remove_not_recompute_node 0.92% : 0.000003s : 2: substitution.replace_old_param 4.44% : 0.000013s : 1: substitution.switch_simplify 4.39% : 0.000013s : 4: substitution.tuple_list_convert_item_index_to_positive 3.74% : 0.000011s : 4: substitution.tuple_list_get_item_const_eliminator 2.93% : 0.000008s : 4: substitution.tuple_list_get_item_depend_reorder 10.70% : 0.000031s : 8: substitution.tuple_list_get_item_eliminator 2.91% : 0.000008s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.058469 2 96.96% : 0.056691s : 1: type_inference.infer 3.04% : 0.001778s : 1: type_inference.specialize ------[replace.] 0.000092 11 62.54% : 0.000058s : 8: replace.inline 18.79% : 0.000017s : 1: replace.switch_simplify 18.67% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000194 11 89.75% : 0.000174s : 8: match.inline 6.25% : 0.000012s : 1: match.switch_simplify 4.00% : 0.000008s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000226 1438 0.97% : 0.000002s : 16: predicate.accumulaten_eliminater 0.68% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.03% : 0.000002s : 16: predicate.addn_zero_filter 0.91% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.19% : 0.000005s : 24: predicate.arithmetic_simplify 1.12% : 0.000003s : 16: predicate.cast_eliminate 0.52% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 16: predicate.dict_get_item_eliminator 1.05% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.84% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 4: predicate.elim_not_effective 0.47% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 20: predicate.environ_get_depend_swap 1.71% : 0.000004s : 28: predicate.environ_get_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.59% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.20% : 0.000005s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.16% : 0.000000s : 4: predicate.graph_param_transform 0.56% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.64% : 0.000013s : 66: predicate.inline 0.66% : 0.000001s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.02% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 42: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.87% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.97% : 0.000002s : 16: predicate.minmaximum_grad 0.83% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 2.08% : 0.000005s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 0.93% : 0.000002s : 16: predicate.print_const_string_wrapper 0.52% : 0.000001s : 8: predicate.reduce_all_const_elim 1.50% : 0.000003s : 16: predicate.reduce_eliminate 2.46% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000003s : 26: predicate.replace_applicator 0.36% : 0.000001s : 8: predicate.replace_old_param 0.16% : 0.000000s : 4: predicate.reset_defer_inline 1.02% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.75% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000002s : 8: predicate.specialize_transform 0.71% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.75% : 0.000004s : 26: predicate.switch_defer_inline 2.22% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.09% : 0.000014s : 86: predicate.switch_simplify 0.96% : 0.000002s : 16: predicate.tile_eliminate 0.98% : 0.000002s : 16: predicate.transpose_eliminate 1.68% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.71% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.08% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.61% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001347 23 56.38% : 0.000760s : 11: func_graph_cloner_run.FuncGraphClonerGraph 43.62% : 0.000588s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.768914 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.15% : 0.007364s : 1: add_attr 0.15% : 0.007346s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000047s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000181s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.02% : 0.000934s : 1: bootstrap 0.00% : 0.000018s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000100s : 1: event_method 0.00% : 0.000036s : 1: execute 0.00% : 0.000009s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000003s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000003s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.01% : 0.000438s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.01% : 0.000508s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.03% : 0.001432s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000133s : 28: opt.transform.opt_b 0.00% : 0.000062s : 2: opt.transform.opt_trans_graph 0.00% : 0.000039s : 4: opt.transform.symbol_engine_opt 0.08% : 0.003613s : 1: opt_a 0.00% : 0.000110s : 1: opt_after_cconv 0.01% : 0.000479s : 1: opt_after_jit_grad 0.01% : 0.000260s : 1: opt_b 0.12% : 0.005848s : 1: optimize 0.00% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000003s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000049s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000026s : 1: remove_dup_value 0.01% : 0.000685s : 1: renormalize.infer 0.01% : 0.000547s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000028s : 1: rewriter_after_opt_a 0.00% : 0.000237s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000093s : 1: symbol_engine_optimizer 98.12% : 4.679067s : 1: task_emit 0.00% : 0.000091s : 1: tuple_transform 1.23% : 0.058563s : 1: type_inference 0.00% : 0.000079s : 1: validate [WARNING] CORE(87365,ffffbf434f30,python3.9):2026-01-29-17:52:00.321.325 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph0 TotalTime = 4.92625, [24] [bootstrap]: 0.00088323 [type_inference]: 0.0515079 [event_method]: 2.113e-05 [auto_monad]: 0.00014985 [graph_reusing]: 5.81e-06 [inline]: 1.96e-06 [add_attr]: 0.00728537, [1] [add_attr_with_inline]: 0.00727371, [1] [Cycle 1]: 0.00012716, [2] [tag_attr]: 2.95e-05 [meta_addattr_fg_expand]: 1.709e-05 [parallel-infer-symbol]: 3.12997e-06 [pre_auto_parallel]: 5.652e-05 [insert-virtual-dataset]: 3.03998e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.82999e-06 [pipeline_split]: 1.70001e-06 [optimize]: 0.00537047, [53] [py_interpret_to_execute]: 4.77e-06 [rewriter_before_opt_a]: 0.00025698 [opt_a]: 0.00319609, [2] [Cycle 1]: 0.00263907, [45] [expand_dump_flag]: 3.18e-06 [switch_simplify]: 0.00012036 [loop_unroll]: 3.271e-05 [a_1]: 0.00062389 [with_stream_mark]: 1.436e-05 [recompute_prepare]: 7.24001e-06 [updatestate_depend_eliminate]: 1.361e-05 [updatestate_assign_eliminate]: 1.311e-05 [updatestate_loads_eliminate]: 5.42999e-06 [parameter_eliminate]: 8.09989e-07 [a_2]: 7.351e-05 [accelerated_algorithm]: 6.33e-06 [shard]: 1.82001e-06 [meta_shard_fg_expand]: 5.29e-06 [shard_inline]: 5.69e-06 [merge_send_recv]: 4.481e-05 [auto_parallel]: 5.59998e-06 [parallel]: 8.356e-05 [flash_sp]: 4.265e-05 [merge_comm]: 7.31001e-06 [allreduce_fusion]: 1.239e-05 [matmul_add_comm_reduction]: 2.016e-05 [allreduce_slice_to_reducescatter]: 1.147e-05 [virtual_shard_identity]: 9.00001e-06 [virtual_dataset]: 5.96998e-06 [get_grad_eliminate_]: 5.48002e-06 [virtual_output]: 8.03001e-06 [merge_forward]: 3.02002e-06 [cell_reuse_recompute_pass]: 9.70002e-07 [offload_activation]: 1.68e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.247e-05 [merge_recompute_call_nodes]: 1.37999e-06 [before_grad]: 9.17001e-06 [set_forward_comm_id_for_comm_node_pass]: 1.365e-05 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 2.32001e-06 [receive_attached]: 2.274e-05 [after_resolve]: 8.85999e-06 [a_after_grad]: 8.18001e-06 [renormalize]: 0.00096302 [add_forward_monad_depend]: 5.29e-06 [auto_monad_grad]: 1.15999e-06 [auto_monad_eliminator]: 2.209e-05 [cse]: 5.812e-05 [a_3]: 3.762e-05 [Cycle 2]: 0.00054751, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 6.59999e-06 [loop_unroll]: 5.56e-06 [a_1]: 9.42e-05 [with_stream_mark]: 8.95001e-06 [recompute_prepare]: 5.23002e-06 [updatestate_depend_eliminate]: 2.86e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 9.80013e-07 [a_2]: 6.026e-05 [accelerated_algorithm]: 5.34e-06 [shard]: 8.00006e-07 [meta_shard_fg_expand]: 1.16997e-06 [shard_inline]: 5.05001e-06 [merge_send_recv]: 4.05e-06 [auto_parallel]: 4.82e-06 [parallel]: 3.96001e-06 [flash_sp]: 8.07998e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.74001e-06 [matmul_add_comm_reduction]: 5.02e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 6.16e-06 [virtual_dataset]: 4.99998e-06 [get_grad_eliminate_]: 4.84003e-06 [virtual_output]: 5.39e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 5.51998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.138e-05 [merge_recompute_call_nodes]: 6.99976e-07 [before_grad]: 8.07e-06 [set_forward_comm_id_for_comm_node_pass]: 2.86999e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 8.2e-07 [receive_attached]: 1.02e-06 [after_resolve]: 7.5e-06 [a_after_grad]: 7.52998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.05001e-06 [auto_monad_grad]: 6.89994e-07 [auto_monad_eliminator]: 5.99999e-06 [cse]: 1.403e-05 [a_3]: 3.043e-05 [py_interpret_to_execute_after_opt_a]: 4e-06 [slice_cell_reuse_recomputed_activation]: 5.00001e-06 [rewriter_after_opt_a]: 1.815e-05 [convert_after_rewriter]: 1.03001e-06 [order_py_execute_after_rewriter]: 1.39998e-06 [mutable_eliminate]: 0.00048433 [opt_b]: 0.00017646, [1] [Cycle 1]: 0.00017083, [7] [b_1]: 0.00010153 [b_2]: 6.66e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 4.59986e-07 [cse]: 2.066e-05 [optimize_parallel_all_gather_comm]: 1.825e-05 [overlap_param_gather]: 5.86e-06 [cconv]: 1.417e-05 [loop_unroll]: 0.00040872 [opt_after_cconv]: 9.226e-05, [1] [Cycle 1]: 8.669e-05, [7] [c_1]: 2.356e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.32001e-06 [cse]: 1.917e-05 [renormalize]: 3.00002e-07 [remove_dup_value]: 2.551e-05 [tuple_transform]: 6.607e-05, [1] [Cycle 1]: 6.187e-05, [4] [d_1]: 3.752e-05 [none_parameter_eliminate]: 7.7e-07 [renormalize]: 1.60013e-07 [switch_simplify]: 5.77999e-06 [partial_unused_args_eliminate]: 8.60018e-07 [add_recomputation]: 3.513e-05 [cse_after_recomputation]: 1.943e-05, [1] [Cycle 1]: 1.549e-05, [1] [cse]: 1.044e-05 [environ_conv]: 1.525e-05 [swap_dp_allreduce_reducescatter]: 1.585e-05 [bias_add_comm_swap]: 1.169e-05 [label_micro_interleaved_index]: 5.64e-06 [label_fine_grained_interleaved_index]: 1.44e-06 [merge_cast_opt]: 6.49976e-07 [slice_recompute_activation]: 8.59989e-07 [micro_interleaved_order_control]: 1.25999e-06 [assign_add_opt]: 6.00005e-07 [ForceFp32Comm]: 4.68999e-06 [remove_cast_before_assign_add]: 3.72998e-06 [full_micro_interleaved_order_control]: 4.24002e-06 [reorder_send_recv_between_fp_bp]: 9.89996e-07 [comm_op_add_attrs]: 2.84001e-06 [add_comm_op_reuse_tag]: 3.00002e-07 [interleave_split_concat_branches]: 6.69999e-07 [interleave_parallel_branches]: 3.9e-06 [overlap_opt_shard_in_pipeline]: 1.095e-05 [overlap_opt_shard_grad_in_pipeline]: 6.80011e-07 [control_data_broadcast_order]: 1.284e-05 [grouped_pairwise_exchange_alltoall]: 4.10015e-07 [offloading_packed_experts]: 2.62001e-06 [overlap_recompute_and_grad_model_parallel]: 6.68998e-06 [overlap_grad_matmul_and_grad_allreduce]: 7.39994e-07 [overlap_recompute_allgather_and_fa_grad]: 7.00005e-07 [overlap_recompute_comm]: 9.20001e-07 [overlap_grad_ring_attention]: 2.31e-05 [overlap_grad_flash_sp]: 5.325e-05 [begin_end_overlap_inline]: 3.49974e-07 [split_matmul_comm_elemetwise]: 1.632e-05 [split_layernorm_comm]: 6.80011e-07 [handle_group_info]: 3.89991e-07 [symbol_engine_optimizer]: 0.00010012, [1] [Cycle 1]: 9.593e-05, [6] [build]: 3.313e-05 [elim_shapecalc]: 9.50001e-06 [elim_not_effective]: 1.094e-05 [opt_reshape]: 6.18002e-06 [fold_const_symbol]: 8.95001e-06 [renormalize]: 1.79978e-07 [detach_backward]: 1.03001e-06 [pipeline_parallel_scheduler]: 9.00007e-07 [auto_monad_reorder]: 1.309e-05 [get_jit_bprop_graph]: 1.00999e-06 [rewriter_after_jit_bprop_graph]: 3.01001e-06 [opt_after_jit_grad]: 0.00044805 [validate]: 5.403e-05 [backend_pass]: 7.80012e-07 [task_emit]: 4.85991 [execute]: 1.39e-05 Sums bootstrap : 0.000883s : 0.02% type_inference : 0.051508s : 1.05% event_method : 0.000021s : 0.00% auto_monad : 0.000150s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000057s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000257s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000127s : 0.00% optimize.opt_a.loop_unroll : 0.000038s : 0.00% optimize.opt_a.a_1 : 0.000718s : 0.01% optimize.opt_a.with_stream_mark : 0.000023s : 0.00% optimize.opt_a.recompute_prepare : 0.000012s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000134s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000049s : 0.00% optimize.opt_a.auto_parallel : 0.000010s : 0.00% optimize.opt_a.parallel : 0.000088s : 0.00% optimize.opt_a.flash_sp : 0.000051s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.000016s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.000963s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.00% optimize.opt_a.cse : 0.000072s : 0.00% optimize.opt_a.a_3 : 0.000068s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000484s : 0.01% optimize.opt_b.b_1 : 0.000102s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.00% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000014s : 0.00% optimize.loop_unroll : 0.000409s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000026s : 0.00% optimize.tuple_transform.d_1 : 0.000038s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000035s : 0.00% optimize.cse_after_recomputation.cse : 0.000010s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000016s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000005s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000004s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000003s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000011s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000000s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000053s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000016s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000000s : 0.00% optimize.symbol_engine_optimizer.build : 0.000033s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000013s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000448s : 0.01% validate : 0.000054s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.859915s : 98.83% execute : 0.000014s : 0.00% Time group info: ------[substitution.] 0.000253 26 0.49% : 0.000001s : 2: substitution.elim_not_effective 0.38% : 0.000001s : 2: substitution.fold_const_symbol 3.35% : 0.000009s : 3: substitution.graph_param_transform 71.77% : 0.000182s : 6: substitution.inline 1.21% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.02% : 0.000015s : 4: substitution.remove_not_recompute_node 1.16% : 0.000003s : 2: substitution.replace_old_param 8.00% : 0.000020s : 1: substitution.switch_simplify 7.61% : 0.000019s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.051420 2 97.63% : 0.050202s : 1: type_inference.infer 2.37% : 0.001218s : 1: type_inference.specialize ------[replace.] 0.000078 9 60.97% : 0.000047s : 6: replace.inline 18.01% : 0.000014s : 1: replace.switch_simplify 21.02% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000216 9 82.57% : 0.000179s : 6: match.inline 9.02% : 0.000020s : 1: match.switch_simplify 8.41% : 0.000018s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000171 1092 0.96% : 0.000002s : 12: predicate.accumulaten_eliminater 0.82% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 6: predicate.addn_check_dump 1.07% : 0.000002s : 12: predicate.addn_zero_filter 0.90% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.25% : 0.000004s : 18: predicate.arithmetic_simplify 0.99% : 0.000002s : 12: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.68% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.91% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.27% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.17% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.62% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.54% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.32% : 0.000004s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.76% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.58% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.52% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 6.07% : 0.000010s : 50: predicate.inline 0.75% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.81% : 0.000001s : 6: predicate.less_batch_normalization 1.68% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.52% : 0.000004s : 32: predicate.load_eliminater 0.97% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.93% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.72% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.57% : 0.000001s : 6: predicate.merge_addn 0.46% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 12: predicate.minmaximum_grad 1.14% : 0.000002s : 3: predicate.mutable_eliminate 0.38% : 0.000001s : 3: predicate.opt_reshape 0.42% : 0.000001s : 3: predicate.parallel_virtual_node 2.18% : 0.000004s : 20: predicate.partial_defer_inline 1.45% : 0.000002s : 17: predicate.partial_eliminate 1.04% : 0.000002s : 12: predicate.print_const_string_wrapper 0.67% : 0.000001s : 6: predicate.reduce_all_const_elim 1.59% : 0.000003s : 12: predicate.reduce_eliminate 2.49% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 20: predicate.replace_applicator 0.40% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.00% : 0.000002s : 12: predicate.reshape_eliminate 0.66% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.67% : 0.000001s : 6: predicate.same_eliminate 0.38% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.78% : 0.000001s : 6: predicate.shard_identity_eliminate 0.64% : 0.000001s : 6: predicate.special_op_eliminate 0.63% : 0.000001s : 6: predicate.specialize_transform 0.64% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.68% : 0.000003s : 20: predicate.switch_defer_inline 2.20% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.12% : 0.000010s : 68: predicate.switch_simplify 0.97% : 0.000002s : 12: predicate.tile_eliminate 0.96% : 0.000002s : 12: predicate.transpose_eliminate 1.57% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.48% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 2.74% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.85% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.33% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.94% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.28% : 0.000000s : 3: predicate.value_based_eliminate 0.62% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.65% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000982 16 56.65% : 0.000556s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.35% : 0.000426s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 4.941029 196 0.00% : 0.000008s : 1: ForceFp32Comm 0.15% : 0.007290s : 1: add_attr 0.15% : 0.007278s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000039s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000155s : 1: auto_monad 0.00% : 0.000017s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.02% : 0.000934s : 1: bootstrap 0.00% : 0.000018s : 1: cconv 0.00% : 0.000006s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000022s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000070s : 1: execute 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000003s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.01% : 0.000417s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.01% : 0.000493s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.02% : 0.001169s : 78: opt.transform.opt_a 0.00% : 0.000022s : 1: opt.transform.opt_after_cconv 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000082s : 28: opt.transform.opt_b 0.00% : 0.000041s : 2: opt.transform.opt_trans_graph 0.00% : 0.000032s : 4: opt.transform.symbol_engine_opt 0.06% : 0.003199s : 1: opt_a 0.00% : 0.000096s : 1: opt_after_cconv 0.01% : 0.000457s : 1: opt_after_jit_grad 0.00% : 0.000180s : 1: opt_b 0.11% : 0.005374s : 1: optimize 0.00% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000057s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000003s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000014s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000030s : 1: remove_dup_value 0.01% : 0.000556s : 1: renormalize.infer 0.01% : 0.000400s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000022s : 1: rewriter_after_opt_a 0.01% : 0.000262s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000003s : 1: split_layernorm_comm 0.00% : 0.000019s : 1: split_matmul_comm_elemetwise 0.00% : 0.000019s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000103s : 1: symbol_engine_optimizer 98.36% : 4.860043s : 1: task_emit 0.00% : 0.000069s : 1: tuple_transform 1.04% : 0.051527s : 1: type_inference 0.00% : 0.000073s : 1: validate TotalTime = 5.0159, [24] [bootstrap]: 0.00088316 [type_inference]: 0.0580032 [event_method]: 7.875e-05 [auto_monad]: 0.00019358 [graph_reusing]: 1.025e-05 [inline]: 1.55999e-06 [add_attr]: 0.0069597, [1] [add_attr_with_inline]: 0.00694819, [1] [Cycle 1]: 0.00010204, [2] [tag_attr]: 2.96e-05 [meta_addattr_fg_expand]: 1.141e-05 [parallel-infer-symbol]: 2.68998e-06 [pre_auto_parallel]: 4.738e-05 [insert-virtual-dataset]: 1.63002e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.25001e-06 [pipeline_split]: 1.39998e-06 [optimize]: 0.00626171, [53] [py_interpret_to_execute]: 3.97e-06 [rewriter_before_opt_a]: 0.00023724 [opt_a]: 0.00378302, [2] [Cycle 1]: 0.00309679, [45] [expand_dump_flag]: 2.71e-06 [switch_simplify]: 0.00010225 [loop_unroll]: 3.848e-05 [a_1]: 0.00078477 [with_stream_mark]: 1.457e-05 [recompute_prepare]: 9.40001e-06 [updatestate_depend_eliminate]: 8.80001e-06 [updatestate_assign_eliminate]: 7.95e-06 [updatestate_loads_eliminate]: 3.04001e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 9.549e-05 [accelerated_algorithm]: 7.14001e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 6.69001e-06 [merge_send_recv]: 3.387e-05 [auto_parallel]: 7.13e-06 [parallel]: 6.561e-05 [flash_sp]: 1.948e-05 [merge_comm]: 4.94e-06 [allreduce_fusion]: 1.083e-05 [matmul_add_comm_reduction]: 1.103e-05 [allreduce_slice_to_reducescatter]: 4.18999e-06 [virtual_shard_identity]: 8.85001e-06 [virtual_dataset]: 6.87002e-06 [get_grad_eliminate_]: 6.36e-06 [virtual_output]: 6.58e-06 [merge_forward]: 4.32e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 1.349e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.32e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 8.75001e-06 [meta_fg_expand]: 3.38e-06 [flash_sp_send_recv_attached]: 1.62999e-06 [receive_attached]: 1.27e-05 [after_resolve]: 9.59999e-06 [a_after_grad]: 9.56998e-06 [renormalize]: 0.0013466 [add_forward_monad_depend]: 5.22e-06 [auto_monad_grad]: 1.87999e-06 [auto_monad_eliminator]: 2.627e-05 [cse]: 6.076e-05 [a_3]: 4.97e-05 [Cycle 2]: 0.0006763, [45] [expand_dump_flag]: 1.09003e-06 [switch_simplify]: 8.10999e-06 [loop_unroll]: 7.01999e-06 [a_1]: 0.0001518 [with_stream_mark]: 1.274e-05 [recompute_prepare]: 6.61999e-06 [updatestate_depend_eliminate]: 3.8e-06 [updatestate_assign_eliminate]: 3.24001e-06 [updatestate_loads_eliminate]: 3.23e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 8.439e-05 [accelerated_algorithm]: 6.52001e-06 [shard]: 9.79984e-07 [meta_shard_fg_expand]: 1.71002e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 5.66e-06 [auto_parallel]: 6.22001e-06 [parallel]: 4.65001e-06 [flash_sp]: 2.71999e-06 [merge_comm]: 3.88001e-06 [allreduce_fusion]: 3.6e-06 [matmul_add_comm_reduction]: 6.11e-06 [allreduce_slice_to_reducescatter]: 5.19998e-07 [virtual_shard_identity]: 7.33999e-06 [virtual_dataset]: 5.91003e-06 [get_grad_eliminate_]: 5.91e-06 [virtual_output]: 5.97001e-06 [merge_forward]: 3.28e-06 [cell_reuse_recompute_pass]: 1.82999e-06 [offload_activation]: 6.84001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.152e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.012e-05 [set_forward_comm_id_for_comm_node_pass]: 4.06001e-06 [meta_fg_expand]: 2.64999e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.09003e-06 [after_resolve]: 9.30001e-06 [a_after_grad]: 8.99e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 8.33999e-06 [cse]: 1.94e-05 [a_3]: 3.877e-05 [py_interpret_to_execute_after_opt_a]: 4.77e-06 [slice_cell_reuse_recomputed_activation]: 1.94999e-06 [rewriter_after_opt_a]: 3.3e-05 [convert_after_rewriter]: 1.69e-06 [order_py_execute_after_rewriter]: 1.17999e-06 [mutable_eliminate]: 0.00058758 [opt_b]: 0.00025824, [1] [Cycle 1]: 0.00025171, [7] [b_1]: 0.00015266 [b_2]: 2.516e-05 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 3.49001e-06 [renormalize]: 6.19999e-07 [cse]: 2.5e-05 [optimize_parallel_all_gather_comm]: 2.717e-05 [overlap_param_gather]: 9.77999e-06 [cconv]: 2.299e-05 [loop_unroll]: 0.00041993 [opt_after_cconv]: 0.00010573, [1] [Cycle 1]: 0.00010009, [7] [c_1]: 2.941e-05 [parameter_eliminate]: 2.16003e-06 [updatestate_depend_eliminate]: 6.07999e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.06001e-06 [cse]: 2.397e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 3.163e-05 [tuple_transform]: 9.057e-05, [1] [Cycle 1]: 8.624e-05, [4] [d_1]: 5.852e-05 [none_parameter_eliminate]: 1.94e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 7.40003e-06 [partial_unused_args_eliminate]: 1.55001e-06 [add_recomputation]: 6.023e-05 [cse_after_recomputation]: 2.577e-05, [1] [Cycle 1]: 2.169e-05, [1] [cse]: 1.646e-05 [environ_conv]: 1.793e-05 [swap_dp_allreduce_reducescatter]: 2.314e-05 [bias_add_comm_swap]: 9.12999e-06 [label_micro_interleaved_index]: 1.162e-05 [label_fine_grained_interleaved_index]: 2.19999e-06 [merge_cast_opt]: 9.79984e-07 [slice_recompute_activation]: 2.25002e-06 [micro_interleaved_order_control]: 2.25002e-06 [assign_add_opt]: 1.47001e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 8.01001e-06 [full_micro_interleaved_order_control]: 8.31002e-06 [reorder_send_recv_between_fp_bp]: 1.75001e-06 [comm_op_add_attrs]: 8.09989e-07 [add_comm_op_reuse_tag]: 7.7e-07 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 7.92998e-06 [overlap_opt_shard_in_pipeline]: 1.811e-05 [overlap_opt_shard_grad_in_pipeline]: 1.64e-06 [control_data_broadcast_order]: 1.363e-05 [grouped_pairwise_exchange_alltoall]: 1.22e-06 [offloading_packed_experts]: 4.62998e-06 [overlap_recompute_and_grad_model_parallel]: 1.344e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.44e-06 [overlap_recompute_allgather_and_fa_grad]: 9.50007e-07 [overlap_recompute_comm]: 1.96e-06 [overlap_grad_ring_attention]: 1.936e-05 [overlap_grad_flash_sp]: 4.055e-05 [begin_end_overlap_inline]: 3.59985e-07 [split_matmul_comm_elemetwise]: 8.82e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 8.00006e-07 [symbol_engine_optimizer]: 0.00010243, [1] [Cycle 1]: 9.799e-05, [6] [build]: 2.501e-05 [elim_shapecalc]: 1.144e-05 [elim_not_effective]: 1.418e-05 [opt_reshape]: 7.61999e-06 [fold_const_symbol]: 1.078e-05 [renormalize]: 1.8999e-07 [detach_backward]: 2.43e-06 [pipeline_parallel_scheduler]: 1.30001e-06 [auto_monad_reorder]: 2.986e-05 [get_jit_bprop_graph]: 1.79e-06 [rewriter_after_jit_bprop_graph]: 3.41001e-06 [opt_after_jit_grad]: 0.0004549 [validate]: 5.732e-05 [backend_pass]: 8.70001e-07 [task_emit]: 4.94246 [execute]: 1.118e-05 Sums bootstrap : 0.000883s : 0.02% type_inference : 0.058003s : 1.16% event_method : 0.000079s : 0.00% auto_monad : 0.000194s : 0.00% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000011s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000047s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000237s : 0.00% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000110s : 0.00% optimize.opt_a.loop_unroll : 0.000045s : 0.00% optimize.opt_a.a_1 : 0.000937s : 0.02% optimize.opt_a.with_stream_mark : 0.000027s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000180s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000040s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000070s : 0.00% optimize.opt_a.flash_sp : 0.000022s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000005s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000014s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.001347s : 0.03% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.00% optimize.opt_a.cse : 0.000080s : 0.00% optimize.opt_a.a_3 : 0.000088s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000033s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000588s : 0.01% optimize.opt_b.b_1 : 0.000153s : 0.00% optimize.opt_b.b_2 : 0.000025s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000420s : 0.01% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000032s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000018s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000009s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000008s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000018s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000041s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000009s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000025s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000455s : 0.01% validate : 0.000057s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 4.942463s : 98.70% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000299 62 0.65% : 0.000002s : 3: substitution.elim_not_effective 2.19% : 0.000007s : 3: substitution.float_tuple_getitem_switch 0.51% : 0.000002s : 3: substitution.fold_const_symbol 1.87% : 0.000006s : 4: substitution.graph_param_transform 57.27% : 0.000172s : 8: substitution.inline 1.27% : 0.000004s : 6: substitution.j_node_and_user_rematch 1.57% : 0.000005s : 2: substitution.minmaximum_grad 1.80% : 0.000005s : 6: substitution.remove_not_recompute_node 1.13% : 0.000003s : 2: substitution.replace_old_param 3.68% : 0.000011s : 1: substitution.switch_simplify 4.91% : 0.000015s : 4: substitution.tuple_list_convert_item_index_to_positive 4.54% : 0.000014s : 4: substitution.tuple_list_get_item_const_eliminator 3.33% : 0.000010s : 4: substitution.tuple_list_get_item_depend_reorder 12.02% : 0.000036s : 8: substitution.tuple_list_get_item_eliminator 3.26% : 0.000010s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.057924 2 97.11% : 0.056247s : 1: type_inference.infer 2.89% : 0.001677s : 1: type_inference.specialize ------[replace.] 0.000094 11 61.61% : 0.000058s : 8: replace.inline 20.26% : 0.000019s : 1: replace.switch_simplify 18.13% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000188 11 89.29% : 0.000168s : 8: match.inline 5.27% : 0.000010s : 1: match.switch_simplify 5.44% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1438 0.94% : 0.000002s : 16: predicate.accumulaten_eliminater 1.02% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.47% : 0.000001s : 8: predicate.addn_check_dump 1.07% : 0.000002s : 16: predicate.addn_zero_filter 0.91% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 24: predicate.arithmetic_simplify 0.99% : 0.000002s : 16: predicate.cast_eliminate 0.53% : 0.000001s : 8: predicate.check_bprop_eliminate 0.51% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.48% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.19% : 0.000003s : 16: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.40% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.19% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 20: predicate.environ_get_depend_swap 1.72% : 0.000004s : 28: predicate.environ_get_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.64% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.84% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.62% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.44% : 0.000001s : 8: predicate.incorporate_call_switch 5.67% : 0.000013s : 66: predicate.inline 0.68% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.44% : 0.000006s : 42: predicate.load_eliminater 0.94% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.82% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.58% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.49% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.98% : 0.000002s : 16: predicate.minmaximum_grad 0.88% : 0.000002s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.03% : 0.000005s : 26: predicate.partial_defer_inline 1.42% : 0.000003s : 22: predicate.partial_eliminate 1.12% : 0.000003s : 16: predicate.print_const_string_wrapper 0.48% : 0.000001s : 8: predicate.reduce_all_const_elim 1.39% : 0.000003s : 16: predicate.reduce_eliminate 2.53% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 26: predicate.replace_applicator 0.35% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000000s : 4: predicate.reset_defer_inline 0.98% : 0.000002s : 16: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.37% : 0.000001s : 4: predicate.row_tensor_eliminate 0.81% : 0.000002s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.78% : 0.000002s : 8: predicate.specialize_transform 0.70% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.64% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.75% : 0.000004s : 26: predicate.switch_defer_inline 2.23% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.95% : 0.000014s : 86: predicate.switch_simplify 0.99% : 0.000002s : 16: predicate.tile_eliminate 1.04% : 0.000002s : 16: predicate.transpose_eliminate 1.70% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.79% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.22% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.67% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.44% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 3.02% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.30% : 0.000001s : 4: predicate.value_based_eliminate 0.57% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001379 23 56.57% : 0.000780s : 11: func_graph_cloner_run.FuncGraphClonerGraph 43.43% : 0.000599s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 5.032081 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.14% : 0.006964s : 1: add_attr 0.14% : 0.006952s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000064s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000203s : 1: auto_monad 0.00% : 0.000034s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000012s : 1: bias_add_comm_swap 0.02% : 0.000934s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000022s : 1: environ_conv 0.00% : 0.000086s : 1: event_method 0.00% : 0.000028s : 1: execute 0.00% : 0.000012s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000015s : 1: label_micro_interleaved_index 0.01% : 0.000429s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.000596s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.001459s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000150s : 28: opt.transform.opt_b 0.00% : 0.000064s : 2: opt.transform.opt_trans_graph 0.00% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.08% : 0.003786s : 1: opt_a 0.00% : 0.000109s : 1: opt_after_cconv 0.01% : 0.000464s : 1: opt_after_jit_grad 0.01% : 0.000262s : 1: opt_b 0.12% : 0.006266s : 1: optimize 0.00% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000044s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000022s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000013s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000052s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000036s : 1: remove_dup_value 0.02% : 0.000768s : 1: renormalize.infer 0.01% : 0.000570s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000037s : 1: rewriter_after_opt_a 0.00% : 0.000243s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000026s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000105s : 1: symbol_engine_optimizer 98.22% : 4.942578s : 1: task_emit 0.00% : 0.000093s : 1: tuple_transform 1.15% : 0.058019s : 1: type_inference 0.00% : 0.000081s : 1: validate [LOG_WARNING] can not open file, file: /home/jenkins/ascend/log/debug/plog/plog-171921_20260129175155549.log, possible reason: Permission denied. TotalTime = 0.3996, [24] [bootstrap]: 0.0411796 [type_inference]: 0.24671 [event_method]: 0.00023285 [auto_monad]: 0.00017732 [graph_reusing]: 1.024e-05 [inline]: 2.51e-06 [add_attr]: 0.0162403, [1] [add_attr_with_inline]: 0.0162206, [1] [Cycle 1]: 0.0122383, [2] [tag_attr]: 4.579e-05 [meta_addattr_fg_expand]: 4.438e-05 [parallel-infer-symbol]: 5.64e-06 [pre_auto_parallel]: 8.793e-05 [insert-virtual-dataset]: 3.63e-06 [parallel-infer-symbol-second]: 7.09988e-07 [dataset_repeat_opt]: 2.53998e-06 [pipeline_split]: 1.79e-06 [optimize]: 0.085847, [53] [py_interpret_to_execute]: 1.124e-05 [rewriter_before_opt_a]: 0.00036719 [opt_a]: 0.0187926, [3] [Cycle 1]: 0.0156056, [45] [expand_dump_flag]: 4.75001e-06 [switch_simplify]: 0.00017087 [loop_unroll]: 6.213e-05 [a_1]: 0.00139081 [with_stream_mark]: 2.937e-05 [recompute_prepare]: 2.017e-05 [updatestate_depend_eliminate]: 8.07998e-06 [updatestate_assign_eliminate]: 6.69999e-06 [updatestate_loads_eliminate]: 6.29999e-06 [parameter_eliminate]: 2.94001e-06 [a_2]: 0.00020105 [accelerated_algorithm]: 1.386e-05 [shard]: 2.63e-06 [meta_shard_fg_expand]: 3.88001e-06 [shard_inline]: 1.274e-05 [merge_send_recv]: 1.604e-05 [auto_parallel]: 1.119e-05 [parallel]: 9.238e-05 [flash_sp]: 1.207e-05 [merge_comm]: 9.42999e-06 [allreduce_fusion]: 7.45998e-06 [matmul_add_comm_reduction]: 3.188e-05 [allreduce_slice_to_reducescatter]: 9.89996e-07 [virtual_shard_identity]: 1.503e-05 [virtual_dataset]: 1.284e-05 [get_grad_eliminate_]: 1.266e-05 [virtual_output]: 1.304e-05 [merge_forward]: 8.64e-06 [cell_reuse_recompute_pass]: 1.62999e-06 [offload_activation]: 1.765e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.579e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 2.397e-05 [set_forward_comm_id_for_comm_node_pass]: 7.95e-06 [meta_fg_expand]: 0.00201698 [flash_sp_send_recv_attached]: 3.9e-06 [receive_attached]: 2.61999e-06 [after_resolve]: 6.216e-05 [a_after_grad]: 7.976e-05 [renormalize]: 0.0101942 [add_forward_monad_depend]: 1.114e-05 [auto_monad_grad]: 6.19001e-06 [auto_monad_eliminator]: 5.319e-05 [cse]: 0.00030273 [a_3]: 0.00029902 [Cycle 2]: 0.00261659, [45] [expand_dump_flag]: 3.23e-06 [switch_simplify]: 3.955e-05 [loop_unroll]: 3.734e-05 [a_1]: 0.00115069 [with_stream_mark]: 1.81e-05 [recompute_prepare]: 8.08001e-06 [updatestate_depend_eliminate]: 4.22e-06 [updatestate_assign_eliminate]: 3.23e-06 [updatestate_loads_eliminate]: 2.71e-06 [parameter_eliminate]: 1.47001e-06 [a_2]: 6.895e-05 [accelerated_algorithm]: 6.48e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 5.78997e-06 [merge_send_recv]: 7.71999e-06 [auto_parallel]: 9.84001e-06 [parallel]: 8.25999e-06 [flash_sp]: 4.01001e-06 [merge_comm]: 3.04001e-06 [allreduce_fusion]: 3.21999e-06 [matmul_add_comm_reduction]: 8.47e-06 [allreduce_slice_to_reducescatter]: 7.79983e-07 [virtual_shard_identity]: 6.91999e-06 [virtual_dataset]: 6.09999e-06 [get_grad_eliminate_]: 7.05e-06 [virtual_output]: 5.62001e-06 [merge_forward]: 4.28001e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 8.38999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.211e-05 [merge_recompute_call_nodes]: 1.25999e-06 [before_grad]: 9.82999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 7.196e-05 [flash_sp_send_recv_attached]: 1.91e-06 [receive_attached]: 1.93002e-06 [after_resolve]: 1.142e-05 [a_after_grad]: 8.57998e-06 [renormalize]: 0.00073883 [add_forward_monad_depend]: 5.20999e-06 [auto_monad_grad]: 1.54998e-06 [auto_monad_eliminator]: 1.149e-05 [cse]: 2.303e-05 [a_3]: 4.209e-05 [Cycle 3]: 0.00054884, [45] [expand_dump_flag]: 1.61002e-06 [switch_simplify]: 6.73e-06 [loop_unroll]: 5.71e-06 [a_1]: 9.611e-05 [with_stream_mark]: 8.55999e-06 [recompute_prepare]: 5.83997e-06 [updatestate_depend_eliminate]: 2.80002e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.26e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 6.402e-05 [accelerated_algorithm]: 5.69e-06 [shard]: 9.80013e-07 [meta_shard_fg_expand]: 1.24e-06 [shard_inline]: 5.61e-06 [merge_send_recv]: 4.12e-06 [auto_parallel]: 6.06998e-06 [parallel]: 4.12e-06 [flash_sp]: 9.5999e-07 [merge_comm]: 3.03e-06 [allreduce_fusion]: 2.61e-06 [matmul_add_comm_reduction]: 5.07e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.26e-06 [virtual_dataset]: 5.25001e-06 [get_grad_eliminate_]: 5.37999e-06 [virtual_output]: 5.19e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.44998e-06 [offload_activation]: 5.59e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.157e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 8.1e-06 [set_forward_comm_id_for_comm_node_pass]: 2.97002e-06 [meta_fg_expand]: 1.89999e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 8.70001e-07 [after_resolve]: 8.03001e-06 [a_after_grad]: 7.58999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 9.89996e-07 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 5.91e-06 [cse]: 1.487e-05 [a_3]: 3.09e-05 [py_interpret_to_execute_after_opt_a]: 4.95999e-06 [slice_cell_reuse_recomputed_activation]: 2.03002e-06 [rewriter_after_opt_a]: 1.99e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.66e-06 [mutable_eliminate]: 0.0649423 [opt_b]: 0.00028847, [1] [Cycle 1]: 0.00027672, [7] [b_1]: 0.00015572 [b_2]: 8.54e-06 [updatestate_depend_eliminate]: 1.379e-05 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 3.26001e-06 [renormalize]: 6.40022e-07 [cse]: 5.345e-05 [optimize_parallel_all_gather_comm]: 2.71e-05 [overlap_param_gather]: 2.19999e-06 [cconv]: 4.272e-05 [loop_unroll]: 0.00059851 [opt_after_cconv]: 0.00010548, [1] [Cycle 1]: 9.949e-05, [7] [c_1]: 2.726e-05 [parameter_eliminate]: 6.43e-06 [updatestate_depend_eliminate]: 5.36002e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 2.132e-05 [renormalize]: 4.59986e-07 [remove_dup_value]: 1.788e-05 [tuple_transform]: 7.652e-05, [1] [Cycle 1]: 7.017e-05, [4] [d_1]: 4.35e-05 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.43998e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 5.346e-05 [cse_after_recomputation]: 2.506e-05, [1] [Cycle 1]: 2.01e-05, [1] [cse]: 1.435e-05 [environ_conv]: 6.96999e-06 [swap_dp_allreduce_reducescatter]: 5.38002e-06 [bias_add_comm_swap]: 3.7e-06 [label_micro_interleaved_index]: 4.79002e-06 [label_fine_grained_interleaved_index]: 2.84999e-06 [merge_cast_opt]: 1.52001e-06 [slice_recompute_activation]: 2.51998e-06 [micro_interleaved_order_control]: 2.38998e-06 [assign_add_opt]: 1.45001e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.12999e-06 [full_micro_interleaved_order_control]: 2.29001e-06 [reorder_send_recv_between_fp_bp]: 3.06999e-06 [comm_op_add_attrs]: 1.38002e-06 [add_comm_op_reuse_tag]: 1.11002e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.10999e-06 [overlap_opt_shard_in_pipeline]: 2.905e-05 [overlap_opt_shard_grad_in_pipeline]: 2.07999e-06 [control_data_broadcast_order]: 1.265e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 4.48001e-06 [overlap_recompute_and_grad_model_parallel]: 5.09e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64998e-06 [overlap_recompute_comm]: 2.31998e-06 [overlap_grad_ring_attention]: 4.50999e-06 [overlap_grad_flash_sp]: 2.283e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.58e-06 [split_layernorm_comm]: 2.21e-06 [handle_group_info]: 1.23002e-06 [symbol_engine_optimizer]: 7.623e-05, [1] [Cycle 1]: 7.18e-05, [6] [build]: 2.98003e-06 [elim_shapecalc]: 9.66e-06 [elim_not_effective]: 1.358e-05 [opt_reshape]: 7.28e-06 [fold_const_symbol]: 9.90002e-06 [renormalize]: 1.79978e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.54e-06 [auto_monad_reorder]: 1.93e-05 [get_jit_bprop_graph]: 2.35002e-06 [rewriter_after_jit_bprop_graph]: 6.94001e-06 [opt_after_jit_grad]: 0.00049821 [validate]: 4.832e-05 [backend_pass]: 8.30012e-07 [task_emit]: 0.00823335 [execute]: 7.48e-06 Sums bootstrap : 0.041180s : 10.78% type_inference : 0.246710s : 64.58% event_method : 0.000233s : 0.06% auto_monad : 0.000177s : 0.05% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.01% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000044s : 0.01% parallel-infer-symbol : 0.000006s : 0.00% pre_auto_parallel : 0.000088s : 0.02% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000011s : 0.00% optimize.rewriter_before_opt_a : 0.000367s : 0.10% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000217s : 0.06% optimize.opt_a.loop_unroll : 0.000105s : 0.03% optimize.opt_a.a_1 : 0.002638s : 0.69% optimize.opt_a.with_stream_mark : 0.000056s : 0.01% optimize.opt_a.recompute_prepare : 0.000034s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000334s : 0.09% optimize.opt_a.accelerated_algorithm : 0.000026s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000024s : 0.01% optimize.opt_a.merge_send_recv : 0.000028s : 0.01% optimize.opt_a.auto_parallel : 0.000027s : 0.01% optimize.opt_a.parallel : 0.000105s : 0.03% optimize.opt_a.flash_sp : 0.000017s : 0.00% optimize.opt_a.merge_comm : 0.000016s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000045s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000028s : 0.01% optimize.opt_a.virtual_dataset : 0.000024s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000025s : 0.01% optimize.opt_a.virtual_output : 0.000024s : 0.01% optimize.opt_a.merge_forward : 0.000016s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000032s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000049s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000042s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.00% optimize.opt_a.meta_fg_expand : 0.002091s : 0.55% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000082s : 0.02% optimize.opt_a.a_after_grad : 0.000096s : 0.03% optimize.opt_a.renormalize : 0.010933s : 2.86% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000071s : 0.02% optimize.opt_a.cse : 0.000341s : 0.09% optimize.opt_a.a_3 : 0.000372s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000020s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.064942s : 17.00% optimize.opt_b.b_1 : 0.000156s : 0.04% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000014s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000053s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000043s : 0.01% optimize.loop_unroll : 0.000599s : 0.16% optimize.opt_after_cconv.c_1 : 0.000027s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.00% optimize.tuple_transform.d_1 : 0.000044s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.01% optimize.cse_after_recomputation.cse : 0.000014s : 0.00% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000029s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.00% opt_after_jit_grad : 0.000498s : 0.13% validate : 0.000048s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.008233s : 2.16% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000764 154 0.34% : 0.000003s : 2: substitution.elim_not_effective 1.11% : 0.000008s : 11: substitution.float_depend_g_call 0.61% : 0.000005s : 2: substitution.float_tuple_getitem_switch 0.23% : 0.000002s : 2: substitution.fold_const_symbol 0.82% : 0.000006s : 3: substitution.graph_param_transform 0.48% : 0.000004s : 2: substitution.incorporate_call 0.27% : 0.000002s : 2: substitution.incorporate_call_switch 64.36% : 0.000492s : 20: substitution.inline 2.36% : 0.000018s : 2: substitution.inline_without_move 1.28% : 0.000010s : 12: substitution.j_node_and_user_rematch 1.31% : 0.000010s : 7: substitution.minmaximum_grad 2.65% : 0.000020s : 11: substitution.partial_eliminate 1.42% : 0.000011s : 12: substitution.remove_not_recompute_node 3.16% : 0.000024s : 9: substitution.replace_applicator 1.25% : 0.000010s : 9: substitution.replace_old_param 0.37% : 0.000003s : 1: substitution.set_cell_output_no_recompute 3.67% : 0.000028s : 3: substitution.switch_simplify 2.85% : 0.000022s : 7: substitution.tuple_list_convert_item_index_to_positive 1.28% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.80% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 6.53% : 0.000050s : 16: substitution.tuple_list_get_item_eliminator 1.84% : 0.000014s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.246604 2 98.86% : 0.243801s : 1: type_inference.infer 1.14% : 0.002803s : 1: type_inference.specialize ------[replace.] 0.000247 30 58.99% : 0.000146s : 20: replace.inline 17.53% : 0.000043s : 3: replace.switch_simplify 23.48% : 0.000058s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000531 30 90.76% : 0.000482s : 20: match.inline 4.90% : 0.000026s : 3: match.switch_simplify 4.34% : 0.000023s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000574 3823 1.14% : 0.000007s : 49: predicate.accumulaten_eliminater 0.23% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.55% : 0.000003s : 17: predicate.addn_check_dump 1.16% : 0.000007s : 49: predicate.addn_zero_filter 1.08% : 0.000006s : 49: predicate.adjust_all_reduce_mul_add 2.03% : 0.000012s : 66: predicate.arithmetic_simplify 1.14% : 0.000007s : 49: predicate.cast_eliminate 1.04% : 0.000006s : 44: predicate.check_bprop_eliminate 0.44% : 0.000003s : 17: predicate.compare_switch_simplify 0.05% : 0.000000s : 3: predicate.const_output_eliminate 0.40% : 0.000002s : 17: predicate.depend_value_elim 1.20% : 0.000007s : 49: predicate.dict_get_item_const_eliminator 1.36% : 0.000008s : 49: predicate.dict_get_item_eliminator 1.17% : 0.000007s : 49: predicate.dict_set_item_eliminator 0.31% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 3: predicate.elim_not_effective 0.13% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000007s : 52: predicate.environ_add_const_eliminate 1.18% : 0.000007s : 52: predicate.environ_get_add_eliminate 1.16% : 0.000007s : 52: predicate.environ_get_depend_swap 1.58% : 0.000009s : 69: predicate.environ_get_eliminate 1.15% : 0.000007s : 52: predicate.environ_get_set_eliminate 1.79% : 0.000010s : 76: predicate.exchange_switch_depend_value 2.59% : 0.000015s : 76: predicate.float_depend_g_call 0.43% : 0.000002s : 17: predicate.float_environ_get_switch 0.52% : 0.000003s : 20: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 3: predicate.fold_const_symbol 0.54% : 0.000003s : 17: predicate.get_grad_eliminate 0.09% : 0.000001s : 3: predicate.graph_param_transform 0.40% : 0.000002s : 17: predicate.incorporate_call 0.37% : 0.000002s : 17: predicate.incorporate_call_switch 5.53% : 0.000032s : 165: predicate.inline 1.32% : 0.000008s : 41: predicate.inline_without_move 0.23% : 0.000001s : 17: predicate.j_node_and_user_rematch 0.62% : 0.000004s : 17: predicate.less_batch_normalization 1.57% : 0.000009s : 62: predicate.list_to_tuple_eliminator_ 2.49% : 0.000014s : 111: predicate.load_eliminater 0.36% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.77% : 0.000016s : 113: predicate.loop_unroll_before_grad 1.39% : 0.000008s : 55: predicate.make_slice_get_slice_eliminator 0.45% : 0.000003s : 17: predicate.merge_addn 1.05% : 0.000006s : 44: predicate.micro_step_allgather_replace 1.03% : 0.000006s : 44: predicate.mini_step_allgather_replace 1.12% : 0.000006s : 49: predicate.minmaximum_grad 1.14% : 0.000007s : 3: predicate.mutable_eliminate 0.12% : 0.000001s : 3: predicate.opt_reshape 0.17% : 0.000001s : 3: predicate.parallel_virtual_node 2.28% : 0.000013s : 76: predicate.partial_defer_inline 1.53% : 0.000009s : 59: predicate.partial_eliminate 1.15% : 0.000007s : 49: predicate.print_const_string_wrapper 0.46% : 0.000003s : 17: predicate.reduce_all_const_elim 1.31% : 0.000008s : 49: predicate.reduce_eliminate 2.52% : 0.000014s : 111: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000002s : 17: predicate.remove_not_recompute_node 1.79% : 0.000010s : 100: predicate.replace_applicator 0.67% : 0.000004s : 41: predicate.replace_old_param 0.19% : 0.000001s : 3: predicate.reset_defer_inline 1.18% : 0.000007s : 49: predicate.reshape_eliminate 1.08% : 0.000006s : 44: predicate.row_tensor_add_zeros_like 0.14% : 0.000001s : 3: predicate.row_tensor_eliminate 1.29% : 0.000007s : 44: predicate.same_eliminate 0.29% : 0.000002s : 17: predicate.set_cell_output_no_recompute 0.54% : 0.000003s : 17: predicate.shard_identity_eliminate 0.24% : 0.000001s : 6: predicate.special_op_eliminate 0.47% : 0.000003s : 17: predicate.specialize_transform 1.29% : 0.000007s : 44: predicate.split_environ_get_set_with_tuple_value 1.40% : 0.000008s : 41: predicate.stack_unstack_eliminate 0.11% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.98% : 0.000011s : 76: predicate.switch_defer_inline 3.01% : 0.000017s : 120: predicate.switch_layer_defer_inline 5.60% : 0.000032s : 215: predicate.switch_simplify 1.16% : 0.000007s : 49: predicate.tile_eliminate 1.10% : 0.000006s : 49: predicate.transpose_eliminate 1.48% : 0.000008s : 55: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000009s : 55: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000008s : 55: predicate.tuple_list_get_item_depend_reorder 2.79% : 0.000016s : 79: predicate.tuple_list_get_item_eliminator 1.50% : 0.000009s : 55: predicate.tuple_list_get_set_item_eliminator 1.96% : 0.000011s : 72: predicate.tuple_list_set_item_eliminator 1.58% : 0.000009s : 62: predicate.tuple_to_list_eliminator_ 2.42% : 0.000014s : 111: predicate.updatestate_pure_node_eliminater 2.89% : 0.000017s : 128: predicate.updatestate_useless_node_eliminater 0.13% : 0.000001s : 3: predicate.value_based_eliminate 0.55% : 0.000003s : 17: predicate.virtual_dataset_eliminate 0.51% : 0.000003s : 17: predicate.virtual_output_eliminate 0.07% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.17% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002859 41 65.67% : 0.001877s : 17: func_graph_cloner_run.FuncGraphClonerGraph 34.33% : 0.000981s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.516795 237 0.00% : 0.000004s : 1: ForceFp32Comm 3.14% : 0.016248s : 1: add_attr 3.14% : 0.016226s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.04% : 0.000188s : 1: auto_monad 0.00% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 7.98% : 0.041227s : 1: bootstrap 0.01% : 0.000046s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.05% : 0.000245s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.12% : 0.000607s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 12.57% : 0.064969s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000045s : 1: opt.transform.mutable_eliminate 0.77% : 0.003984s : 117: opt.transform.opt_a 0.01% : 0.000026s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000130s : 28: opt.transform.opt_b 0.01% : 0.000048s : 2: opt.transform.opt_trans_graph 0.01% : 0.000036s : 4: opt.transform.symbol_engine_opt 3.64% : 0.018796s : 1: opt_a 0.02% : 0.000109s : 1: opt_after_cconv 0.10% : 0.000508s : 1: opt_after_jit_grad 0.06% : 0.000293s : 1: opt_b 16.61% : 0.085853s : 1: optimize 0.01% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000033s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000009s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.02% : 0.000092s : 1: pre_auto_parallel 0.00% : 0.000015s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.00% : 0.000021s : 1: remove_dup_value 1.73% : 0.008929s : 2: renormalize.infer 0.38% : 0.001987s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000023s : 1: rewriter_after_opt_a 0.07% : 0.000373s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000079s : 1: symbol_engine_optimizer 1.60% : 0.008248s : 1: task_emit 0.02% : 0.000080s : 1: tuple_transform 47.74% : 0.246740s : 1: type_inference 0.02% : 0.000090s : 1: validate TotalTime = 0.100578, [24] [bootstrap]: 0.00077597 [type_inference]: 0.0389191 [event_method]: 0.0001045 [auto_monad]: 0.00018066 [graph_reusing]: 1.246e-05 [inline]: 4.12e-06 [add_attr]: 0.00546004, [1] [add_attr_with_inline]: 0.00544152, [1] [Cycle 1]: 9.341e-05, [2] [tag_attr]: 3.602e-05 [meta_addattr_fg_expand]: 7.45998e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 5.231e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 1.10001e-06 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.97999e-06 [optimize]: 0.00715436, [53] [py_interpret_to_execute]: 8.62e-06 [rewriter_before_opt_a]: 0.00029172 [opt_a]: 0.00436064, [2] [Cycle 1]: 0.00352653, [45] [expand_dump_flag]: 3.63e-06 [switch_simplify]: 0.0001064 [loop_unroll]: 4.259e-05 [a_1]: 0.00081421 [with_stream_mark]: 2.09e-05 [recompute_prepare]: 1.004e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 3.92998e-06 [updatestate_loads_eliminate]: 3.61001e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 9.801e-05 [accelerated_algorithm]: 7.6e-06 [shard]: 2.14999e-06 [meta_shard_fg_expand]: 2.59001e-06 [shard_inline]: 6.95002e-06 [merge_send_recv]: 1.092e-05 [auto_parallel]: 9.20001e-06 [parallel]: 5.631e-05 [flash_sp]: 1.105e-05 [merge_comm]: 5.22999e-06 [allreduce_fusion]: 4e-06 [matmul_add_comm_reduction]: 1.145e-05 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 9.48002e-06 [virtual_dataset]: 7.65e-06 [get_grad_eliminate_]: 7.15e-06 [virtual_output]: 6.96001e-06 [merge_forward]: 4.43999e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 1.103e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.429e-05 [merge_recompute_call_nodes]: 1.81e-06 [before_grad]: 1.233e-05 [set_forward_comm_id_for_comm_node_pass]: 4.64998e-06 [meta_fg_expand]: 4.06001e-06 [flash_sp_send_recv_attached]: 2.34001e-06 [receive_attached]: 2.12001e-06 [after_resolve]: 1.147e-05 [a_after_grad]: 1.09e-05 [renormalize]: 0.00175729 [add_forward_monad_depend]: 8.40001e-06 [auto_monad_grad]: 2.92002e-06 [auto_monad_eliminator]: 2.421e-05 [cse]: 3.998e-05 [a_3]: 6.271e-05 [Cycle 2]: 0.00081809, [45] [expand_dump_flag]: 2.09e-06 [switch_simplify]: 9.27999e-06 [loop_unroll]: 7.45e-06 [a_1]: 0.00022661 [with_stream_mark]: 2.073e-05 [recompute_prepare]: 7.58001e-06 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.86999e-06 [updatestate_loads_eliminate]: 4.12998e-06 [parameter_eliminate]: 2.18998e-06 [a_2]: 8.9e-05 [accelerated_algorithm]: 7.04001e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.86999e-06 [shard_inline]: 6.81001e-06 [merge_send_recv]: 8.65001e-06 [auto_parallel]: 9.34e-06 [parallel]: 7.65e-06 [flash_sp]: 4.08999e-06 [merge_comm]: 4.42998e-06 [allreduce_fusion]: 4.18001e-06 [matmul_add_comm_reduction]: 9.76998e-06 [allreduce_slice_to_reducescatter]: 6.59988e-07 [virtual_shard_identity]: 7.78999e-06 [virtual_dataset]: 6.68e-06 [get_grad_eliminate_]: 6.56999e-06 [virtual_output]: 6.07999e-06 [merge_forward]: 4.68001e-06 [cell_reuse_recompute_pass]: 2.12001e-06 [offload_activation]: 1.015e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.224e-05 [merge_recompute_call_nodes]: 1.42999e-06 [before_grad]: 1.058e-05 [set_forward_comm_id_for_comm_node_pass]: 4.92999e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 1.60001e-06 [after_resolve]: 1.01e-05 [a_after_grad]: 9.62999e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 1.01e-05 [cse]: 2.368e-05 [a_3]: 3.895e-05 [py_interpret_to_execute_after_opt_a]: 9.09e-06 [slice_cell_reuse_recomputed_activation]: 2.19001e-06 [rewriter_after_opt_a]: 2.898e-05 [convert_after_rewriter]: 1.25999e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00078468 [opt_b]: 0.00031182, [1] [Cycle 1]: 0.00030268, [7] [b_1]: 0.00020427 [b_2]: 9.65002e-06 [updatestate_depend_eliminate]: 8.1e-06 [updatestate_assign_eliminate]: 3.33e-06 [updatestate_loads_eliminate]: 3.15998e-06 [renormalize]: 6.29982e-07 [cse]: 3.307e-05 [optimize_parallel_all_gather_comm]: 2.124e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 3.316e-05 [loop_unroll]: 0.00051623 [opt_after_cconv]: 0.00011747, [1] [Cycle 1]: 0.0001104, [7] [c_1]: 3.275e-05 [parameter_eliminate]: 4.22998e-06 [updatestate_depend_eliminate]: 6.56e-06 [updatestate_assign_eliminate]: 3.35e-06 [updatestate_loads_eliminate]: 3.36999e-06 [cse]: 2.63e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.853e-05 [tuple_transform]: 9.943e-05, [1] [Cycle 1]: 9.487e-05, [4] [d_1]: 6.497e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.73999e-06 [partial_unused_args_eliminate]: 2.06e-06 [add_recomputation]: 5.415e-05 [cse_after_recomputation]: 2.823e-05, [1] [Cycle 1]: 2.363e-05, [1] [cse]: 1.844e-05 [environ_conv]: 1.257e-05 [swap_dp_allreduce_reducescatter]: 6.53998e-06 [bias_add_comm_swap]: 3.53e-06 [label_micro_interleaved_index]: 5.69999e-06 [label_fine_grained_interleaved_index]: 2.86e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.32999e-06 [micro_interleaved_order_control]: 2.68998e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 1.10001e-06 [full_micro_interleaved_order_control]: 2.89999e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.25999e-06 [interleave_parallel_branches]: 1.24e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.516e-05 [grouped_pairwise_exchange_alltoall]: 2.03997e-06 [offloading_packed_experts]: 5.05999e-06 [overlap_recompute_and_grad_model_parallel]: 5.80002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29003e-06 [overlap_recompute_comm]: 2.39001e-06 [overlap_grad_ring_attention]: 5.00999e-06 [overlap_grad_flash_sp]: 2.435e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.56e-06 [split_layernorm_comm]: 1.84998e-06 [handle_group_info]: 1.03001e-06 [symbol_engine_optimizer]: 9.197e-05, [1] [Cycle 1]: 8.744e-05, [6] [build]: 1.505e-05 [elim_shapecalc]: 1.078e-05 [elim_not_effective]: 1.399e-05 [opt_reshape]: 7.71001e-06 [fold_const_symbol]: 1.183e-05 [renormalize]: 2.19996e-07 [detach_backward]: 2.32001e-06 [pipeline_parallel_scheduler]: 1.79998e-06 [auto_monad_reorder]: 2.018e-05 [get_jit_bprop_graph]: 2.12999e-06 [rewriter_after_jit_bprop_graph]: 5.05999e-06 [opt_after_jit_grad]: 0.00056806 [validate]: 5.505e-05 [backend_pass]: 9.10019e-07 [task_emit]: 0.0468777 [execute]: 1.093e-05 Sums bootstrap : 0.000776s : 0.83% type_inference : 0.038919s : 41.45% event_method : 0.000105s : 0.11% auto_monad : 0.000181s : 0.19% graph_reusing : 0.000012s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000036s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000052s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.01% optimize.rewriter_before_opt_a : 0.000292s : 0.31% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000116s : 0.12% optimize.opt_a.loop_unroll : 0.000050s : 0.05% optimize.opt_a.a_1 : 0.001041s : 1.11% optimize.opt_a.with_stream_mark : 0.000042s : 0.04% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000187s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000064s : 0.07% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000010s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000021s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.001757s : 1.87% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.04% optimize.opt_a.cse : 0.000064s : 0.07% optimize.opt_a.a_3 : 0.000102s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000785s : 0.84% optimize.opt_b.b_1 : 0.000204s : 0.22% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.04% optimize.loop_unroll : 0.000516s : 0.55% optimize.opt_after_cconv.c_1 : 0.000033s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000065s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000054s : 0.06% optimize.cse_after_recomputation.cse : 0.000018s : 0.02% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000015s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000568s : 0.60% validate : 0.000055s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.046878s : 49.92% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000388 62 0.60% : 0.000002s : 3: substitution.elim_not_effective 2.21% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.55% : 0.000002s : 3: substitution.fold_const_symbol 1.73% : 0.000007s : 4: substitution.graph_param_transform 55.27% : 0.000214s : 8: substitution.inline 1.32% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.61% : 0.000006s : 2: substitution.minmaximum_grad 1.76% : 0.000007s : 6: substitution.remove_not_recompute_node 1.31% : 0.000005s : 2: substitution.replace_old_param 2.20% : 0.000009s : 1: substitution.switch_simplify 4.73% : 0.000018s : 4: substitution.tuple_list_convert_item_index_to_positive 2.31% : 0.000009s : 4: substitution.tuple_list_get_item_const_eliminator 3.21% : 0.000012s : 4: substitution.tuple_list_get_item_depend_reorder 18.20% : 0.000071s : 8: substitution.tuple_list_get_item_eliminator 3.01% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.038817 2 94.42% : 0.036649s : 1: type_inference.infer 5.58% : 0.002168s : 1: type_inference.specialize ------[replace.] 0.000107 11 58.84% : 0.000063s : 8: replace.inline 24.26% : 0.000026s : 1: replace.switch_simplify 16.90% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000222 11 94.62% : 0.000210s : 8: match.inline 3.44% : 0.000008s : 1: match.switch_simplify 1.94% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000247 1438 0.97% : 0.000002s : 16: predicate.accumulaten_eliminater 0.68% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 8: predicate.addn_check_dump 1.09% : 0.000003s : 16: predicate.addn_zero_filter 0.91% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 24: predicate.arithmetic_simplify 1.09% : 0.000003s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.15% : 0.000000s : 4: predicate.const_output_eliminate 0.43% : 0.000001s : 8: predicate.depend_value_elim 1.04% : 0.000003s : 16: predicate.dict_get_item_const_eliminator 1.21% : 0.000003s : 16: predicate.dict_get_item_eliminator 1.13% : 0.000003s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.24% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.82% : 0.000005s : 28: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.47% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.30% : 0.000006s : 26: predicate.float_depend_g_call 0.47% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.59% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.61% : 0.000014s : 66: predicate.inline 0.70% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 42: predicate.load_eliminater 1.09% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.97% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.64% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 8: predicate.merge_addn 0.46% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.98% : 0.000002s : 16: predicate.minmaximum_grad 1.19% : 0.000003s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.04% : 0.000005s : 26: predicate.partial_defer_inline 1.33% : 0.000003s : 22: predicate.partial_eliminate 1.09% : 0.000003s : 16: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.36% : 0.000003s : 16: predicate.reduce_eliminate 2.46% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.26% : 0.000003s : 26: predicate.replace_applicator 0.41% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000000s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 16: predicate.reshape_eliminate 0.50% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.74% : 0.000002s : 8: predicate.same_eliminate 0.33% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.69% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.94% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.60% : 0.000004s : 26: predicate.switch_defer_inline 2.05% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.78% : 0.000014s : 86: predicate.switch_simplify 0.94% : 0.000002s : 16: predicate.tile_eliminate 0.96% : 0.000002s : 16: predicate.transpose_eliminate 1.86% : 0.000005s : 24: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.58% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.49% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.55% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.18% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.87% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 4: predicate.value_based_eliminate 0.65% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.28% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001953 23 60.47% : 0.001181s : 11: func_graph_cloner_run.FuncGraphClonerGraph 39.53% : 0.000772s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.116790 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.68% : 0.005466s : 1: add_attr 4.66% : 0.005446s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.16% : 0.000191s : 1: auto_monad 0.02% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.70% : 0.000821s : 1: bootstrap 0.03% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000031s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.10% : 0.000115s : 1: event_method 0.02% : 0.000018s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.05% : 0.000055s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.45% : 0.000526s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.68% : 0.000796s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.37% : 0.001597s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000181s : 28: opt.transform.opt_b 0.06% : 0.000070s : 2: opt.transform.opt_trans_graph 0.03% : 0.000041s : 4: opt.transform.symbol_engine_opt 3.74% : 0.004365s : 1: opt_a 0.10% : 0.000121s : 1: opt_after_cconv 0.50% : 0.000580s : 1: opt_after_jit_grad 0.27% : 0.000315s : 1: opt_b 6.13% : 0.007160s : 1: optimize 0.02% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000057s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.84% : 0.000980s : 1: renormalize.infer 0.66% : 0.000765s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000033s : 1: rewriter_after_opt_a 0.26% : 0.000299s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000095s : 1: symbol_engine_optimizer 40.16% : 0.046903s : 1: task_emit 0.09% : 0.000103s : 1: tuple_transform 33.35% : 0.038951s : 1: type_inference 0.09% : 0.000100s : 1: validate [WARNING] CORE(87355,ffffbf434f30,python3.9):2026-01-29-17:52:01.462.781 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph2 TotalTime = 0.100413, [24] [bootstrap]: 0.00050114 [type_inference]: 0.0293733 [event_method]: 2.561e-05 [auto_monad]: 8.661e-05 [graph_reusing]: 6.38e-06 [inline]: 3.6e-06 [add_attr]: 0.0037941, [1] [add_attr_with_inline]: 0.00378169, [1] [Cycle 1]: 7.309e-05, [2] [tag_attr]: 2.509e-05 [meta_addattr_fg_expand]: 6.28998e-06 [parallel-infer-symbol]: 3.46999e-06 [pre_auto_parallel]: 4.334e-05 [insert-virtual-dataset]: 2.35002e-06 [parallel-infer-symbol-second]: 7.79983e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00584889, [53] [py_interpret_to_execute]: 6.53e-06 [rewriter_before_opt_a]: 0.00027306 [opt_a]: 0.00334633, [2] [Cycle 1]: 0.00271452, [45] [expand_dump_flag]: 3.23998e-06 [switch_simplify]: 8.467e-05 [loop_unroll]: 3.334e-05 [a_1]: 0.00061645 [with_stream_mark]: 1.922e-05 [recompute_prepare]: 8.69998e-06 [updatestate_depend_eliminate]: 4.17e-06 [updatestate_assign_eliminate]: 3.11001e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 7.133e-05 [accelerated_algorithm]: 6.20002e-06 [shard]: 2.12001e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 5.70001e-06 [merge_send_recv]: 9.29998e-06 [auto_parallel]: 7.58999e-06 [parallel]: 1.928e-05 [flash_sp]: 9.00999e-06 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.44001e-06 [matmul_add_comm_reduction]: 9.32001e-06 [allreduce_slice_to_reducescatter]: 7.09988e-07 [virtual_shard_identity]: 8.37998e-06 [virtual_dataset]: 5.81e-06 [get_grad_eliminate_]: 6.02999e-06 [virtual_output]: 6.07999e-06 [merge_forward]: 4.06001e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.031e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.336e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 1.014e-05 [set_forward_comm_id_for_comm_node_pass]: 3.57997e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.74001e-06 [receive_attached]: 2.67001e-06 [after_resolve]: 1.013e-05 [a_after_grad]: 8.64e-06 [renormalize]: 0.00130987 [add_forward_monad_depend]: 7.4e-06 [auto_monad_grad]: 2.53e-06 [auto_monad_eliminator]: 1.783e-05 [cse]: 3.669e-05 [a_3]: 5.112e-05 [Cycle 2]: 0.0006193, [45] [expand_dump_flag]: 1.50001e-06 [switch_simplify]: 7.82e-06 [loop_unroll]: 5.87999e-06 [a_1]: 0.00011282 [with_stream_mark]: 1.699e-05 [recompute_prepare]: 5.54998e-06 [updatestate_depend_eliminate]: 3.27002e-06 [updatestate_assign_eliminate]: 3.39001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [parameter_eliminate]: 1.23002e-06 [a_2]: 6.208e-05 [accelerated_algorithm]: 5.25001e-06 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 5.12e-06 [merge_send_recv]: 6.70002e-06 [auto_parallel]: 6.58e-06 [parallel]: 7.33e-06 [flash_sp]: 3.8e-06 [merge_comm]: 3.27002e-06 [allreduce_fusion]: 3.25e-06 [matmul_add_comm_reduction]: 7.15998e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 6.54001e-06 [virtual_dataset]: 5.41998e-06 [get_grad_eliminate_]: 5.34e-06 [virtual_output]: 5.08002e-06 [merge_forward]: 3.56001e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 8.89e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.54e-05 [merge_recompute_call_nodes]: 1.07998e-06 [before_grad]: 8.55999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.57002e-06 [meta_fg_expand]: 2.42001e-06 [flash_sp_send_recv_attached]: 1.68997e-06 [receive_attached]: 1.83002e-06 [after_resolve]: 9.34e-06 [a_after_grad]: 8.66002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.80001e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 7.08e-06 [cse]: 1.723e-05 [a_3]: 3.143e-05 [py_interpret_to_execute_after_opt_a]: 6.65002e-06 [slice_cell_reuse_recomputed_activation]: 2.25002e-06 [rewriter_after_opt_a]: 2.404e-05 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.14003e-06 [mutable_eliminate]: 0.00069391 [opt_b]: 0.00019665, [1] [Cycle 1]: 0.00018966, [7] [b_1]: 0.00010788 [b_2]: 7.48e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 8.70001e-07 [cse]: 2.669e-05 [optimize_parallel_all_gather_comm]: 1.887e-05 [overlap_param_gather]: 2.10002e-06 [cconv]: 3.315e-05 [loop_unroll]: 0.00052314 [opt_after_cconv]: 0.00011106, [1] [Cycle 1]: 0.00010417, [7] [c_1]: 2.667e-05 [parameter_eliminate]: 5.05999e-06 [updatestate_depend_eliminate]: 6.83e-06 [updatestate_assign_eliminate]: 2.79001e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.45e-05 [renormalize]: 9.30013e-07 [remove_dup_value]: 1.659e-05 [tuple_transform]: 7.051e-05, [1] [Cycle 1]: 6.578e-05, [4] [d_1]: 3.831e-05 [none_parameter_eliminate]: 1.49e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 6.44999e-06 [partial_unused_args_eliminate]: 2.02001e-06 [add_recomputation]: 4.895e-05 [cse_after_recomputation]: 2.371e-05, [1] [Cycle 1]: 1.933e-05, [1] [cse]: 1.352e-05 [environ_conv]: 1.113e-05 [swap_dp_allreduce_reducescatter]: 6.14999e-06 [bias_add_comm_swap]: 3.82998e-06 [label_micro_interleaved_index]: 5.49e-06 [label_fine_grained_interleaved_index]: 3.13e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 2.73998e-06 [micro_interleaved_order_control]: 2.98998e-06 [assign_add_opt]: 1.39e-06 [ForceFp32Comm]: 9.50007e-07 [remove_cast_before_assign_add]: 1.35999e-06 [full_micro_interleaved_order_control]: 2.61e-06 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 1.32e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.34998e-06 [interleave_parallel_branches]: 1.25999e-06 [overlap_opt_shard_in_pipeline]: 1.66e-06 [overlap_opt_shard_grad_in_pipeline]: 1.66e-06 [control_data_broadcast_order]: 1.465e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 3.83001e-06 [overlap_recompute_and_grad_model_parallel]: 5.32001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.11998e-06 [overlap_grad_ring_attention]: 4.89998e-06 [overlap_grad_flash_sp]: 2.349e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.73e-06 [split_layernorm_comm]: 1.66e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.893e-05, [1] [Cycle 1]: 8.353e-05, [6] [build]: 1.318e-05 [elim_shapecalc]: 1.162e-05 [elim_not_effective]: 1.276e-05 [opt_reshape]: 6.46e-06 [fold_const_symbol]: 9.14e-06 [renormalize]: 2.30008e-07 [detach_backward]: 2.29001e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.888e-05 [get_jit_bprop_graph]: 2.32999e-06 [rewriter_after_jit_bprop_graph]: 5.71998e-06 [opt_after_jit_grad]: 0.00054234 [validate]: 5.244e-05 [backend_pass]: 1.07e-06 [task_emit]: 0.0598151 [execute]: 1.122e-05 Sums bootstrap : 0.000501s : 0.52% type_inference : 0.029373s : 30.75% event_method : 0.000026s : 0.03% auto_monad : 0.000087s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000043s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000273s : 0.29% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000092s : 0.10% optimize.opt_a.loop_unroll : 0.000039s : 0.04% optimize.opt_a.a_1 : 0.000729s : 0.76% optimize.opt_a.with_stream_mark : 0.000036s : 0.04% optimize.opt_a.recompute_prepare : 0.000014s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000133s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000017s : 0.02% optimize.opt_a.renormalize : 0.001310s : 1.37% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000054s : 0.06% optimize.opt_a.a_3 : 0.000083s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000694s : 0.73% optimize.opt_b.b_1 : 0.000108s : 0.11% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.03% optimize.loop_unroll : 0.000523s : 0.55% optimize.opt_after_cconv.c_1 : 0.000027s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000038s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000049s : 0.05% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000542s : 0.57% validate : 0.000052s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.059815s : 62.62% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000209 26 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.69% : 0.000001s : 2: substitution.fold_const_symbol 2.74% : 0.000006s : 3: substitution.graph_param_transform 79.51% : 0.000166s : 6: substitution.inline 1.93% : 0.000004s : 4: substitution.j_node_and_user_rematch 3.05% : 0.000006s : 4: substitution.remove_not_recompute_node 2.26% : 0.000005s : 2: substitution.replace_old_param 3.65% : 0.000008s : 1: substitution.switch_simplify 5.30% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029293 2 95.04% : 0.027839s : 1: type_inference.infer 4.96% : 0.001454s : 1: type_inference.specialize ------[replace.] 0.000088 9 56.63% : 0.000050s : 6: replace.inline 25.83% : 0.000023s : 1: replace.switch_simplify 17.55% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000179 9 90.67% : 0.000163s : 6: match.inline 3.79% : 0.000007s : 1: match.switch_simplify 5.54% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000198 1092 0.83% : 0.000002s : 12: predicate.accumulaten_eliminater 0.90% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000002s : 12: predicate.addn_zero_filter 0.80% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 1.94% : 0.000004s : 18: predicate.arithmetic_simplify 1.19% : 0.000002s : 12: predicate.cast_eliminate 0.80% : 0.000002s : 6: predicate.check_bprop_eliminate 0.46% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.42% : 0.000001s : 6: predicate.depend_value_elim 0.87% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.06% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.12% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.54% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 15: predicate.environ_add_const_eliminate 0.92% : 0.000002s : 15: predicate.environ_get_add_eliminate 0.96% : 0.000002s : 15: predicate.environ_get_depend_swap 1.44% : 0.000003s : 21: predicate.environ_get_eliminate 0.96% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.39% : 0.000005s : 20: predicate.float_depend_g_call 0.43% : 0.000001s : 6: predicate.float_environ_get_switch 0.63% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.78% : 0.000002s : 6: predicate.get_grad_eliminate 0.16% : 0.000000s : 3: predicate.graph_param_transform 0.46% : 0.000001s : 6: predicate.incorporate_call 0.39% : 0.000001s : 6: predicate.incorporate_call_switch 5.59% : 0.000011s : 50: predicate.inline 0.74% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.68% : 0.000001s : 6: predicate.less_batch_normalization 1.55% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.20% : 0.000004s : 32: predicate.load_eliminater 1.12% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.66% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.42% : 0.000001s : 6: predicate.merge_addn 0.43% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.80% : 0.000002s : 12: predicate.minmaximum_grad 1.40% : 0.000003s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.38% : 0.000001s : 3: predicate.parallel_virtual_node 2.15% : 0.000004s : 20: predicate.partial_defer_inline 1.24% : 0.000002s : 17: predicate.partial_eliminate 0.92% : 0.000002s : 12: predicate.print_const_string_wrapper 0.46% : 0.000001s : 6: predicate.reduce_all_const_elim 1.19% : 0.000002s : 12: predicate.reduce_eliminate 2.15% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 20: predicate.replace_applicator 0.35% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 12: predicate.reshape_eliminate 0.52% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 0.72% : 0.000001s : 6: predicate.same_eliminate 0.32% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.73% : 0.000001s : 6: predicate.shard_identity_eliminate 0.70% : 0.000001s : 6: predicate.special_op_eliminate 0.62% : 0.000001s : 6: predicate.specialize_transform 0.67% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.45% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.54% : 0.000003s : 20: predicate.switch_defer_inline 1.97% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.53% : 0.000011s : 68: predicate.switch_simplify 0.87% : 0.000002s : 12: predicate.tile_eliminate 0.90% : 0.000002s : 12: predicate.transpose_eliminate 1.42% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.30% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.29% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 8.59% : 0.000017s : 24: predicate.tuple_list_set_item_eliminator 1.64% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.01% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.61% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.44% : 0.000001s : 3: predicate.value_based_eliminate 0.58% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.34% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001271 16 57.27% : 0.000728s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.73% : 0.000543s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.112675 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.37% : 0.003801s : 1: add_attr 3.36% : 0.003786s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000055s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.08% : 0.000092s : 1: auto_monad 0.02% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.48% : 0.000542s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000033s : 1: event_method 0.02% : 0.000020s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.48% : 0.000535s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.63% : 0.000708s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.03% : 0.001159s : 78: opt.transform.opt_a 0.02% : 0.000025s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000087s : 28: opt.transform.opt_b 0.04% : 0.000043s : 2: opt.transform.opt_trans_graph 0.03% : 0.000036s : 4: opt.transform.symbol_engine_opt 2.97% : 0.003350s : 1: opt_a 0.10% : 0.000115s : 1: opt_after_cconv 0.49% : 0.000555s : 1: opt_after_jit_grad 0.18% : 0.000200s : 1: opt_b 5.20% : 0.005854s : 1: optimize 0.02% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000047s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 0.68% : 0.000763s : 1: renormalize.infer 0.48% : 0.000536s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.25% : 0.000280s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000092s : 1: symbol_engine_optimizer 53.11% : 0.059844s : 1: task_emit 0.07% : 0.000074s : 1: tuple_transform 26.09% : 0.029401s : 1: type_inference 0.08% : 0.000088s : 1: validate TotalTime = 6.21303, [24] [bootstrap]: 0.00088303 [type_inference]: 0.0515078 [event_method]: 2.085e-05 [auto_monad]: 0.00014999 [graph_reusing]: 5.80002e-06 [inline]: 1.77999e-06 [add_attr]: 0.00728566, [1] [add_attr_with_inline]: 0.00727464, [1] [Cycle 1]: 0.00012844, [2] [tag_attr]: 3.404e-05 [meta_addattr_fg_expand]: 1.867e-05 [parallel-infer-symbol]: 3.14999e-06 [pre_auto_parallel]: 5.64e-05 [insert-virtual-dataset]: 2.89999e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 1.78002e-06 [pipeline_split]: 1.89999e-06 [optimize]: 0.00537061, [53] [py_interpret_to_execute]: 4.37e-06 [rewriter_before_opt_a]: 0.00025699 [opt_a]: 0.00319606, [2] [Cycle 1]: 0.00263882, [45] [expand_dump_flag]: 3.25e-06 [switch_simplify]: 0.00012029 [loop_unroll]: 3.188e-05 [a_1]: 0.00062428 [with_stream_mark]: 1.46e-05 [recompute_prepare]: 7.31999e-06 [updatestate_depend_eliminate]: 1.343e-05 [updatestate_assign_eliminate]: 1.265e-05 [updatestate_loads_eliminate]: 3.26999e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 7.141e-05 [accelerated_algorithm]: 6.31e-06 [shard]: 4.72e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 5.67001e-06 [merge_send_recv]: 4.486e-05 [auto_parallel]: 5.62001e-06 [parallel]: 8.523e-05 [flash_sp]: 4.095e-05 [merge_comm]: 4.67e-06 [allreduce_fusion]: 1.51e-05 [matmul_add_comm_reduction]: 2.808e-05 [allreduce_slice_to_reducescatter]: 3.85e-06 [virtual_shard_identity]: 8.22998e-06 [virtual_dataset]: 5.66e-06 [get_grad_eliminate_]: 5.36998e-06 [virtual_output]: 8.38001e-06 [merge_forward]: 7.1e-06 [cell_reuse_recompute_pass]: 3.98001e-06 [offload_activation]: 8.78001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.259e-05 [merge_recompute_call_nodes]: 4.99998e-06 [before_grad]: 8.85999e-06 [set_forward_comm_id_for_comm_node_pass]: 1.033e-05 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 6.12001e-06 [receive_attached]: 1.891e-05 [after_resolve]: 8.79e-06 [a_after_grad]: 8.13999e-06 [renormalize]: 0.00096273 [add_forward_monad_depend]: 5.32001e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 2.21e-05 [cse]: 5.437e-05 [a_3]: 4.111e-05 [Cycle 2]: 0.0005476, [45] [expand_dump_flag]: 9.20001e-07 [switch_simplify]: 6.65002e-06 [loop_unroll]: 5.80002e-06 [a_1]: 9.509e-05 [with_stream_mark]: 7.73001e-06 [recompute_prepare]: 5.38002e-06 [updatestate_depend_eliminate]: 2.58e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 2.11e-06 [parameter_eliminate]: 8.89995e-07 [a_2]: 6.003e-05 [accelerated_algorithm]: 5.30999e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.12e-06 [shard_inline]: 5.09e-06 [merge_send_recv]: 4.28999e-06 [auto_parallel]: 4.94998e-06 [parallel]: 3.96001e-06 [flash_sp]: 1.134e-05 [merge_comm]: 2.89999e-06 [allreduce_fusion]: 2.64999e-06 [matmul_add_comm_reduction]: 4.05e-06 [allreduce_slice_to_reducescatter]: 2.9002e-07 [virtual_shard_identity]: 6.15002e-06 [virtual_dataset]: 5.20001e-06 [get_grad_eliminate_]: 5.09e-06 [virtual_output]: 4.77e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 4.75999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.114e-05 [merge_recompute_call_nodes]: 5.69999e-07 [before_grad]: 7.77998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.01999e-06 [meta_fg_expand]: 1.83002e-06 [flash_sp_send_recv_attached]: 7.40023e-07 [receive_attached]: 1.04e-06 [after_resolve]: 7.55998e-06 [a_after_grad]: 7.18e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.15001e-06 [auto_monad_grad]: 8.09989e-07 [auto_monad_eliminator]: 6.05002e-06 [cse]: 1.426e-05 [a_3]: 3.047e-05 [py_interpret_to_execute_after_opt_a]: 4.35e-06 [slice_cell_reuse_recomputed_activation]: 4.72e-06 [rewriter_after_opt_a]: 1.827e-05 [convert_after_rewriter]: 1.00001e-06 [order_py_execute_after_rewriter]: 1.318e-05 [mutable_eliminate]: 0.00048442 [opt_b]: 0.00017654, [1] [Cycle 1]: 0.00017099, [7] [b_1]: 0.00010253 [b_2]: 6.70002e-06 [updatestate_depend_eliminate]: 4.36002e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.14999e-06 [renormalize]: 2.3999e-07 [cse]: 2.076e-05 [optimize_parallel_all_gather_comm]: 2.052e-05 [overlap_param_gather]: 4.61002e-06 [cconv]: 1.31e-05 [loop_unroll]: 0.00040892 [opt_after_cconv]: 9.238e-05, [1] [Cycle 1]: 8.674e-05, [7] [c_1]: 2.352e-05 [parameter_eliminate]: 2.31998e-06 [updatestate_depend_eliminate]: 5.17e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 1.918e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 2.531e-05 [tuple_transform]: 6.171e-05, [1] [Cycle 1]: 5.753e-05, [4] [d_1]: 3.307e-05 [none_parameter_eliminate]: 8.00006e-07 [renormalize]: 1.60013e-07 [switch_simplify]: 5.74e-06 [partial_unused_args_eliminate]: 9.30013e-07 [add_recomputation]: 3.785e-05 [cse_after_recomputation]: 2.149e-05, [1] [Cycle 1]: 1.711e-05, [1] [cse]: 1.171e-05 [environ_conv]: 1.52e-05 [swap_dp_allreduce_reducescatter]: 1.55e-05 [bias_add_comm_swap]: 7.28999e-06 [label_micro_interleaved_index]: 1.121e-05 [label_fine_grained_interleaved_index]: 1.15001e-06 [merge_cast_opt]: 4.69998e-07 [slice_recompute_activation]: 6.39993e-07 [micro_interleaved_order_control]: 9.20001e-07 [assign_add_opt]: 5.10016e-07 [ForceFp32Comm]: 3.89991e-07 [remove_cast_before_assign_add]: 5.07e-06 [full_micro_interleaved_order_control]: 5.05999e-06 [reorder_send_recv_between_fp_bp]: 1.22e-06 [comm_op_add_attrs]: 4.69998e-07 [add_comm_op_reuse_tag]: 4.30009e-07 [interleave_split_concat_branches]: 8.00006e-07 [interleave_parallel_branches]: 4.99e-06 [overlap_opt_shard_in_pipeline]: 1.463e-05 [overlap_opt_shard_grad_in_pipeline]: 8.59989e-07 [control_data_broadcast_order]: 9.32001e-06 [grouped_pairwise_exchange_alltoall]: 5.79981e-07 [offloading_packed_experts]: 2.79001e-06 [overlap_recompute_and_grad_model_parallel]: 8.28999e-06 [overlap_grad_matmul_and_grad_allreduce]: 8.29983e-07 [overlap_recompute_allgather_and_fa_grad]: 8.39995e-07 [overlap_recompute_comm]: 1.08001e-06 [overlap_grad_ring_attention]: 2.481e-05 [overlap_grad_flash_sp]: 5.146e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 1.818e-05 [split_layernorm_comm]: 8.00006e-07 [handle_group_info]: 3.4002e-07 [symbol_engine_optimizer]: 0.0001001, [1] [Cycle 1]: 9.59e-05, [6] [build]: 3.292e-05 [elim_shapecalc]: 8.87e-06 [elim_not_effective]: 1.134e-05 [opt_reshape]: 6.07999e-06 [fold_const_symbol]: 8.85999e-06 [renormalize]: 1.60013e-07 [detach_backward]: 1.08001e-06 [pipeline_parallel_scheduler]: 8.10018e-07 [auto_monad_reorder]: 1.317e-05 [get_jit_bprop_graph]: 1.17999e-06 [rewriter_after_jit_bprop_graph]: 3.04999e-06 [opt_after_jit_grad]: 0.00044791 [validate]: 5.146e-05 [backend_pass]: 7.2e-07 [task_emit]: 6.14666 [execute]: 1.311e-05 Sums bootstrap : 0.000883s : 0.01% type_inference : 0.051508s : 0.83% event_method : 0.000021s : 0.00% auto_monad : 0.000150s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000019s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000056s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000257s : 0.00% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000127s : 0.00% optimize.opt_a.loop_unroll : 0.000038s : 0.00% optimize.opt_a.a_1 : 0.000719s : 0.01% optimize.opt_a.with_stream_mark : 0.000022s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000131s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000049s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000089s : 0.00% optimize.opt_a.flash_sp : 0.000052s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000032s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000010s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000014s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000006s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000013s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000016s : 0.00% optimize.opt_a.a_after_grad : 0.000015s : 0.00% optimize.opt_a.renormalize : 0.000963s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.00% optimize.opt_a.cse : 0.000069s : 0.00% optimize.opt_a.a_3 : 0.000072s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000013s : 0.00% optimize.mutable_eliminate : 0.000484s : 0.01% optimize.opt_b.b_1 : 0.000103s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.00% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000013s : 0.00% optimize.loop_unroll : 0.000409s : 0.01% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000025s : 0.00% optimize.tuple_transform.d_1 : 0.000033s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000038s : 0.00% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000016s : 0.00% optimize.bias_add_comm_swap : 0.000007s : 0.00% optimize.label_micro_interleaved_index : 0.000011s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000000s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000005s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000000s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000015s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000009s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000025s : 0.00% optimize.overlap_grad_flash_sp : 0.000051s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000018s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000000s : 0.00% optimize.symbol_engine_optimizer.build : 0.000033s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000013s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000448s : 0.01% validate : 0.000051s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 6.146658s : 99.07% execute : 0.000013s : 0.00% Time group info: ------[substitution.] 0.000246 26 0.51% : 0.000001s : 2: substitution.elim_not_effective 0.38% : 0.000001s : 2: substitution.fold_const_symbol 1.45% : 0.000004s : 3: substitution.graph_param_transform 73.69% : 0.000181s : 6: substitution.inline 0.99% : 0.000002s : 4: substitution.j_node_and_user_rematch 5.95% : 0.000015s : 4: substitution.remove_not_recompute_node 1.31% : 0.000003s : 2: substitution.replace_old_param 7.13% : 0.000017s : 1: substitution.switch_simplify 8.60% : 0.000021s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.051419 2 97.63% : 0.050200s : 1: type_inference.infer 2.37% : 0.001219s : 1: type_inference.specialize ------[replace.] 0.000079 9 57.93% : 0.000046s : 6: replace.inline 21.26% : 0.000017s : 1: replace.switch_simplify 20.81% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000214 9 82.88% : 0.000178s : 6: match.inline 7.80% : 0.000017s : 1: match.switch_simplify 9.33% : 0.000020s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000171 1092 1.06% : 0.000002s : 12: predicate.accumulaten_eliminater 0.85% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.19% : 0.000004s : 18: predicate.arithmetic_simplify 1.01% : 0.000002s : 12: predicate.cast_eliminate 0.52% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.76% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.24% : 0.000000s : 3: predicate.elim_not_effective 0.35% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_depend_swap 1.67% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.59% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.56% : 0.000004s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.62% : 0.000001s : 6: predicate.get_grad_eliminate 0.19% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.46% : 0.000001s : 6: predicate.incorporate_call_switch 5.87% : 0.000010s : 50: predicate.inline 0.67% : 0.000001s : 6: predicate.inline_without_move 0.29% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.86% : 0.000001s : 6: predicate.less_batch_normalization 1.87% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.51% : 0.000004s : 32: predicate.load_eliminater 0.86% : 0.000001s : 3: predicate.loop_unroll_after_grad 2.86% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 6: predicate.merge_addn 0.52% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.65% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.09% : 0.000002s : 3: predicate.mutable_eliminate 0.34% : 0.000001s : 3: predicate.opt_reshape 0.33% : 0.000001s : 3: predicate.parallel_virtual_node 1.98% : 0.000003s : 20: predicate.partial_defer_inline 1.46% : 0.000003s : 17: predicate.partial_eliminate 1.37% : 0.000002s : 12: predicate.print_const_string_wrapper 0.62% : 0.000001s : 6: predicate.reduce_all_const_elim 1.49% : 0.000003s : 12: predicate.reduce_eliminate 2.48% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 6: predicate.remove_not_recompute_node 1.28% : 0.000002s : 20: predicate.replace_applicator 0.38% : 0.000001s : 6: predicate.replace_old_param 0.20% : 0.000000s : 3: predicate.reset_defer_inline 1.25% : 0.000002s : 12: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.77% : 0.000001s : 6: predicate.same_eliminate 0.39% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.69% : 0.000001s : 6: predicate.special_op_eliminate 0.64% : 0.000001s : 6: predicate.specialize_transform 0.74% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.69% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.71% : 0.000003s : 20: predicate.switch_defer_inline 2.29% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.01% : 0.000010s : 68: predicate.switch_simplify 0.95% : 0.000002s : 12: predicate.tile_eliminate 0.99% : 0.000002s : 12: predicate.transpose_eliminate 1.67% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 2.98% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.82% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.40% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.93% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.66% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000984 16 56.17% : 0.000553s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.83% : 0.000431s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 6.227790 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.12% : 0.007290s : 1: add_attr 0.12% : 0.007278s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000042s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000156s : 1: auto_monad 0.00% : 0.000017s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000010s : 1: bias_add_comm_swap 0.02% : 0.000934s : 1: bootstrap 0.00% : 0.000016s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000012s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000027s : 1: execute 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000003s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000008s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000014s : 1: label_micro_interleaved_index 0.01% : 0.000417s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.01% : 0.000493s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.02% : 0.001172s : 78: opt.transform.opt_a 0.00% : 0.000022s : 1: opt.transform.opt_after_cconv 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000083s : 28: opt.transform.opt_b 0.00% : 0.000037s : 2: opt.transform.opt_trans_graph 0.00% : 0.000032s : 4: opt.transform.symbol_engine_opt 0.05% : 0.003199s : 1: opt_a 0.00% : 0.000096s : 1: opt_after_cconv 0.01% : 0.000457s : 1: opt_after_jit_grad 0.00% : 0.000180s : 1: opt_b 0.09% : 0.005375s : 1: optimize 0.00% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000016s : 1: order_py_execute_after_rewriter 0.00% : 0.000055s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000028s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000008s : 1: remove_cast_before_assign_add 0.00% : 0.000029s : 1: remove_dup_value 0.01% : 0.000556s : 1: renormalize.infer 0.01% : 0.000399s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000022s : 1: rewriter_after_opt_a 0.00% : 0.000262s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000021s : 1: split_matmul_comm_elemetwise 0.00% : 0.000019s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000103s : 1: symbol_engine_optimizer 98.70% : 6.146829s : 1: task_emit 0.00% : 0.000065s : 1: tuple_transform 0.83% : 0.051527s : 1: type_inference 0.00% : 0.000077s : 1: validate [WARNING] CORE(87355,ffffbf434f30,python3.9):2026-01-29-17:52:01.852.127 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph3 TotalTime = 0.0892346, [24] [bootstrap]: 0.00052272 [type_inference]: 0.0305432 [event_method]: 2.627e-05 [auto_monad]: 8.674e-05 [graph_reusing]: 6.62002e-06 [inline]: 3.46999e-06 [add_attr]: 0.00397137, [1] [add_attr_with_inline]: 0.00396023, [1] [Cycle 1]: 7.815e-05, [2] [tag_attr]: 2.598e-05 [meta_addattr_fg_expand]: 6.64999e-06 [parallel-infer-symbol]: 3.66999e-06 [pre_auto_parallel]: 4.17e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 8.39995e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.00573512, [53] [py_interpret_to_execute]: 7.35e-06 [rewriter_before_opt_a]: 0.00027013 [opt_a]: 0.00327559, [2] [Cycle 1]: 0.00266823, [45] [expand_dump_flag]: 3.41999e-06 [switch_simplify]: 8.422e-05 [loop_unroll]: 3.262e-05 [a_1]: 0.00061606 [with_stream_mark]: 1.839e-05 [recompute_prepare]: 7.9e-06 [updatestate_depend_eliminate]: 4.21001e-06 [updatestate_assign_eliminate]: 3.64002e-06 [updatestate_loads_eliminate]: 2.88e-06 [parameter_eliminate]: 1.95001e-06 [a_2]: 7.05e-05 [accelerated_algorithm]: 6.28998e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 1.84e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 7.12002e-06 [parallel]: 1.88e-05 [flash_sp]: 8.70001e-06 [merge_comm]: 3.78999e-06 [allreduce_fusion]: 3.35e-06 [matmul_add_comm_reduction]: 8.84003e-06 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 7.38e-06 [virtual_dataset]: 5.72999e-06 [get_grad_eliminate_]: 5.53002e-06 [virtual_output]: 5.74e-06 [merge_forward]: 4.3e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 1.07e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.394e-05 [merge_recompute_call_nodes]: 1.66998e-06 [before_grad]: 1.128e-05 [set_forward_comm_id_for_comm_node_pass]: 3.61999e-06 [meta_fg_expand]: 2.63e-06 [flash_sp_send_recv_attached]: 2.29001e-06 [receive_attached]: 2.16e-06 [after_resolve]: 9.79999e-06 [a_after_grad]: 8.73001e-06 [renormalize]: 0.00126779 [add_forward_monad_depend]: 7.93001e-06 [auto_monad_grad]: 2.74999e-06 [auto_monad_eliminator]: 1.791e-05 [cse]: 3.934e-05 [a_3]: 4.598e-05 [Cycle 2]: 0.0005962, [45] [expand_dump_flag]: 1.82001e-06 [switch_simplify]: 7.01999e-06 [loop_unroll]: 5.78002e-06 [a_1]: 9.771e-05 [with_stream_mark]: 1.449e-05 [recompute_prepare]: 5.52999e-06 [updatestate_depend_eliminate]: 3.92002e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 2.60002e-06 [parameter_eliminate]: 1.39998e-06 [a_2]: 6.225e-05 [accelerated_algorithm]: 5.32999e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.39e-06 [shard_inline]: 5.36998e-06 [merge_send_recv]: 6.99001e-06 [auto_parallel]: 6.11998e-06 [parallel]: 6.79999e-06 [flash_sp]: 3.63e-06 [merge_comm]: 3.18e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 7.25e-06 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 6.45002e-06 [virtual_dataset]: 4.99e-06 [get_grad_eliminate_]: 4.78001e-06 [virtual_output]: 4.85999e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 2.04999e-06 [offload_activation]: 8.05e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.435e-05 [merge_recompute_call_nodes]: 1.22e-06 [before_grad]: 8.94e-06 [set_forward_comm_id_for_comm_node_pass]: 3.40998e-06 [meta_fg_expand]: 2.19001e-06 [flash_sp_send_recv_attached]: 1.06002e-06 [receive_attached]: 1.67001e-06 [after_resolve]: 8.94998e-06 [a_after_grad]: 7.51999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.34998e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 8.40001e-06 [cse]: 1.886e-05 [a_3]: 3.061e-05 [py_interpret_to_execute_after_opt_a]: 6.61999e-06 [slice_cell_reuse_recomputed_activation]: 2.37001e-06 [rewriter_after_opt_a]: 2.109e-05 [convert_after_rewriter]: 1.15001e-06 [order_py_execute_after_rewriter]: 1.50999e-06 [mutable_eliminate]: 0.00064985 [opt_b]: 0.0001938, [1] [Cycle 1]: 0.00018683, [7] [b_1]: 0.00010465 [b_2]: 6.56999e-06 [updatestate_depend_eliminate]: 7.64002e-06 [updatestate_assign_eliminate]: 3.13e-06 [updatestate_loads_eliminate]: 2.44999e-06 [renormalize]: 6.19999e-07 [cse]: 2.74e-05 [optimize_parallel_all_gather_comm]: 1.762e-05 [overlap_param_gather]: 2.54001e-06 [cconv]: 3.188e-05 [loop_unroll]: 0.00049692 [opt_after_cconv]: 0.00010374, [1] [Cycle 1]: 9.745e-05, [7] [c_1]: 2.419e-05 [parameter_eliminate]: 4.84003e-06 [updatestate_depend_eliminate]: 6.59999e-06 [updatestate_assign_eliminate]: 2.54999e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 2.338e-05 [renormalize]: 6.10016e-07 [remove_dup_value]: 3.949e-05 [tuple_transform]: 8.967e-05, [1] [Cycle 1]: 8.453e-05, [4] [d_1]: 3.952e-05 [none_parameter_eliminate]: 1.82001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 6.74001e-06 [partial_unused_args_eliminate]: 1.79998e-06 [add_recomputation]: 4.843e-05 [cse_after_recomputation]: 4.831e-05, [1] [Cycle 1]: 4.311e-05, [1] [cse]: 3.618e-05 [environ_conv]: 1.062e-05 [swap_dp_allreduce_reducescatter]: 6.14999e-06 [bias_add_comm_swap]: 3.15998e-06 [label_micro_interleaved_index]: 5.07999e-06 [label_fine_grained_interleaved_index]: 3.03e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.15002e-06 [micro_interleaved_order_control]: 3.18e-06 [assign_add_opt]: 1.24998e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.03001e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.91999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.16997e-06 [interleave_split_concat_branches]: 1.07e-06 [interleave_parallel_branches]: 1.40999e-06 [overlap_opt_shard_in_pipeline]: 1.32e-06 [overlap_opt_shard_grad_in_pipeline]: 1.69e-06 [control_data_broadcast_order]: 1.282e-05 [grouped_pairwise_exchange_alltoall]: 1.59e-06 [offloading_packed_experts]: 4.02e-06 [overlap_recompute_and_grad_model_parallel]: 5.33002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.26002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41998e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 4.56002e-06 [overlap_grad_flash_sp]: 2.142e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 8.241e-05, [1] [Cycle 1]: 7.747e-05, [6] [build]: 1.193e-05 [elim_shapecalc]: 1.101e-05 [elim_not_effective]: 1.168e-05 [opt_reshape]: 6.13002e-06 [fold_const_symbol]: 9.34e-06 [renormalize]: 2.50002e-07 [detach_backward]: 2.02999e-06 [pipeline_parallel_scheduler]: 1.64e-06 [auto_monad_reorder]: 1.652e-05 [get_jit_bprop_graph]: 2.25002e-06 [rewriter_after_jit_bprop_graph]: 6.22001e-06 [opt_after_jit_grad]: 0.00052214 [validate]: 5.107e-05 [backend_pass]: 9.39996e-07 [task_emit]: 0.0474117 [execute]: 1.02e-05 Sums bootstrap : 0.000523s : 0.62% type_inference : 0.030543s : 36.29% event_method : 0.000026s : 0.03% auto_monad : 0.000087s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000042s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000270s : 0.32% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000091s : 0.11% optimize.opt_a.loop_unroll : 0.000038s : 0.05% optimize.opt_a.a_1 : 0.000714s : 0.85% optimize.opt_a.with_stream_mark : 0.000033s : 0.04% optimize.opt_a.recompute_prepare : 0.000013s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000133s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000026s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.001268s : 1.51% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000058s : 0.07% optimize.opt_a.a_3 : 0.000077s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000650s : 0.77% optimize.opt_b.b_1 : 0.000105s : 0.12% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000027s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000018s : 0.02% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000032s : 0.04% optimize.loop_unroll : 0.000497s : 0.59% optimize.opt_after_cconv.c_1 : 0.000024s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000039s : 0.05% optimize.tuple_transform.d_1 : 0.000040s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000048s : 0.06% optimize.cse_after_recomputation.cse : 0.000036s : 0.04% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000522s : 0.62% validate : 0.000051s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.047412s : 56.33% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000213 26 0.91% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000002s : 2: substitution.fold_const_symbol 2.92% : 0.000006s : 3: substitution.graph_param_transform 78.86% : 0.000168s : 6: substitution.inline 2.41% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.87% : 0.000006s : 4: substitution.remove_not_recompute_node 1.98% : 0.000004s : 2: substitution.replace_old_param 3.84% : 0.000008s : 1: substitution.switch_simplify 5.47% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030455 2 93.42% : 0.028450s : 1: type_inference.infer 6.58% : 0.002005s : 1: type_inference.specialize ------[replace.] 0.000090 9 55.95% : 0.000051s : 6: replace.inline 24.94% : 0.000023s : 1: replace.switch_simplify 19.10% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000183 9 90.25% : 0.000165s : 6: match.inline 4.00% : 0.000007s : 1: match.switch_simplify 5.75% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000185 1092 1.30% : 0.000002s : 12: predicate.accumulaten_eliminater 1.06% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 6: predicate.addn_check_dump 0.93% : 0.000002s : 12: predicate.addn_zero_filter 1.09% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.46% : 0.000005s : 18: predicate.arithmetic_simplify 1.01% : 0.000002s : 12: predicate.cast_eliminate 0.49% : 0.000001s : 6: predicate.check_bprop_eliminate 0.46% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.52% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.00% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.44% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 15: predicate.environ_get_depend_swap 1.61% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.52% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.76% : 0.000005s : 20: predicate.float_depend_g_call 0.57% : 0.000001s : 6: predicate.float_environ_get_switch 0.65% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000000s : 3: predicate.graph_param_transform 0.48% : 0.000001s : 6: predicate.incorporate_call 0.40% : 0.000001s : 6: predicate.incorporate_call_switch 6.05% : 0.000011s : 50: predicate.inline 0.74% : 0.000001s : 6: predicate.inline_without_move 0.33% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 6: predicate.less_batch_normalization 1.67% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.44% : 0.000005s : 32: predicate.load_eliminater 1.19% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.74% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.71% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.46% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 12: predicate.minmaximum_grad 1.51% : 0.000003s : 3: predicate.mutable_eliminate 0.29% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 1.89% : 0.000003s : 20: predicate.partial_defer_inline 1.36% : 0.000003s : 17: predicate.partial_eliminate 1.05% : 0.000002s : 12: predicate.print_const_string_wrapper 0.59% : 0.000001s : 6: predicate.reduce_all_const_elim 1.28% : 0.000002s : 12: predicate.reduce_eliminate 2.30% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 6: predicate.remove_not_recompute_node 1.17% : 0.000002s : 20: predicate.replace_applicator 0.41% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 0.97% : 0.000002s : 12: predicate.reshape_eliminate 0.49% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.77% : 0.000001s : 6: predicate.same_eliminate 0.36% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.81% : 0.000001s : 6: predicate.special_op_eliminate 0.57% : 0.000001s : 6: predicate.specialize_transform 0.90% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.61% : 0.000003s : 20: predicate.switch_defer_inline 2.05% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.84% : 0.000011s : 68: predicate.switch_simplify 0.98% : 0.000002s : 12: predicate.tile_eliminate 0.95% : 0.000002s : 12: predicate.transpose_eliminate 1.53% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.82% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.54% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.36% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.83% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.60% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.64% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001705 16 44.01% : 0.000750s : 8: func_graph_cloner_run.FuncGraphClonerGraph 55.99% : 0.000955s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.101483 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.92% : 0.003977s : 1: add_attr 3.91% : 0.003964s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000053s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000092s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.55% : 0.000563s : 1: bootstrap 0.04% : 0.000036s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000051s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000033s : 1: event_method 0.02% : 0.000017s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.50% : 0.000508s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.65% : 0.000663s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.12% : 0.001132s : 78: opt.transform.opt_a 0.02% : 0.000023s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.08% : 0.000084s : 28: opt.transform.opt_b 0.04% : 0.000044s : 2: opt.transform.opt_trans_graph 0.03% : 0.000034s : 4: opt.transform.symbol_engine_opt 3.23% : 0.003279s : 1: opt_a 0.11% : 0.000107s : 1: opt_after_cconv 0.53% : 0.000534s : 1: opt_after_jit_grad 0.19% : 0.000198s : 1: opt_b 5.66% : 0.005740s : 1: optimize 0.02% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000046s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000044s : 1: remove_dup_value 0.73% : 0.000741s : 1: renormalize.infer 0.51% : 0.000517s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000024s : 1: rewriter_after_opt_a 0.27% : 0.000278s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000085s : 1: symbol_engine_optimizer 46.74% : 0.047438s : 1: task_emit 0.09% : 0.000093s : 1: tuple_transform 30.13% : 0.030575s : 1: type_inference 0.08% : 0.000086s : 1: validate [WARNING] CORE(87365,ffffbf434f30,python3.9):2026-01-29-17:52:02.235.634 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph1 TotalTime = 0.0942125, [24] [bootstrap]: 0.0005165 [type_inference]: 0.0345347 [event_method]: 0.00010912 [auto_monad]: 0.00018487 [graph_reusing]: 2.636e-05 [inline]: 3.00002e-06 [add_attr]: 0.00379758, [1] [add_attr_with_inline]: 0.00378584, [1] [Cycle 1]: 8.716e-05, [2] [tag_attr]: 3.19e-05 [meta_addattr_fg_expand]: 7.58001e-06 [parallel-infer-symbol]: 4.07e-06 [pre_auto_parallel]: 4.905e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.0067457, [53] [py_interpret_to_execute]: 8.43001e-06 [rewriter_before_opt_a]: 0.00035237 [opt_a]: 0.00408303, [2] [Cycle 1]: 0.00328331, [45] [expand_dump_flag]: 4.08999e-06 [switch_simplify]: 9.515e-05 [loop_unroll]: 3.953e-05 [a_1]: 0.00081266 [with_stream_mark]: 2.122e-05 [recompute_prepare]: 1.12e-05 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 4e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 2.31e-06 [a_2]: 9.634e-05 [accelerated_algorithm]: 7.58999e-06 [shard]: 1.93002e-06 [meta_shard_fg_expand]: 2.69001e-06 [shard_inline]: 7.21999e-06 [merge_send_recv]: 1.017e-05 [auto_parallel]: 9.27999e-06 [parallel]: 2.145e-05 [flash_sp]: 1.066e-05 [merge_comm]: 5.12999e-06 [allreduce_fusion]: 4.37e-06 [matmul_add_comm_reduction]: 1.091e-05 [allreduce_slice_to_reducescatter]: 9.20001e-07 [virtual_shard_identity]: 9.87001e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.61e-06 [virtual_output]: 6.83e-06 [merge_forward]: 5.64998e-06 [cell_reuse_recompute_pass]: 1.76003e-06 [offload_activation]: 1.082e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.531e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 1.263e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87e-06 [meta_fg_expand]: 4.07e-06 [flash_sp_send_recv_attached]: 3.08998e-06 [receive_attached]: 2.28998e-06 [after_resolve]: 1.136e-05 [a_after_grad]: 1.182e-05 [renormalize]: 0.00156226 [add_forward_monad_depend]: 8.44002e-06 [auto_monad_grad]: 2.47001e-06 [auto_monad_eliminator]: 2.103e-05 [cse]: 4.483e-05 [a_3]: 6.105e-05 [Cycle 2]: 0.00078655, [45] [expand_dump_flag]: 2.24001e-06 [switch_simplify]: 9.31e-06 [loop_unroll]: 7.1e-06 [a_1]: 0.00016421 [with_stream_mark]: 1.851e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 4.47e-06 [updatestate_assign_eliminate]: 3.9e-06 [updatestate_loads_eliminate]: 3.66001e-06 [parameter_eliminate]: 1.60001e-06 [a_2]: 8.848e-05 [accelerated_algorithm]: 7.01999e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 2.12001e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 4.634e-05 [auto_parallel]: 9.81e-06 [parallel]: 8e-06 [flash_sp]: 4.66002e-06 [merge_comm]: 4.23999e-06 [allreduce_fusion]: 4.14002e-06 [matmul_add_comm_reduction]: 8.46002e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 8.94e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 6.12999e-06 [virtual_output]: 5.88998e-06 [merge_forward]: 4.91002e-06 [cell_reuse_recompute_pass]: 2.09999e-06 [offload_activation]: 1.063e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.241e-05 [merge_recompute_call_nodes]: 1.39998e-06 [before_grad]: 1.044e-05 [set_forward_comm_id_for_comm_node_pass]: 4.90999e-06 [meta_fg_expand]: 2.99999e-06 [flash_sp_send_recv_attached]: 1.30999e-06 [receive_attached]: 1.58002e-06 [after_resolve]: 9.57999e-06 [a_after_grad]: 9.60001e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.25001e-06 [auto_monad_eliminator]: 9.05999e-06 [cse]: 2.512e-05 [a_3]: 3.936e-05 [py_interpret_to_execute_after_opt_a]: 7.24001e-06 [slice_cell_reuse_recomputed_activation]: 2.46998e-06 [rewriter_after_opt_a]: 2.911e-05 [convert_after_rewriter]: 1.34e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00070492 [opt_b]: 0.00025711, [1] [Cycle 1]: 0.00025023, [7] [b_1]: 0.00015976 [b_2]: 9.06998e-06 [updatestate_depend_eliminate]: 7.97e-06 [updatestate_assign_eliminate]: 3.18998e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 8.70001e-07 [cse]: 3.052e-05 [optimize_parallel_all_gather_comm]: 2.017e-05 [overlap_param_gather]: 2.02001e-06 [cconv]: 3.079e-05 [loop_unroll]: 0.00046089 [opt_after_cconv]: 0.00011343, [1] [Cycle 1]: 0.00010689, [7] [c_1]: 3.11e-05 [parameter_eliminate]: 3.36999e-06 [updatestate_depend_eliminate]: 6.52001e-06 [updatestate_assign_eliminate]: 3.28998e-06 [updatestate_loads_eliminate]: 3.01999e-06 [cse]: 2.687e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 4.189e-05 [tuple_transform]: 9.487e-05, [1] [Cycle 1]: 9.067e-05, [4] [d_1]: 6.211e-05 [none_parameter_eliminate]: 1.91e-06 [renormalize]: 2.3999e-07 [switch_simplify]: 7.51999e-06 [partial_unused_args_eliminate]: 1.92001e-06 [add_recomputation]: 5.082e-05 [cse_after_recomputation]: 2.75e-05, [1] [Cycle 1]: 2.336e-05, [1] [cse]: 1.794e-05 [environ_conv]: 9.41e-06 [swap_dp_allreduce_reducescatter]: 6.12999e-06 [bias_add_comm_swap]: 2.86999e-06 [label_micro_interleaved_index]: 4.53999e-06 [label_fine_grained_interleaved_index]: 2.79999e-06 [merge_cast_opt]: 1.72001e-06 [slice_recompute_activation]: 2.13002e-06 [micro_interleaved_order_control]: 2.54001e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.37999e-06 [full_micro_interleaved_order_control]: 3.18e-06 [reorder_send_recv_between_fp_bp]: 2.52001e-06 [comm_op_add_attrs]: 1.25999e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.06002e-06 [interleave_parallel_branches]: 1.19998e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.62001e-06 [control_data_broadcast_order]: 1.382e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 2.682e-05 [overlap_recompute_and_grad_model_parallel]: 5.81e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.02001e-06 [overlap_grad_ring_attention]: 4.48001e-06 [overlap_grad_flash_sp]: 2.422e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.18002e-06 [split_layernorm_comm]: 1.92001e-06 [handle_group_info]: 1.10001e-06 [symbol_engine_optimizer]: 9.017e-05, [1] [Cycle 1]: 8.544e-05, [6] [build]: 1.114e-05 [elim_shapecalc]: 1.135e-05 [elim_not_effective]: 1.512e-05 [opt_reshape]: 7.88001e-06 [fold_const_symbol]: 1.135e-05 [renormalize]: 3.80009e-07 [detach_backward]: 2.46998e-06 [pipeline_parallel_scheduler]: 1.77999e-06 [auto_monad_reorder]: 2.159e-05 [get_jit_bprop_graph]: 2.09999e-06 [rewriter_after_jit_bprop_graph]: 4.41002e-06 [opt_after_jit_grad]: 0.0005021 [validate]: 5.02e-05 [backend_pass]: 8.30012e-07 [task_emit]: 0.0473622 [execute]: 1.04e-05 Sums bootstrap : 0.000517s : 0.58% type_inference : 0.034535s : 38.67% event_method : 0.000109s : 0.12% auto_monad : 0.000185s : 0.21% graph_reusing : 0.000026s : 0.03% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000049s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000352s : 0.39% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000104s : 0.12% optimize.opt_a.loop_unroll : 0.000047s : 0.05% optimize.opt_a.a_1 : 0.000977s : 1.09% optimize.opt_a.with_stream_mark : 0.000040s : 0.04% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000185s : 0.21% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000057s : 0.06% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000019s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.02% optimize.opt_a.a_after_grad : 0.000021s : 0.02% optimize.opt_a.renormalize : 0.001562s : 1.75% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000030s : 0.03% optimize.opt_a.cse : 0.000070s : 0.08% optimize.opt_a.a_3 : 0.000100s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000705s : 0.79% optimize.opt_b.b_1 : 0.000160s : 0.18% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000031s : 0.03% optimize.loop_unroll : 0.000461s : 0.52% optimize.opt_after_cconv.c_1 : 0.000031s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000027s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000042s : 0.05% optimize.tuple_transform.d_1 : 0.000062s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.06% optimize.cse_after_recomputation.cse : 0.000018s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000027s : 0.03% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000502s : 0.56% validate : 0.000050s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.047362s : 53.03% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000343 62 0.64% : 0.000002s : 3: substitution.elim_not_effective 2.28% : 0.000008s : 3: substitution.float_tuple_getitem_switch 0.52% : 0.000002s : 3: substitution.fold_const_symbol 1.73% : 0.000006s : 4: substitution.graph_param_transform 60.13% : 0.000206s : 8: substitution.inline 1.51% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.82% : 0.000006s : 2: substitution.minmaximum_grad 1.85% : 0.000006s : 6: substitution.remove_not_recompute_node 1.21% : 0.000004s : 2: substitution.replace_old_param 3.09% : 0.000011s : 1: substitution.switch_simplify 5.27% : 0.000018s : 4: substitution.tuple_list_convert_item_index_to_positive 2.27% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.69% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 10.52% : 0.000036s : 8: substitution.tuple_list_get_item_eliminator 3.46% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.034437 2 94.16% : 0.032426s : 1: type_inference.infer 5.84% : 0.002011s : 1: type_inference.specialize ------[replace.] 0.000102 11 58.78% : 0.000060s : 8: replace.inline 22.04% : 0.000022s : 1: replace.switch_simplify 19.18% : 0.000020s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000215 11 93.87% : 0.000201s : 8: match.inline 4.17% : 0.000009s : 1: match.switch_simplify 1.97% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000241 1438 1.02% : 0.000002s : 16: predicate.accumulaten_eliminater 1.12% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 8: predicate.addn_check_dump 0.92% : 0.000002s : 16: predicate.addn_zero_filter 0.92% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 24: predicate.arithmetic_simplify 1.08% : 0.000003s : 16: predicate.cast_eliminate 0.54% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 1.07% : 0.000003s : 16: predicate.dict_get_item_const_eliminator 1.13% : 0.000003s : 16: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.76% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.21% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_depend_swap 1.63% : 0.000004s : 28: predicate.environ_get_eliminate 1.21% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.45% : 0.000006s : 26: predicate.float_depend_g_call 0.48% : 0.000001s : 8: predicate.float_environ_get_switch 0.73% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 4: predicate.fold_const_symbol 0.57% : 0.000001s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 6.03% : 0.000015s : 66: predicate.inline 0.81% : 0.000002s : 8: predicate.inline_without_move 0.32% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.66% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.46% : 0.000006s : 42: predicate.load_eliminater 0.87% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.79% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.49% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 16: predicate.minmaximum_grad 1.19% : 0.000003s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 2.29% : 0.000006s : 26: predicate.partial_defer_inline 1.34% : 0.000003s : 22: predicate.partial_eliminate 0.88% : 0.000002s : 16: predicate.print_const_string_wrapper 0.51% : 0.000001s : 8: predicate.reduce_all_const_elim 1.29% : 0.000003s : 16: predicate.reduce_eliminate 2.35% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000001s : 8: predicate.remove_not_recompute_node 1.19% : 0.000003s : 26: predicate.replace_applicator 0.34% : 0.000001s : 8: predicate.replace_old_param 0.17% : 0.000000s : 4: predicate.reset_defer_inline 1.06% : 0.000003s : 16: predicate.reshape_eliminate 0.58% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.46% : 0.000001s : 4: predicate.row_tensor_eliminate 0.76% : 0.000002s : 8: predicate.same_eliminate 0.35% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.88% : 0.000002s : 8: predicate.shard_identity_eliminate 0.60% : 0.000001s : 8: predicate.special_op_eliminate 0.60% : 0.000001s : 8: predicate.specialize_transform 0.84% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.87% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.64% : 0.000004s : 26: predicate.switch_defer_inline 2.18% : 0.000005s : 34: predicate.switch_layer_defer_inline 6.22% : 0.000015s : 86: predicate.switch_simplify 1.02% : 0.000002s : 16: predicate.tile_eliminate 1.00% : 0.000002s : 16: predicate.transpose_eliminate 1.71% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.66% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.59% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.63% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.22% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.01% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 4: predicate.value_based_eliminate 0.64% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001737 23 58.59% : 0.001018s : 11: func_graph_cloner_run.FuncGraphClonerGraph 41.41% : 0.000719s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.108069 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.52% : 0.003803s : 1: add_attr 3.51% : 0.003790s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.18% : 0.000196s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.51% : 0.000555s : 1: bootstrap 0.03% : 0.000034s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.11% : 0.000120s : 1: event_method 0.02% : 0.000018s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000032s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.44% : 0.000471s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.66% : 0.000717s : 1: mutable_eliminate 0.03% : 0.000031s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000018s : 1: opt.transform.mutable_eliminate 1.40% : 0.001518s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000140s : 28: opt.transform.opt_b 0.06% : 0.000068s : 2: opt.transform.opt_trans_graph 0.04% : 0.000041s : 4: opt.transform.symbol_engine_opt 3.78% : 0.004087s : 1: opt_a 0.11% : 0.000117s : 1: opt_after_cconv 0.47% : 0.000513s : 1: opt_after_jit_grad 0.24% : 0.000261s : 1: opt_b 6.25% : 0.006751s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000046s : 1: remove_dup_value 0.84% : 0.000903s : 1: renormalize.infer 0.60% : 0.000647s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000033s : 1: rewriter_after_opt_a 0.33% : 0.000360s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000093s : 1: symbol_engine_optimizer 43.85% : 0.047392s : 1: task_emit 0.09% : 0.000098s : 1: tuple_transform 31.98% : 0.034565s : 1: type_inference 0.08% : 0.000084s : 1: validate TotalTime = 0.105345, [24] [bootstrap]: 0.00078752 [type_inference]: 0.0393546 [event_method]: 2.778e-05 [auto_monad]: 9.418e-05 [graph_reusing]: 7.48e-06 [inline]: 3.56999e-06 [add_attr]: 0.00619289, [1] [add_attr_with_inline]: 0.00617648, [1] [Cycle 1]: 0.00016457, [2] [tag_attr]: 3.153e-05 [meta_addattr_fg_expand]: 6.71999e-06 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 5.119e-05 [insert-virtual-dataset]: 3.89002e-06 [parallel-infer-symbol-second]: 6.30011e-07 [dataset_repeat_opt]: 2.06e-06 [pipeline_split]: 2.99001e-06 [optimize]: 0.00709512, [53] [py_interpret_to_execute]: 7.68001e-06 [rewriter_before_opt_a]: 0.00029882 [opt_a]: 0.00431568, [2] [Cycle 1]: 0.00362966, [45] [expand_dump_flag]: 3.59002e-06 [switch_simplify]: 0.00010182 [loop_unroll]: 3.744e-05 [a_1]: 0.00080173 [with_stream_mark]: 2.399e-05 [recompute_prepare]: 1.162e-05 [updatestate_depend_eliminate]: 4.65001e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.39001e-06 [parameter_eliminate]: 2.32999e-06 [a_2]: 8.334e-05 [accelerated_algorithm]: 7.46999e-06 [shard]: 2.29001e-06 [meta_shard_fg_expand]: 3.27002e-06 [shard_inline]: 7.30003e-06 [merge_send_recv]: 1.164e-05 [auto_parallel]: 9.20999e-06 [parallel]: 7.612e-05 [flash_sp]: 1.191e-05 [merge_comm]: 4.28999e-06 [allreduce_fusion]: 3.85e-06 [matmul_add_comm_reduction]: 1.041e-05 [allreduce_slice_to_reducescatter]: 7.80012e-07 [virtual_shard_identity]: 9.00999e-06 [virtual_dataset]: 7.71999e-06 [get_grad_eliminate_]: 7.4e-06 [virtual_output]: 6.69001e-06 [merge_forward]: 4.37998e-06 [cell_reuse_recompute_pass]: 1.96e-06 [offload_activation]: 1.079e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.604e-05 [merge_recompute_call_nodes]: 1.76998e-06 [before_grad]: 1.171e-05 [set_forward_comm_id_for_comm_node_pass]: 4.16001e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.63e-06 [after_resolve]: 1.187e-05 [a_after_grad]: 9.46e-06 [renormalize]: 0.00183153 [add_forward_monad_depend]: 9.04e-06 [auto_monad_grad]: 2.94001e-06 [auto_monad_eliminator]: 2.079e-05 [cse]: 4.279e-05 [a_3]: 5.71e-05 [Cycle 2]: 0.00067219, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 9.41003e-06 [loop_unroll]: 6.62002e-06 [a_1]: 0.00010819 [with_stream_mark]: 1.947e-05 [recompute_prepare]: 6.27001e-06 [updatestate_depend_eliminate]: 4.00998e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 1.69e-06 [a_2]: 6.404e-05 [accelerated_algorithm]: 6.17999e-06 [shard]: 2.27999e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 5.22999e-06 [merge_send_recv]: 7.48e-06 [auto_parallel]: 9.34e-06 [parallel]: 8.80001e-06 [flash_sp]: 4.11001e-06 [merge_comm]: 4.21001e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 7.63999e-06 [allreduce_slice_to_reducescatter]: 7.49977e-07 [virtual_shard_identity]: 7.19001e-06 [virtual_dataset]: 5.63002e-06 [get_grad_eliminate_]: 6.68998e-06 [virtual_output]: 5.61003e-06 [merge_forward]: 3.98001e-06 [cell_reuse_recompute_pass]: 2.88e-06 [offload_activation]: 9.59e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.533e-05 [merge_recompute_call_nodes]: 1.15001e-06 [before_grad]: 1.025e-05 [set_forward_comm_id_for_comm_node_pass]: 3.75e-06 [meta_fg_expand]: 2.66e-06 [flash_sp_send_recv_attached]: 1.69e-06 [receive_attached]: 1.76998e-06 [after_resolve]: 1.019e-05 [a_after_grad]: 8.42e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.66998e-06 [auto_monad_grad]: 1.07e-06 [auto_monad_eliminator]: 8.67998e-06 [cse]: 1.834e-05 [a_3]: 3.255e-05 [py_interpret_to_execute_after_opt_a]: 7.5e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 2.442e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00084201 [opt_b]: 0.00023918, [1] [Cycle 1]: 0.00022993, [7] [b_1]: 0.00014041 [b_2]: 7.85e-06 [updatestate_depend_eliminate]: 7.91001e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.57001e-06 [renormalize]: 8.00006e-07 [cse]: 3.076e-05 [optimize_parallel_all_gather_comm]: 1.966e-05 [overlap_param_gather]: 2.39999e-06 [cconv]: 3.399e-05 [loop_unroll]: 0.00052255 [opt_after_cconv]: 0.00011898, [1] [Cycle 1]: 0.00011194, [7] [c_1]: 2.986e-05 [parameter_eliminate]: 3.71001e-06 [updatestate_depend_eliminate]: 6.28e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 2.642e-05 [renormalize]: 6.00005e-07 [remove_dup_value]: 1.948e-05 [tuple_transform]: 8.314e-05, [1] [Cycle 1]: 7.772e-05, [4] [d_1]: 4.882e-05 [none_parameter_eliminate]: 2.04e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.94999e-06 [partial_unused_args_eliminate]: 2.11e-06 [add_recomputation]: 5.804e-05 [cse_after_recomputation]: 2.694e-05, [1] [Cycle 1]: 2.174e-05, [1] [cse]: 1.462e-05 [environ_conv]: 1.171e-05 [swap_dp_allreduce_reducescatter]: 7.15998e-06 [bias_add_comm_swap]: 3.21999e-06 [label_micro_interleaved_index]: 5.53002e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 2.30002e-06 [micro_interleaved_order_control]: 2.41998e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 1.00999e-06 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.40002e-06 [reorder_send_recv_between_fp_bp]: 2.78e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.17999e-06 [interleave_parallel_branches]: 1.34e-06 [overlap_opt_shard_in_pipeline]: 1.39e-06 [overlap_opt_shard_grad_in_pipeline]: 2.01998e-06 [control_data_broadcast_order]: 1.686e-05 [grouped_pairwise_exchange_alltoall]: 1.85001e-06 [offloading_packed_experts]: 4.60999e-06 [overlap_recompute_and_grad_model_parallel]: 5.69e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.40001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34003e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 5.18002e-06 [overlap_grad_flash_sp]: 2.452e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.42001e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.35999e-06 [symbol_engine_optimizer]: 9.248e-05, [1] [Cycle 1]: 8.812e-05, [6] [build]: 1.586e-05 [elim_shapecalc]: 1.18e-05 [elim_not_effective]: 1.369e-05 [opt_reshape]: 7.30998e-06 [fold_const_symbol]: 1.059e-05 [renormalize]: 3.30008e-07 [detach_backward]: 2.39999e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.122e-05 [get_jit_bprop_graph]: 2.02999e-06 [rewriter_after_jit_bprop_graph]: 5.92001e-06 [opt_after_jit_grad]: 0.0005866 [validate]: 0.00017022 [backend_pass]: 9.79984e-07 [task_emit]: 0.0506063 [execute]: 1.019e-05 Sums bootstrap : 0.000788s : 0.80% type_inference : 0.039355s : 40.20% event_method : 0.000028s : 0.03% auto_monad : 0.000094s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000032s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000051s : 0.05% insert-virtual-dataset : 0.000004s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000003s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000299s : 0.31% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000111s : 0.11% optimize.opt_a.loop_unroll : 0.000044s : 0.05% optimize.opt_a.a_1 : 0.000910s : 0.93% optimize.opt_a.with_stream_mark : 0.000043s : 0.04% optimize.opt_a.recompute_prepare : 0.000018s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000147s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000085s : 0.09% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.02% optimize.opt_a.a_after_grad : 0.000018s : 0.02% optimize.opt_a.renormalize : 0.001832s : 1.87% optimize.opt_a.add_forward_monad_depend : 0.000011s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000029s : 0.03% optimize.opt_a.cse : 0.000061s : 0.06% optimize.opt_a.a_3 : 0.000090s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000842s : 0.86% optimize.opt_b.b_1 : 0.000140s : 0.14% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.03% optimize.loop_unroll : 0.000523s : 0.53% optimize.opt_after_cconv.c_1 : 0.000030s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000049s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000058s : 0.06% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000025s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000016s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000587s : 0.60% validate : 0.000170s : 0.17% backend_pass : 0.000001s : 0.00% task_emit : 0.050606s : 51.69% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000252 26 1.33% : 0.000003s : 2: substitution.elim_not_effective 0.55% : 0.000001s : 2: substitution.fold_const_symbol 2.32% : 0.000006s : 3: substitution.graph_param_transform 79.79% : 0.000201s : 6: substitution.inline 1.98% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.75% : 0.000007s : 4: substitution.remove_not_recompute_node 2.21% : 0.000006s : 2: substitution.replace_old_param 3.32% : 0.000008s : 1: substitution.switch_simplify 5.77% : 0.000015s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.039255 2 94.90% : 0.037255s : 1: type_inference.infer 5.10% : 0.002000s : 1: type_inference.specialize ------[replace.] 0.000104 9 55.66% : 0.000058s : 6: replace.inline 24.06% : 0.000025s : 1: replace.switch_simplify 20.29% : 0.000021s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000218 9 90.72% : 0.000197s : 6: match.inline 3.53% : 0.000008s : 1: match.switch_simplify 5.75% : 0.000013s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000192 1092 1.00% : 0.000002s : 12: predicate.accumulaten_eliminater 1.18% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 6: predicate.addn_check_dump 0.97% : 0.000002s : 12: predicate.addn_zero_filter 0.89% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.70% : 0.000005s : 18: predicate.arithmetic_simplify 1.01% : 0.000002s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.44% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.74% : 0.000001s : 6: predicate.depend_value_elim 0.89% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.52% : 0.000003s : 12: predicate.dict_get_item_eliminator 1.05% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.13% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.42% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.07% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.76% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.47% : 0.000005s : 20: predicate.float_depend_g_call 0.48% : 0.000001s : 6: predicate.float_environ_get_switch 0.62% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.39% : 0.000001s : 6: predicate.incorporate_call_switch 5.67% : 0.000011s : 50: predicate.inline 0.54% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.79% : 0.000002s : 6: predicate.less_batch_normalization 1.85% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.25% : 0.000004s : 32: predicate.load_eliminater 1.10% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.72% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 6: predicate.merge_addn 0.52% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 12: predicate.minmaximum_grad 1.52% : 0.000003s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.05% : 0.000004s : 20: predicate.partial_defer_inline 1.30% : 0.000002s : 17: predicate.partial_eliminate 0.93% : 0.000002s : 12: predicate.print_const_string_wrapper 0.55% : 0.000001s : 6: predicate.reduce_all_const_elim 1.34% : 0.000003s : 12: predicate.reduce_eliminate 2.42% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 20: predicate.replace_applicator 0.34% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 0.99% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 3: predicate.row_tensor_eliminate 0.68% : 0.000001s : 6: predicate.same_eliminate 0.31% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.91% : 0.000002s : 6: predicate.shard_identity_eliminate 0.83% : 0.000002s : 6: predicate.special_op_eliminate 0.54% : 0.000001s : 6: predicate.specialize_transform 0.96% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.74% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.53% : 0.000003s : 20: predicate.switch_defer_inline 1.98% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.98% : 0.000011s : 68: predicate.switch_simplify 0.93% : 0.000002s : 12: predicate.tile_eliminate 0.94% : 0.000002s : 12: predicate.transpose_eliminate 1.64% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.79% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.28% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000005s : 24: predicate.tuple_list_set_item_eliminator 1.62% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.12% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.90% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.40% : 0.000001s : 3: predicate.value_based_eliminate 0.59% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002161 16 59.43% : 0.001284s : 8: func_graph_cloner_run.FuncGraphClonerGraph 40.57% : 0.000877s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.122039 196 0.00% : 0.000004s : 1: ForceFp32Comm 5.08% : 0.006200s : 1: add_attr 5.06% : 0.006181s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000062s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000100s : 1: auto_monad 0.02% : 0.000026s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.69% : 0.000839s : 1: bootstrap 0.03% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.03% : 0.000036s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.44% : 0.000534s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000008s : 1: micro_interleaved_order_control 0.70% : 0.000854s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000018s : 1: opt.transform.mutable_eliminate 1.14% : 0.001397s : 78: opt.transform.opt_a 0.02% : 0.000029s : 1: opt.transform.opt_after_cconv 0.03% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000115s : 28: opt.transform.opt_b 0.04% : 0.000054s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 3.54% : 0.004320s : 1: opt_a 0.10% : 0.000124s : 1: opt_after_cconv 0.49% : 0.000602s : 1: opt_after_jit_grad 0.20% : 0.000243s : 1: opt_b 5.82% : 0.007102s : 1: optimize 0.02% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.05% : 0.000055s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000005s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.79% : 0.000958s : 1: renormalize.infer 0.71% : 0.000862s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000028s : 1: rewriter_after_opt_a 0.25% : 0.000308s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000095s : 1: symbol_engine_optimizer 41.49% : 0.050634s : 1: task_emit 0.07% : 0.000086s : 1: tuple_transform 32.27% : 0.039388s : 1: type_inference 0.19% : 0.000227s : 1: validate TotalTime = 0.10491, [24] [bootstrap]: 0.00090747 [type_inference]: 0.0399301 [event_method]: 0.00010852 [auto_monad]: 0.00017876 [graph_reusing]: 1.25e-05 [inline]: 4.04002e-06 [add_attr]: 0.00567537, [1] [add_attr_with_inline]: 0.00565818, [1] [Cycle 1]: 9.18e-05, [2] [tag_attr]: 3.636e-05 [meta_addattr_fg_expand]: 7.56999e-06 [parallel-infer-symbol]: 4.51002e-06 [pre_auto_parallel]: 5.238e-05 [insert-virtual-dataset]: 3.14001e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.36998e-06 [pipeline_split]: 1.72999e-06 [optimize]: 0.00737355, [53] [py_interpret_to_execute]: 5.89e-06 [rewriter_before_opt_a]: 0.00028086 [opt_a]: 0.00432752, [2] [Cycle 1]: 0.00355165, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 9.901e-05 [loop_unroll]: 3.899e-05 [a_1]: 0.00080002 [with_stream_mark]: 2.098e-05 [recompute_prepare]: 9.77001e-06 [updatestate_depend_eliminate]: 5.07e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 3.82002e-06 [parameter_eliminate]: 1.85001e-06 [a_2]: 0.00012174 [accelerated_algorithm]: 8.54e-06 [shard]: 1.86e-06 [meta_shard_fg_expand]: 2.60997e-06 [shard_inline]: 6.73e-06 [merge_send_recv]: 1.057e-05 [auto_parallel]: 9.09e-06 [parallel]: 5.6e-05 [flash_sp]: 1.016e-05 [merge_comm]: 4.65001e-06 [allreduce_fusion]: 4.45e-06 [matmul_add_comm_reduction]: 1.134e-05 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 9.86003e-06 [virtual_dataset]: 7.4e-06 [get_grad_eliminate_]: 6.89001e-06 [virtual_output]: 7.1e-06 [merge_forward]: 4.60999e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.456e-05 [merge_recompute_call_nodes]: 1.76e-06 [before_grad]: 1.14e-05 [set_forward_comm_id_for_comm_node_pass]: 4.28001e-06 [meta_fg_expand]: 3.52002e-06 [flash_sp_send_recv_attached]: 3.10998e-06 [receive_attached]: 3.14999e-06 [after_resolve]: 1.077e-05 [a_after_grad]: 1.049e-05 [renormalize]: 0.00180666 [add_forward_monad_depend]: 8.62998e-06 [auto_monad_grad]: 2.68e-06 [auto_monad_eliminator]: 2.202e-05 [cse]: 4.256e-05 [a_3]: 5.843e-05 [Cycle 2]: 0.00076145, [45] [expand_dump_flag]: 2.05002e-06 [switch_simplify]: 8.68001e-06 [loop_unroll]: 7.55998e-06 [a_1]: 0.00017095 [with_stream_mark]: 1.891e-05 [recompute_prepare]: 7.36999e-06 [updatestate_depend_eliminate]: 4.92999e-06 [updatestate_assign_eliminate]: 3.58e-06 [updatestate_loads_eliminate]: 3.72998e-06 [parameter_eliminate]: 1.19998e-06 [a_2]: 8.875e-05 [accelerated_algorithm]: 7.28e-06 [shard]: 1.59e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 6.84999e-06 [merge_send_recv]: 8.05999e-06 [auto_parallel]: 1.004e-05 [parallel]: 6.84001e-06 [flash_sp]: 3.38999e-06 [merge_comm]: 4.42998e-06 [allreduce_fusion]: 4.65999e-06 [matmul_add_comm_reduction]: 9.10999e-06 [allreduce_slice_to_reducescatter]: 1.39998e-06 [virtual_shard_identity]: 9.02999e-06 [virtual_dataset]: 7.46001e-06 [get_grad_eliminate_]: 7.31001e-06 [virtual_output]: 7.41001e-06 [merge_forward]: 4.47998e-06 [cell_reuse_recompute_pass]: 2.98998e-06 [offload_activation]: 9.34e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.224e-05 [merge_recompute_call_nodes]: 9.80013e-07 [before_grad]: 1.073e-05 [set_forward_comm_id_for_comm_node_pass]: 4.87998e-06 [meta_fg_expand]: 2.94999e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 2.11e-06 [after_resolve]: 9.79e-06 [a_after_grad]: 9.92001e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 1.47999e-06 [auto_monad_eliminator]: 9.99001e-06 [cse]: 2.204e-05 [a_3]: 3.963e-05 [py_interpret_to_execute_after_opt_a]: 7.34002e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 2.942e-05 [convert_after_rewriter]: 1.81e-06 [order_py_execute_after_rewriter]: 1.20001e-06 [mutable_eliminate]: 0.00094999 [opt_b]: 0.00033489, [1] [Cycle 1]: 0.00032604, [7] [b_1]: 0.00021613 [b_2]: 1.108e-05 [updatestate_depend_eliminate]: 1.056e-05 [updatestate_assign_eliminate]: 3.97e-06 [updatestate_loads_eliminate]: 3.79002e-06 [renormalize]: 1.25001e-06 [cse]: 3.609e-05 [optimize_parallel_all_gather_comm]: 2.349e-05 [overlap_param_gather]: 2.44001e-06 [cconv]: 3.428e-05 [loop_unroll]: 0.00056452 [opt_after_cconv]: 0.00012391, [1] [Cycle 1]: 0.00011648, [7] [c_1]: 3.39e-05 [parameter_eliminate]: 4.95001e-06 [updatestate_depend_eliminate]: 6.47001e-06 [updatestate_assign_eliminate]: 3.36999e-06 [updatestate_loads_eliminate]: 3.23e-06 [cse]: 2.955e-05 [renormalize]: 9.10019e-07 [remove_dup_value]: 2.148e-05 [tuple_transform]: 0.0001051, [1] [Cycle 1]: 9.97e-05, [4] [d_1]: 6.828e-05 [none_parameter_eliminate]: 1.59998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 9.60001e-06 [partial_unused_args_eliminate]: 2.43998e-06 [add_recomputation]: 6.015e-05 [cse_after_recomputation]: 2.883e-05, [1] [Cycle 1]: 2.414e-05, [1] [cse]: 1.821e-05 [environ_conv]: 1.242e-05 [swap_dp_allreduce_reducescatter]: 6.36998e-06 [bias_add_comm_swap]: 3.7e-06 [label_micro_interleaved_index]: 5.40999e-06 [label_fine_grained_interleaved_index]: 2.57001e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.44999e-06 [micro_interleaved_order_control]: 2.69999e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 1.19e-06 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.07e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.25001e-06 [interleave_parallel_branches]: 1.33002e-06 [overlap_opt_shard_in_pipeline]: 1.53002e-06 [overlap_opt_shard_grad_in_pipeline]: 2.02999e-06 [control_data_broadcast_order]: 1.545e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 5.37001e-06 [overlap_recompute_and_grad_model_parallel]: 6.43e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.47001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.75002e-06 [overlap_grad_ring_attention]: 5.40999e-06 [overlap_grad_flash_sp]: 2.436e-05 [begin_end_overlap_inline]: 6.30011e-07 [split_matmul_comm_elemetwise]: 2.27001e-06 [split_layernorm_comm]: 2.02999e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 8.985e-05, [1] [Cycle 1]: 8.428e-05, [6] [build]: 1.224e-05 [elim_shapecalc]: 1.132e-05 [elim_not_effective]: 1.392e-05 [opt_reshape]: 7.72998e-06 [fold_const_symbol]: 1.143e-05 [renormalize]: 2.30008e-07 [detach_backward]: 2.86999e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 2.228e-05 [get_jit_bprop_graph]: 2.27999e-06 [rewriter_after_jit_bprop_graph]: 6.09001e-06 [opt_after_jit_grad]: 0.00054206 [validate]: 0.00019934 [backend_pass]: 1.02998e-06 [task_emit]: 0.0495583 [execute]: 9.72999e-06 Sums bootstrap : 0.000907s : 0.93% type_inference : 0.039930s : 40.71% event_method : 0.000109s : 0.11% auto_monad : 0.000179s : 0.18% graph_reusing : 0.000012s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000036s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000052s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000281s : 0.29% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000108s : 0.11% optimize.opt_a.loop_unroll : 0.000047s : 0.05% optimize.opt_a.a_1 : 0.000971s : 0.99% optimize.opt_a.with_stream_mark : 0.000040s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000210s : 0.21% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000019s : 0.02% optimize.opt_a.parallel : 0.000063s : 0.06% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.001807s : 1.84% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.03% optimize.opt_a.cse : 0.000065s : 0.07% optimize.opt_a.a_3 : 0.000098s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.03% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000950s : 0.97% optimize.opt_b.b_1 : 0.000216s : 0.22% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000023s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.03% optimize.loop_unroll : 0.000565s : 0.58% optimize.opt_after_cconv.c_1 : 0.000034s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000021s : 0.02% optimize.tuple_transform.d_1 : 0.000068s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.06% optimize.cse_after_recomputation.cse : 0.000018s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000542s : 0.55% validate : 0.000199s : 0.20% backend_pass : 0.000001s : 0.00% task_emit : 0.049558s : 50.53% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000346 62 0.65% : 0.000002s : 3: substitution.elim_not_effective 2.31% : 0.000008s : 3: substitution.float_tuple_getitem_switch 0.51% : 0.000002s : 3: substitution.fold_const_symbol 2.14% : 0.000007s : 4: substitution.graph_param_transform 60.66% : 0.000210s : 8: substitution.inline 1.42% : 0.000005s : 6: substitution.j_node_and_user_rematch 1.73% : 0.000006s : 2: substitution.minmaximum_grad 1.70% : 0.000006s : 6: substitution.remove_not_recompute_node 1.05% : 0.000004s : 2: substitution.replace_old_param 2.53% : 0.000009s : 1: substitution.switch_simplify 5.23% : 0.000018s : 4: substitution.tuple_list_convert_item_index_to_positive 2.35% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.54% : 0.000012s : 4: substitution.tuple_list_get_item_depend_reorder 10.87% : 0.000038s : 8: substitution.tuple_list_get_item_eliminator 3.31% : 0.000011s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.039837 2 94.79% : 0.037759s : 1: type_inference.infer 5.21% : 0.002077s : 1: type_inference.specialize ------[replace.] 0.000106 11 61.24% : 0.000065s : 8: replace.inline 21.63% : 0.000023s : 1: replace.switch_simplify 17.13% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000218 11 94.45% : 0.000206s : 8: match.inline 3.64% : 0.000008s : 1: match.switch_simplify 1.91% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000248 1438 0.92% : 0.000002s : 16: predicate.accumulaten_eliminater 0.71% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.41% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000003s : 16: predicate.addn_zero_filter 0.95% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.16% : 0.000005s : 24: predicate.arithmetic_simplify 1.01% : 0.000003s : 16: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.43% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.46% : 0.000001s : 8: predicate.depend_value_elim 0.93% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.36% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 4: predicate.elim_not_effective 0.51% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.14% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.04% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_depend_swap 1.70% : 0.000004s : 28: predicate.environ_get_eliminate 1.07% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.53% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.46% : 0.000006s : 26: predicate.float_depend_g_call 0.64% : 0.000002s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.89% : 0.000002s : 8: predicate.get_grad_eliminate 0.25% : 0.000001s : 4: predicate.graph_param_transform 0.48% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.64% : 0.000014s : 66: predicate.inline 0.61% : 0.000002s : 8: predicate.inline_without_move 0.24% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.30% : 0.000006s : 42: predicate.load_eliminater 1.11% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.93% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.74% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 16: predicate.minmaximum_grad 0.93% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 1.94% : 0.000005s : 26: predicate.partial_defer_inline 1.30% : 0.000003s : 22: predicate.partial_eliminate 0.95% : 0.000002s : 16: predicate.print_const_string_wrapper 0.49% : 0.000001s : 8: predicate.reduce_all_const_elim 1.43% : 0.000004s : 16: predicate.reduce_eliminate 2.30% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.09% : 0.000003s : 26: predicate.replace_applicator 0.31% : 0.000001s : 8: predicate.replace_old_param 0.22% : 0.000001s : 4: predicate.reset_defer_inline 1.32% : 0.000003s : 16: predicate.reshape_eliminate 0.51% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.97% : 0.000002s : 8: predicate.same_eliminate 0.35% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.03% : 0.000003s : 8: predicate.shard_identity_eliminate 0.85% : 0.000002s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.96% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000004s : 26: predicate.switch_defer_inline 1.99% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.59% : 0.000014s : 86: predicate.switch_simplify 0.97% : 0.000002s : 16: predicate.tile_eliminate 1.06% : 0.000003s : 16: predicate.transpose_eliminate 1.99% : 0.000005s : 24: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.59% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.70% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.80% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.47% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.58% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.28% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 2.80% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 8: predicate.virtual_output_eliminate 0.26% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001944 23 62.61% : 0.001217s : 11: func_graph_cloner_run.FuncGraphClonerGraph 37.39% : 0.000727s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.121546 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.67% : 0.005682s : 1: add_attr 4.66% : 0.005663s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000064s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.16% : 0.000189s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.78% : 0.000944s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.10% : 0.000120s : 1: event_method 0.01% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.47% : 0.000576s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.79% : 0.000965s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.02% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000021s : 1: opt.transform.mutable_eliminate 1.25% : 0.001518s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000191s : 28: opt.transform.opt_b 0.06% : 0.000075s : 2: opt.transform.opt_trans_graph 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 3.56% : 0.004332s : 1: opt_a 0.11% : 0.000128s : 1: opt_after_cconv 0.46% : 0.000553s : 1: opt_after_jit_grad 0.28% : 0.000339s : 1: opt_b 6.07% : 0.007380s : 1: optimize 0.02% : 0.000027s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000057s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000025s : 1: remove_dup_value 0.84% : 0.001024s : 1: renormalize.infer 0.64% : 0.000773s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000033s : 1: rewriter_after_opt_a 0.24% : 0.000287s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000093s : 1: symbol_engine_optimizer 40.80% : 0.049588s : 1: task_emit 0.09% : 0.000108s : 1: tuple_transform 32.87% : 0.039956s : 1: type_inference 0.20% : 0.000249s : 1: validate [WARNING] CORE(87365,ffffbf434f30,python3.9):2026-01-29-17:52:02.606.809 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph2 TotalTime = 0.0886356, [24] [bootstrap]: 0.0006383 [type_inference]: 0.0300514 [event_method]: 2.521e-05 [auto_monad]: 8.675e-05 [graph_reusing]: 6.34999e-06 [inline]: 2.88998e-06 [add_attr]: 0.00386359, [1] [add_attr_with_inline]: 0.00385054, [1] [Cycle 1]: 7.279e-05, [2] [tag_attr]: 2.697e-05 [meta_addattr_fg_expand]: 7e-06 [parallel-infer-symbol]: 3.50998e-06 [pre_auto_parallel]: 4.399e-05 [insert-virtual-dataset]: 2.71999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.16e-06 [pipeline_split]: 1.55999e-06 [optimize]: 0.00641326, [53] [py_interpret_to_execute]: 6.88e-06 [rewriter_before_opt_a]: 0.00027042 [opt_a]: 0.00396283, [2] [Cycle 1]: 0.00332289, [45] [expand_dump_flag]: 3.84002e-06 [switch_simplify]: 8.48e-05 [loop_unroll]: 3.952e-05 [a_1]: 0.00078228 [with_stream_mark]: 2.221e-05 [recompute_prepare]: 1.007e-05 [updatestate_depend_eliminate]: 4.42e-06 [updatestate_assign_eliminate]: 4.29997e-06 [updatestate_loads_eliminate]: 3.32002e-06 [parameter_eliminate]: 2.35002e-06 [a_2]: 8.288e-05 [accelerated_algorithm]: 7.31999e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 2.98e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 9.67999e-06 [auto_parallel]: 7.59002e-06 [parallel]: 2.132e-05 [flash_sp]: 1.004e-05 [merge_comm]: 4.31002e-06 [allreduce_fusion]: 3.7e-06 [matmul_add_comm_reduction]: 1.051e-05 [allreduce_slice_to_reducescatter]: 7.10017e-07 [virtual_shard_identity]: 7.88001e-06 [virtual_dataset]: 8.35999e-06 [get_grad_eliminate_]: 6.78e-06 [virtual_output]: 6.31998e-06 [merge_forward]: 4.25e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 1.052e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.536e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 1.156e-05 [set_forward_comm_id_for_comm_node_pass]: 4.37e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 2.93e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 1.173e-05 [a_after_grad]: 1.102e-05 [renormalize]: 0.00167086 [add_forward_monad_depend]: 6.72002e-06 [auto_monad_grad]: 2.12001e-06 [auto_monad_eliminator]: 1.868e-05 [cse]: 4.228e-05 [a_3]: 6.565e-05 [Cycle 2]: 0.0006268, [45] [expand_dump_flag]: 2.26998e-06 [switch_simplify]: 8.37998e-06 [loop_unroll]: 6.24001e-06 [a_1]: 0.00010966 [with_stream_mark]: 1.645e-05 [recompute_prepare]: 5.84e-06 [updatestate_depend_eliminate]: 3.85998e-06 [updatestate_assign_eliminate]: 2.67001e-06 [updatestate_loads_eliminate]: 2.56998e-06 [parameter_eliminate]: 1.16002e-06 [a_2]: 6.488e-05 [accelerated_algorithm]: 6.04999e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 5.59e-06 [merge_send_recv]: 6.07999e-06 [auto_parallel]: 8.05e-06 [parallel]: 7.48999e-06 [flash_sp]: 4.03001e-06 [merge_comm]: 3.26999e-06 [allreduce_fusion]: 3.14999e-06 [matmul_add_comm_reduction]: 7.11999e-06 [allreduce_slice_to_reducescatter]: 8.50006e-07 [virtual_shard_identity]: 7.10998e-06 [virtual_dataset]: 5.64e-06 [get_grad_eliminate_]: 5.35999e-06 [virtual_output]: 5.35999e-06 [merge_forward]: 3.69002e-06 [cell_reuse_recompute_pass]: 1.78002e-06 [offload_activation]: 8.3e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.523e-05 [merge_recompute_call_nodes]: 1.18001e-06 [before_grad]: 9.49e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 2.96999e-06 [flash_sp_send_recv_attached]: 1.05001e-06 [receive_attached]: 2.44999e-06 [after_resolve]: 8.03001e-06 [a_after_grad]: 8.71997e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.25999e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 7.75e-06 [cse]: 1.885e-05 [a_3]: 3.257e-05 [py_interpret_to_execute_after_opt_a]: 8.1e-06 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 2.034e-05 [convert_after_rewriter]: 1.89e-06 [order_py_execute_after_rewriter]: 1.18001e-06 [mutable_eliminate]: 0.00065873 [opt_b]: 0.00021739, [1] [Cycle 1]: 0.00021037, [7] [b_1]: 0.00012892 [b_2]: 8.63001e-06 [updatestate_depend_eliminate]: 6.07001e-06 [updatestate_assign_eliminate]: 3.44001e-06 [updatestate_loads_eliminate]: 2.60002e-06 [renormalize]: 9.70002e-07 [cse]: 2.37e-05 [optimize_parallel_all_gather_comm]: 2.425e-05 [overlap_param_gather]: 2.27999e-06 [cconv]: 2.875e-05 [loop_unroll]: 0.00047832 [opt_after_cconv]: 0.00010594, [1] [Cycle 1]: 9.987e-05, [7] [c_1]: 2.939e-05 [parameter_eliminate]: 3.07002e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.61999e-06 [cse]: 2.226e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 3.074e-05 [tuple_transform]: 7.64e-05, [1] [Cycle 1]: 7.115e-05, [4] [d_1]: 4.381e-05 [none_parameter_eliminate]: 1.73002e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.39999e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 5.097e-05 [cse_after_recomputation]: 2.547e-05, [1] [Cycle 1]: 2.101e-05, [1] [cse]: 1.366e-05 [environ_conv]: 1.026e-05 [swap_dp_allreduce_reducescatter]: 6.76e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 5.11997e-06 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.66e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 2.64001e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.07998e-06 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.58003e-06 [reorder_send_recv_between_fp_bp]: 2.75002e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.04998e-06 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.37999e-06 [overlap_opt_shard_in_pipeline]: 1.24998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.276e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 4.38001e-06 [overlap_recompute_and_grad_model_parallel]: 5.35001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36002e-06 [overlap_recompute_comm]: 2.42001e-06 [overlap_grad_ring_attention]: 4.60001e-06 [overlap_grad_flash_sp]: 1.956e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.77001e-06 [handle_group_info]: 1.22e-06 [symbol_engine_optimizer]: 8.875e-05, [1] [Cycle 1]: 8.456e-05, [6] [build]: 1.376e-05 [elim_shapecalc]: 1.116e-05 [elim_not_effective]: 1.284e-05 [opt_reshape]: 7.31001e-06 [fold_const_symbol]: 1.039e-05 [renormalize]: 2.10013e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.988e-05 [get_jit_bprop_graph]: 1.77001e-06 [rewriter_after_jit_bprop_graph]: 4.77998e-06 [opt_after_jit_grad]: 0.0005346 [validate]: 4.853e-05 [backend_pass]: 9.49978e-07 [task_emit]: 0.0466 [execute]: 1.203e-05 Sums bootstrap : 0.000638s : 0.76% type_inference : 0.030051s : 35.92% event_method : 0.000025s : 0.03% auto_monad : 0.000087s : 0.10% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000270s : 0.32% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000093s : 0.11% optimize.opt_a.loop_unroll : 0.000046s : 0.05% optimize.opt_a.a_1 : 0.000892s : 1.07% optimize.opt_a.with_stream_mark : 0.000039s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000148s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.001671s : 2.00% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000061s : 0.07% optimize.opt_a.a_3 : 0.000098s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000020s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000659s : 0.79% optimize.opt_b.b_1 : 0.000129s : 0.15% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000024s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000029s : 0.03% optimize.loop_unroll : 0.000478s : 0.57% optimize.opt_after_cconv.c_1 : 0.000029s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000031s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.06% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000020s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000535s : 0.64% validate : 0.000049s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.046600s : 55.70% execute : 0.000012s : 0.01% Time group info: ------[substitution.] 0.000234 26 1.24% : 0.000003s : 2: substitution.elim_not_effective 0.58% : 0.000001s : 2: substitution.fold_const_symbol 2.39% : 0.000006s : 3: substitution.graph_param_transform 79.85% : 0.000187s : 6: substitution.inline 1.89% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.96% : 0.000007s : 4: substitution.remove_not_recompute_node 1.41% : 0.000003s : 2: substitution.replace_old_param 3.59% : 0.000008s : 1: substitution.switch_simplify 6.08% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029966 2 94.49% : 0.028315s : 1: type_inference.infer 5.51% : 0.001651s : 1: type_inference.specialize ------[replace.] 0.000092 9 59.86% : 0.000055s : 6: replace.inline 17.65% : 0.000016s : 1: replace.switch_simplify 22.49% : 0.000021s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 9 90.07% : 0.000183s : 6: match.inline 3.73% : 0.000008s : 1: match.switch_simplify 6.20% : 0.000013s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000190 1092 1.03% : 0.000002s : 12: predicate.accumulaten_eliminater 0.98% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.82% : 0.000002s : 6: predicate.addn_check_dump 1.00% : 0.000002s : 12: predicate.addn_zero_filter 0.84% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.25% : 0.000004s : 18: predicate.arithmetic_simplify 1.07% : 0.000002s : 12: predicate.cast_eliminate 0.58% : 0.000001s : 6: predicate.check_bprop_eliminate 0.46% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.47% : 0.000001s : 6: predicate.depend_value_elim 1.06% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.26% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.82% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 3: predicate.elim_not_effective 0.41% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.02% : 0.000002s : 15: predicate.environ_get_depend_swap 1.57% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.44% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.61% : 0.000001s : 6: predicate.get_grad_eliminate 0.16% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.40% : 0.000001s : 6: predicate.incorporate_call_switch 6.07% : 0.000012s : 50: predicate.inline 0.72% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 6: predicate.less_batch_normalization 1.99% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.43% : 0.000005s : 32: predicate.load_eliminater 0.96% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.78% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.46% : 0.000001s : 6: predicate.merge_addn 0.42% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 12: predicate.minmaximum_grad 1.06% : 0.000002s : 3: predicate.mutable_eliminate 0.55% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 2.26% : 0.000004s : 20: predicate.partial_defer_inline 1.38% : 0.000003s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.47% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000003s : 12: predicate.reduce_eliminate 2.28% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 6: predicate.remove_not_recompute_node 1.33% : 0.000003s : 20: predicate.replace_applicator 0.46% : 0.000001s : 6: predicate.replace_old_param 0.22% : 0.000000s : 3: predicate.reset_defer_inline 1.01% : 0.000002s : 12: predicate.reshape_eliminate 0.52% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 0.82% : 0.000002s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.75% : 0.000001s : 6: predicate.shard_identity_eliminate 0.80% : 0.000002s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.94% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.55% : 0.000003s : 20: predicate.switch_defer_inline 2.32% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.75% : 0.000011s : 68: predicate.switch_simplify 0.90% : 0.000002s : 12: predicate.tile_eliminate 0.89% : 0.000002s : 12: predicate.transpose_eliminate 1.59% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.25% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.46% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000005s : 24: predicate.tuple_list_set_item_eliminator 1.74% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.18% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.69% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.62% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001477 16 50.29% : 0.000743s : 8: func_graph_cloner_run.FuncGraphClonerGraph 49.71% : 0.000734s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.102098 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.79% : 0.003870s : 1: add_attr 3.78% : 0.003854s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.05% : 0.000055s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000093s : 1: auto_monad 0.02% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.67% : 0.000680s : 1: bootstrap 0.03% : 0.000033s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.02% : 0.000021s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.48% : 0.000489s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.66% : 0.000670s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.33% : 0.001355s : 78: opt.transform.opt_a 0.03% : 0.000028s : 1: opt.transform.opt_after_cconv 0.03% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000104s : 28: opt.transform.opt_b 0.05% : 0.000048s : 2: opt.transform.opt_trans_graph 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.89% : 0.003967s : 1: opt_a 0.11% : 0.000111s : 1: opt_after_cconv 0.53% : 0.000546s : 1: opt_after_jit_grad 0.22% : 0.000221s : 1: opt_b 6.29% : 0.006419s : 1: optimize 0.03% : 0.000028s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000035s : 1: remove_dup_value 0.86% : 0.000874s : 1: renormalize.infer 0.77% : 0.000786s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000024s : 1: rewriter_after_opt_a 0.27% : 0.000277s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000006s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000091s : 1: symbol_engine_optimizer 45.67% : 0.046629s : 1: task_emit 0.08% : 0.000079s : 1: tuple_transform 29.46% : 0.030076s : 1: type_inference 0.08% : 0.000081s : 1: validate TotalTime = 0.0947994, [24] [bootstrap]: 0.00075795 [type_inference]: 0.0337033 [event_method]: 0.00010648 [auto_monad]: 0.00016212 [graph_reusing]: 1.188e-05 [inline]: 2.71e-06 [add_attr]: 0.00407504, [1] [add_attr_with_inline]: 0.00406339, [1] [Cycle 1]: 6.427e-05, [2] [tag_attr]: 2.827e-05 [meta_addattr_fg_expand]: 6.89999e-06 [parallel-infer-symbol]: 3.88999e-06 [pre_auto_parallel]: 4.071e-05 [insert-virtual-dataset]: 2.76e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.62999e-06 [pipeline_split]: 1.86e-06 [optimize]: 0.00651174, [53] [py_interpret_to_execute]: 5.52001e-06 [rewriter_before_opt_a]: 0.00025649 [opt_a]: 0.00394896, [2] [Cycle 1]: 0.00320852, [45] [expand_dump_flag]: 3.16001e-06 [switch_simplify]: 8.64e-05 [loop_unroll]: 4.041e-05 [a_1]: 0.00080065 [with_stream_mark]: 1.424e-05 [recompute_prepare]: 9.05999e-06 [updatestate_depend_eliminate]: 4.86002e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 2.98998e-06 [parameter_eliminate]: 1.10001e-06 [a_2]: 9.296e-05 [accelerated_algorithm]: 7.31001e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 2.28002e-06 [shard_inline]: 6.65998e-06 [merge_send_recv]: 7.64002e-06 [auto_parallel]: 7.06001e-06 [parallel]: 1.664e-05 [flash_sp]: 8.37e-06 [merge_comm]: 4.48001e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 7.23e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 8.35999e-06 [virtual_dataset]: 7.48e-06 [get_grad_eliminate_]: 6.72002e-06 [virtual_output]: 6.71999e-06 [merge_forward]: 4.13999e-06 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 8.05e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.341e-05 [merge_recompute_call_nodes]: 8.89995e-07 [before_grad]: 1.136e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.06001e-06 [flash_sp_send_recv_attached]: 2.07999e-06 [receive_attached]: 1.92999e-06 [after_resolve]: 9.15999e-06 [a_after_grad]: 1.025e-05 [renormalize]: 0.00162813 [add_forward_monad_depend]: 6.07999e-06 [auto_monad_grad]: 2.17999e-06 [auto_monad_eliminator]: 1.77e-05 [cse]: 2.934e-05 [a_3]: 5.159e-05 [Cycle 2]: 0.00072964, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 8.09997e-06 [loop_unroll]: 7.26999e-06 [a_1]: 0.00015992 [with_stream_mark]: 1.512e-05 [recompute_prepare]: 7.4e-06 [updatestate_depend_eliminate]: 4.29002e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 3.50998e-06 [parameter_eliminate]: 1.39e-06 [a_2]: 8.676e-05 [accelerated_algorithm]: 6.72002e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.48002e-06 [shard_inline]: 6.43e-06 [merge_send_recv]: 8.33999e-06 [auto_parallel]: 6.53998e-06 [parallel]: 7.33e-06 [flash_sp]: 4e-06 [merge_comm]: 4.2e-06 [allreduce_fusion]: 4.48999e-06 [matmul_add_comm_reduction]: 7.71999e-06 [allreduce_slice_to_reducescatter]: 9.29984e-07 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 7.21001e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.77002e-06 [merge_forward]: 4.47998e-06 [cell_reuse_recompute_pass]: 2.72001e-06 [offload_activation]: 7.75e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.178e-05 [merge_recompute_call_nodes]: 1.49998e-06 [before_grad]: 1.087e-05 [set_forward_comm_id_for_comm_node_pass]: 5.37001e-06 [meta_fg_expand]: 2.58e-06 [flash_sp_send_recv_attached]: 1.38002e-06 [receive_attached]: 1.82001e-06 [after_resolve]: 9.81998e-06 [a_after_grad]: 1.01e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 2.06e-06 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 8.70999e-06 [cse]: 2.128e-05 [a_3]: 3.895e-05 [py_interpret_to_execute_after_opt_a]: 5.89999e-06 [slice_cell_reuse_recomputed_activation]: 1.85001e-06 [rewriter_after_opt_a]: 2.925e-05 [convert_after_rewriter]: 1.30999e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00064442 [opt_b]: 0.0002568, [1] [Cycle 1]: 0.00024982, [7] [b_1]: 0.00016462 [b_2]: 8.07003e-06 [updatestate_depend_eliminate]: 6.31e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 2.96001e-06 [renormalize]: 4.00003e-07 [cse]: 2.735e-05 [optimize_parallel_all_gather_comm]: 2.211e-05 [overlap_param_gather]: 2.24999e-06 [cconv]: 2.677e-05 [loop_unroll]: 0.00049073 [opt_after_cconv]: 0.00011292, [1] [Cycle 1]: 0.00010683, [7] [c_1]: 3.212e-05 [parameter_eliminate]: 3.57002e-06 [updatestate_depend_eliminate]: 6.88e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.96001e-06 [cse]: 2.365e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 1.824e-05 [tuple_transform]: 9.508e-05, [1] [Cycle 1]: 9.107e-05, [4] [d_1]: 6.432e-05 [none_parameter_eliminate]: 1.39e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.71001e-06 [partial_unused_args_eliminate]: 1.85001e-06 [add_recomputation]: 9.984e-05 [cse_after_recomputation]: 2.974e-05, [1] [Cycle 1]: 2.535e-05, [1] [cse]: 1.955e-05 [environ_conv]: 9.04998e-06 [swap_dp_allreduce_reducescatter]: 6.49001e-06 [bias_add_comm_swap]: 3.28e-06 [label_micro_interleaved_index]: 4.85999e-06 [label_fine_grained_interleaved_index]: 2.99999e-06 [merge_cast_opt]: 1.16002e-06 [slice_recompute_activation]: 1.82999e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.06002e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.29998e-06 [full_micro_interleaved_order_control]: 2.41998e-06 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.25001e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.46002e-06 [control_data_broadcast_order]: 1.426e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 6.19001e-06 [overlap_recompute_and_grad_model_parallel]: 5.32001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.20999e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 5.15001e-06 [overlap_grad_flash_sp]: 2.08e-05 [begin_end_overlap_inline]: 7.29982e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.48002e-06 [handle_group_info]: 1.20001e-06 [symbol_engine_optimizer]: 9.256e-05, [1] [Cycle 1]: 8.633e-05, [6] [build]: 9.11002e-06 [elim_shapecalc]: 1.154e-05 [elim_not_effective]: 1.58e-05 [opt_reshape]: 8.22e-06 [fold_const_symbol]: 1.208e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.85001e-06 [pipeline_parallel_scheduler]: 1.54998e-06 [auto_monad_reorder]: 2.124e-05 [get_jit_bprop_graph]: 2.19001e-06 [rewriter_after_jit_bprop_graph]: 4e-06 [opt_after_jit_grad]: 0.00050389 [validate]: 5.308e-05 [backend_pass]: 7.39994e-07 [task_emit]: 0.0485672 [execute]: 8.94e-06 Sums bootstrap : 0.000758s : 0.85% type_inference : 0.033703s : 37.59% event_method : 0.000106s : 0.12% auto_monad : 0.000162s : 0.18% graph_reusing : 0.000012s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000028s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000041s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000256s : 0.29% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000094s : 0.11% optimize.opt_a.loop_unroll : 0.000048s : 0.05% optimize.opt_a.a_1 : 0.000961s : 1.07% optimize.opt_a.with_stream_mark : 0.000029s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000180s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.02% optimize.opt_a.parallel : 0.000024s : 0.03% optimize.opt_a.flash_sp : 0.000012s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.02% optimize.opt_a.virtual_dataset : 0.000015s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000010s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.001628s : 1.82% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000051s : 0.06% optimize.opt_a.a_3 : 0.000091s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000644s : 0.72% optimize.opt_b.b_1 : 0.000165s : 0.18% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000027s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.03% optimize.loop_unroll : 0.000491s : 0.55% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000064s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000100s : 0.11% optimize.cse_after_recomputation.cse : 0.000020s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000504s : 0.56% validate : 0.000053s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.048567s : 54.17% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000327 62 0.68% : 0.000002s : 3: substitution.elim_not_effective 2.20% : 0.000007s : 3: substitution.float_tuple_getitem_switch 0.61% : 0.000002s : 3: substitution.fold_const_symbol 1.92% : 0.000006s : 4: substitution.graph_param_transform 60.01% : 0.000196s : 8: substitution.inline 1.38% : 0.000005s : 6: substitution.j_node_and_user_rematch 2.78% : 0.000009s : 2: substitution.minmaximum_grad 1.57% : 0.000005s : 6: substitution.remove_not_recompute_node 0.90% : 0.000003s : 2: substitution.replace_old_param 2.70% : 0.000009s : 1: substitution.switch_simplify 5.81% : 0.000019s : 4: substitution.tuple_list_convert_item_index_to_positive 2.44% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.65% : 0.000012s : 4: substitution.tuple_list_get_item_depend_reorder 9.66% : 0.000032s : 8: substitution.tuple_list_get_item_eliminator 3.69% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.033496 2 93.93% : 0.031462s : 1: type_inference.infer 6.07% : 0.002034s : 1: type_inference.specialize ------[replace.] 0.000100 11 62.24% : 0.000062s : 8: replace.inline 21.36% : 0.000021s : 1: replace.switch_simplify 16.41% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000203 11 94.35% : 0.000192s : 8: match.inline 3.87% : 0.000008s : 1: match.switch_simplify 1.78% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1438 0.98% : 0.000002s : 16: predicate.accumulaten_eliminater 0.85% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 1.05% : 0.000002s : 16: predicate.addn_zero_filter 0.87% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 24: predicate.arithmetic_simplify 0.99% : 0.000002s : 16: predicate.cast_eliminate 0.57% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.16% : 0.000000s : 4: predicate.const_output_eliminate 0.47% : 0.000001s : 8: predicate.depend_value_elim 0.96% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.18% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.42% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.18% : 0.000003s : 20: predicate.environ_get_depend_swap 1.75% : 0.000004s : 28: predicate.environ_get_eliminate 1.11% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.60% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.19% : 0.000005s : 26: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.88% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.19% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.40% : 0.000001s : 8: predicate.incorporate_call_switch 5.91% : 0.000014s : 66: predicate.inline 0.65% : 0.000002s : 8: predicate.inline_without_move 0.26% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000002s : 8: predicate.less_batch_normalization 1.82% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 42: predicate.load_eliminater 1.14% : 0.000003s : 4: predicate.loop_unroll_after_grad 2.73% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.82% : 0.000004s : 24: predicate.make_slice_get_slice_eliminator 0.60% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.53% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.99% : 0.000002s : 16: predicate.minmaximum_grad 1.27% : 0.000003s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.40% : 0.000001s : 4: predicate.parallel_virtual_node 2.00% : 0.000005s : 26: predicate.partial_defer_inline 1.39% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 16: predicate.print_const_string_wrapper 0.65% : 0.000002s : 8: predicate.reduce_all_const_elim 1.30% : 0.000003s : 16: predicate.reduce_eliminate 2.47% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.27% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000003s : 26: predicate.replace_applicator 0.34% : 0.000001s : 8: predicate.replace_old_param 0.20% : 0.000000s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 16: predicate.reshape_eliminate 0.56% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 4: predicate.row_tensor_eliminate 0.86% : 0.000002s : 8: predicate.same_eliminate 0.39% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.68% : 0.000002s : 8: predicate.shard_identity_eliminate 0.66% : 0.000002s : 8: predicate.special_op_eliminate 0.65% : 0.000002s : 8: predicate.specialize_transform 0.73% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.72% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.31% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.70% : 0.000004s : 26: predicate.switch_defer_inline 2.12% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.80% : 0.000014s : 86: predicate.switch_simplify 1.00% : 0.000002s : 16: predicate.tile_eliminate 1.02% : 0.000002s : 16: predicate.transpose_eliminate 1.61% : 0.000004s : 24: predicate.tuple_list_convert_item_index_to_positive 1.68% : 0.000004s : 24: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000007s : 34: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 24: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 32: predicate.tuple_list_set_item_eliminator 1.74% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.30% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.94% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 4: predicate.value_based_eliminate 0.70% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.57% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001656 23 55.35% : 0.000917s : 11: func_graph_cloner_run.FuncGraphClonerGraph 44.65% : 0.000739s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.108715 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.75% : 0.004081s : 1: add_attr 3.74% : 0.004067s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000105s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.16% : 0.000171s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.72% : 0.000779s : 1: bootstrap 0.03% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.11% : 0.000116s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.46% : 0.000501s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.60% : 0.000655s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000022s : 1: opt.transform.mutable_eliminate 1.36% : 0.001479s : 78: opt.transform.opt_a 0.03% : 0.000031s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000143s : 28: opt.transform.opt_b 0.06% : 0.000070s : 2: opt.transform.opt_trans_graph 0.04% : 0.000044s : 4: opt.transform.symbol_engine_opt 3.64% : 0.003953s : 1: opt_a 0.11% : 0.000117s : 1: opt_after_cconv 0.47% : 0.000515s : 1: opt_after_jit_grad 0.24% : 0.000260s : 1: opt_b 5.99% : 0.006517s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000024s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000045s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.82% : 0.000897s : 1: renormalize.infer 0.66% : 0.000723s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000033s : 1: rewriter_after_opt_a 0.24% : 0.000263s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000096s : 1: symbol_engine_optimizer 44.70% : 0.048593s : 1: task_emit 0.09% : 0.000098s : 1: tuple_transform 31.02% : 0.033722s : 1: type_inference 0.10% : 0.000104s : 1: validate [WARNING] CORE(87352,ffffbf434f30,python3.9):2026-01-29-17:52:02.977.031 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph2 [WARNING] CORE(87355,ffffbf434f30,python3.9):2026-01-29-17:52:02.987.514 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph6 [WARNING] CORE(87365,ffffbf434f30,python3.9):2026-01-29-17:52:03.032.936 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph3 TotalTime = 0.114494, [24] [bootstrap]: 0.00110064 [type_inference]: 0.0333396 [event_method]: 2.817e-05 [auto_monad]: 0.00011885 [graph_reusing]: 6.58998e-06 [inline]: 2.31e-06 [add_attr]: 0.00531013, [1] [add_attr_with_inline]: 0.00527694, [1] [Cycle 1]: 8.393e-05, [2] [tag_attr]: 3.071e-05 [meta_addattr_fg_expand]: 7.05e-06 [parallel-infer-symbol]: 3.82998e-06 [pre_auto_parallel]: 4.738e-05 [insert-virtual-dataset]: 2.74999e-06 [parallel-infer-symbol-second]: 7.89994e-07 [dataset_repeat_opt]: 2.01e-06 [pipeline_split]: 1.81998e-06 [optimize]: 0.00637629, [53] [py_interpret_to_execute]: 6.69999e-06 [rewriter_before_opt_a]: 0.00030526 [opt_a]: 0.00359834, [2] [Cycle 1]: 0.00297884, [45] [expand_dump_flag]: 4.18999e-06 [switch_simplify]: 9.043e-05 [loop_unroll]: 3.723e-05 [a_1]: 0.00067293 [with_stream_mark]: 2.003e-05 [recompute_prepare]: 8.50999e-06 [updatestate_depend_eliminate]: 4.47998e-06 [updatestate_assign_eliminate]: 4.62e-06 [updatestate_loads_eliminate]: 3.11999e-06 [parameter_eliminate]: 2.34999e-06 [a_2]: 7.443e-05 [accelerated_algorithm]: 7.16001e-06 [shard]: 2.50002e-06 [meta_shard_fg_expand]: 2.35002e-06 [shard_inline]: 6.24001e-06 [merge_send_recv]: 9.04e-06 [auto_parallel]: 8.28001e-06 [parallel]: 3.191e-05 [flash_sp]: 1.129e-05 [merge_comm]: 4.25e-06 [allreduce_fusion]: 3.49001e-06 [matmul_add_comm_reduction]: 9.89001e-06 [allreduce_slice_to_reducescatter]: 1.33002e-06 [virtual_shard_identity]: 9.41e-06 [virtual_dataset]: 7.35003e-06 [get_grad_eliminate_]: 6.44001e-06 [virtual_output]: 7.6e-06 [merge_forward]: 4.72998e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.205e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.579e-05 [merge_recompute_call_nodes]: 1.40999e-06 [before_grad]: 1.052e-05 [set_forward_comm_id_for_comm_node_pass]: 4.15e-06 [meta_fg_expand]: 3.32002e-06 [flash_sp_send_recv_attached]: 3.08e-06 [receive_attached]: 2.61e-06 [after_resolve]: 1.109e-05 [a_after_grad]: 9.53997e-06 [renormalize]: 0.00145299 [add_forward_monad_depend]: 7.03998e-06 [auto_monad_grad]: 2.84001e-06 [auto_monad_eliminator]: 1.952e-05 [cse]: 3.828e-05 [a_3]: 4.93e-05 [Cycle 2]: 0.00060607, [45] [expand_dump_flag]: 2.28002e-06 [switch_simplify]: 7.38999e-06 [loop_unroll]: 5.79999e-06 [a_1]: 9.893e-05 [with_stream_mark]: 1.684e-05 [recompute_prepare]: 5.79e-06 [updatestate_depend_eliminate]: 3.66001e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.37e-06 [a_2]: 6.081e-05 [accelerated_algorithm]: 5.62001e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.87001e-06 [shard_inline]: 5.16998e-06 [merge_send_recv]: 6.64001e-06 [auto_parallel]: 6.90002e-06 [parallel]: 7.55e-06 [flash_sp]: 3.43e-06 [merge_comm]: 3.08e-06 [allreduce_fusion]: 2.92002e-06 [matmul_add_comm_reduction]: 7.08998e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 7.06999e-06 [virtual_dataset]: 5.54e-06 [get_grad_eliminate_]: 5.77001e-06 [virtual_output]: 5.79999e-06 [merge_forward]: 4.43001e-06 [cell_reuse_recompute_pass]: 2.98e-06 [offload_activation]: 8.52e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.441e-05 [merge_recompute_call_nodes]: 1.32999e-06 [before_grad]: 8.97e-06 [set_forward_comm_id_for_comm_node_pass]: 3.63999e-06 [meta_fg_expand]: 2.09e-06 [flash_sp_send_recv_attached]: 1.14998e-06 [receive_attached]: 1.91e-06 [after_resolve]: 9.10001e-06 [a_after_grad]: 7.74002e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.77001e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 7.15e-06 [cse]: 1.525e-05 [a_3]: 3.016e-05 [py_interpret_to_execute_after_opt_a]: 7.26001e-06 [slice_cell_reuse_recomputed_activation]: 1.89e-06 [rewriter_after_opt_a]: 2.141e-05 [convert_after_rewriter]: 1.19998e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00076082 [opt_b]: 0.00019432, [1] [Cycle 1]: 0.00018608, [7] [b_1]: 0.00010794 [b_2]: 7.51999e-06 [updatestate_depend_eliminate]: 6.89999e-06 [updatestate_assign_eliminate]: 2.60997e-06 [updatestate_loads_eliminate]: 2.35002e-06 [renormalize]: 5.39992e-07 [cse]: 2.483e-05 [optimize_parallel_all_gather_comm]: 0.00013309 [overlap_param_gather]: 2.62001e-06 [cconv]: 3.158e-05 [loop_unroll]: 0.00055035 [opt_after_cconv]: 0.00012938, [1] [Cycle 1]: 0.00012204, [7] [c_1]: 4.842e-05 [parameter_eliminate]: 4.08001e-06 [updatestate_depend_eliminate]: 6.27001e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.47001e-06 [cse]: 2.141e-05 [renormalize]: 5.79981e-07 [remove_dup_value]: 1.81e-05 [tuple_transform]: 7.157e-05, [1] [Cycle 1]: 6.599e-05, [4] [d_1]: 3.879e-05 [none_parameter_eliminate]: 1.96003e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 6.10002e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 5.889e-05 [cse_after_recomputation]: 2.442e-05, [1] [Cycle 1]: 2.025e-05, [1] [cse]: 1.468e-05 [environ_conv]: 1.088e-05 [swap_dp_allreduce_reducescatter]: 5.79999e-06 [bias_add_comm_swap]: 4.35e-06 [label_micro_interleaved_index]: 5.82001e-06 [label_fine_grained_interleaved_index]: 2.83e-06 [merge_cast_opt]: 1.50999e-06 [slice_recompute_activation]: 2.53003e-06 [micro_interleaved_order_control]: 2.33002e-06 [assign_add_opt]: 1.35999e-06 [ForceFp32Comm]: 8.79983e-07 [remove_cast_before_assign_add]: 1.35999e-06 [full_micro_interleaved_order_control]: 2.54001e-06 [reorder_send_recv_between_fp_bp]: 2.70002e-06 [comm_op_add_attrs]: 1.05999e-06 [add_comm_op_reuse_tag]: 1.10001e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.29e-06 [overlap_opt_shard_in_pipeline]: 1.30999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.37001e-06 [control_data_broadcast_order]: 1.313e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 3.9e-06 [overlap_recompute_and_grad_model_parallel]: 4.55999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34998e-06 [overlap_recompute_comm]: 2.22001e-06 [overlap_grad_ring_attention]: 4.29002e-06 [overlap_grad_flash_sp]: 2.143e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.67001e-06 [split_layernorm_comm]: 2.08002e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 8.371e-05, [1] [Cycle 1]: 7.879e-05, [6] [build]: 1.23e-05 [elim_shapecalc]: 9.82001e-06 [elim_not_effective]: 1.235e-05 [opt_reshape]: 6.79001e-06 [fold_const_symbol]: 9.34e-06 [renormalize]: 1.80007e-07 [detach_backward]: 2.14e-06 [pipeline_parallel_scheduler]: 1.62999e-06 [auto_monad_reorder]: 1.82e-05 [get_jit_bprop_graph]: 2.47001e-06 [rewriter_after_jit_bprop_graph]: 5.52999e-06 [opt_after_jit_grad]: 0.00051332 [validate]: 8.54e-05 [backend_pass]: 9.39996e-07 [task_emit]: 0.0671893 [execute]: 1.011e-05 Sums bootstrap : 0.001101s : 1.02% type_inference : 0.033340s : 30.87% event_method : 0.000028s : 0.03% auto_monad : 0.000119s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000047s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000305s : 0.28% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000098s : 0.09% optimize.opt_a.loop_unroll : 0.000043s : 0.04% optimize.opt_a.a_1 : 0.000772s : 0.71% optimize.opt_a.with_stream_mark : 0.000037s : 0.03% optimize.opt_a.recompute_prepare : 0.000014s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000135s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.01% optimize.opt_a.auto_parallel : 0.000015s : 0.01% optimize.opt_a.parallel : 0.000039s : 0.04% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000017s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000021s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000017s : 0.02% optimize.opt_a.renormalize : 0.001453s : 1.35% optimize.opt_a.add_forward_monad_depend : 0.000009s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000027s : 0.02% optimize.opt_a.cse : 0.000054s : 0.05% optimize.opt_a.a_3 : 0.000079s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000007s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000761s : 0.70% optimize.opt_b.b_1 : 0.000108s : 0.10% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000025s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000133s : 0.12% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000032s : 0.03% optimize.loop_unroll : 0.000550s : 0.51% optimize.opt_after_cconv.c_1 : 0.000048s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000039s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000059s : 0.05% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000021s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000513s : 0.48% validate : 0.000085s : 0.08% backend_pass : 0.000001s : 0.00% task_emit : 0.067189s : 62.21% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000251 26 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.60% : 0.000002s : 2: substitution.fold_const_symbol 2.35% : 0.000006s : 3: substitution.graph_param_transform 81.94% : 0.000205s : 6: substitution.inline 1.64% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.56% : 0.000006s : 4: substitution.remove_not_recompute_node 1.65% : 0.000004s : 2: substitution.replace_old_param 3.49% : 0.000009s : 1: substitution.switch_simplify 4.88% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.033254 2 95.32% : 0.031698s : 1: type_inference.infer 4.68% : 0.001556s : 1: type_inference.specialize ------[replace.] 0.000094 9 55.81% : 0.000053s : 6: replace.inline 24.42% : 0.000023s : 1: replace.switch_simplify 19.77% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000220 9 91.44% : 0.000201s : 6: match.inline 3.54% : 0.000008s : 1: match.switch_simplify 5.02% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000187 1092 0.95% : 0.000002s : 12: predicate.accumulaten_eliminater 0.88% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.66% : 0.000001s : 6: predicate.addn_check_dump 0.91% : 0.000002s : 12: predicate.addn_zero_filter 0.80% : 0.000001s : 12: predicate.adjust_all_reduce_mul_add 2.24% : 0.000004s : 18: predicate.arithmetic_simplify 0.93% : 0.000002s : 12: predicate.cast_eliminate 0.50% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.50% : 0.000001s : 6: predicate.depend_value_elim 0.96% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.99% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.02% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 15: predicate.environ_get_depend_swap 1.50% : 0.000003s : 21: predicate.environ_get_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.51% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.68% : 0.000005s : 20: predicate.float_depend_g_call 0.52% : 0.000001s : 6: predicate.float_environ_get_switch 0.76% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.64% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.49% : 0.000001s : 6: predicate.incorporate_call 0.42% : 0.000001s : 6: predicate.incorporate_call_switch 6.14% : 0.000011s : 50: predicate.inline 0.71% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.98% : 0.000002s : 6: predicate.less_batch_normalization 1.60% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 32: predicate.load_eliminater 1.23% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.26% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.55% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.83% : 0.000002s : 12: predicate.minmaximum_grad 1.43% : 0.000003s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.35% : 0.000001s : 3: predicate.parallel_virtual_node 2.11% : 0.000004s : 20: predicate.partial_defer_inline 1.33% : 0.000002s : 17: predicate.partial_eliminate 0.97% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.34% : 0.000002s : 12: predicate.reduce_eliminate 2.35% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000001s : 6: predicate.remove_not_recompute_node 1.35% : 0.000003s : 20: predicate.replace_applicator 0.42% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 0.94% : 0.000002s : 12: predicate.reshape_eliminate 0.44% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 0.74% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 6: predicate.shard_identity_eliminate 0.65% : 0.000001s : 6: predicate.special_op_eliminate 0.56% : 0.000001s : 6: predicate.specialize_transform 0.87% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.65% : 0.000003s : 20: predicate.switch_defer_inline 2.16% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.20% : 0.000012s : 68: predicate.switch_simplify 0.96% : 0.000002s : 12: predicate.tile_eliminate 0.88% : 0.000002s : 12: predicate.transpose_eliminate 1.58% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.38% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.33% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.12% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.45% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.76% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.24% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.97% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.74% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.79% : 0.000001s : 6: predicate.virtual_output_eliminate 0.20% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001711 16 66.47% : 0.001137s : 8: func_graph_cloner_run.FuncGraphClonerGraph 33.53% : 0.000574s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.128960 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.12% : 0.005316s : 1: add_attr 4.10% : 0.005282s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000063s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000127s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.92% : 0.001191s : 1: bootstrap 0.03% : 0.000035s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000035s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.43% : 0.000560s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.60% : 0.000772s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.94% : 0.001218s : 78: opt.transform.opt_a 0.04% : 0.000047s : 1: opt.transform.opt_after_cconv 0.02% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000087s : 28: opt.transform.opt_b 0.03% : 0.000043s : 2: opt.transform.opt_trans_graph 0.03% : 0.000035s : 4: opt.transform.symbol_engine_opt 2.79% : 0.003603s : 1: opt_a 0.10% : 0.000133s : 1: opt_after_cconv 0.41% : 0.000524s : 1: opt_after_jit_grad 0.15% : 0.000198s : 1: opt_b 4.95% : 0.006382s : 1: optimize 0.11% : 0.000137s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000052s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000022s : 1: remove_dup_value 0.66% : 0.000849s : 1: renormalize.infer 0.46% : 0.000592s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000025s : 1: rewriter_after_opt_a 0.24% : 0.000313s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000086s : 1: symbol_engine_optimizer 52.12% : 0.067220s : 1: task_emit 0.06% : 0.000074s : 1: tuple_transform 25.88% : 0.033370s : 1: type_inference 0.09% : 0.000121s : 1: validate TotalTime = 0.125195, [24] [bootstrap]: 0.00052152 [type_inference]: 0.0308046 [event_method]: 2.644e-05 [auto_monad]: 0.00014932 [graph_reusing]: 7.23999e-06 [inline]: 3.7e-06 [add_attr]: 0.00442218, [1] [add_attr_with_inline]: 0.00440946, [1] [Cycle 1]: 8.451e-05, [2] [tag_attr]: 2.9e-05 [meta_addattr_fg_expand]: 6.50997e-06 [parallel-infer-symbol]: 4.63999e-06 [pre_auto_parallel]: 4.789e-05 [insert-virtual-dataset]: 2.98998e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 2.29001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00651282, [53] [py_interpret_to_execute]: 8.55001e-06 [rewriter_before_opt_a]: 0.00030933 [opt_a]: 0.00377726, [2] [Cycle 1]: 0.0031401, [45] [expand_dump_flag]: 3.86999e-06 [switch_simplify]: 9.23e-05 [loop_unroll]: 3.599e-05 [a_1]: 0.00067681 [with_stream_mark]: 2.122e-05 [recompute_prepare]: 9.19998e-06 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.25e-06 [parameter_eliminate]: 2.43002e-06 [a_2]: 7.271e-05 [accelerated_algorithm]: 7.26999e-06 [shard]: 1.97001e-06 [meta_shard_fg_expand]: 2.58003e-06 [shard_inline]: 6.41998e-06 [merge_send_recv]: 1.044e-05 [auto_parallel]: 8.03999e-06 [parallel]: 2.189e-05 [flash_sp]: 1.039e-05 [merge_comm]: 4.50001e-06 [allreduce_fusion]: 3.83999e-06 [matmul_add_comm_reduction]: 1.006e-05 [allreduce_slice_to_reducescatter]: 7.90023e-07 [virtual_shard_identity]: 9.51e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.01e-06 [virtual_output]: 5.78002e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 1.072e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.55e-05 [merge_recompute_call_nodes]: 1.71e-06 [before_grad]: 1.05e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 3.3e-06 [flash_sp_send_recv_attached]: 2.66999e-06 [receive_attached]: 2.54001e-06 [after_resolve]: 1.019e-05 [a_after_grad]: 8.90999e-06 [renormalize]: 0.00154763 [add_forward_monad_depend]: 9.09e-06 [auto_monad_grad]: 2.91e-06 [auto_monad_eliminator]: 2.029e-05 [cse]: 0.00010502 [a_3]: 5.439e-05 [Cycle 2]: 0.00062341, [45] [expand_dump_flag]: 2.88e-06 [switch_simplify]: 8.64e-06 [loop_unroll]: 5.71e-06 [a_1]: 0.00010156 [with_stream_mark]: 1.882e-05 [recompute_prepare]: 5.86e-06 [updatestate_depend_eliminate]: 4.28001e-06 [updatestate_assign_eliminate]: 3.33998e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.62001e-06 [a_2]: 6.289e-05 [accelerated_algorithm]: 5.99e-06 [shard]: 2.44999e-06 [meta_shard_fg_expand]: 2.27999e-06 [shard_inline]: 5.06002e-06 [merge_send_recv]: 8.60001e-06 [auto_parallel]: 9.05001e-06 [parallel]: 9.09998e-06 [flash_sp]: 4.32e-06 [merge_comm]: 3.09001e-06 [allreduce_fusion]: 3.4e-06 [matmul_add_comm_reduction]: 7.95e-06 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 6.46999e-06 [virtual_dataset]: 5.27001e-06 [get_grad_eliminate_]: 4.96002e-06 [virtual_output]: 5.31998e-06 [merge_forward]: 4.52998e-06 [cell_reuse_recompute_pass]: 1.41998e-06 [offload_activation]: 9.01998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.61e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 9.57001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.71999e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 1.015e-05 [a_after_grad]: 7.80998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.38002e-06 [auto_monad_grad]: 9.29984e-07 [auto_monad_eliminator]: 7.66001e-06 [cse]: 1.72e-05 [a_3]: 3.185e-05 [py_interpret_to_execute_after_opt_a]: 8.02998e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 2.387e-05 [convert_after_rewriter]: 2.12999e-06 [order_py_execute_after_rewriter]: 1.29e-06 [mutable_eliminate]: 0.00082451 [opt_b]: 0.00020975, [1] [Cycle 1]: 0.00020065, [7] [b_1]: 0.00011096 [b_2]: 8.22e-06 [updatestate_depend_eliminate]: 8.90999e-06 [updatestate_assign_eliminate]: 3.04999e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 7.2e-07 [cse]: 2.968e-05 [optimize_parallel_all_gather_comm]: 2.08e-05 [overlap_param_gather]: 2.31e-06 [cconv]: 3.531e-05 [loop_unroll]: 0.00053837 [opt_after_cconv]: 0.00011041, [1] [Cycle 1]: 0.00010338, [7] [c_1]: 2.665e-05 [parameter_eliminate]: 4.76002e-06 [updatestate_depend_eliminate]: 7.84002e-06 [updatestate_assign_eliminate]: 2.39001e-06 [updatestate_loads_eliminate]: 2.41e-06 [cse]: 2.428e-05 [renormalize]: 1.00001e-06 [remove_dup_value]: 1.909e-05 [tuple_transform]: 7.282e-05, [1] [Cycle 1]: 6.779e-05, [4] [d_1]: 4.1e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.00001e-07 [switch_simplify]: 6.76e-06 [partial_unused_args_eliminate]: 1.97999e-06 [add_recomputation]: 5.315e-05 [cse_after_recomputation]: 2.427e-05, [1] [Cycle 1]: 2.016e-05, [1] [cse]: 1.474e-05 [environ_conv]: 1.185e-05 [swap_dp_allreduce_reducescatter]: 5.77001e-06 [bias_add_comm_swap]: 4.24002e-06 [label_micro_interleaved_index]: 5.13002e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.43002e-06 [slice_recompute_activation]: 2.48e-06 [micro_interleaved_order_control]: 2.81e-06 [assign_add_opt]: 1.28002e-06 [ForceFp32Comm]: 1.00999e-06 [remove_cast_before_assign_add]: 1.23002e-06 [full_micro_interleaved_order_control]: 2.79001e-06 [reorder_send_recv_between_fp_bp]: 2.85002e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.20001e-06 [interleave_parallel_branches]: 1.51998e-06 [overlap_opt_shard_in_pipeline]: 1.74998e-06 [overlap_opt_shard_grad_in_pipeline]: 1.99e-06 [control_data_broadcast_order]: 1.487e-05 [grouped_pairwise_exchange_alltoall]: 1.67001e-06 [offloading_packed_experts]: 4.50001e-06 [overlap_recompute_and_grad_model_parallel]: 5.03002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.24001e-06 [overlap_grad_ring_attention]: 4.45e-06 [overlap_grad_flash_sp]: 2.383e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.44001e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 1.09e-06 [symbol_engine_optimizer]: 8.996e-05, [1] [Cycle 1]: 8.494e-05, [6] [build]: 1.373e-05 [elim_shapecalc]: 1.1e-05 [elim_not_effective]: 1.381e-05 [opt_reshape]: 7.21999e-06 [fold_const_symbol]: 1.005e-05 [renormalize]: 2.79979e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.67001e-06 [auto_monad_reorder]: 2.049e-05 [get_jit_bprop_graph]: 1.82001e-06 [rewriter_after_jit_bprop_graph]: 6.46e-06 [opt_after_jit_grad]: 0.00055149 [validate]: 5.751e-05 [backend_pass]: 1.40999e-06 [task_emit]: 0.0814591 [execute]: 1.207e-05 Sums bootstrap : 0.000522s : 0.44% type_inference : 0.030805s : 25.81% event_method : 0.000026s : 0.02% auto_monad : 0.000149s : 0.13% graph_reusing : 0.000007s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000005s : 0.00% pre_auto_parallel : 0.000048s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.01% optimize.rewriter_before_opt_a : 0.000309s : 0.26% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000101s : 0.08% optimize.opt_a.loop_unroll : 0.000042s : 0.03% optimize.opt_a.a_1 : 0.000778s : 0.65% optimize.opt_a.with_stream_mark : 0.000040s : 0.03% optimize.opt_a.recompute_prepare : 0.000015s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000136s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000019s : 0.02% optimize.opt_a.auto_parallel : 0.000017s : 0.01% optimize.opt_a.parallel : 0.000031s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000017s : 0.01% optimize.opt_a.renormalize : 0.001548s : 1.30% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.02% optimize.opt_a.cse : 0.000122s : 0.10% optimize.opt_a.a_3 : 0.000086s : 0.07% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000825s : 0.69% optimize.opt_b.b_1 : 0.000111s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000030s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.03% optimize.loop_unroll : 0.000538s : 0.45% optimize.opt_after_cconv.c_1 : 0.000027s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.02% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000041s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000053s : 0.04% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000551s : 0.46% validate : 0.000058s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.081459s : 68.26% execute : 0.000012s : 0.01% Time group info: ------[substitution.] 0.000243 26 1.02% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000002s : 2: substitution.fold_const_symbol 2.52% : 0.000006s : 3: substitution.graph_param_transform 79.43% : 0.000193s : 6: substitution.inline 1.98% : 0.000005s : 4: substitution.j_node_and_user_rematch 3.15% : 0.000008s : 4: substitution.remove_not_recompute_node 2.35% : 0.000006s : 2: substitution.replace_old_param 3.31% : 0.000008s : 1: substitution.switch_simplify 5.50% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.030718 2 94.93% : 0.029159s : 1: type_inference.infer 5.07% : 0.001559s : 1: type_inference.specialize ------[replace.] 0.000100 9 55.99% : 0.000056s : 6: replace.inline 25.58% : 0.000026s : 1: replace.switch_simplify 18.43% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000208 9 90.77% : 0.000189s : 6: match.inline 3.51% : 0.000007s : 1: match.switch_simplify 5.71% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000197 1092 0.99% : 0.000002s : 12: predicate.accumulaten_eliminater 1.03% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.40% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.42% : 0.000005s : 18: predicate.arithmetic_simplify 0.92% : 0.000002s : 12: predicate.cast_eliminate 0.55% : 0.000001s : 6: predicate.check_bprop_eliminate 0.43% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.59% : 0.000001s : 6: predicate.depend_value_elim 0.88% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.09% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.31% : 0.000003s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.31% : 0.000003s : 15: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.42% : 0.000003s : 15: predicate.environ_get_depend_swap 1.44% : 0.000003s : 21: predicate.environ_get_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.46% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.55% : 0.000005s : 20: predicate.float_depend_g_call 0.41% : 0.000001s : 6: predicate.float_environ_get_switch 0.73% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.56% : 0.000001s : 6: predicate.get_grad_eliminate 0.16% : 0.000000s : 3: predicate.graph_param_transform 0.45% : 0.000001s : 6: predicate.incorporate_call 0.38% : 0.000001s : 6: predicate.incorporate_call_switch 6.42% : 0.000013s : 50: predicate.inline 0.55% : 0.000001s : 6: predicate.inline_without_move 0.23% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.03% : 0.000002s : 6: predicate.less_batch_normalization 1.69% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.25% : 0.000004s : 32: predicate.load_eliminater 1.22% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.93% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.48% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.44% : 0.000001s : 6: predicate.merge_addn 0.40% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.76% : 0.000001s : 12: predicate.minmaximum_grad 2.11% : 0.000004s : 3: predicate.mutable_eliminate 0.30% : 0.000001s : 3: predicate.opt_reshape 0.36% : 0.000001s : 3: predicate.parallel_virtual_node 2.80% : 0.000006s : 20: predicate.partial_defer_inline 1.25% : 0.000002s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.46% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000003s : 12: predicate.reduce_eliminate 2.20% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 6: predicate.remove_not_recompute_node 1.48% : 0.000003s : 20: predicate.replace_applicator 0.55% : 0.000001s : 6: predicate.replace_old_param 0.21% : 0.000000s : 3: predicate.reset_defer_inline 0.98% : 0.000002s : 12: predicate.reshape_eliminate 0.58% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.51% : 0.000001s : 3: predicate.row_tensor_eliminate 0.81% : 0.000002s : 6: predicate.same_eliminate 0.30% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.95% : 0.000002s : 6: predicate.shard_identity_eliminate 0.64% : 0.000001s : 6: predicate.special_op_eliminate 0.64% : 0.000001s : 6: predicate.specialize_transform 1.13% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.25% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.72% : 0.000003s : 20: predicate.switch_defer_inline 1.96% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.66% : 0.000011s : 68: predicate.switch_simplify 0.83% : 0.000002s : 12: predicate.tile_eliminate 0.79% : 0.000002s : 12: predicate.transpose_eliminate 1.41% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.24% : 0.000002s : 18: predicate.tuple_list_get_item_const_eliminator 1.35% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.20% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.30% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.08% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.76% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.07% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.73% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.53% : 0.000001s : 6: predicate.virtual_output_eliminate 0.19% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001403 16 58.20% : 0.000816s : 8: func_graph_cloner_run.FuncGraphClonerGraph 41.80% : 0.000586s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.138851 196 0.00% : 0.000005s : 1: ForceFp32Comm 3.19% : 0.004429s : 1: add_attr 3.18% : 0.004414s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000157s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.41% : 0.000566s : 1: bootstrap 0.03% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.02% : 0.000034s : 1: event_method 0.02% : 0.000021s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000005s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.40% : 0.000550s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.61% : 0.000842s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000024s : 1: opt.transform.mutable_eliminate 0.89% : 0.001231s : 78: opt.transform.opt_a 0.02% : 0.000025s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000090s : 28: opt.transform.opt_b 0.03% : 0.000046s : 2: opt.transform.opt_trans_graph 0.03% : 0.000038s : 4: opt.transform.symbol_engine_opt 2.72% : 0.003781s : 1: opt_a 0.08% : 0.000114s : 1: opt_after_cconv 0.41% : 0.000567s : 1: opt_after_jit_grad 0.15% : 0.000213s : 1: opt_b 4.69% : 0.006518s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.04% : 0.000053s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000023s : 1: remove_dup_value 0.66% : 0.000921s : 1: renormalize.infer 0.44% : 0.000615s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.23% : 0.000316s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000093s : 1: symbol_engine_optimizer 58.75% : 0.081578s : 1: task_emit 0.05% : 0.000076s : 1: tuple_transform 22.21% : 0.030839s : 1: type_inference 0.07% : 0.000094s : 1: validate TotalTime = 0.0864626, [24] [bootstrap]: 0.00050394 [type_inference]: 0.0292569 [event_method]: 2.597e-05 [auto_monad]: 8.946e-05 [graph_reusing]: 6.33e-06 [inline]: 2.58e-06 [add_attr]: 0.00417459, [1] [add_attr_with_inline]: 0.00416237, [1] [Cycle 1]: 8.389e-05, [2] [tag_attr]: 3.05e-05 [meta_addattr_fg_expand]: 6.87002e-06 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 4.531e-05 [insert-virtual-dataset]: 2.99999e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.00613851, [53] [py_interpret_to_execute]: 6.80998e-06 [rewriter_before_opt_a]: 0.00029473 [opt_a]: 0.00344488, [2] [Cycle 1]: 0.00284386, [45] [expand_dump_flag]: 3.58e-06 [switch_simplify]: 8.702e-05 [loop_unroll]: 3.274e-05 [a_1]: 0.00063148 [with_stream_mark]: 2.021e-05 [recompute_prepare]: 7.90998e-06 [updatestate_depend_eliminate]: 4.35e-06 [updatestate_assign_eliminate]: 4.1e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 2.41e-06 [a_2]: 7.399e-05 [accelerated_algorithm]: 6.53003e-06 [shard]: 2.51e-06 [meta_shard_fg_expand]: 2.46998e-06 [shard_inline]: 5.76e-06 [merge_send_recv]: 9.94001e-06 [auto_parallel]: 7.81001e-06 [parallel]: 2.173e-05 [flash_sp]: 1.028e-05 [merge_comm]: 3.59002e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 1.03e-05 [allreduce_slice_to_reducescatter]: 8.10018e-07 [virtual_shard_identity]: 8.51002e-06 [virtual_dataset]: 6.04999e-06 [get_grad_eliminate_]: 5.57999e-06 [virtual_output]: 6.17999e-06 [merge_forward]: 3.96001e-06 [cell_reuse_recompute_pass]: 1.36998e-06 [offload_activation]: 9.93002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.363e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.104e-05 [set_forward_comm_id_for_comm_node_pass]: 3.43999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.64999e-06 [receive_attached]: 2.81999e-06 [after_resolve]: 9.42999e-06 [a_after_grad]: 8.70001e-06 [renormalize]: 0.00136114 [add_forward_monad_depend]: 7.07002e-06 [auto_monad_grad]: 2.76999e-06 [auto_monad_eliminator]: 1.795e-05 [cse]: 4.188e-05 [a_3]: 4.798e-05 [Cycle 2]: 0.00058779, [45] [expand_dump_flag]: 1.49e-06 [switch_simplify]: 7.25e-06 [loop_unroll]: 5.89999e-06 [a_1]: 0.00010054 [with_stream_mark]: 1.439e-05 [recompute_prepare]: 5.92001e-06 [updatestate_depend_eliminate]: 3.51999e-06 [updatestate_assign_eliminate]: 3.21001e-06 [updatestate_loads_eliminate]: 2.34999e-06 [parameter_eliminate]: 1.60999e-06 [a_2]: 6.185e-05 [accelerated_algorithm]: 5.39e-06 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 1.91e-06 [shard_inline]: 4.84998e-06 [merge_send_recv]: 6.01998e-06 [auto_parallel]: 5.64998e-06 [parallel]: 7.15998e-06 [flash_sp]: 4.05e-06 [merge_comm]: 3.79002e-06 [allreduce_fusion]: 3.32002e-06 [matmul_add_comm_reduction]: 7.20998e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 6.28998e-06 [virtual_dataset]: 5.15999e-06 [get_grad_eliminate_]: 4.98001e-06 [virtual_output]: 4.87998e-06 [merge_forward]: 3.06001e-06 [cell_reuse_recompute_pass]: 1.86e-06 [offload_activation]: 7.78001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.377e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 8.82e-06 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 2.14e-06 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.69998e-06 [after_resolve]: 7.70998e-06 [a_after_grad]: 7.2e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 9.99979e-07 [auto_monad_eliminator]: 7.39002e-06 [cse]: 1.764e-05 [a_3]: 3.099e-05 [py_interpret_to_execute_after_opt_a]: 5.30999e-06 [slice_cell_reuse_recomputed_activation]: 2.07999e-06 [rewriter_after_opt_a]: 2.382e-05 [convert_after_rewriter]: 1.43002e-06 [order_py_execute_after_rewriter]: 1.37e-06 [mutable_eliminate]: 0.00076514 [opt_b]: 0.00020454, [1] [Cycle 1]: 0.00019571, [7] [b_1]: 0.00010806 [b_2]: 7.95e-06 [updatestate_depend_eliminate]: 8.53001e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.42001e-06 [renormalize]: 7.30011e-07 [cse]: 2.938e-05 [optimize_parallel_all_gather_comm]: 2.093e-05 [overlap_param_gather]: 2.65997e-06 [cconv]: 3.228e-05 [loop_unroll]: 0.00057704 [opt_after_cconv]: 0.00010425, [1] [Cycle 1]: 9.706e-05, [7] [c_1]: 2.528e-05 [parameter_eliminate]: 4.03999e-06 [updatestate_depend_eliminate]: 6.09001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.23998e-06 [cse]: 2.26e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 4.415e-05 [tuple_transform]: 7.448e-05, [1] [Cycle 1]: 6.868e-05, [4] [d_1]: 4.194e-05 [none_parameter_eliminate]: 1.84998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 6.06e-06 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 6.045e-05 [cse_after_recomputation]: 2.395e-05, [1] [Cycle 1]: 1.94e-05, [1] [cse]: 1.344e-05 [environ_conv]: 1.265e-05 [swap_dp_allreduce_reducescatter]: 5.45001e-06 [bias_add_comm_swap]: 3.14999e-06 [label_micro_interleaved_index]: 5.61998e-06 [label_fine_grained_interleaved_index]: 2.68e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.34001e-06 [micro_interleaved_order_control]: 3.22002e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 1.00999e-06 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.58e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 1.10999e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.09003e-06 [overlap_opt_shard_in_pipeline]: 1.24e-06 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.415e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 3.61001e-06 [overlap_recompute_and_grad_model_parallel]: 5.06002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.57999e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 4.62e-06 [overlap_grad_flash_sp]: 2.266e-05 [begin_end_overlap_inline]: 7.50006e-07 [split_matmul_comm_elemetwise]: 2.79001e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 1.29e-06 [symbol_engine_optimizer]: 8.515e-05, [1] [Cycle 1]: 8.052e-05, [6] [build]: 1.277e-05 [elim_shapecalc]: 1.015e-05 [elim_not_effective]: 1.277e-05 [opt_reshape]: 6.59999e-06 [fold_const_symbol]: 9.31e-06 [renormalize]: 1.50001e-07 [detach_backward]: 2.36e-06 [pipeline_parallel_scheduler]: 1.55001e-06 [auto_monad_reorder]: 1.789e-05 [get_jit_bprop_graph]: 2.22999e-06 [rewriter_after_jit_bprop_graph]: 5.46e-06 [opt_after_jit_grad]: 0.00054269 [validate]: 5.637e-05 [backend_pass]: 8.89995e-07 [task_emit]: 0.045314 [execute]: 1.073e-05 Sums bootstrap : 0.000504s : 0.62% type_inference : 0.029257s : 36.05% event_method : 0.000026s : 0.03% auto_monad : 0.000089s : 0.11% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000045s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000295s : 0.36% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000094s : 0.12% optimize.opt_a.loop_unroll : 0.000039s : 0.05% optimize.opt_a.a_1 : 0.000732s : 0.90% optimize.opt_a.with_stream_mark : 0.000035s : 0.04% optimize.opt_a.recompute_prepare : 0.000014s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000136s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.01% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.04% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000017s : 0.02% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.001361s : 1.68% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.03% optimize.opt_a.cse : 0.000060s : 0.07% optimize.opt_a.a_3 : 0.000079s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000765s : 0.94% optimize.opt_b.b_1 : 0.000108s : 0.13% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000029s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.03% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000032s : 0.04% optimize.loop_unroll : 0.000577s : 0.71% optimize.opt_after_cconv.c_1 : 0.000025s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.05% optimize.tuple_transform.d_1 : 0.000042s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.07% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000013s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000023s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000543s : 0.67% validate : 0.000056s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.045314s : 55.83% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000223 26 0.94% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 3.12% : 0.000007s : 3: substitution.graph_param_transform 79.75% : 0.000178s : 6: substitution.inline 2.29% : 0.000005s : 4: substitution.j_node_and_user_rematch 2.62% : 0.000006s : 4: substitution.remove_not_recompute_node 1.58% : 0.000004s : 2: substitution.replace_old_param 3.58% : 0.000008s : 1: substitution.switch_simplify 5.48% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029176 2 95.17% : 0.027766s : 1: type_inference.infer 4.83% : 0.001410s : 1: type_inference.specialize ------[replace.] 0.000094 9 55.17% : 0.000052s : 6: replace.inline 25.45% : 0.000024s : 1: replace.switch_simplify 19.38% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000193 9 90.61% : 0.000175s : 6: match.inline 3.64% : 0.000007s : 1: match.switch_simplify 5.75% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000184 1092 0.90% : 0.000002s : 12: predicate.accumulaten_eliminater 0.88% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.43% : 0.000001s : 6: predicate.addn_check_dump 0.90% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.30% : 0.000004s : 18: predicate.arithmetic_simplify 0.94% : 0.000002s : 12: predicate.cast_eliminate 0.50% : 0.000001s : 6: predicate.check_bprop_eliminate 0.84% : 0.000002s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.53% : 0.000001s : 6: predicate.depend_value_elim 0.97% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.09% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.01% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 15: predicate.environ_get_depend_swap 1.73% : 0.000003s : 21: predicate.environ_get_eliminate 1.08% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.50% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.61% : 0.000005s : 20: predicate.float_depend_g_call 0.41% : 0.000001s : 6: predicate.float_environ_get_switch 0.64% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.62% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.45% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 6.25% : 0.000011s : 50: predicate.inline 0.63% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 6: predicate.less_batch_normalization 1.73% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.38% : 0.000004s : 32: predicate.load_eliminater 1.00% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.72% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.68% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.45% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.94% : 0.000002s : 12: predicate.minmaximum_grad 1.47% : 0.000003s : 3: predicate.mutable_eliminate 0.48% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.19% : 0.000004s : 20: predicate.partial_defer_inline 1.36% : 0.000002s : 17: predicate.partial_eliminate 0.90% : 0.000002s : 12: predicate.print_const_string_wrapper 0.53% : 0.000001s : 6: predicate.reduce_all_const_elim 1.31% : 0.000002s : 12: predicate.reduce_eliminate 2.27% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 6: predicate.remove_not_recompute_node 1.47% : 0.000003s : 20: predicate.replace_applicator 0.48% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000001s : 3: predicate.reset_defer_inline 0.95% : 0.000002s : 12: predicate.reshape_eliminate 0.69% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 0.90% : 0.000002s : 6: predicate.same_eliminate 0.33% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 6: predicate.shard_identity_eliminate 0.68% : 0.000001s : 6: predicate.special_op_eliminate 0.72% : 0.000001s : 6: predicate.specialize_transform 0.83% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.68% : 0.000003s : 20: predicate.switch_defer_inline 2.01% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.93% : 0.000011s : 68: predicate.switch_simplify 1.12% : 0.000002s : 12: predicate.tile_eliminate 0.88% : 0.000002s : 12: predicate.transpose_eliminate 1.50% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.71% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.97% : 0.000004s : 20: predicate.tuple_to_list_eliminator_ 2.24% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001215 16 57.49% : 0.000699s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.51% : 0.000517s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.099445 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.20% : 0.004181s : 1: add_attr 4.19% : 0.004166s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000095s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.54% : 0.000533s : 1: bootstrap 0.04% : 0.000036s : 1: cconv 0.01% : 0.000005s : 1: comm_op_add_attrs 0.02% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000006s : 1: detach_backward 0.02% : 0.000016s : 1: environ_conv 0.03% : 0.000032s : 1: event_method 0.02% : 0.000017s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.59% : 0.000588s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.78% : 0.000780s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000020s : 1: opt.transform.mutable_eliminate 1.16% : 0.001157s : 78: opt.transform.opt_a 0.02% : 0.000024s : 1: opt.transform.opt_after_cconv 0.03% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000085s : 28: opt.transform.opt_b 0.05% : 0.000046s : 2: opt.transform.opt_trans_graph 0.03% : 0.000035s : 4: opt.transform.symbol_engine_opt 3.47% : 0.003449s : 1: opt_a 0.11% : 0.000108s : 1: opt_after_cconv 0.56% : 0.000556s : 1: opt_after_jit_grad 0.21% : 0.000208s : 1: opt_b 6.18% : 0.006144s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000050s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000048s : 1: remove_dup_value 0.81% : 0.000809s : 1: renormalize.infer 0.54% : 0.000542s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000027s : 1: rewriter_after_opt_a 0.30% : 0.000302s : 1: rewriter_before_opt_a 0.02% : 0.000016s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000088s : 1: symbol_engine_optimizer 45.59% : 0.045340s : 1: task_emit 0.08% : 0.000077s : 1: tuple_transform 29.45% : 0.029288s : 1: type_inference 0.09% : 0.000094s : 1: validate [WARNING] CORE(87355,ffffbf434f30,python3.9):2026-01-29-17:52:03.457.611 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph7 TotalTime = 0.0952427, [24] [bootstrap]: 0.00071907 [type_inference]: 0.0322434 [event_method]: 2.74e-05 [auto_monad]: 8.772e-05 [graph_reusing]: 6.63e-06 [inline]: 3.46999e-06 [add_attr]: 0.00435854, [1] [add_attr_with_inline]: 0.00434493, [1] [Cycle 1]: 8.693e-05, [2] [tag_attr]: 3.001e-05 [meta_addattr_fg_expand]: 6.36e-06 [parallel-infer-symbol]: 3.87998e-06 [pre_auto_parallel]: 4.766e-05 [insert-virtual-dataset]: 2.74001e-06 [parallel-infer-symbol-second]: 9.30013e-07 [dataset_repeat_opt]: 1.99e-06 [pipeline_split]: 1.59e-06 [optimize]: 0.00621667, [53] [py_interpret_to_execute]: 8.27e-06 [rewriter_before_opt_a]: 0.00029608 [opt_a]: 0.00356763, [2] [Cycle 1]: 0.00294079, [45] [expand_dump_flag]: 3.46001e-06 [switch_simplify]: 9.22e-05 [loop_unroll]: 3.274e-05 [a_1]: 0.00067587 [with_stream_mark]: 2.155e-05 [recompute_prepare]: 9.60001e-06 [updatestate_depend_eliminate]: 4.32998e-06 [updatestate_assign_eliminate]: 3.55e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 2.92002e-06 [a_2]: 7.284e-05 [accelerated_algorithm]: 7.11001e-06 [shard]: 2.10002e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 5.67001e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 8.38999e-06 [parallel]: 2.265e-05 [flash_sp]: 1.033e-05 [merge_comm]: 3.95e-06 [allreduce_fusion]: 3.47002e-06 [matmul_add_comm_reduction]: 1.061e-05 [allreduce_slice_to_reducescatter]: 9.30013e-07 [virtual_shard_identity]: 8.06001e-06 [virtual_dataset]: 5.63997e-06 [get_grad_eliminate_]: 5.84999e-06 [virtual_output]: 5.58002e-06 [merge_forward]: 4.59998e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.051e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.468e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.046e-05 [set_forward_comm_id_for_comm_node_pass]: 4.25999e-06 [meta_fg_expand]: 3.13998e-06 [flash_sp_send_recv_attached]: 2.82002e-06 [receive_attached]: 2.76e-06 [after_resolve]: 1.067e-05 [a_after_grad]: 9.06998e-06 [renormalize]: 0.0014411 [add_forward_monad_depend]: 8.57e-06 [auto_monad_grad]: 2.49999e-06 [auto_monad_eliminator]: 1.956e-05 [cse]: 4.111e-05 [a_3]: 5.092e-05 [Cycle 2]: 0.0006131, [45] [expand_dump_flag]: 2.27999e-06 [switch_simplify]: 7.79002e-06 [loop_unroll]: 6.73e-06 [a_1]: 0.00010305 [with_stream_mark]: 1.659e-05 [recompute_prepare]: 5.80002e-06 [updatestate_depend_eliminate]: 3.31001e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 2.53e-06 [parameter_eliminate]: 1.30999e-06 [a_2]: 6.336e-05 [accelerated_algorithm]: 5.67999e-06 [shard]: 1.50999e-06 [meta_shard_fg_expand]: 2.40002e-06 [shard_inline]: 5.15999e-06 [merge_send_recv]: 6.58998e-06 [auto_parallel]: 8.03001e-06 [parallel]: 7.85e-06 [flash_sp]: 3.68999e-06 [merge_comm]: 3.33e-06 [allreduce_fusion]: 3.46999e-06 [matmul_add_comm_reduction]: 7.63999e-06 [allreduce_slice_to_reducescatter]: 8.00006e-07 [virtual_shard_identity]: 6.79001e-06 [virtual_dataset]: 5.23002e-06 [get_grad_eliminate_]: 5.05001e-06 [virtual_output]: 5.01002e-06 [merge_forward]: 3.60998e-06 [cell_reuse_recompute_pass]: 2.21e-06 [offload_activation]: 8.87e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.622e-05 [merge_recompute_call_nodes]: 9.50007e-07 [before_grad]: 8.82e-06 [set_forward_comm_id_for_comm_node_pass]: 3.55e-06 [meta_fg_expand]: 2.52001e-06 [flash_sp_send_recv_attached]: 1.76998e-06 [receive_attached]: 2.26998e-06 [after_resolve]: 1.014e-05 [a_after_grad]: 8.57e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.73002e-06 [auto_monad_grad]: 9.99979e-07 [auto_monad_eliminator]: 5.97001e-06 [cse]: 1.741e-05 [a_3]: 3.208e-05 [py_interpret_to_execute_after_opt_a]: 7.78001e-06 [slice_cell_reuse_recomputed_activation]: 2.10002e-06 [rewriter_after_opt_a]: 2.277e-05 [convert_after_rewriter]: 1.25001e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00075695 [opt_b]: 0.00022472, [1] [Cycle 1]: 0.00021678, [7] [b_1]: 0.00012449 [b_2]: 8.18999e-06 [updatestate_depend_eliminate]: 7.60998e-06 [updatestate_assign_eliminate]: 2.76e-06 [updatestate_loads_eliminate]: 2.56998e-06 [renormalize]: 8.89995e-07 [cse]: 3.32e-05 [optimize_parallel_all_gather_comm]: 2.204e-05 [overlap_param_gather]: 2.13002e-06 [cconv]: 3.547e-05 [loop_unroll]: 0.0005086 [opt_after_cconv]: 0.00010633, [1] [Cycle 1]: 0.00010008, [7] [c_1]: 2.54e-05 [parameter_eliminate]: 4.52998e-06 [updatestate_depend_eliminate]: 5.10001e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.51998e-06 [cse]: 2.404e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 4.52e-05 [tuple_transform]: 7.374e-05, [1] [Cycle 1]: 6.895e-05, [4] [d_1]: 4.053e-05 [none_parameter_eliminate]: 2.16003e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 7.23e-06 [partial_unused_args_eliminate]: 1.72999e-06 [add_recomputation]: 4.963e-05 [cse_after_recomputation]: 2.292e-05, [1] [Cycle 1]: 1.813e-05, [1] [cse]: 1.292e-05 [environ_conv]: 1.145e-05 [swap_dp_allreduce_reducescatter]: 5.32001e-06 [bias_add_comm_swap]: 3.71999e-06 [label_micro_interleaved_index]: 5.99999e-06 [label_fine_grained_interleaved_index]: 2.95002e-06 [merge_cast_opt]: 1.64e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.96999e-06 [assign_add_opt]: 1.67001e-06 [ForceFp32Comm]: 9.70002e-07 [remove_cast_before_assign_add]: 1.28002e-06 [full_micro_interleaved_order_control]: 2.51998e-06 [reorder_send_recv_between_fp_bp]: 2.96001e-06 [comm_op_add_attrs]: 1.19e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.18001e-06 [interleave_parallel_branches]: 1.14e-06 [overlap_opt_shard_in_pipeline]: 1.54998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.16e-06 [control_data_broadcast_order]: 1.463e-05 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 3.90998e-06 [overlap_recompute_and_grad_model_parallel]: 5.52999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.34001e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 2.419e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 2.38002e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 9.89996e-07 [symbol_engine_optimizer]: 9.138e-05, [1] [Cycle 1]: 8.689e-05, [6] [build]: 1.427e-05 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.295e-05 [opt_reshape]: 6.52001e-06 [fold_const_symbol]: 9.65002e-06 [renormalize]: 2.00002e-07 [detach_backward]: 2.68e-06 [pipeline_parallel_scheduler]: 1.84e-06 [auto_monad_reorder]: 1.869e-05 [get_jit_bprop_graph]: 2.98e-06 [rewriter_after_jit_bprop_graph]: 6.63e-06 [opt_after_jit_grad]: 0.00055557 [validate]: 5.542e-05 [backend_pass]: 9.09989e-07 [task_emit]: 0.0505775 [execute]: 1.148e-05 Sums bootstrap : 0.000719s : 0.80% type_inference : 0.032243s : 35.92% event_method : 0.000027s : 0.03% auto_monad : 0.000088s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000030s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000048s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000296s : 0.33% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000100s : 0.11% optimize.opt_a.loop_unroll : 0.000039s : 0.04% optimize.opt_a.a_1 : 0.000779s : 0.87% optimize.opt_a.with_stream_mark : 0.000038s : 0.04% optimize.opt_a.recompute_prepare : 0.000015s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000136s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000016s : 0.02% optimize.opt_a.auto_parallel : 0.000016s : 0.02% optimize.opt_a.parallel : 0.000030s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000008s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000019s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.02% optimize.opt_a.a_after_grad : 0.000018s : 0.02% optimize.opt_a.renormalize : 0.001441s : 1.61% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.03% optimize.opt_a.cse : 0.000059s : 0.07% optimize.opt_a.a_3 : 0.000083s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000023s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000757s : 0.84% optimize.opt_b.b_1 : 0.000124s : 0.14% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000033s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000035s : 0.04% optimize.loop_unroll : 0.000509s : 0.57% optimize.opt_after_cconv.c_1 : 0.000025s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.05% optimize.tuple_transform.d_1 : 0.000041s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000050s : 0.06% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000556s : 0.62% validate : 0.000055s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.050578s : 56.35% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000252 26 0.80% : 0.000002s : 2: substitution.elim_not_effective 0.57% : 0.000001s : 2: substitution.fold_const_symbol 2.50% : 0.000006s : 3: substitution.graph_param_transform 81.05% : 0.000204s : 6: substitution.inline 1.69% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.63% : 0.000007s : 4: substitution.remove_not_recompute_node 2.00% : 0.000005s : 2: substitution.replace_old_param 3.40% : 0.000009s : 1: substitution.switch_simplify 5.36% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.032159 2 95.24% : 0.030629s : 1: type_inference.infer 4.76% : 0.001531s : 1: type_inference.specialize ------[replace.] 0.000099 9 56.18% : 0.000055s : 6: replace.inline 25.36% : 0.000025s : 1: replace.switch_simplify 18.46% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000220 9 90.87% : 0.000200s : 6: match.inline 3.54% : 0.000008s : 1: match.switch_simplify 5.59% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000190 1092 0.87% : 0.000002s : 12: predicate.accumulaten_eliminater 0.91% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 6: predicate.addn_check_dump 0.99% : 0.000002s : 12: predicate.addn_zero_filter 0.80% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.53% : 0.000005s : 18: predicate.arithmetic_simplify 1.19% : 0.000002s : 12: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.86% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.17% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.03% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.48% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_depend_swap 1.63% : 0.000003s : 21: predicate.environ_get_eliminate 1.16% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.45% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.70% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.69% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.54% : 0.000001s : 6: predicate.get_grad_eliminate 0.27% : 0.000001s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 5.63% : 0.000011s : 50: predicate.inline 0.89% : 0.000002s : 6: predicate.inline_without_move 0.23% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.00% : 0.000002s : 6: predicate.less_batch_normalization 1.79% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 32: predicate.load_eliminater 0.97% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.96% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 6: predicate.merge_addn 0.43% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.84% : 0.000002s : 12: predicate.minmaximum_grad 1.32% : 0.000003s : 3: predicate.mutable_eliminate 0.45% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.15% : 0.000004s : 20: predicate.partial_defer_inline 1.31% : 0.000002s : 17: predicate.partial_eliminate 0.95% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.38% : 0.000003s : 12: predicate.reduce_eliminate 2.34% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 6: predicate.remove_not_recompute_node 1.31% : 0.000002s : 20: predicate.replace_applicator 0.58% : 0.000001s : 6: predicate.replace_old_param 0.44% : 0.000001s : 3: predicate.reset_defer_inline 1.13% : 0.000002s : 12: predicate.reshape_eliminate 0.54% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 3: predicate.row_tensor_eliminate 0.77% : 0.000001s : 6: predicate.same_eliminate 0.45% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.62% : 0.000001s : 6: predicate.shard_identity_eliminate 0.84% : 0.000002s : 6: predicate.special_op_eliminate 0.55% : 0.000001s : 6: predicate.specialize_transform 1.09% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.61% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.58% : 0.000003s : 20: predicate.switch_defer_inline 2.26% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.94% : 0.000011s : 68: predicate.switch_simplify 0.96% : 0.000002s : 12: predicate.tile_eliminate 1.11% : 0.000002s : 12: predicate.transpose_eliminate 1.43% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.43% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.53% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.20% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.52% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.50% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.39% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002062 16 73.26% : 0.001511s : 8: func_graph_cloner_run.FuncGraphClonerGraph 26.74% : 0.000551s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.108617 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.02% : 0.004365s : 1: add_attr 4.00% : 0.004349s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000054s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.09% : 0.000094s : 1: auto_monad 0.02% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.70% : 0.000761s : 1: bootstrap 0.04% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000035s : 1: event_method 0.02% : 0.000020s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000007s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.48% : 0.000519s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.71% : 0.000770s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000019s : 1: opt.transform.mutable_eliminate 1.13% : 0.001222s : 78: opt.transform.opt_a 0.02% : 0.000024s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000102s : 28: opt.transform.opt_b 0.04% : 0.000045s : 2: opt.transform.opt_trans_graph 0.03% : 0.000036s : 4: opt.transform.symbol_engine_opt 3.29% : 0.003571s : 1: opt_a 0.10% : 0.000110s : 1: opt_after_cconv 0.52% : 0.000568s : 1: opt_after_jit_grad 0.21% : 0.000229s : 1: opt_b 5.73% : 0.006222s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000028s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.05% : 0.000052s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000050s : 1: remove_dup_value 0.79% : 0.000853s : 1: renormalize.infer 0.53% : 0.000577s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.28% : 0.000304s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000094s : 1: symbol_engine_optimizer 46.59% : 0.050608s : 1: task_emit 0.07% : 0.000077s : 1: tuple_transform 29.72% : 0.032277s : 1: type_inference 0.09% : 0.000093s : 1: validate [WARNING] CORE(87352,ffffbf434f30,python3.9):2026-01-29-17:52:03.653.370 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph3 [WARNING] CORE(87365,ffffbf434f30,python3.9):2026-01-29-17:52:03.683.205 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph4 TotalTime = 0.091171, [24] [bootstrap]: 0.00052476 [type_inference]: 0.0294634 [event_method]: 2.544e-05 [auto_monad]: 9.005e-05 [graph_reusing]: 7.06001e-06 [inline]: 3.28e-06 [add_attr]: 0.00383632, [1] [add_attr_with_inline]: 0.00382469, [1] [Cycle 1]: 7.584e-05, [2] [tag_attr]: 2.602e-05 [meta_addattr_fg_expand]: 5.94999e-06 [parallel-infer-symbol]: 4.3e-06 [pre_auto_parallel]: 4.44e-05 [insert-virtual-dataset]: 3.06001e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.80001e-06 [optimize]: 0.01161, [53] [py_interpret_to_execute]: 7.18e-06 [rewriter_before_opt_a]: 0.00028177 [opt_a]: 0.00899739, [2] [Cycle 1]: 0.00835741, [45] [expand_dump_flag]: 3.69002e-06 [switch_simplify]: 0.00534676 [loop_unroll]: 4.698e-05 [a_1]: 0.00070364 [with_stream_mark]: 2.706e-05 [recompute_prepare]: 8.64e-06 [updatestate_depend_eliminate]: 5.02e-06 [updatestate_assign_eliminate]: 3.60998e-06 [updatestate_loads_eliminate]: 2.96999e-06 [parameter_eliminate]: 3.2e-06 [a_2]: 7.26e-05 [accelerated_algorithm]: 7.16001e-06 [shard]: 2.76e-06 [meta_shard_fg_expand]: 3.06001e-06 [shard_inline]: 5.64e-06 [merge_send_recv]: 1.037e-05 [auto_parallel]: 1.141e-05 [parallel]: 2.322e-05 [flash_sp]: 1.219e-05 [merge_comm]: 3.56999e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 1.024e-05 [allreduce_slice_to_reducescatter]: 1.13001e-06 [virtual_shard_identity]: 8.89998e-06 [virtual_dataset]: 6.12001e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 5.59998e-06 [merge_forward]: 4.27e-06 [cell_reuse_recompute_pass]: 1.51002e-06 [offload_activation]: 1.095e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.35e-05 [merge_recompute_call_nodes]: 1.57999e-06 [before_grad]: 1.06e-05 [set_forward_comm_id_for_comm_node_pass]: 3.66001e-06 [meta_fg_expand]: 2.78998e-06 [flash_sp_send_recv_attached]: 2.74999e-06 [receive_attached]: 2.75002e-06 [after_resolve]: 1.09e-05 [a_after_grad]: 8.1e-06 [renormalize]: 0.00148985 [add_forward_monad_depend]: 8.78001e-06 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 2.054e-05 [cse]: 4.322e-05 [a_3]: 5.133e-05 [Cycle 2]: 0.00062478, [45] [expand_dump_flag]: 2.77002e-06 [switch_simplify]: 8.10999e-06 [loop_unroll]: 5.67999e-06 [a_1]: 0.00010067 [with_stream_mark]: 1.925e-05 [recompute_prepare]: 6.09999e-06 [updatestate_depend_eliminate]: 3.89002e-06 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 2.63998e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 6.291e-05 [accelerated_algorithm]: 5.77001e-06 [shard]: 1.66e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 5.04e-06 [merge_send_recv]: 6.55002e-06 [auto_parallel]: 8.82e-06 [parallel]: 8.05e-06 [flash_sp]: 4.14002e-06 [merge_comm]: 4.23001e-06 [allreduce_fusion]: 3.30998e-06 [matmul_add_comm_reduction]: 8.15e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 6.58998e-06 [virtual_dataset]: 5.68002e-06 [get_grad_eliminate_]: 4.80999e-06 [virtual_output]: 5.54e-06 [merge_forward]: 4.59998e-06 [cell_reuse_recompute_pass]: 2.49001e-06 [offload_activation]: 9.47999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.518e-05 [merge_recompute_call_nodes]: 9.5999e-07 [before_grad]: 9.86e-06 [set_forward_comm_id_for_comm_node_pass]: 3.26001e-06 [meta_fg_expand]: 2.29001e-06 [flash_sp_send_recv_attached]: 1.57001e-06 [receive_attached]: 2.16003e-06 [after_resolve]: 9.61e-06 [a_after_grad]: 7.31001e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.50001e-06 [auto_monad_grad]: 1.94999e-06 [auto_monad_eliminator]: 7.71001e-06 [cse]: 1.87e-05 [a_3]: 3.17e-05 [py_interpret_to_execute_after_opt_a]: 9.61e-06 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 2.392e-05 [convert_after_rewriter]: 1.35001e-06 [order_py_execute_after_rewriter]: 1.39998e-06 [mutable_eliminate]: 0.00085224 [opt_b]: 0.00020043, [1] [Cycle 1]: 0.00019148, [7] [b_1]: 0.0001051 [b_2]: 6.88e-06 [updatestate_depend_eliminate]: 8.38001e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.22001e-06 [renormalize]: 7.79983e-07 [cse]: 3.117e-05 [optimize_parallel_all_gather_comm]: 2.032e-05 [overlap_param_gather]: 2.04999e-06 [cconv]: 3.285e-05 [loop_unroll]: 0.00047843 [opt_after_cconv]: 0.00010233, [1] [Cycle 1]: 9.539e-05, [7] [c_1]: 2.468e-05 [parameter_eliminate]: 4.48001e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 2.157e-05 [renormalize]: 6.59988e-07 [remove_dup_value]: 1.76e-05 [tuple_transform]: 6.773e-05, [1] [Cycle 1]: 6.333e-05, [4] [d_1]: 3.731e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 6.32001e-06 [partial_unused_args_eliminate]: 2.27001e-06 [add_recomputation]: 5.234e-05 [cse_after_recomputation]: 2.242e-05, [1] [Cycle 1]: 1.824e-05, [1] [cse]: 1.324e-05 [environ_conv]: 1.067e-05 [swap_dp_allreduce_reducescatter]: 5.59998e-06 [bias_add_comm_swap]: 4.55001e-06 [label_micro_interleaved_index]: 5.54998e-06 [label_fine_grained_interleaved_index]: 2.78e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.31e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.62999e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 1.36002e-06 [full_micro_interleaved_order_control]: 2.37999e-06 [reorder_send_recv_between_fp_bp]: 2.97002e-06 [comm_op_add_attrs]: 1.14e-06 [add_comm_op_reuse_tag]: 1.32e-06 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.18001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.304e-05 [grouped_pairwise_exchange_alltoall]: 1.82999e-06 [offloading_packed_experts]: 3.91999e-06 [overlap_recompute_and_grad_model_parallel]: 5.85002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42999e-06 [overlap_recompute_comm]: 2.82002e-06 [overlap_grad_ring_attention]: 4.38999e-06 [overlap_grad_flash_sp]: 2.239e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.01e-06 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 8.037e-05, [1] [Cycle 1]: 7.56e-05, [6] [build]: 1.239e-05 [elim_shapecalc]: 8.1e-06 [elim_not_effective]: 1.23e-05 [opt_reshape]: 6.23e-06 [fold_const_symbol]: 8.77e-06 [renormalize]: 1.50001e-07 [detach_backward]: 2.34001e-06 [pipeline_parallel_scheduler]: 1.51002e-06 [auto_monad_reorder]: 1.944e-05 [get_jit_bprop_graph]: 1.89999e-06 [rewriter_after_jit_bprop_graph]: 5.64e-06 [opt_after_jit_grad]: 0.00049991 [validate]: 5.354e-05 [backend_pass]: 9.79984e-07 [task_emit]: 0.0446904 [execute]: 1.1e-05 Sums bootstrap : 0.000525s : 0.61% type_inference : 0.029463s : 34.19% event_method : 0.000025s : 0.03% auto_monad : 0.000090s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000044s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000282s : 0.33% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.005355s : 6.21% optimize.opt_a.loop_unroll : 0.000053s : 0.06% optimize.opt_a.a_1 : 0.000804s : 0.93% optimize.opt_a.with_stream_mark : 0.000046s : 0.05% optimize.opt_a.recompute_prepare : 0.000015s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.01% optimize.opt_a.a_2 : 0.000136s : 0.16% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.02% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000017s : 0.02% optimize.opt_a.auto_parallel : 0.000020s : 0.02% optimize.opt_a.parallel : 0.000031s : 0.04% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000018s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000020s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.02% optimize.opt_a.a_after_grad : 0.000015s : 0.02% optimize.opt_a.renormalize : 0.001490s : 1.73% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.03% optimize.opt_a.cse : 0.000062s : 0.07% optimize.opt_a.a_3 : 0.000083s : 0.10% optimize.py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000024s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000852s : 0.99% optimize.opt_b.b_1 : 0.000105s : 0.12% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000031s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000033s : 0.04% optimize.loop_unroll : 0.000478s : 0.56% optimize.opt_after_cconv.c_1 : 0.000025s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000004s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000037s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000052s : 0.06% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000011s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000005s : 0.01% optimize.label_micro_interleaved_index : 0.000006s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000022s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000500s : 0.58% validate : 0.000054s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.044690s : 51.86% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.005449 26 0.03% : 0.000002s : 2: substitution.elim_not_effective 0.02% : 0.000001s : 2: substitution.fold_const_symbol 0.11% : 0.000006s : 3: substitution.graph_param_transform 3.88% : 0.000212s : 6: substitution.inline 0.09% : 0.000005s : 4: substitution.j_node_and_user_rematch 0.12% : 0.000007s : 4: substitution.remove_not_recompute_node 0.09% : 0.000005s : 2: substitution.replace_old_param 95.36% : 0.005196s : 1: substitution.switch_simplify 0.29% : 0.000016s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.029377 2 95.37% : 0.028017s : 1: type_inference.infer 4.63% : 0.001360s : 1: type_inference.specialize ------[replace.] 0.000135 9 40.54% : 0.000055s : 6: replace.inline 45.80% : 0.000062s : 1: replace.switch_simplify 13.66% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000256 9 81.41% : 0.000208s : 6: match.inline 12.99% : 0.000033s : 1: match.switch_simplify 5.60% : 0.000014s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000202 1092 1.15% : 0.000002s : 12: predicate.accumulaten_eliminater 0.73% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.50% : 0.000001s : 6: predicate.addn_check_dump 1.30% : 0.000003s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.58% : 0.000005s : 18: predicate.arithmetic_simplify 1.14% : 0.000002s : 12: predicate.cast_eliminate 0.49% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.64% : 0.000001s : 6: predicate.depend_value_elim 0.88% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.07% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.30% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.04% : 0.000002s : 15: predicate.environ_add_const_eliminate 0.98% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.00% : 0.000002s : 15: predicate.environ_get_depend_swap 1.63% : 0.000003s : 21: predicate.environ_get_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.94% : 0.000006s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.83% : 0.000002s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.56% : 0.000001s : 6: predicate.get_grad_eliminate 0.16% : 0.000000s : 3: predicate.graph_param_transform 0.42% : 0.000001s : 6: predicate.incorporate_call 0.39% : 0.000001s : 6: predicate.incorporate_call_switch 5.94% : 0.000012s : 50: predicate.inline 0.52% : 0.000001s : 6: predicate.inline_without_move 0.23% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.33% : 0.000003s : 6: predicate.less_batch_normalization 1.90% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.32% : 0.000005s : 32: predicate.load_eliminater 1.28% : 0.000003s : 3: predicate.loop_unroll_after_grad 3.64% : 0.000007s : 37: predicate.loop_unroll_before_grad 1.53% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.61% : 0.000001s : 6: predicate.merge_addn 0.44% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.45% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 12: predicate.minmaximum_grad 1.99% : 0.000004s : 3: predicate.mutable_eliminate 0.33% : 0.000001s : 3: predicate.opt_reshape 0.29% : 0.000001s : 3: predicate.parallel_virtual_node 2.18% : 0.000004s : 20: predicate.partial_defer_inline 1.28% : 0.000003s : 17: predicate.partial_eliminate 0.94% : 0.000002s : 12: predicate.print_const_string_wrapper 0.49% : 0.000001s : 6: predicate.reduce_all_const_elim 1.17% : 0.000002s : 12: predicate.reduce_eliminate 2.35% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 20: predicate.replace_applicator 0.43% : 0.000001s : 6: predicate.replace_old_param 0.20% : 0.000000s : 3: predicate.reset_defer_inline 1.14% : 0.000002s : 12: predicate.reshape_eliminate 0.52% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.83% : 0.000002s : 6: predicate.same_eliminate 0.34% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 6: predicate.shard_identity_eliminate 0.59% : 0.000001s : 6: predicate.special_op_eliminate 0.51% : 0.000001s : 6: predicate.specialize_transform 0.76% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.66% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.65% : 0.000003s : 20: predicate.switch_defer_inline 2.19% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.32% : 0.000013s : 68: predicate.switch_simplify 0.91% : 0.000002s : 12: predicate.tile_eliminate 1.03% : 0.000002s : 12: predicate.transpose_eliminate 1.41% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.27% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 2.78% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.33% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.00% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.87% : 0.000004s : 20: predicate.tuple_to_list_eliminator_ 2.21% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.64% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 3: predicate.value_based_eliminate 0.73% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.51% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001297 16 61.54% : 0.000798s : 8: func_graph_cloner_run.FuncGraphClonerGraph 38.46% : 0.000499s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.114738 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.35% : 0.003841s : 1: add_attr 3.34% : 0.003828s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000057s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000097s : 1: auto_monad 0.02% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.49% : 0.000567s : 1: bootstrap 0.03% : 0.000037s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.03% : 0.000033s : 1: event_method 0.02% : 0.000019s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.43% : 0.000489s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.76% : 0.000867s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000021s : 1: opt.transform.mutable_eliminate 5.67% : 0.006504s : 78: opt.transform.opt_a 0.02% : 0.000024s : 1: opt.transform.opt_after_cconv 0.02% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.07% : 0.000084s : 28: opt.transform.opt_b 0.04% : 0.000041s : 2: opt.transform.opt_trans_graph 0.03% : 0.000032s : 4: opt.transform.symbol_engine_opt 7.84% : 0.009001s : 1: opt_a 0.09% : 0.000106s : 1: opt_after_cconv 0.44% : 0.000510s : 1: opt_after_jit_grad 0.18% : 0.000204s : 1: opt_b 10.12% : 0.011615s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.02% : 0.000025s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000049s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000007s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 0.78% : 0.000892s : 1: renormalize.infer 0.51% : 0.000586s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000027s : 1: rewriter_after_opt_a 0.25% : 0.000290s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000083s : 1: symbol_engine_optimizer 38.97% : 0.044718s : 1: task_emit 0.06% : 0.000071s : 1: tuple_transform 25.71% : 0.029496s : 1: type_inference 0.08% : 0.000088s : 1: validate TotalTime = 0.135843, [24] [bootstrap]: 0.0006992 [type_inference]: 0.0328193 [event_method]: 2.782e-05 [auto_monad]: 8.948e-05 [graph_reusing]: 6.96001e-06 [inline]: 3.63999e-06 [add_attr]: 0.00479027, [1] [add_attr_with_inline]: 0.00477688, [1] [Cycle 1]: 7.972e-05, [2] [tag_attr]: 2.899e-05 [meta_addattr_fg_expand]: 6.19999e-06 [parallel-infer-symbol]: 3.98999e-06 [pre_auto_parallel]: 4.765e-05 [insert-virtual-dataset]: 2.94999e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 2.02999e-06 [pipeline_split]: 1.96998e-06 [optimize]: 0.0165192, [53] [py_interpret_to_execute]: 7.55e-06 [rewriter_before_opt_a]: 0.00029636 [opt_a]: 0.0116087, [2] [Cycle 1]: 0.0107502, [45] [expand_dump_flag]: 3.65e-06 [switch_simplify]: 9.011e-05 [loop_unroll]: 3.201e-05 [a_1]: 0.00064017 [with_stream_mark]: 2.02e-05 [recompute_prepare]: 8.75999e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 3.61999e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 2.73e-06 [a_2]: 7.197e-05 [accelerated_algorithm]: 6.56999e-06 [shard]: 2.53e-06 [meta_shard_fg_expand]: 2.41e-06 [shard_inline]: 5.82999e-06 [merge_send_recv]: 9.87001e-06 [auto_parallel]: 6.89999e-06 [parallel]: 2.272e-05 [flash_sp]: 1.017e-05 [merge_comm]: 4.08999e-06 [allreduce_fusion]: 3.66001e-06 [matmul_add_comm_reduction]: 1.022e-05 [allreduce_slice_to_reducescatter]: 1.02e-06 [virtual_shard_identity]: 7.57002e-06 [virtual_dataset]: 6.20002e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 5.51998e-06 [merge_forward]: 4.44002e-06 [cell_reuse_recompute_pass]: 2.46e-06 [offload_activation]: 1.081e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.384e-05 [merge_recompute_call_nodes]: 1.97999e-06 [before_grad]: 9.94999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 3.09001e-06 [flash_sp_send_recv_attached]: 2.53003e-06 [receive_attached]: 2.86999e-06 [after_resolve]: 1.011e-05 [a_after_grad]: 8.93002e-06 [renormalize]: 0.00148511 [add_forward_monad_depend]: 9.14998e-06 [auto_monad_grad]: 2.48002e-06 [auto_monad_eliminator]: 1.873e-05 [cse]: 4.118e-05 [a_3]: 0.00783491 [Cycle 2]: 0.00084179, [45] [expand_dump_flag]: 5.34998e-06 [switch_simplify]: 1.482e-05 [loop_unroll]: 8.30999e-06 [a_1]: 0.00014244 [with_stream_mark]: 4.219e-05 [recompute_prepare]: 7.08998e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 4.05e-06 [updatestate_loads_eliminate]: 3.28e-06 [parameter_eliminate]: 2.23998e-06 [a_2]: 6.707e-05 [accelerated_algorithm]: 6.71e-06 [shard]: 2.78998e-06 [meta_shard_fg_expand]: 3.71001e-06 [shard_inline]: 5.45001e-06 [merge_send_recv]: 1.127e-05 [auto_parallel]: 1.076e-05 [parallel]: 1.326e-05 [flash_sp]: 3.98999e-06 [merge_comm]: 3.40003e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 1.37e-05 [allreduce_slice_to_reducescatter]: 1.07e-06 [virtual_shard_identity]: 8.39998e-06 [virtual_dataset]: 5.86e-06 [get_grad_eliminate_]: 5.51e-06 [virtual_output]: 5.91e-06 [merge_forward]: 4.82e-06 [cell_reuse_recompute_pass]: 4.83001e-06 [offload_activation]: 1.181e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.373e-05 [merge_recompute_call_nodes]: 1.52999e-06 [before_grad]: 1.081e-05 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 3.43e-06 [flash_sp_send_recv_attached]: 1.89e-06 [receive_attached]: 3.04999e-06 [after_resolve]: 1.468e-05 [a_after_grad]: 8.74e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 3.75e-06 [auto_monad_grad]: 3.51001e-06 [auto_monad_eliminator]: 2.081e-05 [cse]: 4.982e-05 [a_3]: 3.184e-05 [py_interpret_to_execute_after_opt_a]: 1.549e-05 [slice_cell_reuse_recomputed_activation]: 2.03997e-06 [rewriter_after_opt_a]: 2.675e-05 [convert_after_rewriter]: 1.25001e-06 [order_py_execute_after_rewriter]: 1.42999e-06 [mutable_eliminate]: 0.00277295 [opt_b]: 0.00024111, [1] [Cycle 1]: 0.0002301, [7] [b_1]: 0.00011802 [b_2]: 7.8e-06 [updatestate_depend_eliminate]: 1.254e-05 [updatestate_assign_eliminate]: 3.5e-06 [updatestate_loads_eliminate]: 2.88998e-06 [renormalize]: 9.30013e-07 [cse]: 4.586e-05 [optimize_parallel_all_gather_comm]: 2.771e-05 [overlap_param_gather]: 2.46e-06 [cconv]: 4.354e-05 [loop_unroll]: 0.00061751 [opt_after_cconv]: 0.00011575, [1] [Cycle 1]: 0.00010769, [7] [c_1]: 2.627e-05 [parameter_eliminate]: 6.59999e-06 [updatestate_depend_eliminate]: 7.06001e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.26998e-06 [cse]: 2.797e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 5.068e-05 [tuple_transform]: 7.765e-05, [1] [Cycle 1]: 7.284e-05, [4] [d_1]: 4.426e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 6.39999e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 0.00010192 [cse_after_recomputation]: 2.712e-05, [1] [Cycle 1]: 2.191e-05, [1] [cse]: 1.561e-05 [environ_conv]: 1.318e-05 [swap_dp_allreduce_reducescatter]: 6.21e-06 [bias_add_comm_swap]: 3.85e-06 [label_micro_interleaved_index]: 6.48998e-06 [label_fine_grained_interleaved_index]: 2.68998e-06 [merge_cast_opt]: 1.55001e-06 [slice_recompute_activation]: 2.24999e-06 [micro_interleaved_order_control]: 3.3e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.24e-06 [full_micro_interleaved_order_control]: 2.49999e-06 [reorder_send_recv_between_fp_bp]: 2.97002e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.67001e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.74e-06 [overlap_opt_shard_grad_in_pipeline]: 1.98002e-06 [control_data_broadcast_order]: 1.518e-05 [grouped_pairwise_exchange_alltoall]: 1.96e-06 [offloading_packed_experts]: 4.38999e-06 [overlap_recompute_and_grad_model_parallel]: 5.20001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.30001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.75001e-06 [overlap_recompute_comm]: 2.73e-06 [overlap_grad_ring_attention]: 4.46002e-06 [overlap_grad_flash_sp]: 2.444e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 2.23998e-06 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.02998e-06 [symbol_engine_optimizer]: 9.137e-05, [1] [Cycle 1]: 8.675e-05, [6] [build]: 1.413e-05 [elim_shapecalc]: 1.167e-05 [elim_not_effective]: 1.262e-05 [opt_reshape]: 6.51999e-06 [fold_const_symbol]: 9.97001e-06 [renormalize]: 2.00002e-07 [detach_backward]: 2.44999e-06 [pipeline_parallel_scheduler]: 1.79e-06 [auto_monad_reorder]: 1.885e-05 [get_jit_bprop_graph]: 2.67001e-06 [rewriter_after_jit_bprop_graph]: 6.11998e-06 [opt_after_jit_grad]: 0.00057487 [validate]: 5.862e-05 [backend_pass]: 9.80013e-07 [task_emit]: 0.0798548 [execute]: 1.126e-05 Sums bootstrap : 0.000699s : 0.54% type_inference : 0.032819s : 25.28% event_method : 0.000028s : 0.02% auto_monad : 0.000089s : 0.07% graph_reusing : 0.000007s : 0.01% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000048s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000296s : 0.23% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000105s : 0.08% optimize.opt_a.loop_unroll : 0.000040s : 0.03% optimize.opt_a.a_1 : 0.000783s : 0.60% optimize.opt_a.with_stream_mark : 0.000062s : 0.05% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000139s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000021s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.01% optimize.opt_a.parallel : 0.000036s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000007s : 0.01% optimize.opt_a.offload_activation : 0.000023s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.02% optimize.opt_a.a_after_grad : 0.000018s : 0.01% optimize.opt_a.renormalize : 0.001485s : 1.14% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000006s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.03% optimize.opt_a.cse : 0.000091s : 0.07% optimize.opt_a.a_3 : 0.007867s : 6.06% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000027s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.002773s : 2.14% optimize.opt_b.b_1 : 0.000118s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000046s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000044s : 0.03% optimize.loop_unroll : 0.000618s : 0.48% optimize.opt_after_cconv.c_1 : 0.000026s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000028s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000051s : 0.04% optimize.tuple_transform.d_1 : 0.000044s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000102s : 0.08% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000004s : 0.00% optimize.label_micro_interleaved_index : 0.000006s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000015s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000024s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.00% opt_after_jit_grad : 0.000575s : 0.44% validate : 0.000059s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.079855s : 61.51% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000236 26 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.82% : 0.000002s : 2: substitution.fold_const_symbol 2.77% : 0.000007s : 3: substitution.graph_param_transform 78.78% : 0.000186s : 6: substitution.inline 2.39% : 0.000006s : 4: substitution.j_node_and_user_rematch 2.93% : 0.000007s : 4: substitution.remove_not_recompute_node 2.58% : 0.000006s : 2: substitution.replace_old_param 3.66% : 0.000009s : 1: substitution.switch_simplify 5.18% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.032730 2 93.66% : 0.030656s : 1: type_inference.infer 6.34% : 0.002074s : 1: type_inference.specialize ------[replace.] 0.000094 9 55.51% : 0.000052s : 6: replace.inline 26.64% : 0.000025s : 1: replace.switch_simplify 17.86% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000201 9 90.56% : 0.000182s : 6: match.inline 3.92% : 0.000008s : 1: match.switch_simplify 5.52% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000222 1092 0.89% : 0.000002s : 12: predicate.accumulaten_eliminater 0.95% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 6: predicate.addn_check_dump 1.17% : 0.000003s : 12: predicate.addn_zero_filter 0.74% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.30% : 0.000005s : 18: predicate.arithmetic_simplify 1.17% : 0.000003s : 12: predicate.cast_eliminate 3.00% : 0.000007s : 6: predicate.check_bprop_eliminate 0.41% : 0.000001s : 6: predicate.compare_switch_simplify 0.12% : 0.000000s : 3: predicate.const_output_eliminate 0.63% : 0.000001s : 6: predicate.depend_value_elim 0.91% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.41% : 0.000003s : 12: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.99% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 3: predicate.elim_not_effective 0.34% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 15: predicate.environ_add_const_eliminate 1.00% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_depend_swap 1.44% : 0.000003s : 21: predicate.environ_get_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.30% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.72% : 0.000006s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.58% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.12% : 0.000000s : 3: predicate.fold_const_symbol 0.55% : 0.000001s : 6: predicate.get_grad_eliminate 0.42% : 0.000001s : 3: predicate.graph_param_transform 0.42% : 0.000001s : 6: predicate.incorporate_call 0.35% : 0.000001s : 6: predicate.incorporate_call_switch 5.37% : 0.000012s : 50: predicate.inline 0.74% : 0.000002s : 6: predicate.inline_without_move 0.21% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.13% : 0.000002s : 6: predicate.less_batch_normalization 1.62% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.19% : 0.000005s : 32: predicate.load_eliminater 0.99% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.48% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.69% : 0.000004s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.53% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.59% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 12: predicate.minmaximum_grad 2.35% : 0.000005s : 3: predicate.mutable_eliminate 0.27% : 0.000001s : 3: predicate.opt_reshape 0.31% : 0.000001s : 3: predicate.parallel_virtual_node 1.99% : 0.000004s : 20: predicate.partial_defer_inline 1.12% : 0.000002s : 17: predicate.partial_eliminate 0.88% : 0.000002s : 12: predicate.print_const_string_wrapper 0.50% : 0.000001s : 6: predicate.reduce_all_const_elim 1.12% : 0.000002s : 12: predicate.reduce_eliminate 2.21% : 0.000005s : 32: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000001s : 6: predicate.remove_not_recompute_node 1.40% : 0.000003s : 20: predicate.replace_applicator 0.52% : 0.000001s : 6: predicate.replace_old_param 0.46% : 0.000001s : 3: predicate.reset_defer_inline 1.00% : 0.000002s : 12: predicate.reshape_eliminate 0.59% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.54% : 0.000001s : 3: predicate.row_tensor_eliminate 0.63% : 0.000001s : 6: predicate.same_eliminate 0.27% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.69% : 0.000002s : 6: predicate.shard_identity_eliminate 0.66% : 0.000001s : 6: predicate.special_op_eliminate 0.55% : 0.000001s : 6: predicate.specialize_transform 1.49% : 0.000003s : 6: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.24% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.60% : 0.000004s : 20: predicate.switch_defer_inline 1.83% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.43% : 0.000012s : 68: predicate.switch_simplify 0.81% : 0.000002s : 12: predicate.tile_eliminate 0.81% : 0.000002s : 12: predicate.transpose_eliminate 1.87% : 0.000004s : 18: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000004s : 18: predicate.tuple_list_get_item_const_eliminator 1.30% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.16% : 0.000007s : 26: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.16% : 0.000005s : 24: predicate.tuple_list_set_item_eliminator 1.64% : 0.000004s : 20: predicate.tuple_to_list_eliminator_ 2.03% : 0.000005s : 32: predicate.updatestate_pure_node_eliminater 2.68% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 3: predicate.value_based_eliminate 0.72% : 0.000002s : 6: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 6: predicate.virtual_output_eliminate 0.18% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001471 16 60.61% : 0.000891s : 8: func_graph_cloner_run.FuncGraphClonerGraph 39.39% : 0.000579s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.160079 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.00% : 0.004797s : 1: add_attr 2.99% : 0.004781s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000107s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000097s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.47% : 0.000757s : 1: bootstrap 0.03% : 0.000047s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.02% : 0.000034s : 1: event_method 0.01% : 0.000019s : 1: execute 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000010s : 1: label_micro_interleaved_index 0.39% : 0.000628s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 1.75% : 0.002797s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000036s : 1: opt.transform.mutable_eliminate 0.80% : 0.001279s : 78: opt.transform.opt_a 0.02% : 0.000025s : 1: opt.transform.opt_after_cconv 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000091s : 28: opt.transform.opt_b 0.03% : 0.000049s : 2: opt.transform.opt_trans_graph 0.02% : 0.000037s : 4: opt.transform.symbol_engine_opt 7.25% : 0.011613s : 1: opt_a 0.07% : 0.000120s : 1: opt_after_cconv 0.37% : 0.000587s : 1: opt_after_jit_grad 0.15% : 0.000245s : 1: opt_b 10.32% : 0.016525s : 1: optimize 0.02% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000052s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000055s : 1: remove_dup_value 0.56% : 0.000891s : 1: renormalize.infer 0.36% : 0.000583s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000010s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000031s : 1: rewriter_after_opt_a 0.19% : 0.000302s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000094s : 1: symbol_engine_optimizer 49.90% : 0.079883s : 1: task_emit 0.05% : 0.000081s : 1: tuple_transform 20.52% : 0.032853s : 1: type_inference 0.06% : 0.000097s : 1: validate TotalTime = 0.104244, [24] [bootstrap]: 0.0007839 [type_inference]: 0.0383751 [event_method]: 0.00012415 [auto_monad]: 0.00019828 [graph_reusing]: 1.235e-05 [inline]: 2.85998e-06 [add_attr]: 0.00423149, [1] [add_attr_with_inline]: 0.00421787, [1] [Cycle 1]: 8.958e-05, [2] [tag_attr]: 3.448e-05 [meta_addattr_fg_expand]: 8.23001e-06 [parallel-infer-symbol]: 3.58999e-06 [pre_auto_parallel]: 5.206e-05 [insert-virtual-dataset]: 2.88e-06 [parallel-infer-symbol-second]: 1.05001e-06 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00813205, [53] [py_interpret_to_execute]: 8.90001e-06 [rewriter_before_opt_a]: 0.00031099 [opt_a]: 0.00497019, [2] [Cycle 1]: 0.00414359, [45] [expand_dump_flag]: 3.99002e-06 [switch_simplify]: 9.751e-05 [loop_unroll]: 4.022e-05 [a_1]: 0.00097093 [with_stream_mark]: 2.962e-05 [recompute_prepare]: 1.366e-05 [updatestate_depend_eliminate]: 5.89e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.76999e-06 [parameter_eliminate]: 2.46e-06 [a_2]: 0.00010373 [accelerated_algorithm]: 8.55999e-06 [shard]: 1.99e-06 [meta_shard_fg_expand]: 4.57998e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 1.008e-05 [auto_parallel]: 9.77001e-06 [parallel]: 7.073e-05 [flash_sp]: 1.24e-05 [merge_comm]: 5.00001e-06 [allreduce_fusion]: 4.85999e-06 [matmul_add_comm_reduction]: 1.061e-05 [allreduce_slice_to_reducescatter]: 7.40023e-07 [virtual_shard_identity]: 1.168e-05 [virtual_dataset]: 8.52e-06 [get_grad_eliminate_]: 7.3e-06 [virtual_output]: 7.91001e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 1.71e-06 [offload_activation]: 1.144e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.88e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 1.287e-05 [set_forward_comm_id_for_comm_node_pass]: 4.75999e-06 [meta_fg_expand]: 4.38001e-06 [flash_sp_send_recv_attached]: 2.39001e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 1.232e-05 [a_after_grad]: 1.118e-05 [renormalize]: 0.00215436 [add_forward_monad_depend]: 1.048e-05 [auto_monad_grad]: 2.71999e-06 [auto_monad_eliminator]: 2.383e-05 [cse]: 4.636e-05 [a_3]: 7.047e-05 [Cycle 2]: 0.00081254, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 1.053e-05 [loop_unroll]: 7.88999e-06 [a_1]: 0.00019446 [with_stream_mark]: 1.995e-05 [recompute_prepare]: 8.24002e-06 [updatestate_depend_eliminate]: 4.79e-06 [updatestate_assign_eliminate]: 3.92998e-06 [updatestate_loads_eliminate]: 3.69002e-06 [parameter_eliminate]: 2.39001e-06 [a_2]: 9.349e-05 [accelerated_algorithm]: 7.5e-06 [shard]: 2.11998e-06 [meta_shard_fg_expand]: 2.48e-06 [shard_inline]: 6.48e-06 [merge_send_recv]: 9.86e-06 [auto_parallel]: 1.049e-05 [parallel]: 9.09e-06 [flash_sp]: 3.78999e-06 [merge_comm]: 4.1e-06 [allreduce_fusion]: 3.88999e-06 [matmul_add_comm_reduction]: 9.66e-06 [allreduce_slice_to_reducescatter]: 5.39992e-07 [virtual_shard_identity]: 9.02e-06 [virtual_dataset]: 7.63999e-06 [get_grad_eliminate_]: 6.66e-06 [virtual_output]: 7.4e-06 [merge_forward]: 4.94e-06 [cell_reuse_recompute_pass]: 3.56001e-06 [offload_activation]: 1.091e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.403e-05 [merge_recompute_call_nodes]: 1.66e-06 [before_grad]: 1.112e-05 [set_forward_comm_id_for_comm_node_pass]: 4.68999e-06 [meta_fg_expand]: 3.58999e-06 [flash_sp_send_recv_attached]: 1.59e-06 [receive_attached]: 1.92001e-06 [after_resolve]: 1.172e-05 [a_after_grad]: 1.073e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 2.41e-06 [auto_monad_grad]: 1.87001e-06 [auto_monad_eliminator]: 8.45999e-06 [cse]: 2.403e-05 [a_3]: 4.126e-05 [py_interpret_to_execute_after_opt_a]: 9.29998e-06 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 2.765e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.15001e-06 [mutable_eliminate]: 0.0009142 [opt_b]: 0.00029662, [1] [Cycle 1]: 0.00028773, [7] [b_1]: 0.00018328 [b_2]: 1.137e-05 [updatestate_depend_eliminate]: 1.052e-05 [updatestate_assign_eliminate]: 3.57997e-06 [updatestate_loads_eliminate]: 3.34001e-06 [renormalize]: 1.24e-06 [cse]: 3.642e-05 [optimize_parallel_all_gather_comm]: 2.052e-05 [overlap_param_gather]: 2.17999e-06 [cconv]: 3.369e-05 [loop_unroll]: 0.0005234 [opt_after_cconv]: 0.00012929, [1] [Cycle 1]: 0.0001226, [7] [c_1]: 3.553e-05 [parameter_eliminate]: 5.67001e-06 [updatestate_depend_eliminate]: 7.41001e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.16999e-06 [cse]: 3.028e-05 [renormalize]: 5.69999e-07 [remove_dup_value]: 5.095e-05 [tuple_transform]: 0.00013593, [1] [Cycle 1]: 0.00012836, [4] [d_1]: 9.395e-05 [none_parameter_eliminate]: 2.42001e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.022e-05 [partial_unused_args_eliminate]: 2.44999e-06 [add_recomputation]: 6.51e-05 [cse_after_recomputation]: 3.69e-05, [1] [Cycle 1]: 3.007e-05, [1] [cse]: 2.157e-05 [environ_conv]: 1.28e-05 [swap_dp_allreduce_reducescatter]: 7.07997e-06 [bias_add_comm_swap]: 2.81e-06 [label_micro_interleaved_index]: 5.29e-06 [label_fine_grained_interleaved_index]: 2.97002e-06 [merge_cast_opt]: 1.51998e-06 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.32001e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 1.20001e-06 [remove_cast_before_assign_add]: 9.29984e-07 [full_micro_interleaved_order_control]: 2.32999e-06 [reorder_send_recv_between_fp_bp]: 3.15998e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.64e-06 [interleave_parallel_branches]: 1.19e-06 [overlap_opt_shard_in_pipeline]: 1.09e-06 [overlap_opt_shard_grad_in_pipeline]: 2.61e-06 [control_data_broadcast_order]: 2.278e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 5.76e-06 [overlap_recompute_and_grad_model_parallel]: 6.85002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39998e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 6.09999e-06 [overlap_grad_flash_sp]: 2.952e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.35002e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 0.00011432, [1] [Cycle 1]: 0.00010798, [6] [build]: 1.344e-05 [elim_shapecalc]: 1.606e-05 [elim_not_effective]: 1.893e-05 [opt_reshape]: 9.76003e-06 [fold_const_symbol]: 1.401e-05 [renormalize]: 4.50003e-07 [detach_backward]: 3.04999e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 2.566e-05 [get_jit_bprop_graph]: 2.71999e-06 [rewriter_after_jit_bprop_graph]: 7.43999e-06 [opt_after_jit_grad]: 0.00071616 [validate]: 5.929e-05 [backend_pass]: 1.14998e-06 [task_emit]: 0.0511567 [execute]: 1.116e-05 Sums bootstrap : 0.000784s : 0.79% type_inference : 0.038375s : 38.88% event_method : 0.000124s : 0.13% auto_monad : 0.000198s : 0.20% graph_reusing : 0.000012s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000052s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000009s : 0.01% optimize.rewriter_before_opt_a : 0.000311s : 0.32% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000108s : 0.11% optimize.opt_a.loop_unroll : 0.000048s : 0.05% optimize.opt_a.a_1 : 0.001165s : 1.18% optimize.opt_a.with_stream_mark : 0.000050s : 0.05% optimize.opt_a.recompute_prepare : 0.000022s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000197s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000020s : 0.02% optimize.opt_a.auto_parallel : 0.000020s : 0.02% optimize.opt_a.parallel : 0.000080s : 0.08% optimize.opt_a.flash_sp : 0.000016s : 0.02% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000009s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000021s : 0.02% optimize.opt_a.virtual_dataset : 0.000016s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.01% optimize.opt_a.virtual_output : 0.000015s : 0.02% optimize.opt_a.merge_forward : 0.000010s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.01% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000024s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000008s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.02% optimize.opt_a.a_after_grad : 0.000022s : 0.02% optimize.opt_a.renormalize : 0.002154s : 2.18% optimize.opt_a.add_forward_monad_depend : 0.000013s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.03% optimize.opt_a.cse : 0.000070s : 0.07% optimize.opt_a.a_3 : 0.000112s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000009s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000914s : 0.93% optimize.opt_b.b_1 : 0.000183s : 0.19% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000036s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000034s : 0.03% optimize.loop_unroll : 0.000523s : 0.53% optimize.opt_after_cconv.c_1 : 0.000036s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000051s : 0.05% optimize.tuple_transform.d_1 : 0.000094s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.07% optimize.cse_after_recomputation.cse : 0.000022s : 0.02% optimize.environ_conv : 0.000013s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000007s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000023s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000003s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.03% get_jit_bprop_graph : 0.000003s : 0.00% rewriter_after_jit_bprop_graph : 0.000007s : 0.01% opt_after_jit_grad : 0.000716s : 0.73% validate : 0.000059s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.051157s : 51.83% execute : 0.000011s : 0.01% Time group info: ------[substitution.] 0.000473 62 0.72% : 0.000003s : 3: substitution.elim_not_effective 1.94% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.50% : 0.000002s : 3: substitution.fold_const_symbol 1.93% : 0.000009s : 4: substitution.graph_param_transform 54.48% : 0.000258s : 8: substitution.inline 1.20% : 0.000006s : 6: substitution.j_node_and_user_rematch 9.42% : 0.000045s : 2: substitution.minmaximum_grad 1.51% : 0.000007s : 6: substitution.remove_not_recompute_node 1.17% : 0.000006s : 2: substitution.replace_old_param 1.84% : 0.000009s : 1: substitution.switch_simplify 4.48% : 0.000021s : 4: substitution.tuple_list_convert_item_index_to_positive 1.85% : 0.000009s : 4: substitution.tuple_list_get_item_const_eliminator 2.72% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 13.18% : 0.000062s : 8: substitution.tuple_list_get_item_eliminator 3.06% : 0.000014s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.038282 2 93.82% : 0.035916s : 1: type_inference.infer 6.18% : 0.002367s : 1: type_inference.specialize ------[replace.] 0.000124 11 58.05% : 0.000072s : 8: replace.inline 19.50% : 0.000024s : 1: replace.switch_simplify 22.45% : 0.000028s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000266 11 95.08% : 0.000253s : 8: match.inline 2.98% : 0.000008s : 1: match.switch_simplify 1.94% : 0.000005s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000286 1438 1.06% : 0.000003s : 16: predicate.accumulaten_eliminater 1.07% : 0.000003s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000003s : 16: predicate.addn_zero_filter 0.78% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.28% : 0.000007s : 24: predicate.arithmetic_simplify 1.21% : 0.000003s : 16: predicate.cast_eliminate 0.59% : 0.000002s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.42% : 0.000001s : 8: predicate.depend_value_elim 0.94% : 0.000003s : 16: predicate.dict_get_item_const_eliminator 1.17% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.82% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.81% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 4: predicate.elim_not_effective 0.41% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.47% : 0.000004s : 20: predicate.environ_add_const_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.05% : 0.000003s : 20: predicate.environ_get_depend_swap 1.73% : 0.000005s : 28: predicate.environ_get_eliminate 1.06% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.24% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.50% : 0.000007s : 26: predicate.float_depend_g_call 0.53% : 0.000002s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.66% : 0.000002s : 8: predicate.get_grad_eliminate 0.21% : 0.000001s : 4: predicate.graph_param_transform 0.42% : 0.000001s : 8: predicate.incorporate_call 0.35% : 0.000001s : 8: predicate.incorporate_call_switch 5.13% : 0.000015s : 66: predicate.inline 0.53% : 0.000002s : 8: predicate.inline_without_move 0.22% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.00% : 0.000003s : 8: predicate.less_batch_normalization 1.54% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.53% : 0.000007s : 42: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.36% : 0.000007s : 46: predicate.loop_unroll_before_grad 1.93% : 0.000006s : 24: predicate.make_slice_get_slice_eliminator 0.41% : 0.000001s : 8: predicate.merge_addn 0.51% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000002s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 16: predicate.minmaximum_grad 1.50% : 0.000004s : 4: predicate.mutable_eliminate 0.33% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 2.28% : 0.000007s : 26: predicate.partial_defer_inline 1.20% : 0.000003s : 22: predicate.partial_eliminate 0.88% : 0.000003s : 16: predicate.print_const_string_wrapper 0.70% : 0.000002s : 8: predicate.reduce_all_const_elim 1.42% : 0.000004s : 16: predicate.reduce_eliminate 2.03% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 8: predicate.remove_not_recompute_node 1.03% : 0.000003s : 26: predicate.replace_applicator 0.36% : 0.000001s : 8: predicate.replace_old_param 0.21% : 0.000001s : 4: predicate.reset_defer_inline 0.89% : 0.000003s : 16: predicate.reshape_eliminate 0.71% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.48% : 0.000001s : 4: predicate.row_tensor_eliminate 0.64% : 0.000002s : 8: predicate.same_eliminate 0.37% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000002s : 8: predicate.shard_identity_eliminate 0.71% : 0.000002s : 8: predicate.special_op_eliminate 0.57% : 0.000002s : 8: predicate.specialize_transform 1.02% : 0.000003s : 8: predicate.split_environ_get_set_with_tuple_value 0.81% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.23% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.59% : 0.000005s : 26: predicate.switch_defer_inline 2.02% : 0.000006s : 34: predicate.switch_layer_defer_inline 5.44% : 0.000016s : 86: predicate.switch_simplify 0.83% : 0.000002s : 16: predicate.tile_eliminate 0.93% : 0.000003s : 16: predicate.transpose_eliminate 1.65% : 0.000005s : 24: predicate.tuple_list_convert_item_index_to_positive 2.14% : 0.000006s : 24: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000004s : 24: predicate.tuple_list_get_item_depend_reorder 4.61% : 0.000013s : 34: predicate.tuple_list_get_item_eliminator 1.82% : 0.000005s : 24: predicate.tuple_list_get_set_item_eliminator 2.38% : 0.000007s : 32: predicate.tuple_list_set_item_eliminator 1.47% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.49% : 0.000007s : 42: predicate.updatestate_pure_node_eliminater 3.15% : 0.000009s : 50: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.85% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.75% : 0.000002s : 8: predicate.virtual_output_eliminate 0.20% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.35% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002019 23 53.82% : 0.001086s : 11: func_graph_cloner_run.FuncGraphClonerGraph 46.18% : 0.000932s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.120791 196 0.00% : 0.000006s : 1: ForceFp32Comm 3.51% : 0.004240s : 1: add_attr 3.50% : 0.004223s : 1: add_attr_with_inline 0.05% : 0.000065s : 1: add_comm_op_reuse_tag 0.06% : 0.000070s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.17% : 0.000208s : 1: auto_monad 0.02% : 0.000030s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.71% : 0.000862s : 1: bootstrap 0.03% : 0.000038s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000028s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000041s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000007s : 1: detach_backward 0.01% : 0.000016s : 1: environ_conv 0.11% : 0.000134s : 1: event_method 0.02% : 0.000020s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000009s : 1: label_micro_interleaved_index 0.44% : 0.000533s : 1: loop_unroll 0.00% : 0.000006s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.77% : 0.000927s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000026s : 1: opt.transform.mutable_eliminate 1.44% : 0.001741s : 78: opt.transform.opt_a 0.03% : 0.000034s : 1: opt.transform.opt_after_cconv 0.03% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.13% : 0.000158s : 28: opt.transform.opt_b 0.08% : 0.000101s : 2: opt.transform.opt_trans_graph 0.04% : 0.000053s : 4: opt.transform.symbol_engine_opt 4.12% : 0.004975s : 1: opt_a 0.11% : 0.000133s : 1: opt_after_cconv 0.60% : 0.000729s : 1: opt_after_jit_grad 0.25% : 0.000302s : 1: opt_b 6.74% : 0.008141s : 1: optimize 0.02% : 0.000024s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000035s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000007s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000007s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000057s : 1: pre_auto_parallel 0.01% : 0.000013s : 1: py_interpret_to_execute 0.01% : 0.000013s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000058s : 1: remove_dup_value 0.99% : 0.001196s : 1: renormalize.infer 0.78% : 0.000943s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000011s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000031s : 1: rewriter_after_opt_a 0.26% : 0.000318s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000010s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000118s : 1: symbol_engine_optimizer 42.38% : 0.051186s : 1: task_emit 0.12% : 0.000139s : 1: tuple_transform 31.79% : 0.038398s : 1: type_inference 0.08% : 0.000099s : 1: validate TotalTime = 0.100783, [24] [bootstrap]: 0.0009748 [type_inference]: 0.0358688 [event_method]: 0.00012095 [auto_monad]: 0.00020497 [graph_reusing]: 1.262e-05 [inline]: 3.10002e-06 [add_attr]: 0.0045302, [1] [add_attr_with_inline]: 0.00449864, [1] [Cycle 1]: 8.311e-05, [2] [tag_attr]: 3.333e-05 [meta_addattr_fg_expand]: 7.66999e-06 [parallel-infer-symbol]: 3.10002e-06 [pre_auto_parallel]: 5.051e-05 [insert-virtual-dataset]: 3.33e-06 [parallel-infer-symbol-second]: 6.60017e-07 [dataset_repeat_opt]: 1.77999e-06 [pipeline_split]: 2.10002e-06 [optimize]: 0.00732988, [53] [py_interpret_to_execute]: 7.35e-06 [rewriter_before_opt_a]: 0.00029623 [opt_a]: 0.00437233, [2] [Cycle 1]: 0.00357121, [45] [expand_dump_flag]: 4.38999e-06 [switch_simplify]: 9.473e-05 [loop_unroll]: 3.941e-05 [a_1]: 0.00084566 [with_stream_mark]: 2.075e-05 [recompute_prepare]: 1.203e-05 [updatestate_depend_eliminate]: 5.02999e-06 [updatestate_assign_eliminate]: 4.38001e-06 [updatestate_loads_eliminate]: 4.05e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 9.911e-05 [accelerated_algorithm]: 8.28001e-06 [shard]: 1.96003e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 7.13e-06 [merge_send_recv]: 9.69e-06 [auto_parallel]: 8.32998e-06 [parallel]: 2.015e-05 [flash_sp]: 9.71e-06 [merge_comm]: 4.35e-06 [allreduce_fusion]: 4.12e-06 [matmul_add_comm_reduction]: 1.065e-05 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 8.93002e-06 [virtual_dataset]: 7.1e-06 [get_grad_eliminate_]: 6.54999e-06 [virtual_output]: 7.20003e-06 [merge_forward]: 4.92e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 1.169e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.375e-05 [merge_recompute_call_nodes]: 1.72999e-06 [before_grad]: 1.161e-05 [set_forward_comm_id_for_comm_node_pass]: 4.35999e-06 [meta_fg_expand]: 3.62002e-06 [flash_sp_send_recv_attached]: 2.54001e-06 [receive_attached]: 2.55997e-06 [after_resolve]: 1.165e-05 [a_after_grad]: 1.018e-05 [renormalize]: 0.00185502 [add_forward_monad_depend]: 7.37002e-06 [auto_monad_grad]: 3.06001e-06 [auto_monad_eliminator]: 2.075e-05 [cse]: 4.092e-05 [a_3]: 5.812e-05 [Cycle 2]: 0.00078781, [45] [expand_dump_flag]: 1.95001e-06 [switch_simplify]: 9.82999e-06 [loop_unroll]: 7.02002e-06 [a_1]: 0.00017192 [with_stream_mark]: 1.766e-05 [recompute_prepare]: 8.35001e-06 [updatestate_depend_eliminate]: 4.68999e-06 [updatestate_assign_eliminate]: 3.32002e-06 [updatestate_loads_eliminate]: 3.92998e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 9.035e-05 [accelerated_algorithm]: 6.91999e-06 [shard]: 1.49e-06 [meta_shard_fg_expand]: 2.09999e-06 [shard_inline]: 7.15e-06 [merge_send_recv]: 8.30999e-06 [auto_parallel]: 9.42999e-06 [parallel]: 7.15e-06 [flash_sp]: 3.19001e-06 [merge_comm]: 4.67998e-06 [allreduce_fusion]: 4.02e-06 [matmul_add_comm_reduction]: 9.07001e-06 [allreduce_slice_to_reducescatter]: 6.69999e-07 [virtual_shard_identity]: 1.008e-05 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 6.29999e-06 [virtual_output]: 6.49999e-06 [merge_forward]: 6.04001e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 1.01e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.516e-05 [merge_recompute_call_nodes]: 1.67999e-06 [before_grad]: 1.136e-05 [set_forward_comm_id_for_comm_node_pass]: 4.94998e-06 [meta_fg_expand]: 3.49001e-06 [flash_sp_send_recv_attached]: 1.32999e-06 [receive_attached]: 2.04999e-06 [after_resolve]: 1.225e-05 [a_after_grad]: 9.64999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 2.96999e-06 [auto_monad_grad]: 1.50999e-06 [auto_monad_eliminator]: 1.03e-05 [cse]: 2.689e-05 [a_3]: 4.185e-05 [py_interpret_to_execute_after_opt_a]: 8.1e-06 [slice_cell_reuse_recomputed_activation]: 1.81e-06 [rewriter_after_opt_a]: 2.768e-05 [convert_after_rewriter]: 1.18001e-06 [order_py_execute_after_rewriter]: 1.11002e-06 [mutable_eliminate]: 0.00079323 [opt_b]: 0.00030052, [1] [Cycle 1]: 0.00029312, [7] [b_1]: 0.00018339 [b_2]: 9.25999e-06 [updatestate_depend_eliminate]: 1.102e-05 [updatestate_assign_eliminate]: 4.10998e-06 [updatestate_loads_eliminate]: 4.32e-06 [renormalize]: 7.59988e-07 [cse]: 4.018e-05 [optimize_parallel_all_gather_comm]: 5.043e-05 [overlap_param_gather]: 2.24001e-06 [cconv]: 5.481e-05 [loop_unroll]: 0.00052728 [opt_after_cconv]: 0.00012571, [1] [Cycle 1]: 0.00011922, [7] [c_1]: 3.365e-05 [parameter_eliminate]: 5.37999e-06 [updatestate_depend_eliminate]: 8.70999e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 3.04001e-06 [cse]: 2.998e-05 [renormalize]: 6.80011e-07 [remove_dup_value]: 1.926e-05 [tuple_transform]: 0.00010321, [1] [Cycle 1]: 9.873e-05, [4] [d_1]: 7.025e-05 [none_parameter_eliminate]: 1.79e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 8.69e-06 [partial_unused_args_eliminate]: 1.93002e-06 [add_recomputation]: 6.124e-05 [cse_after_recomputation]: 2.962e-05, [1] [Cycle 1]: 2.484e-05, [1] [cse]: 1.928e-05 [environ_conv]: 1.168e-05 [swap_dp_allreduce_reducescatter]: 6.36e-06 [bias_add_comm_swap]: 2.64001e-06 [label_micro_interleaved_index]: 4.61002e-06 [label_fine_grained_interleaved_index]: 2.37001e-06 [merge_cast_opt]: 1.71998e-06 [slice_recompute_activation]: 2.17999e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 1.14e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.53998e-06 [comm_op_add_attrs]: 1.31998e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.23002e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.00001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.93997e-06 [control_data_broadcast_order]: 1.743e-05 [grouped_pairwise_exchange_alltoall]: 1.62001e-06 [offloading_packed_experts]: 5.33002e-06 [overlap_recompute_and_grad_model_parallel]: 6.64001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40001e-06 [overlap_recompute_comm]: 2.70002e-06 [overlap_grad_ring_attention]: 5.47001e-06 [overlap_grad_flash_sp]: 2.57e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.43002e-06 [split_layernorm_comm]: 2.14e-06 [handle_group_info]: 1.18001e-06 [symbol_engine_optimizer]: 0.00010696, [1] [Cycle 1]: 0.00010077, [6] [build]: 1.191e-05 [elim_shapecalc]: 1.381e-05 [elim_not_effective]: 1.791e-05 [opt_reshape]: 1.057e-05 [fold_const_symbol]: 1.339e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.41998e-06 [pipeline_parallel_scheduler]: 1.39998e-06 [auto_monad_reorder]: 2.282e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 6.12999e-06 [opt_after_jit_grad]: 0.00059498 [validate]: 5.348e-05 [backend_pass]: 9.10019e-07 [task_emit]: 0.0507019 [execute]: 9.86e-06 Sums bootstrap : 0.000975s : 1.03% type_inference : 0.035869s : 37.73% event_method : 0.000121s : 0.13% auto_monad : 0.000205s : 0.22% graph_reusing : 0.000013s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000033s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000051s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.01% optimize.rewriter_before_opt_a : 0.000296s : 0.31% optimize.opt_a.expand_dump_flag : 0.000006s : 0.01% optimize.opt_a.switch_simplify : 0.000105s : 0.11% optimize.opt_a.loop_unroll : 0.000046s : 0.05% optimize.opt_a.a_1 : 0.001018s : 1.07% optimize.opt_a.with_stream_mark : 0.000038s : 0.04% optimize.opt_a.recompute_prepare : 0.000020s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000189s : 0.20% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.01% optimize.opt_a.shard_inline : 0.000014s : 0.02% optimize.opt_a.merge_send_recv : 0.000018s : 0.02% optimize.opt_a.auto_parallel : 0.000018s : 0.02% optimize.opt_a.parallel : 0.000027s : 0.03% optimize.opt_a.flash_sp : 0.000013s : 0.01% optimize.opt_a.merge_comm : 0.000009s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000020s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000014s : 0.01% optimize.opt_a.merge_forward : 0.000011s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000022s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000029s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000009s : 0.01% optimize.opt_a.meta_fg_expand : 0.000007s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.02% optimize.opt_a.renormalize : 0.001855s : 1.95% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.01% optimize.opt_a.auto_monad_grad : 0.000005s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000031s : 0.03% optimize.opt_a.cse : 0.000068s : 0.07% optimize.opt_a.a_3 : 0.000100s : 0.11% optimize.py_interpret_to_execute_after_opt_a : 0.000008s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000793s : 0.83% optimize.opt_b.b_1 : 0.000183s : 0.19% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000011s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000040s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000050s : 0.05% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000055s : 0.06% optimize.loop_unroll : 0.000527s : 0.55% optimize.opt_after_cconv.c_1 : 0.000034s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000019s : 0.02% optimize.tuple_transform.d_1 : 0.000070s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.06% optimize.cse_after_recomputation.cse : 0.000019s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000026s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000023s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000595s : 0.63% validate : 0.000053s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.050702s : 53.34% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000384 62 0.68% : 0.000003s : 3: substitution.elim_not_effective 2.28% : 0.000009s : 3: substitution.float_tuple_getitem_switch 0.60% : 0.000002s : 3: substitution.fold_const_symbol 1.73% : 0.000007s : 4: substitution.graph_param_transform 55.85% : 0.000215s : 8: substitution.inline 1.23% : 0.000005s : 6: substitution.j_node_and_user_rematch 7.46% : 0.000029s : 2: substitution.minmaximum_grad 1.66% : 0.000006s : 6: substitution.remove_not_recompute_node 1.45% : 0.000006s : 2: substitution.replace_old_param 2.44% : 0.000009s : 1: substitution.switch_simplify 4.92% : 0.000019s : 4: substitution.tuple_list_convert_item_index_to_positive 2.15% : 0.000008s : 4: substitution.tuple_list_get_item_const_eliminator 3.35% : 0.000013s : 4: substitution.tuple_list_get_item_depend_reorder 10.95% : 0.000042s : 8: substitution.tuple_list_get_item_eliminator 3.24% : 0.000012s : 4: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.035750 2 93.39% : 0.033387s : 1: type_inference.infer 6.61% : 0.002364s : 1: type_inference.specialize ------[replace.] 0.000106 11 58.83% : 0.000062s : 8: replace.inline 21.25% : 0.000023s : 1: replace.switch_simplify 19.92% : 0.000021s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000223 11 94.17% : 0.000210s : 8: match.inline 3.60% : 0.000008s : 1: match.switch_simplify 2.24% : 0.000005s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000251 1438 1.04% : 0.000003s : 16: predicate.accumulaten_eliminater 0.99% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 1.09% : 0.000003s : 16: predicate.addn_zero_filter 0.83% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.01% : 0.000005s : 24: predicate.arithmetic_simplify 1.01% : 0.000003s : 16: predicate.cast_eliminate 0.48% : 0.000001s : 8: predicate.check_bprop_eliminate 0.52% : 0.000001s : 8: predicate.compare_switch_simplify 0.14% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000002s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.09% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.69% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 4: predicate.elim_not_effective 0.34% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.07% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.08% : 0.000003s : 20: predicate.environ_get_add_eliminate 1.03% : 0.000003s : 20: predicate.environ_get_depend_swap 1.74% : 0.000004s : 28: predicate.environ_get_eliminate 1.26% : 0.000003s : 20: predicate.environ_get_set_eliminate 1.50% : 0.000004s : 26: predicate.exchange_switch_depend_value 2.31% : 0.000006s : 26: predicate.float_depend_g_call 0.53% : 0.000001s : 8: predicate.float_environ_get_switch 0.72% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.54% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.47% : 0.000001s : 8: predicate.incorporate_call 0.38% : 0.000001s : 8: predicate.incorporate_call_switch 5.85% : 0.000015s : 66: predicate.inline 0.71% : 0.000002s : 8: predicate.inline_without_move 0.27% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.70% : 0.000002s : 8: predicate.less_batch_normalization 1.79% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.26% : 0.000006s : 42: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.57% : 0.000006s : 46: predicate.loop_unroll_before_grad 2.05% : 0.000005s : 24: predicate.make_slice_get_slice_eliminator 0.52% : 0.000001s : 8: predicate.merge_addn 0.46% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.93% : 0.000002s : 16: predicate.minmaximum_grad 1.58% : 0.000004s : 4: predicate.mutable_eliminate 0.29% : 0.000001s : 4: predicate.opt_reshape 0.43% : 0.000001s : 4: predicate.parallel_virtual_node 2.27% : 0.000006s : 26: predicate.partial_defer_inline 1.30% : 0.000003s : 22: predicate.partial_eliminate 0.92% : 0.000002s : 16: predicate.print_const_string_wrapper 0.62% : 0.000002s : 8: predicate.reduce_all_const_elim 1.38% : 0.000003s : 16: predicate.reduce_eliminate 2.26% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.52% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000003s : 26: predicate.replace_applicator 0.37% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000000s : 4: predicate.reset_defer_inline 0.87% : 0.000002s : 16: predicate.reshape_eliminate 0.62% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.39% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000002s : 8: predicate.same_eliminate 0.33% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.52% : 0.000001s : 8: predicate.special_op_eliminate 0.59% : 0.000001s : 8: predicate.specialize_transform 0.84% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.68% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.60% : 0.000004s : 26: predicate.switch_defer_inline 1.98% : 0.000005s : 34: predicate.switch_layer_defer_inline 5.76% : 0.000014s : 86: predicate.switch_simplify 0.97% : 0.000002s : 16: predicate.tile_eliminate 1.02% : 0.000003s : 16: predicate.transpose_eliminate 2.26% : 0.000006s : 24: predicate.tuple_list_convert_item_index_to_positive 2.12% : 0.000005s : 24: predicate.tuple_list_get_item_const_eliminator 1.82% : 0.000005s : 24: predicate.tuple_list_get_item_depend_reorder 3.44% : 0.000009s : 34: predicate.tuple_list_get_item_eliminator 1.96% : 0.000005s : 24: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000006s : 32: predicate.tuple_list_set_item_eliminator 1.59% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.19% : 0.000006s : 42: predicate.updatestate_pure_node_eliminater 2.72% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.53% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001978 23 57.06% : 0.001129s : 11: func_graph_cloner_run.FuncGraphClonerGraph 42.94% : 0.000850s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.116299 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.90% : 0.004538s : 1: add_attr 3.87% : 0.004502s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.18% : 0.000214s : 1: auto_monad 0.02% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000007s : 1: bias_add_comm_swap 0.87% : 0.001010s : 1: bootstrap 0.05% : 0.000059s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000034s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.11% : 0.000131s : 1: event_method 0.02% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.02% : 0.000018s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000006s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000006s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.46% : 0.000538s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.69% : 0.000808s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000029s : 1: opt.transform.mutable_eliminate 1.35% : 0.001567s : 78: opt.transform.opt_a 0.03% : 0.000032s : 1: opt.transform.opt_after_cconv 0.03% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000158s : 28: opt.transform.opt_b 0.07% : 0.000077s : 2: opt.transform.opt_trans_graph 0.04% : 0.000049s : 4: opt.transform.symbol_engine_opt 3.76% : 0.004377s : 1: opt_a 0.11% : 0.000129s : 1: opt_after_cconv 0.52% : 0.000607s : 1: opt_after_jit_grad 0.26% : 0.000306s : 1: opt_b 6.31% : 0.007336s : 1: optimize 0.05% : 0.000055s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.04% : 0.000045s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.05% : 0.000055s : 1: pre_auto_parallel 0.01% : 0.000012s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000024s : 1: remove_dup_value 0.91% : 0.001058s : 1: renormalize.infer 0.67% : 0.000785s : 1: renormalize.specialize 0.01% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000031s : 1: rewriter_after_opt_a 0.26% : 0.000303s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000110s : 1: symbol_engine_optimizer 43.62% : 0.050730s : 1: task_emit 0.09% : 0.000107s : 1: tuple_transform 30.86% : 0.035894s : 1: type_inference 0.08% : 0.000090s : 1: validate [WARNING] CORE(87352,ffffbf434f30,python3.9):2026-01-29-17:52:05.800.921 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph6 TotalTime = 0.122421, [24] [bootstrap]: 0.00078107 [type_inference]: 0.0380827 [event_method]: 2.959e-05 [auto_monad]: 0.00010545 [graph_reusing]: 6.37001e-06 [inline]: 2.89999e-06 [add_attr]: 0.00590742, [1] [add_attr_with_inline]: 0.00589638, [1] [Cycle 1]: 6.965e-05, [2] [tag_attr]: 2.425e-05 [meta_addattr_fg_expand]: 6.54999e-06 [parallel-infer-symbol]: 4.42e-06 [pre_auto_parallel]: 4.283e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 7.30011e-07 [dataset_repeat_opt]: 1.83997e-06 [pipeline_split]: 1.55001e-06 [optimize]: 0.0062629, [53] [py_interpret_to_execute]: 7.56001e-06 [rewriter_before_opt_a]: 0.00028347 [opt_a]: 0.00402261, [2] [Cycle 1]: 0.00344641, [45] [expand_dump_flag]: 3.80998e-06 [switch_simplify]: 8.477e-05 [loop_unroll]: 3.253e-05 [a_1]: 0.00066063 [with_stream_mark]: 1.698e-05 [recompute_prepare]: 7.08e-06 [updatestate_depend_eliminate]: 3.9e-06 [updatestate_assign_eliminate]: 3.26001e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.97001e-06 [a_2]: 6.914e-05 [accelerated_algorithm]: 6.02999e-06 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 2.34999e-06 [shard_inline]: 5.83002e-06 [merge_send_recv]: 8.32998e-06 [auto_parallel]: 6.24001e-06 [parallel]: 1.818e-05 [flash_sp]: 8.48999e-06 [merge_comm]: 3.63999e-06 [allreduce_fusion]: 3.29001e-06 [matmul_add_comm_reduction]: 2.786e-05 [allreduce_slice_to_reducescatter]: 5.60016e-07 [virtual_shard_identity]: 7.53999e-06 [virtual_dataset]: 5.79999e-06 [get_grad_eliminate_]: 5.40999e-06 [virtual_output]: 5.46e-06 [merge_forward]: 3.87002e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 1.011e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.171e-05 [merge_recompute_call_nodes]: 1.54e-06 [before_grad]: 9.91e-06 [set_forward_comm_id_for_comm_node_pass]: 3.97e-06 [meta_fg_expand]: 3.14999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.35002e-06 [after_resolve]: 9.30001e-06 [a_after_grad]: 8.47e-06 [renormalize]: 0.00203431 [add_forward_monad_depend]: 5.72001e-06 [auto_monad_grad]: 2.46e-06 [auto_monad_eliminator]: 1.65e-05 [cse]: 3.656e-05 [a_3]: 4.348e-05 [Cycle 2]: 0.00056619, [45] [expand_dump_flag]: 1.13001e-06 [switch_simplify]: 7.18e-06 [loop_unroll]: 5.74999e-06 [a_1]: 9.756e-05 [with_stream_mark]: 1.214e-05 [recompute_prepare]: 5.52001e-06 [updatestate_depend_eliminate]: 3.09001e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.83e-06 [parameter_eliminate]: 1.41998e-06 [a_2]: 6.094e-05 [accelerated_algorithm]: 5.61e-06 [shard]: 1.22999e-06 [meta_shard_fg_expand]: 1.54998e-06 [shard_inline]: 5.32001e-06 [merge_send_recv]: 5.15999e-06 [auto_parallel]: 5.56e-06 [parallel]: 6.04001e-06 [flash_sp]: 2.99999e-06 [merge_comm]: 3.23e-06 [allreduce_fusion]: 2.95998e-06 [matmul_add_comm_reduction]: 4.95001e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 6.23998e-06 [virtual_dataset]: 5.18002e-06 [get_grad_eliminate_]: 5.10001e-06 [virtual_output]: 4.83001e-06 [merge_forward]: 3.31999e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 6.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.282e-05 [merge_recompute_call_nodes]: 7.60017e-07 [before_grad]: 9.07001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.13998e-06 [meta_fg_expand]: 1.82001e-06 [flash_sp_send_recv_attached]: 1.30001e-06 [receive_attached]: 1.24e-06 [after_resolve]: 7.94002e-06 [a_after_grad]: 7.80998e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 1.07998e-06 [auto_monad_eliminator]: 6.64001e-06 [cse]: 1.602e-05 [a_3]: 3.113e-05 [py_interpret_to_execute_after_opt_a]: 5.41998e-06 [slice_cell_reuse_recomputed_activation]: 1.81998e-06 [rewriter_after_opt_a]: 1.755e-05 [convert_after_rewriter]: 1.20999e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00057459 [opt_b]: 0.00018194, [1] [Cycle 1]: 0.00017529, [7] [b_1]: 0.00010474 [b_2]: 7.20998e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.39001e-06 [renormalize]: 9.39996e-07 [cse]: 2.028e-05 [optimize_parallel_all_gather_comm]: 1.576e-05 [overlap_param_gather]: 2.33002e-06 [cconv]: 2.506e-05 [loop_unroll]: 0.00043894 [opt_after_cconv]: 9.351e-05, [1] [Cycle 1]: 8.819e-05, [7] [c_1]: 2.398e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.36998e-06 [updatestate_assign_eliminate]: 2.45002e-06 [updatestate_loads_eliminate]: 2.27001e-06 [cse]: 1.888e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.635e-05 [tuple_transform]: 6.483e-05, [1] [Cycle 1]: 6.076e-05, [4] [d_1]: 3.6e-05 [none_parameter_eliminate]: 1.65001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 5.94e-06 [partial_unused_args_eliminate]: 2.02999e-06 [add_recomputation]: 7.034e-05 [cse_after_recomputation]: 2.407e-05, [1] [Cycle 1]: 1.984e-05, [1] [cse]: 1.471e-05 [environ_conv]: 9.56e-06 [swap_dp_allreduce_reducescatter]: 5.62001e-06 [bias_add_comm_swap]: 2.66999e-06 [label_micro_interleaved_index]: 4.33001e-06 [label_fine_grained_interleaved_index]: 2.48e-06 [merge_cast_opt]: 1.50001e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.04e-06 [assign_add_opt]: 1.52999e-06 [ForceFp32Comm]: 9.00007e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.64001e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.13001e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.47001e-06 [interleave_parallel_branches]: 1.02998e-06 [overlap_opt_shard_in_pipeline]: 9.5999e-07 [overlap_opt_shard_grad_in_pipeline]: 1.71002e-06 [control_data_broadcast_order]: 1.307e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 3.50998e-06 [overlap_recompute_and_grad_model_parallel]: 4.44002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.31998e-06 [overlap_grad_ring_attention]: 4.03999e-06 [overlap_grad_flash_sp]: 1.828e-05 [begin_end_overlap_inline]: 7.7e-07 [split_matmul_comm_elemetwise]: 2.34999e-06 [split_layernorm_comm]: 1.58002e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 7.896e-05, [1] [Cycle 1]: 7.49e-05, [6] [build]: 1.078e-05 [elim_shapecalc]: 9.60001e-06 [elim_not_effective]: 1.204e-05 [opt_reshape]: 6.39001e-06 [fold_const_symbol]: 8.99e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.92999e-06 [pipeline_parallel_scheduler]: 1.96e-06 [auto_monad_reorder]: 1.73e-05 [get_jit_bprop_graph]: 1.91e-06 [rewriter_after_jit_bprop_graph]: 3.75e-06 [opt_after_jit_grad]: 0.00045989 [validate]: 7.832e-05 [backend_pass]: 9.60019e-07 [task_emit]: 0.0703101 [execute]: 9.86003e-06 Sums bootstrap : 0.000781s : 0.68% type_inference : 0.038083s : 32.99% event_method : 0.000030s : 0.03% auto_monad : 0.000105s : 0.09% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000043s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000008s : 0.01% optimize.rewriter_before_opt_a : 0.000283s : 0.25% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000092s : 0.08% optimize.opt_a.loop_unroll : 0.000038s : 0.03% optimize.opt_a.a_1 : 0.000758s : 0.66% optimize.opt_a.with_stream_mark : 0.000029s : 0.03% optimize.opt_a.recompute_prepare : 0.000013s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000130s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000024s : 0.02% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000033s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.01% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000010s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.01% optimize.opt_a.a_after_grad : 0.000016s : 0.01% optimize.opt_a.renormalize : 0.002034s : 1.76% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.02% optimize.opt_a.cse : 0.000053s : 0.05% optimize.opt_a.a_3 : 0.000075s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000575s : 0.50% optimize.opt_b.b_1 : 0.000105s : 0.09% optimize.opt_b.b_2 : 0.000007s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000020s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000025s : 0.02% optimize.loop_unroll : 0.000439s : 0.38% optimize.opt_after_cconv.c_1 : 0.000024s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000036s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000070s : 0.06% optimize.cse_after_recomputation.cse : 0.000015s : 0.01% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000018s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000460s : 0.40% validate : 0.000078s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.070310s : 60.91% execute : 0.000010s : 0.01% Time group info: ------[substitution.] 0.000252 26 0.88% : 0.000002s : 2: substitution.elim_not_effective 0.52% : 0.000001s : 2: substitution.fold_const_symbol 2.35% : 0.000006s : 3: substitution.graph_param_transform 84.25% : 0.000212s : 6: substitution.inline 1.61% : 0.000004s : 4: substitution.j_node_and_user_rematch 1.84% : 0.000005s : 4: substitution.remove_not_recompute_node 1.32% : 0.000003s : 2: substitution.replace_old_param 2.85% : 0.000007s : 1: substitution.switch_simplify 4.38% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.037943 2 85.35% : 0.032384s : 1: type_inference.infer 14.65% : 0.005560s : 1: type_inference.specialize ------[replace.] 0.000087 9 56.65% : 0.000049s : 6: replace.inline 25.82% : 0.000022s : 1: replace.switch_simplify 17.54% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000225 9 92.73% : 0.000208s : 6: match.inline 2.88% : 0.000006s : 1: match.switch_simplify 4.38% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000173 1092 0.94% : 0.000002s : 12: predicate.accumulaten_eliminater 0.80% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 12: predicate.addn_zero_filter 0.93% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.20% : 0.000004s : 18: predicate.arithmetic_simplify 1.09% : 0.000002s : 12: predicate.cast_eliminate 0.59% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.47% : 0.000001s : 6: predicate.depend_value_elim 0.95% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.14% : 0.000002s : 12: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_depend_swap 1.64% : 0.000003s : 21: predicate.environ_get_eliminate 1.20% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.54% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.64% : 0.000005s : 20: predicate.float_depend_g_call 0.46% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.59% : 0.000001s : 6: predicate.get_grad_eliminate 0.26% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 6.15% : 0.000011s : 50: predicate.inline 0.70% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.78% : 0.000001s : 6: predicate.less_batch_normalization 1.70% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.34% : 0.000004s : 32: predicate.load_eliminater 0.87% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.88% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.53% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.86% : 0.000001s : 12: predicate.minmaximum_grad 1.05% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.44% : 0.000001s : 3: predicate.parallel_virtual_node 2.06% : 0.000004s : 20: predicate.partial_defer_inline 1.41% : 0.000002s : 17: predicate.partial_eliminate 0.98% : 0.000002s : 12: predicate.print_const_string_wrapper 0.54% : 0.000001s : 6: predicate.reduce_all_const_elim 1.38% : 0.000002s : 12: predicate.reduce_eliminate 2.37% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000001s : 6: predicate.remove_not_recompute_node 1.30% : 0.000002s : 20: predicate.replace_applicator 0.51% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 1.05% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 3: predicate.row_tensor_eliminate 0.66% : 0.000001s : 6: predicate.same_eliminate 0.34% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.64% : 0.000001s : 6: predicate.shard_identity_eliminate 0.86% : 0.000001s : 6: predicate.special_op_eliminate 0.62% : 0.000001s : 6: predicate.specialize_transform 0.96% : 0.000002s : 6: predicate.split_environ_get_set_with_tuple_value 0.71% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.80% : 0.000003s : 20: predicate.switch_defer_inline 2.18% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.34% : 0.000011s : 68: predicate.switch_simplify 0.95% : 0.000002s : 12: predicate.tile_eliminate 0.90% : 0.000002s : 12: predicate.transpose_eliminate 1.65% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.69% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.28% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.93% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.61% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.56% : 0.000001s : 6: predicate.virtual_output_eliminate 0.25% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.38% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002946 16 33.05% : 0.000974s : 8: func_graph_cloner_run.FuncGraphClonerGraph 66.95% : 0.001972s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.137881 196 0.00% : 0.000004s : 1: ForceFp32Comm 4.29% : 0.005913s : 1: add_attr 4.28% : 0.005901s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000075s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000111s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.59% : 0.000813s : 1: bootstrap 0.02% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.03% : 0.000037s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.32% : 0.000447s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000019s : 1: micro_interleaved_order_control 0.42% : 0.000583s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000013s : 1: opt.transform.mutable_eliminate 0.85% : 0.001169s : 78: opt.transform.opt_a 0.02% : 0.000023s : 1: opt.transform.opt_after_cconv 0.02% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000085s : 28: opt.transform.opt_b 0.03% : 0.000040s : 2: opt.transform.opt_trans_graph 0.02% : 0.000034s : 4: opt.transform.symbol_engine_opt 2.92% : 0.004026s : 1: opt_a 0.07% : 0.000097s : 1: opt_after_cconv 0.34% : 0.000469s : 1: opt_after_jit_grad 0.13% : 0.000186s : 1: opt_b 4.55% : 0.006268s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000022s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000047s : 1: pre_auto_parallel 0.01% : 0.000011s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.94% : 0.001291s : 1: renormalize.infer 0.53% : 0.000735s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000021s : 1: rewriter_after_opt_a 0.21% : 0.000290s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000082s : 1: symbol_engine_optimizer 51.01% : 0.070337s : 1: task_emit 0.05% : 0.000068s : 1: tuple_transform 27.65% : 0.038125s : 1: type_inference 0.08% : 0.000111s : 1: validate [WARNING] CORE(87352,ffffbf434f30,python3.9):2026-01-29-17:52:06.368.869 [mindspore/core/ir/manager.cc:936] Replace] Cannot replace the return node of a func graph kernel_graph7 TotalTime = 0.0827653, [24] [bootstrap]: 0.0010282 [type_inference]: 0.0275046 [event_method]: 2.258e-05 [auto_monad]: 7.906e-05 [graph_reusing]: 7.03e-06 [inline]: 2.52001e-06 [add_attr]: 0.00325654, [1] [add_attr_with_inline]: 0.00324882, [1] [Cycle 1]: 5.676e-05, [2] [tag_attr]: 2.148e-05 [meta_addattr_fg_expand]: 6.59999e-06 [parallel-infer-symbol]: 2.79001e-06 [pre_auto_parallel]: 3.432e-05 [insert-virtual-dataset]: 2.26e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.94e-06 [pipeline_split]: 1.49e-06 [optimize]: 0.00495088, [53] [py_interpret_to_execute]: 4.55999e-06 [rewriter_before_opt_a]: 0.00024228 [opt_a]: 0.00287325, [2] [Cycle 1]: 0.0023101, [45] [expand_dump_flag]: 3.59002e-06 [switch_simplify]: 7.712e-05 [loop_unroll]: 3.203e-05 [a_1]: 0.00058911 [with_stream_mark]: 1.465e-05 [recompute_prepare]: 7e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 3.94002e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 6.972e-05 [accelerated_algorithm]: 6.39001e-06 [shard]: 1.59998e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 6.08002e-06 [merge_send_recv]: 7.75998e-06 [auto_parallel]: 6.14001e-06 [parallel]: 2.039e-05 [flash_sp]: 7.30998e-06 [merge_comm]: 3.91999e-06 [allreduce_fusion]: 3.18e-06 [matmul_add_comm_reduction]: 8.74e-06 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 5.75001e-06 [get_grad_eliminate_]: 5.64e-06 [virtual_output]: 5.51e-06 [merge_forward]: 3.86001e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 8.28999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.242e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 9.72999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 3.36999e-06 [flash_sp_send_recv_attached]: 2.17999e-06 [receive_attached]: 1.92999e-06 [after_resolve]: 8.97e-06 [a_after_grad]: 8.28999e-06 [renormalize]: 0.00099667 [add_forward_monad_depend]: 5.14e-06 [auto_monad_grad]: 1.79e-06 [auto_monad_eliminator]: 1.419e-05 [cse]: 3.621e-05 [a_3]: 4.108e-05 [Cycle 2]: 0.00055364, [45] [expand_dump_flag]: 1.24e-06 [switch_simplify]: 6.91001e-06 [loop_unroll]: 5.87999e-06 [a_1]: 9.618e-05 [with_stream_mark]: 1.047e-05 [recompute_prepare]: 5.39e-06 [updatestate_depend_eliminate]: 2.95002e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.11e-06 [parameter_eliminate]: 1.12e-06 [a_2]: 6.068e-05 [accelerated_algorithm]: 5.57999e-06 [shard]: 1.01997e-06 [meta_shard_fg_expand]: 1.27e-06 [shard_inline]: 5.20999e-06 [merge_send_recv]: 4.78001e-06 [auto_parallel]: 5.15999e-06 [parallel]: 4.58999e-06 [flash_sp]: 3.06001e-06 [merge_comm]: 3.01001e-06 [allreduce_fusion]: 3.11001e-06 [matmul_add_comm_reduction]: 4.80001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.29001e-06 [virtual_dataset]: 5.30001e-06 [get_grad_eliminate_]: 5.05999e-06 [virtual_output]: 4.88001e-06 [merge_forward]: 2.79999e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 5.74e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.189e-05 [merge_recompute_call_nodes]: 8.70001e-07 [before_grad]: 8.12e-06 [set_forward_comm_id_for_comm_node_pass]: 3.31999e-06 [meta_fg_expand]: 1.94e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.08001e-06 [after_resolve]: 7.95e-06 [a_after_grad]: 8.17998e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.21997e-06 [auto_monad_grad]: 8.79983e-07 [auto_monad_eliminator]: 5.94999e-06 [cse]: 1.513e-05 [a_3]: 3.098e-05 [py_interpret_to_execute_after_opt_a]: 4.43999e-06 [slice_cell_reuse_recomputed_activation]: 1.76e-06 [rewriter_after_opt_a]: 1.577e-05 [convert_after_rewriter]: 1.44998e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00046995 [opt_b]: 0.00020723, [1] [Cycle 1]: 0.00020043, [7] [b_1]: 0.00012372 [b_2]: 7.63001e-06 [updatestate_depend_eliminate]: 5.81e-06 [updatestate_assign_eliminate]: 3.06999e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 4.69998e-07 [cse]: 2.115e-05 [optimize_parallel_all_gather_comm]: 1.564e-05 [overlap_param_gather]: 2.40002e-06 [cconv]: 2.364e-05 [loop_unroll]: 0.00041929 [opt_after_cconv]: 9.499e-05, [1] [Cycle 1]: 8.998e-05, [7] [c_1]: 2.476e-05 [parameter_eliminate]: 2.40002e-06 [updatestate_depend_eliminate]: 5.08002e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.26998e-06 [cse]: 2.046e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 2.914e-05 [tuple_transform]: 6.579e-05, [1] [Cycle 1]: 6.1e-05, [4] [d_1]: 3.627e-05 [none_parameter_eliminate]: 1.67999e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 5.91e-06 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 4.549e-05 [cse_after_recomputation]: 2.187e-05, [1] [Cycle 1]: 1.778e-05, [1] [cse]: 1.252e-05 [environ_conv]: 7.64002e-06 [swap_dp_allreduce_reducescatter]: 5.00999e-06 [bias_add_comm_swap]: 2.34999e-06 [label_micro_interleaved_index]: 4.28001e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.38998e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 9.99979e-07 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.27e-06 [overlap_opt_shard_in_pipeline]: 1.22e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.24e-05 [grouped_pairwise_exchange_alltoall]: 1.77001e-06 [offloading_packed_experts]: 3.49001e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.24e-06 [overlap_recompute_comm]: 2.53998e-06 [overlap_grad_ring_attention]: 4.70001e-06 [overlap_grad_flash_sp]: 1.931e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.81e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 7.995e-05, [1] [Cycle 1]: 7.573e-05, [6] [build]: 1.069e-05 [elim_shapecalc]: 9.47999e-06 [elim_not_effective]: 1.291e-05 [opt_reshape]: 6.32001e-06 [fold_const_symbol]: 8.92999e-06 [renormalize]: 1.8999e-07 [detach_backward]: 1.62001e-06 [pipeline_parallel_scheduler]: 1.68002e-06 [auto_monad_reorder]: 1.589e-05 [get_jit_bprop_graph]: 1.34e-06 [rewriter_after_jit_bprop_graph]: 3.53999e-06 [opt_after_jit_grad]: 0.00045387 [validate]: 4.208e-05 [backend_pass]: 9.10019e-07 [task_emit]: 0.045078 [execute]: 9.39e-06 Sums bootstrap : 0.001028s : 1.31% type_inference : 0.027505s : 35.05% event_method : 0.000023s : 0.03% auto_monad : 0.000079s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000034s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000242s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000084s : 0.11% optimize.opt_a.loop_unroll : 0.000038s : 0.05% optimize.opt_a.a_1 : 0.000685s : 0.87% optimize.opt_a.with_stream_mark : 0.000025s : 0.03% optimize.opt_a.recompute_prepare : 0.000012s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000130s : 0.17% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000025s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.02% optimize.opt_a.virtual_dataset : 0.000011s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.01% optimize.opt_a.virtual_output : 0.000010s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000014s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.02% optimize.opt_a.a_after_grad : 0.000016s : 0.02% optimize.opt_a.renormalize : 0.000997s : 1.27% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.03% optimize.opt_a.cse : 0.000051s : 0.07% optimize.opt_a.a_3 : 0.000072s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000470s : 0.60% optimize.opt_b.b_1 : 0.000124s : 0.16% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.03% optimize.loop_unroll : 0.000419s : 0.53% optimize.opt_after_cconv.c_1 : 0.000025s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000029s : 0.04% optimize.tuple_transform.d_1 : 0.000036s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000045s : 0.06% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000016s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000454s : 0.58% validate : 0.000042s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.045078s : 57.45% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000183 26 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.04% : 0.000006s : 3: substitution.graph_param_transform 79.56% : 0.000146s : 6: substitution.inline 1.75% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.52% : 0.000005s : 4: substitution.remove_not_recompute_node 1.76% : 0.000003s : 2: substitution.replace_old_param 3.69% : 0.000007s : 1: substitution.switch_simplify 5.77% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.027438 2 95.06% : 0.026084s : 1: type_inference.infer 4.94% : 0.001354s : 1: type_inference.specialize ------[replace.] 0.000079 9 58.92% : 0.000046s : 6: replace.inline 22.17% : 0.000017s : 1: replace.switch_simplify 18.91% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000157 9 90.31% : 0.000142s : 6: match.inline 3.84% : 0.000006s : 1: match.switch_simplify 5.85% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000174 1092 0.98% : 0.000002s : 12: predicate.accumulaten_eliminater 0.84% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 12: predicate.addn_zero_filter 0.88% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.27% : 0.000004s : 18: predicate.arithmetic_simplify 1.05% : 0.000002s : 12: predicate.cast_eliminate 0.58% : 0.000001s : 6: predicate.check_bprop_eliminate 0.50% : 0.000001s : 6: predicate.compare_switch_simplify 0.16% : 0.000000s : 3: predicate.const_output_eliminate 0.42% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.03% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 3: predicate.elim_not_effective 0.39% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_depend_swap 1.82% : 0.000003s : 21: predicate.environ_get_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.54% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.62% : 0.000005s : 20: predicate.float_depend_g_call 0.48% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.62% : 0.000001s : 6: predicate.get_grad_eliminate 0.22% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.91% : 0.000010s : 50: predicate.inline 0.67% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 6: predicate.less_batch_normalization 1.85% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.39% : 0.000004s : 32: predicate.load_eliminater 0.89% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.84% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.66% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 6: predicate.merge_addn 0.50% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 12: predicate.minmaximum_grad 1.05% : 0.000002s : 3: predicate.mutable_eliminate 0.36% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 2.28% : 0.000004s : 20: predicate.partial_defer_inline 1.43% : 0.000002s : 17: predicate.partial_eliminate 0.93% : 0.000002s : 12: predicate.print_const_string_wrapper 0.51% : 0.000001s : 6: predicate.reduce_all_const_elim 1.39% : 0.000002s : 12: predicate.reduce_eliminate 2.46% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 6: predicate.remove_not_recompute_node 1.29% : 0.000002s : 20: predicate.replace_applicator 0.44% : 0.000001s : 6: predicate.replace_old_param 0.28% : 0.000000s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 12: predicate.reshape_eliminate 0.58% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 3: predicate.row_tensor_eliminate 0.74% : 0.000001s : 6: predicate.same_eliminate 0.39% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.68% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.72% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.77% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.28% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.73% : 0.000003s : 20: predicate.switch_defer_inline 2.22% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.11% : 0.000011s : 68: predicate.switch_simplify 1.10% : 0.000002s : 12: predicate.tile_eliminate 0.94% : 0.000002s : 12: predicate.transpose_eliminate 1.65% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.24% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.63% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.27% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.89% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.32% : 0.000001s : 3: predicate.value_based_eliminate 0.68% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001557 16 68.98% : 0.001074s : 8: func_graph_cloner_run.FuncGraphClonerGraph 31.02% : 0.000483s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.093178 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.50% : 0.003261s : 1: add_attr 3.49% : 0.003252s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000049s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000084s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.17% : 0.001093s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000010s : 1: environ_conv 0.03% : 0.000029s : 1: event_method 0.02% : 0.000017s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.46% : 0.000427s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.51% : 0.000478s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.01% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000013s : 1: opt.transform.mutable_eliminate 1.16% : 0.001085s : 78: opt.transform.opt_a 0.03% : 0.000024s : 1: opt.transform.opt_after_cconv 0.02% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000087s : 28: opt.transform.opt_b 0.04% : 0.000040s : 2: opt.transform.opt_trans_graph 0.04% : 0.000034s : 4: opt.transform.symbol_engine_opt 3.09% : 0.002876s : 1: opt_a 0.11% : 0.000098s : 1: opt_after_cconv 0.50% : 0.000463s : 1: opt_after_jit_grad 0.23% : 0.000211s : 1: opt_b 5.32% : 0.004955s : 1: optimize 0.02% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000039s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000034s : 1: remove_dup_value 0.59% : 0.000553s : 1: renormalize.infer 0.47% : 0.000436s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000019s : 1: rewriter_after_opt_a 0.27% : 0.000248s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000083s : 1: symbol_engine_optimizer 48.41% : 0.045105s : 1: task_emit 0.07% : 0.000069s : 1: tuple_transform 29.54% : 0.027523s : 1: type_inference 0.08% : 0.000072s : 1: validate TotalTime = 0.0970546, [24] [bootstrap]: 0.00123249 [type_inference]: 0.0723579 [event_method]: 2.419e-05 [auto_monad]: 9.057e-05 [graph_reusing]: 7.06001e-06 [inline]: 2.10002e-06 [add_attr]: 0.00554023, [1] [add_attr_with_inline]: 0.00552754, [1] [Cycle 1]: 7.14e-05, [2] [tag_attr]: 2.681e-05 [meta_addattr_fg_expand]: 6.46e-06 [parallel-infer-symbol]: 4.42e-06 [pre_auto_parallel]: 3.937e-05 [insert-virtual-dataset]: 2.66e-06 [parallel-infer-symbol-second]: 7.50006e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00680581, [53] [py_interpret_to_execute]: 6.48e-06 [rewriter_before_opt_a]: 0.00028332 [opt_a]: 0.00416033, [2] [Cycle 1]: 0.00354486, [45] [expand_dump_flag]: 3.98001e-06 [switch_simplify]: 8.978e-05 [loop_unroll]: 3.893e-05 [a_1]: 0.00103364 [with_stream_mark]: 2.089e-05 [recompute_prepare]: 1.058e-05 [updatestate_depend_eliminate]: 4.48001e-06 [updatestate_assign_eliminate]: 4.52e-06 [updatestate_loads_eliminate]: 1.93e-05 [parameter_eliminate]: 2.37001e-06 [a_2]: 0.00010327 [accelerated_algorithm]: 7.14001e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 2.69999e-06 [shard_inline]: 5.90002e-06 [merge_send_recv]: 8.42998e-06 [auto_parallel]: 7.29001e-06 [parallel]: 3.527e-05 [flash_sp]: 8.28001e-06 [merge_comm]: 3.76999e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 9.36e-06 [allreduce_slice_to_reducescatter]: 9.39996e-07 [virtual_shard_identity]: 8.24002e-06 [virtual_dataset]: 7.31999e-06 [get_grad_eliminate_]: 6.91001e-06 [virtual_output]: 6.39001e-06 [merge_forward]: 3.95e-06 [cell_reuse_recompute_pass]: 2.02001e-06 [offload_activation]: 9.81e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.428e-05 [merge_recompute_call_nodes]: 1.48002e-06 [before_grad]: 1.093e-05 [set_forward_comm_id_for_comm_node_pass]: 3.85e-06 [meta_fg_expand]: 2.89999e-06 [flash_sp_send_recv_attached]: 2.54999e-06 [receive_attached]: 2.08998e-06 [after_resolve]: 1.038e-05 [a_after_grad]: 2.679e-05 [renormalize]: 0.00162628 [add_forward_monad_depend]: 6.61999e-06 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 1.685e-05 [cse]: 3.726e-05 [a_3]: 4.709e-05 [Cycle 2]: 0.00060433, [45] [expand_dump_flag]: 1.30001e-06 [switch_simplify]: 8.01001e-06 [loop_unroll]: 5.86998e-06 [a_1]: 0.00010184 [with_stream_mark]: 1.298e-05 [recompute_prepare]: 6.21e-06 [updatestate_depend_eliminate]: 3.14001e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.37999e-06 [parameter_eliminate]: 1.13001e-06 [a_2]: 6.46e-05 [accelerated_algorithm]: 5.54e-06 [shard]: 1.32e-06 [meta_shard_fg_expand]: 1.48002e-06 [shard_inline]: 5.32001e-06 [merge_send_recv]: 5.34e-06 [auto_parallel]: 6.21998e-06 [parallel]: 5.64998e-06 [flash_sp]: 3.2e-06 [merge_comm]: 2.98e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 6.05002e-06 [allreduce_slice_to_reducescatter]: 4.59986e-07 [virtual_shard_identity]: 6.27001e-06 [virtual_dataset]: 5.09998e-06 [get_grad_eliminate_]: 5.92001e-06 [virtual_output]: 5.09e-06 [merge_forward]: 3.07002e-06 [cell_reuse_recompute_pass]: 1.80001e-06 [offload_activation]: 6.31e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.363e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 8.60999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.22997e-06 [meta_fg_expand]: 1.92999e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.09e-06 [after_resolve]: 8.48999e-06 [a_after_grad]: 8.59e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 6.96999e-06 [cse]: 1.627e-05 [a_3]: 3.106e-05 [py_interpret_to_execute_after_opt_a]: 4.87998e-06 [slice_cell_reuse_recomputed_activation]: 2.32001e-06 [rewriter_after_opt_a]: 1.864e-05 [convert_after_rewriter]: 1.27e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00065535 [opt_b]: 0.00033891, [1] [Cycle 1]: 0.00033224, [7] [b_1]: 0.00023859 [b_2]: 7.63999e-06 [updatestate_depend_eliminate]: 5.86e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 1.791e-05 [renormalize]: 6.19999e-07 [cse]: 2.078e-05 [optimize_parallel_all_gather_comm]: 3.114e-05 [overlap_param_gather]: 2.12001e-06 [cconv]: 2.689e-05 [loop_unroll]: 0.00048136 [opt_after_cconv]: 0.00012245, [1] [Cycle 1]: 0.00011634, [7] [c_1]: 2.833e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 5.67001e-06 [updatestate_assign_eliminate]: 2.239e-05 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 1.942e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.566e-05 [tuple_transform]: 9.395e-05, [1] [Cycle 1]: 8.888e-05, [4] [d_1]: 6.052e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.99972e-07 [switch_simplify]: 7.5e-06 [partial_unused_args_eliminate]: 1.80001e-06 [add_recomputation]: 4.82e-05 [cse_after_recomputation]: 2.458e-05, [1] [Cycle 1]: 1.97e-05, [1] [cse]: 1.424e-05 [environ_conv]: 9.35001e-06 [swap_dp_allreduce_reducescatter]: 5.46998e-06 [bias_add_comm_swap]: 3.08e-06 [label_micro_interleaved_index]: 4.52e-06 [label_fine_grained_interleaved_index]: 2.71e-06 [merge_cast_opt]: 1.34998e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.21997e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.32e-06 [full_micro_interleaved_order_control]: 3.20998e-06 [reorder_send_recv_between_fp_bp]: 2.74001e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.00999e-06 [interleave_split_concat_branches]: 1.55001e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 1.19e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71998e-06 [control_data_broadcast_order]: 1.251e-05 [grouped_pairwise_exchange_alltoall]: 1.57999e-06 [offloading_packed_experts]: 3.61001e-06 [overlap_recompute_and_grad_model_parallel]: 5.27999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.50999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.36998e-06 [overlap_recompute_comm]: 2.48e-06 [overlap_grad_ring_attention]: 4.37e-06 [overlap_grad_flash_sp]: 1.875e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.21e-06 [split_layernorm_comm]: 1.82999e-06 [handle_group_info]: 1.10999e-06 [symbol_engine_optimizer]: 8.894e-05, [1] [Cycle 1]: 8.44e-05, [6] [build]: 1.375e-05 [elim_shapecalc]: 1.071e-05 [elim_not_effective]: 1.226e-05 [opt_reshape]: 6.92002e-06 [fold_const_symbol]: 1.037e-05 [renormalize]: 2.69996e-07 [detach_backward]: 1.91e-06 [pipeline_parallel_scheduler]: 1.44998e-06 [auto_monad_reorder]: 2.032e-05 [get_jit_bprop_graph]: 2.46e-06 [rewriter_after_jit_bprop_graph]: 3.83001e-06 [opt_after_jit_grad]: 0.0005479 [validate]: 8.752e-05 [backend_pass]: 1.12e-06 [task_emit]: 0.00995062 [execute]: 8.23999e-06 Sums bootstrap : 0.001232s : 1.36% type_inference : 0.072358s : 80.13% event_method : 0.000024s : 0.03% auto_monad : 0.000091s : 0.10% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000039s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000283s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000098s : 0.11% optimize.opt_a.loop_unroll : 0.000045s : 0.05% optimize.opt_a.a_1 : 0.001135s : 1.26% optimize.opt_a.with_stream_mark : 0.000034s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000022s : 0.02% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000168s : 0.19% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.01% optimize.opt_a.merge_send_recv : 0.000014s : 0.02% optimize.opt_a.auto_parallel : 0.000014s : 0.01% optimize.opt_a.parallel : 0.000041s : 0.05% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000012s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000011s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000028s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.02% optimize.opt_a.a_after_grad : 0.000035s : 0.04% optimize.opt_a.renormalize : 0.001626s : 1.80% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.01% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.03% optimize.opt_a.cse : 0.000054s : 0.06% optimize.opt_a.a_3 : 0.000078s : 0.09% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000655s : 0.73% optimize.opt_b.b_1 : 0.000239s : 0.26% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000018s : 0.02% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000027s : 0.03% optimize.loop_unroll : 0.000481s : 0.53% optimize.opt_after_cconv.c_1 : 0.000028s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000022s : 0.02% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.02% optimize.tuple_transform.d_1 : 0.000061s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000048s : 0.05% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000002s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000019s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000014s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000020s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000548s : 0.61% validate : 0.000088s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.009951s : 11.02% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000252 26 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.74% : 0.000002s : 2: substitution.fold_const_symbol 2.13% : 0.000005s : 3: substitution.graph_param_transform 78.78% : 0.000199s : 6: substitution.inline 1.54% : 0.000004s : 4: substitution.j_node_and_user_rematch 2.49% : 0.000006s : 4: substitution.remove_not_recompute_node 1.60% : 0.000004s : 2: substitution.replace_old_param 3.07% : 0.000008s : 1: substitution.switch_simplify 8.81% : 0.000022s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.072249 2 96.76% : 0.069907s : 1: type_inference.infer 3.24% : 0.002342s : 1: type_inference.specialize ------[replace.] 0.000104 9 59.68% : 0.000062s : 6: replace.inline 19.58% : 0.000020s : 1: replace.switch_simplify 20.74% : 0.000022s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 9 87.86% : 0.000195s : 6: match.inline 2.88% : 0.000006s : 1: match.switch_simplify 9.26% : 0.000021s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000183 1092 1.21% : 0.000002s : 12: predicate.accumulaten_eliminater 0.79% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.42% : 0.000001s : 6: predicate.addn_check_dump 0.99% : 0.000002s : 12: predicate.addn_zero_filter 0.95% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.41% : 0.000004s : 18: predicate.arithmetic_simplify 1.00% : 0.000002s : 12: predicate.cast_eliminate 0.49% : 0.000001s : 6: predicate.check_bprop_eliminate 0.49% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.51% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.39% : 0.000003s : 12: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.94% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.40% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.20% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 15: predicate.environ_get_depend_swap 1.79% : 0.000003s : 21: predicate.environ_get_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.49% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.64% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.70% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.17% : 0.000000s : 3: predicate.fold_const_symbol 0.60% : 0.000001s : 6: predicate.get_grad_eliminate 0.18% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 6.03% : 0.000011s : 50: predicate.inline 0.61% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.67% : 0.000001s : 6: predicate.less_batch_normalization 2.00% : 0.000004s : 20: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 32: predicate.load_eliminater 0.94% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.78% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.76% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.45% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.56% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.82% : 0.000001s : 12: predicate.minmaximum_grad 1.10% : 0.000002s : 3: predicate.mutable_eliminate 0.34% : 0.000001s : 3: predicate.opt_reshape 0.47% : 0.000001s : 3: predicate.parallel_virtual_node 2.35% : 0.000004s : 20: predicate.partial_defer_inline 1.37% : 0.000003s : 17: predicate.partial_eliminate 1.05% : 0.000002s : 12: predicate.print_const_string_wrapper 0.44% : 0.000001s : 6: predicate.reduce_all_const_elim 1.27% : 0.000002s : 12: predicate.reduce_eliminate 2.34% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.32% : 0.000001s : 6: predicate.remove_not_recompute_node 1.21% : 0.000002s : 20: predicate.replace_applicator 0.47% : 0.000001s : 6: predicate.replace_old_param 0.20% : 0.000000s : 3: predicate.reset_defer_inline 1.01% : 0.000002s : 12: predicate.reshape_eliminate 0.56% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.91% : 0.000002s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.68% : 0.000001s : 6: predicate.shard_identity_eliminate 0.75% : 0.000001s : 6: predicate.special_op_eliminate 0.63% : 0.000001s : 6: predicate.specialize_transform 0.68% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.98% : 0.000002s : 6: predicate.stack_unstack_eliminate 0.26% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.62% : 0.000003s : 20: predicate.switch_defer_inline 2.33% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.89% : 0.000011s : 68: predicate.switch_simplify 1.04% : 0.000002s : 12: predicate.tile_eliminate 0.98% : 0.000002s : 12: predicate.transpose_eliminate 1.55% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.52% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.38% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.08% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.13% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.83% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.26% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.08% : 0.000006s : 38: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.57% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004067 16 69.44% : 0.002824s : 8: func_graph_cloner_run.FuncGraphClonerGraph 30.56% : 0.001243s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.112963 196 0.00% : 0.000003s : 1: ForceFp32Comm 4.91% : 0.005547s : 1: add_attr 4.90% : 0.005531s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000052s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000097s : 1: auto_monad 0.02% : 0.000025s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.16% : 0.001306s : 1: bootstrap 0.03% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.03% : 0.000031s : 1: event_method 0.01% : 0.000014s : 1: execute 0.01% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.43% : 0.000490s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.59% : 0.000664s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000037s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000014s : 1: opt.transform.mutable_eliminate 1.44% : 0.001627s : 78: opt.transform.opt_a 0.02% : 0.000027s : 1: opt.transform.opt_after_cconv 0.06% : 0.000063s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000186s : 28: opt.transform.opt_b 0.06% : 0.000066s : 2: opt.transform.opt_trans_graph 0.03% : 0.000037s : 4: opt.transform.symbol_engine_opt 3.69% : 0.004164s : 1: opt_a 0.11% : 0.000127s : 1: opt_after_cconv 0.49% : 0.000558s : 1: opt_after_jit_grad 0.30% : 0.000343s : 1: opt_b 6.03% : 0.006811s : 1: optimize 0.03% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000022s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.02% : 0.000020s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000044s : 1: pre_auto_parallel 0.01% : 0.000010s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.05% : 0.000052s : 1: remove_cast_before_assign_add 0.02% : 0.000019s : 1: remove_dup_value 0.70% : 0.000793s : 1: renormalize.infer 0.73% : 0.000824s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000022s : 1: rewriter_after_opt_a 0.26% : 0.000290s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000091s : 1: symbol_engine_optimizer 8.82% : 0.009966s : 1: task_emit 0.09% : 0.000097s : 1: tuple_transform 64.09% : 0.072398s : 1: type_inference 0.12% : 0.000135s : 1: validate TotalTime = 0.0449265, [24] [bootstrap]: 0.00050173 [type_inference]: 0.0263092 [event_method]: 2.253e-05 [auto_monad]: 8.201e-05 [graph_reusing]: 6.51e-06 [inline]: 2.11e-06 [add_attr]: 0.00329455, [1] [add_attr_with_inline]: 0.00328624, [1] [Cycle 1]: 5.789e-05, [2] [tag_attr]: 2.212e-05 [meta_addattr_fg_expand]: 6.69001e-06 [parallel-infer-symbol]: 3.10998e-06 [pre_auto_parallel]: 3.65e-05 [insert-virtual-dataset]: 3.06999e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.81e-06 [pipeline_split]: 2.12001e-06 [optimize]: 0.00537003, [53] [py_interpret_to_execute]: 4.15999e-06 [rewriter_before_opt_a]: 0.00023677 [opt_a]: 0.0032819, [2] [Cycle 1]: 0.00264546, [45] [expand_dump_flag]: 3.86001e-06 [switch_simplify]: 8.266e-05 [loop_unroll]: 3.877e-05 [a_1]: 0.00071819 [with_stream_mark]: 1.547e-05 [recompute_prepare]: 9.36e-06 [updatestate_depend_eliminate]: 4.45e-06 [updatestate_assign_eliminate]: 3.95e-06 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 1.87999e-06 [a_2]: 7.973e-05 [accelerated_algorithm]: 6.69001e-06 [shard]: 1.59e-06 [meta_shard_fg_expand]: 2.37999e-06 [shard_inline]: 6.12001e-06 [merge_send_recv]: 8.12e-06 [auto_parallel]: 6.09001e-06 [parallel]: 2.149e-05 [flash_sp]: 7.16001e-06 [merge_comm]: 3.85998e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 9.25999e-06 [allreduce_slice_to_reducescatter]: 6.10016e-07 [virtual_shard_identity]: 7.43999e-06 [virtual_dataset]: 6.68998e-06 [get_grad_eliminate_]: 6.88e-06 [virtual_output]: 5.97999e-06 [merge_forward]: 4.23001e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 8.95999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.393e-05 [merge_recompute_call_nodes]: 1.83002e-06 [before_grad]: 9.84999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.88001e-06 [meta_fg_expand]: 3.16999e-06 [flash_sp_send_recv_attached]: 2.45002e-06 [receive_attached]: 2.01e-06 [after_resolve]: 9.99999e-06 [a_after_grad]: 9.63002e-06 [renormalize]: 0.00115531 [add_forward_monad_depend]: 5.46e-06 [auto_monad_grad]: 1.76e-06 [auto_monad_eliminator]: 1.601e-05 [cse]: 3.591e-05 [a_3]: 4.62e-05 [Cycle 2]: 0.00062713, [45] [expand_dump_flag]: 1.27999e-06 [switch_simplify]: 7.82e-06 [loop_unroll]: 5.76998e-06 [a_1]: 0.00010081 [with_stream_mark]: 1.117e-05 [recompute_prepare]: 5.57001e-06 [updatestate_depend_eliminate]: 2.97002e-06 [updatestate_assign_eliminate]: 2.97002e-06 [updatestate_loads_eliminate]: 2.22999e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 6.608e-05 [accelerated_algorithm]: 5.81e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.35001e-06 [shard_inline]: 5.40001e-06 [merge_send_recv]: 4.30999e-06 [auto_parallel]: 4.90999e-06 [parallel]: 3.86999e-06 [flash_sp]: 3.04001e-06 [merge_comm]: 3.07002e-06 [allreduce_fusion]: 2.86e-06 [matmul_add_comm_reduction]: 5.17e-06 [allreduce_slice_to_reducescatter]: 4.39992e-07 [virtual_shard_identity]: 6.33e-06 [virtual_dataset]: 5.44998e-06 [get_grad_eliminate_]: 5.84e-06 [virtual_output]: 4.566e-05 [merge_forward]: 2.96001e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 6.08002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.288e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 8.48999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36001e-06 [meta_fg_expand]: 1.87999e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 1.02998e-06 [after_resolve]: 7.8e-06 [a_after_grad]: 8.11002e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 6.80998e-06 [cse]: 1.837e-05 [a_3]: 3.295e-05 [py_interpret_to_execute_after_opt_a]: 3.91999e-06 [slice_cell_reuse_recomputed_activation]: 1.94e-06 [rewriter_after_opt_a]: 1.816e-05 [convert_after_rewriter]: 1.26997e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00047355 [opt_b]: 0.00020799, [1] [Cycle 1]: 0.00020187, [7] [b_1]: 0.0001266 [b_2]: 7.28999e-06 [updatestate_depend_eliminate]: 5.57999e-06 [updatestate_assign_eliminate]: 2.87002e-06 [updatestate_loads_eliminate]: 2.63998e-06 [renormalize]: 5.50004e-07 [cse]: 2.087e-05 [optimize_parallel_all_gather_comm]: 1.649e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.492e-05 [loop_unroll]: 0.00042593 [opt_after_cconv]: 0.00010113, [1] [Cycle 1]: 9.577e-05, [7] [c_1]: 2.836e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.47001e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.72001e-06 [cse]: 2.045e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 2.681e-05 [tuple_transform]: 7.162e-05, [1] [Cycle 1]: 6.68e-05, [4] [d_1]: 4.032e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 6.73e-06 [partial_unused_args_eliminate]: 1.77999e-06 [add_recomputation]: 4.527e-05 [cse_after_recomputation]: 2.195e-05, [1] [Cycle 1]: 1.767e-05, [1] [cse]: 1.201e-05 [environ_conv]: 8.05999e-06 [swap_dp_allreduce_reducescatter]: 6.08998e-06 [bias_add_comm_swap]: 2.81e-06 [label_micro_interleaved_index]: 4.20999e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.19e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.56e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.39e-06 [full_micro_interleaved_order_control]: 2.42001e-06 [reorder_send_recv_between_fp_bp]: 2.76e-06 [comm_op_add_attrs]: 1.04003e-06 [add_comm_op_reuse_tag]: 1.35999e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04999e-06 [control_data_broadcast_order]: 1.292e-05 [grouped_pairwise_exchange_alltoall]: 1.82001e-06 [offloading_packed_experts]: 3.95e-06 [overlap_recompute_and_grad_model_parallel]: 5.08002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.16001e-06 [overlap_grad_flash_sp]: 1.743e-05 [begin_end_overlap_inline]: 6.10016e-07 [split_matmul_comm_elemetwise]: 2.12999e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.24998e-06 [symbol_engine_optimizer]: 8.023e-05, [1] [Cycle 1]: 7.629e-05, [6] [build]: 1.067e-05 [elim_shapecalc]: 1.059e-05 [elim_not_effective]: 1.169e-05 [opt_reshape]: 6.36e-06 [fold_const_symbol]: 9.67999e-06 [renormalize]: 2.30008e-07 [detach_backward]: 1.64e-06 [pipeline_parallel_scheduler]: 1.73002e-06 [auto_monad_reorder]: 1.688e-05 [get_jit_bprop_graph]: 9.89996e-07 [rewriter_after_jit_bprop_graph]: 3.28e-06 [opt_after_jit_grad]: 0.00046698 [validate]: 3.905e-05 [backend_pass]: 9.89996e-07 [task_emit]: 0.00852945 [execute]: 7e-06 Sums bootstrap : 0.000502s : 1.24% type_inference : 0.026309s : 64.77% event_method : 0.000023s : 0.06% auto_monad : 0.000082s : 0.20% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000022s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000037s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000237s : 0.58% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000090s : 0.22% optimize.opt_a.loop_unroll : 0.000045s : 0.11% optimize.opt_a.a_1 : 0.000819s : 2.02% optimize.opt_a.with_stream_mark : 0.000027s : 0.07% optimize.opt_a.recompute_prepare : 0.000015s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000146s : 0.36% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000025s : 0.06% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.03% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.03% optimize.opt_a.virtual_output : 0.000052s : 0.13% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.07% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.04% optimize.opt_a.a_after_grad : 0.000018s : 0.04% optimize.opt_a.renormalize : 0.001155s : 2.84% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.06% optimize.opt_a.cse : 0.000054s : 0.13% optimize.opt_a.a_3 : 0.000079s : 0.19% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000474s : 1.17% optimize.opt_b.b_1 : 0.000127s : 0.31% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000021s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000025s : 0.06% optimize.loop_unroll : 0.000426s : 1.05% optimize.opt_after_cconv.c_1 : 0.000028s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000027s : 0.07% optimize.tuple_transform.d_1 : 0.000040s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000045s : 0.11% optimize.cse_after_recomputation.cse : 0.000012s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000017s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000467s : 1.15% validate : 0.000039s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.008529s : 21.00% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000198 26 1.12% : 0.000002s : 2: substitution.elim_not_effective 0.62% : 0.000001s : 2: substitution.fold_const_symbol 2.57% : 0.000005s : 3: substitution.graph_param_transform 79.13% : 0.000156s : 6: substitution.inline 1.55% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.74% : 0.000005s : 4: substitution.remove_not_recompute_node 1.72% : 0.000003s : 2: substitution.replace_old_param 4.23% : 0.000008s : 1: substitution.switch_simplify 6.33% : 0.000013s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026247 2 94.92% : 0.024912s : 1: type_inference.infer 5.08% : 0.001334s : 1: type_inference.specialize ------[replace.] 0.000086 9 59.52% : 0.000051s : 6: replace.inline 19.48% : 0.000017s : 1: replace.switch_simplify 21.00% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000170 9 89.55% : 0.000152s : 6: match.inline 4.03% : 0.000007s : 1: match.switch_simplify 6.42% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000178 1092 0.95% : 0.000002s : 12: predicate.accumulaten_eliminater 0.98% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 6: predicate.addn_check_dump 0.95% : 0.000002s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.34% : 0.000004s : 18: predicate.arithmetic_simplify 0.98% : 0.000002s : 12: predicate.cast_eliminate 0.52% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.52% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.21% : 0.000000s : 3: predicate.elim_not_effective 0.47% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.15% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.22% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 15: predicate.environ_get_depend_swap 1.81% : 0.000003s : 21: predicate.environ_get_eliminate 1.17% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.71% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.66% : 0.000005s : 20: predicate.float_depend_g_call 0.42% : 0.000001s : 6: predicate.float_environ_get_switch 0.63% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.20% : 0.000000s : 3: predicate.graph_param_transform 0.49% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 5.96% : 0.000011s : 50: predicate.inline 0.65% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.70% : 0.000001s : 6: predicate.less_batch_normalization 1.72% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.34% : 0.000004s : 32: predicate.load_eliminater 1.13% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.87% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.67% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.51% : 0.000001s : 6: predicate.merge_addn 0.47% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 12: predicate.minmaximum_grad 1.05% : 0.000002s : 3: predicate.mutable_eliminate 0.34% : 0.000001s : 3: predicate.opt_reshape 0.51% : 0.000001s : 3: predicate.parallel_virtual_node 2.20% : 0.000004s : 20: predicate.partial_defer_inline 1.41% : 0.000003s : 17: predicate.partial_eliminate 1.02% : 0.000002s : 12: predicate.print_const_string_wrapper 0.48% : 0.000001s : 6: predicate.reduce_all_const_elim 1.31% : 0.000002s : 12: predicate.reduce_eliminate 2.34% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000001s : 6: predicate.remove_not_recompute_node 1.23% : 0.000002s : 20: predicate.replace_applicator 0.36% : 0.000001s : 6: predicate.replace_old_param 0.25% : 0.000000s : 3: predicate.reset_defer_inline 1.06% : 0.000002s : 12: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.33% : 0.000001s : 3: predicate.row_tensor_eliminate 0.70% : 0.000001s : 6: predicate.same_eliminate 0.34% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.64% : 0.000001s : 6: predicate.shard_identity_eliminate 0.87% : 0.000002s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.76% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.28% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.72% : 0.000003s : 20: predicate.switch_defer_inline 2.18% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.10% : 0.000011s : 68: predicate.switch_simplify 0.99% : 0.000002s : 12: predicate.tile_eliminate 0.90% : 0.000002s : 12: predicate.transpose_eliminate 1.54% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.35% : 0.000006s : 26: predicate.tuple_list_get_item_eliminator 1.56% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.24% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.70% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.32% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 3.02% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 3: predicate.value_based_eliminate 0.55% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.63% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.45% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001226 16 50.72% : 0.000622s : 8: func_graph_cloner_run.FuncGraphClonerGraph 49.28% : 0.000604s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.056217 196 0.01% : 0.000003s : 1: ForceFp32Comm 5.87% : 0.003299s : 1: add_attr 5.85% : 0.003290s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000087s : 1: auto_monad 0.04% : 0.000020s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.99% : 0.000556s : 1: bootstrap 0.05% : 0.000028s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000007s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.77% : 0.000434s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.86% : 0.000482s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 2.31% : 0.001300s : 78: opt.transform.opt_a 0.05% : 0.000027s : 1: opt.transform.opt_after_cconv 0.04% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.18% : 0.000102s : 28: opt.transform.opt_b 0.08% : 0.000045s : 2: opt.transform.opt_trans_graph 0.06% : 0.000035s : 4: opt.transform.symbol_engine_opt 5.84% : 0.003285s : 1: opt_a 0.19% : 0.000105s : 1: opt_after_cconv 0.85% : 0.000476s : 1: opt_after_jit_grad 0.38% : 0.000211s : 1: opt_b 9.56% : 0.005375s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000041s : 1: pre_auto_parallel 0.01% : 0.000007s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000031s : 1: remove_dup_value 0.93% : 0.000524s : 1: renormalize.infer 1.11% : 0.000623s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000022s : 1: rewriter_after_opt_a 0.43% : 0.000243s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000004s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000083s : 1: symbol_engine_optimizer 15.19% : 0.008541s : 1: task_emit 0.13% : 0.000074s : 1: tuple_transform 46.83% : 0.026324s : 1: type_inference 0.12% : 0.000068s : 1: validate TotalTime = 0.0462501, [24] [bootstrap]: 0.00045052 [type_inference]: 0.028118 [event_method]: 2.265e-05 [auto_monad]: 8.494e-05 [graph_reusing]: 5.85002e-06 [inline]: 3.27002e-06 [add_attr]: 0.00354725, [1] [add_attr_with_inline]: 0.00353828, [1] [Cycle 1]: 5.516e-05, [2] [tag_attr]: 2.1e-05 [meta_addattr_fg_expand]: 6.24999e-06 [parallel-infer-symbol]: 3.53e-06 [pre_auto_parallel]: 3.534e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 1.01002e-06 [dataset_repeat_opt]: 1.79e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.0051166, [53] [py_interpret_to_execute]: 3.94002e-06 [rewriter_before_opt_a]: 0.0002365 [opt_a]: 0.00300979, [2] [Cycle 1]: 0.00244889, [45] [expand_dump_flag]: 3.43999e-06 [switch_simplify]: 7.524e-05 [loop_unroll]: 3.203e-05 [a_1]: 0.0005864 [with_stream_mark]: 1.391e-05 [recompute_prepare]: 7.01001e-06 [updatestate_depend_eliminate]: 4.322e-05 [updatestate_assign_eliminate]: 3.51001e-06 [updatestate_loads_eliminate]: 3.09999e-06 [parameter_eliminate]: 2.06e-06 [a_2]: 7.058e-05 [accelerated_algorithm]: 6.16e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 1.92999e-06 [shard_inline]: 5.50001e-06 [merge_send_recv]: 8.84e-06 [auto_parallel]: 5.66e-06 [parallel]: 1.876e-05 [flash_sp]: 8.06001e-06 [merge_comm]: 3.64002e-06 [allreduce_fusion]: 3.38e-06 [matmul_add_comm_reduction]: 8.35001e-06 [allreduce_slice_to_reducescatter]: 7.30011e-07 [virtual_shard_identity]: 7.23e-06 [virtual_dataset]: 5.71e-06 [get_grad_eliminate_]: 5.61e-06 [virtual_output]: 5.44e-06 [merge_forward]: 3.76001e-06 [cell_reuse_recompute_pass]: 1.13001e-06 [offload_activation]: 9.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.199e-05 [merge_recompute_call_nodes]: 1.69e-06 [before_grad]: 9.40001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.45e-06 [meta_fg_expand]: 2.97002e-06 [flash_sp_send_recv_attached]: 2.82002e-06 [receive_attached]: 2.94001e-06 [after_resolve]: 9.04e-06 [a_after_grad]: 8.37998e-06 [renormalize]: 0.00111603 [add_forward_monad_depend]: 5.91e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 1.489e-05 [cse]: 3.582e-05 [a_3]: 3.987e-05 [Cycle 2]: 0.00055155, [45] [expand_dump_flag]: 1.10999e-06 [switch_simplify]: 6.56e-06 [loop_unroll]: 6.04999e-06 [a_1]: 9.518e-05 [with_stream_mark]: 1.078e-05 [recompute_prepare]: 5.72999e-06 [updatestate_depend_eliminate]: 2.93e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.17999e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 6.103e-05 [accelerated_algorithm]: 5.29998e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.44998e-06 [shard_inline]: 5.07999e-06 [merge_send_recv]: 4.13001e-06 [auto_parallel]: 5.04e-06 [parallel]: 4.15e-06 [flash_sp]: 3.04001e-06 [merge_comm]: 3.00998e-06 [allreduce_fusion]: 2.77002e-06 [matmul_add_comm_reduction]: 4.68999e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 5.91998e-06 [virtual_dataset]: 5.14e-06 [get_grad_eliminate_]: 5.09e-06 [virtual_output]: 4.99e-06 [merge_forward]: 2.64001e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 5.63002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.248e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 7.78999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.06001e-06 [meta_fg_expand]: 1.93002e-06 [flash_sp_send_recv_attached]: 8.40024e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 7.33999e-06 [a_after_grad]: 7.28e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.13001e-06 [auto_monad_grad]: 8.59989e-07 [auto_monad_eliminator]: 6.01e-06 [cse]: 1.537e-05 [a_3]: 3.078e-05 [py_interpret_to_execute_after_opt_a]: 3.97002e-06 [slice_cell_reuse_recomputed_activation]: 2.53e-06 [rewriter_after_opt_a]: 1.659e-05 [convert_after_rewriter]: 1.24998e-06 [order_py_execute_after_rewriter]: 1.10999e-06 [mutable_eliminate]: 0.00049888 [opt_b]: 0.00018512, [1] [Cycle 1]: 0.00017886, [7] [b_1]: 0.0001074 [b_2]: 6.78998e-06 [updatestate_depend_eliminate]: 5.40001e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.32999e-06 [renormalize]: 3.50003e-07 [cse]: 1.98e-05 [optimize_parallel_all_gather_comm]: 1.624e-05 [overlap_param_gather]: 2.69001e-06 [cconv]: 2.3e-05 [loop_unroll]: 0.00044235 [opt_after_cconv]: 9.471e-05, [1] [Cycle 1]: 8.893e-05, [7] [c_1]: 2.483e-05 [parameter_eliminate]: 2.58e-06 [updatestate_depend_eliminate]: 5.00001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [cse]: 1.931e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 2.909e-05 [tuple_transform]: 6.67e-05, [1] [Cycle 1]: 6.162e-05, [4] [d_1]: 3.581e-05 [none_parameter_eliminate]: 1.76998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 5.96998e-06 [partial_unused_args_eliminate]: 1.91e-06 [add_recomputation]: 4.619e-05 [cse_after_recomputation]: 2.136e-05, [1] [Cycle 1]: 1.706e-05, [1] [cse]: 1.175e-05 [environ_conv]: 7.56999e-06 [swap_dp_allreduce_reducescatter]: 5.27999e-06 [bias_add_comm_swap]: 3.23e-06 [label_micro_interleaved_index]: 4.65999e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.30011e-07 [remove_cast_before_assign_add]: 1.17999e-06 [full_micro_interleaved_order_control]: 2.26e-06 [reorder_send_recv_between_fp_bp]: 2.63e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.42e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.44998e-06 [overlap_opt_shard_grad_in_pipeline]: 2.06e-06 [control_data_broadcast_order]: 1.176e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 4.23001e-06 [overlap_recompute_and_grad_model_parallel]: 4.87e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 1.97001e-06 [overlap_grad_ring_attention]: 4.09997e-06 [overlap_grad_flash_sp]: 1.738e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.96003e-06 [handle_group_info]: 1.04998e-06 [symbol_engine_optimizer]: 9.524e-05, [1] [Cycle 1]: 9.094e-05, [6] [build]: 2.553e-05 [elim_shapecalc]: 9.09e-06 [elim_not_effective]: 1.097e-05 [opt_reshape]: 6.17001e-06 [fold_const_symbol]: 9.02999e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.66e-06 [pipeline_parallel_scheduler]: 1.38002e-06 [auto_monad_reorder]: 1.709e-05 [get_jit_bprop_graph]: 1.12999e-06 [rewriter_after_jit_bprop_graph]: 3.46999e-06 [opt_after_jit_grad]: 0.00048263 [validate]: 4.321e-05 [backend_pass]: 8.89995e-07 [task_emit]: 0.00808664 [execute]: 6.99001e-06 Sums bootstrap : 0.000451s : 1.08% type_inference : 0.028118s : 67.38% event_method : 0.000023s : 0.05% auto_monad : 0.000085s : 0.20% graph_reusing : 0.000006s : 0.01% inline : 0.000003s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.05% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000035s : 0.08% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000236s : 0.57% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000082s : 0.20% optimize.opt_a.loop_unroll : 0.000038s : 0.09% optimize.opt_a.a_1 : 0.000682s : 1.63% optimize.opt_a.with_stream_mark : 0.000025s : 0.06% optimize.opt_a.recompute_prepare : 0.000013s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000046s : 0.11% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000132s : 0.32% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000023s : 0.05% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.03% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000010s : 0.02% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000017s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.001116s : 2.67% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.05% optimize.opt_a.cse : 0.000051s : 0.12% optimize.opt_a.a_3 : 0.000071s : 0.17% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000017s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000499s : 1.20% optimize.opt_b.b_1 : 0.000107s : 0.26% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000023s : 0.06% optimize.loop_unroll : 0.000442s : 1.06% optimize.opt_after_cconv.c_1 : 0.000025s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000019s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000029s : 0.07% optimize.tuple_transform.d_1 : 0.000036s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000046s : 0.11% optimize.cse_after_recomputation.cse : 0.000012s : 0.03% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.04% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000026s : 0.06% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.04% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000483s : 1.16% validate : 0.000043s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.008087s : 19.38% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000184 26 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.73% : 0.000001s : 2: substitution.fold_const_symbol 2.79% : 0.000005s : 3: substitution.graph_param_transform 79.15% : 0.000146s : 6: substitution.inline 1.81% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.03% : 0.000006s : 4: substitution.remove_not_recompute_node 1.51% : 0.000003s : 2: substitution.replace_old_param 3.94% : 0.000007s : 1: substitution.switch_simplify 6.07% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.028039 2 94.50% : 0.026496s : 1: type_inference.infer 5.50% : 0.001543s : 1: type_inference.specialize ------[replace.] 0.000079 9 59.80% : 0.000047s : 6: replace.inline 20.49% : 0.000016s : 1: replace.switch_simplify 19.71% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000159 9 89.62% : 0.000143s : 6: match.inline 4.10% : 0.000007s : 1: match.switch_simplify 6.29% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000178 1092 1.04% : 0.000002s : 12: predicate.accumulaten_eliminater 0.87% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 6: predicate.addn_check_dump 1.09% : 0.000002s : 12: predicate.addn_zero_filter 0.86% : 0.000002s : 12: predicate.adjust_all_reduce_mul_add 2.29% : 0.000004s : 18: predicate.arithmetic_simplify 1.09% : 0.000002s : 12: predicate.cast_eliminate 0.50% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.51% : 0.000001s : 6: predicate.depend_value_elim 0.99% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.13% : 0.000002s : 12: predicate.dict_get_item_eliminator 1.02% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.84% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.17% : 0.000000s : 3: predicate.elim_not_effective 0.38% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.06% : 0.000002s : 15: predicate.environ_get_depend_swap 1.64% : 0.000003s : 21: predicate.environ_get_eliminate 1.14% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.56% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.58% : 0.000005s : 20: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.68% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.67% : 0.000001s : 6: predicate.get_grad_eliminate 0.15% : 0.000000s : 3: predicate.graph_param_transform 0.52% : 0.000001s : 6: predicate.incorporate_call 0.43% : 0.000001s : 6: predicate.incorporate_call_switch 5.72% : 0.000010s : 50: predicate.inline 0.61% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.82% : 0.000001s : 6: predicate.less_batch_normalization 1.78% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.42% : 0.000004s : 32: predicate.load_eliminater 0.96% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.15% : 0.000006s : 37: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 6: predicate.merge_addn 0.46% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.47% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.89% : 0.000002s : 12: predicate.minmaximum_grad 1.19% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.41% : 0.000001s : 3: predicate.parallel_virtual_node 2.17% : 0.000004s : 20: predicate.partial_defer_inline 1.38% : 0.000002s : 17: predicate.partial_eliminate 0.96% : 0.000002s : 12: predicate.print_const_string_wrapper 0.52% : 0.000001s : 6: predicate.reduce_all_const_elim 1.42% : 0.000003s : 12: predicate.reduce_eliminate 2.40% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000002s : 20: predicate.replace_applicator 0.49% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 0.99% : 0.000002s : 12: predicate.reshape_eliminate 0.54% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 3: predicate.row_tensor_eliminate 0.65% : 0.000001s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.75% : 0.000001s : 6: predicate.shard_identity_eliminate 0.80% : 0.000001s : 6: predicate.special_op_eliminate 0.64% : 0.000001s : 6: predicate.specialize_transform 0.75% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.66% : 0.000003s : 20: predicate.switch_defer_inline 2.12% : 0.000004s : 26: predicate.switch_layer_defer_inline 6.07% : 0.000011s : 68: predicate.switch_simplify 0.97% : 0.000002s : 12: predicate.tile_eliminate 1.04% : 0.000002s : 12: predicate.transpose_eliminate 1.54% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.63% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 18: predicate.tuple_list_get_item_depend_reorder 3.05% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.75% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.56% : 0.000005s : 24: predicate.tuple_list_set_item_eliminator 1.72% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.32% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.87% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 3: predicate.value_based_eliminate 0.61% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.57% : 0.000001s : 6: predicate.virtual_output_eliminate 0.24% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.56% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001173 16 53.40% : 0.000626s : 8: func_graph_cloner_run.FuncGraphClonerGraph 46.60% : 0.000546s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.057243 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.20% : 0.003552s : 1: add_attr 6.19% : 0.003542s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.09% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.16% : 0.000090s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000005s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.84% : 0.000483s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.04% : 0.000024s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000028s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000005s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.79% : 0.000451s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.89% : 0.000508s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000014s : 1: opt.transform.mutable_eliminate 1.88% : 0.001077s : 78: opt.transform.opt_a 0.04% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.15% : 0.000085s : 28: opt.transform.opt_b 0.07% : 0.000040s : 2: opt.transform.opt_trans_graph 0.06% : 0.000032s : 4: opt.transform.symbol_engine_opt 5.26% : 0.003013s : 1: opt_a 0.17% : 0.000098s : 1: opt_after_cconv 0.86% : 0.000492s : 1: opt_after_jit_grad 0.33% : 0.000189s : 1: opt_b 8.95% : 0.005121s : 1: optimize 0.03% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.07% : 0.000039s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.06% : 0.000033s : 1: remove_dup_value 1.10% : 0.000628s : 1: renormalize.infer 0.84% : 0.000480s : 1: renormalize.specialize 0.01% : 0.000007s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000020s : 1: rewriter_after_opt_a 0.42% : 0.000242s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.17% : 0.000098s : 1: symbol_engine_optimizer 14.14% : 0.008097s : 1: task_emit 0.12% : 0.000070s : 1: tuple_transform 49.16% : 0.028139s : 1: type_inference 0.12% : 0.000071s : 1: validate TotalTime = 0.0415543, [24] [bootstrap]: 0.0005159 [type_inference]: 0.0244079 [event_method]: 2.105e-05 [auto_monad]: 8.386e-05 [graph_reusing]: 6.80002e-06 [inline]: 2.22001e-06 [add_attr]: 0.00329474, [1] [add_attr_with_inline]: 0.00328537, [1] [Cycle 1]: 5.36e-05, [2] [tag_attr]: 2.12e-05 [meta_addattr_fg_expand]: 5.77001e-06 [parallel-infer-symbol]: 2.78e-06 [pre_auto_parallel]: 3.283e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.57999e-06 [optimize]: 0.0045648, [53] [py_interpret_to_execute]: 4.56002e-06 [rewriter_before_opt_a]: 0.0002294 [opt_a]: 0.00259731, [2] [Cycle 1]: 0.00204038, [45] [expand_dump_flag]: 3.43e-06 [switch_simplify]: 7.481e-05 [loop_unroll]: 3.116e-05 [a_1]: 0.00055804 [with_stream_mark]: 1.562e-05 [recompute_prepare]: 7.1e-06 [updatestate_depend_eliminate]: 4e-06 [updatestate_assign_eliminate]: 3.63e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 1.79998e-06 [a_2]: 6.728e-05 [accelerated_algorithm]: 6.07999e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 2.17001e-06 [shard_inline]: 5.57999e-06 [merge_send_recv]: 8.55999e-06 [auto_parallel]: 5.32001e-06 [parallel]: 1.941e-05 [flash_sp]: 7.15e-06 [merge_comm]: 3.47002e-06 [allreduce_fusion]: 3.16999e-06 [matmul_add_comm_reduction]: 9.14e-06 [allreduce_slice_to_reducescatter]: 6.90023e-07 [virtual_shard_identity]: 7.37997e-06 [virtual_dataset]: 5.72999e-06 [get_grad_eliminate_]: 5.32999e-06 [virtual_output]: 5.47001e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 9.05999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.065e-05 [merge_recompute_call_nodes]: 1.62999e-06 [before_grad]: 9.69e-06 [set_forward_comm_id_for_comm_node_pass]: 3.2e-06 [meta_fg_expand]: 2.46e-06 [flash_sp_send_recv_attached]: 2.44001e-06 [receive_attached]: 2.10002e-06 [after_resolve]: 8.82e-06 [a_after_grad]: 7.9e-06 [renormalize]: 0.00079328 [add_forward_monad_depend]: 5.07e-06 [auto_monad_grad]: 1.67999e-06 [auto_monad_eliminator]: 1.463e-05 [cse]: 3.293e-05 [a_3]: 3.92e-05 [Cycle 2]: 0.00054774, [45] [expand_dump_flag]: 9.70002e-07 [switch_simplify]: 6.46e-06 [loop_unroll]: 5.46e-06 [a_1]: 9.418e-05 [with_stream_mark]: 1.029e-05 [recompute_prepare]: 5.12e-06 [updatestate_depend_eliminate]: 2.73998e-06 [updatestate_assign_eliminate]: 2.36998e-06 [updatestate_loads_eliminate]: 2.17999e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 7.174e-05 [accelerated_algorithm]: 5.13002e-06 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 1.12e-06 [shard_inline]: 5.00999e-06 [merge_send_recv]: 4.17e-06 [auto_parallel]: 5.42999e-06 [parallel]: 4.22e-06 [flash_sp]: 2.98e-06 [merge_comm]: 2.96001e-06 [allreduce_fusion]: 2.63e-06 [matmul_add_comm_reduction]: 5.03002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 5.77001e-06 [virtual_dataset]: 5.07e-06 [get_grad_eliminate_]: 4.82998e-06 [virtual_output]: 4.84e-06 [merge_forward]: 2.54999e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 5.47999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.087e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 7.69002e-06 [set_forward_comm_id_for_comm_node_pass]: 2.94999e-06 [meta_fg_expand]: 1.49998e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 8.90024e-07 [after_resolve]: 7.58001e-06 [a_after_grad]: 7.01999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 5.56e-06 [cse]: 1.475e-05 [a_3]: 2.901e-05 [py_interpret_to_execute_after_opt_a]: 3.97e-06 [slice_cell_reuse_recomputed_activation]: 2.16998e-06 [rewriter_after_opt_a]: 2.434e-05 [convert_after_rewriter]: 1.35999e-06 [order_py_execute_after_rewriter]: 1.20001e-06 [mutable_eliminate]: 0.00043633 [opt_b]: 0.00017495, [1] [Cycle 1]: 0.00016893, [7] [b_1]: 0.00010087 [b_2]: 6.59999e-06 [updatestate_depend_eliminate]: 5.21002e-06 [updatestate_assign_eliminate]: 2.49001e-06 [updatestate_loads_eliminate]: 2.27999e-06 [renormalize]: 4.19997e-07 [cse]: 1.826e-05 [optimize_parallel_all_gather_comm]: 1.619e-05 [overlap_param_gather]: 1.99999e-06 [cconv]: 2.282e-05 [loop_unroll]: 0.00039968 [opt_after_cconv]: 9.131e-05, [1] [Cycle 1]: 8.588e-05, [7] [c_1]: 2.416e-05 [parameter_eliminate]: 2.09999e-06 [updatestate_depend_eliminate]: 4.90999e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.07001e-06 [cse]: 1.789e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.561e-05 [tuple_transform]: 6.232e-05, [1] [Cycle 1]: 5.823e-05, [4] [d_1]: 3.391e-05 [none_parameter_eliminate]: 1.50999e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 5.71e-06 [partial_unused_args_eliminate]: 1.67999e-06 [add_recomputation]: 7.384e-05 [cse_after_recomputation]: 2.303e-05, [1] [Cycle 1]: 1.885e-05, [1] [cse]: 1.353e-05 [environ_conv]: 7.73999e-06 [swap_dp_allreduce_reducescatter]: 5.65001e-06 [bias_add_comm_swap]: 2.52001e-06 [label_micro_interleaved_index]: 4.23999e-06 [label_fine_grained_interleaved_index]: 3.29001e-06 [merge_cast_opt]: 1.35999e-06 [slice_recompute_activation]: 1.96998e-06 [micro_interleaved_order_control]: 2.22999e-06 [assign_add_opt]: 1.17e-06 [ForceFp32Comm]: 7.50006e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.52001e-06 [reorder_send_recv_between_fp_bp]: 3.03e-06 [comm_op_add_attrs]: 1.34e-06 [add_comm_op_reuse_tag]: 1.20001e-06 [interleave_split_concat_branches]: 1.37e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.27e-06 [overlap_opt_shard_grad_in_pipeline]: 1.92001e-06 [control_data_broadcast_order]: 1.234e-05 [grouped_pairwise_exchange_alltoall]: 1.62999e-06 [offloading_packed_experts]: 3.44001e-06 [overlap_recompute_and_grad_model_parallel]: 4.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.06e-06 [overlap_grad_ring_attention]: 4.48999e-06 [overlap_grad_flash_sp]: 1.713e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 1.99999e-06 [split_layernorm_comm]: 1.91e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.326e-05, [1] [Cycle 1]: 6.938e-05, [6] [build]: 8.86002e-06 [elim_shapecalc]: 8.18001e-06 [elim_not_effective]: 1.107e-05 [opt_reshape]: 5.76003e-06 [fold_const_symbol]: 8.59e-06 [renormalize]: 1.50001e-07 [detach_backward]: 1.62001e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.824e-05 [get_jit_bprop_graph]: 1.10001e-06 [rewriter_after_jit_bprop_graph]: 3.4e-06 [opt_after_jit_grad]: 0.00043156 [validate]: 3.761e-05 [backend_pass]: 1.02998e-06 [task_emit]: 0.00791809 [execute]: 6.59001e-06 Sums bootstrap : 0.000516s : 1.38% type_inference : 0.024408s : 65.38% event_method : 0.000021s : 0.06% auto_monad : 0.000084s : 0.22% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000021s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000033s : 0.09% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000229s : 0.61% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000081s : 0.22% optimize.opt_a.loop_unroll : 0.000037s : 0.10% optimize.opt_a.a_1 : 0.000652s : 1.75% optimize.opt_a.with_stream_mark : 0.000026s : 0.07% optimize.opt_a.recompute_prepare : 0.000012s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000139s : 0.37% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000024s : 0.06% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000006s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.04% optimize.opt_a.virtual_dataset : 0.000011s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.03% optimize.opt_a.virtual_output : 0.000010s : 0.03% optimize.opt_a.merge_forward : 0.000006s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000022s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000017s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.02% optimize.opt_a.meta_fg_expand : 0.000004s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000016s : 0.04% optimize.opt_a.a_after_grad : 0.000015s : 0.04% optimize.opt_a.renormalize : 0.000793s : 2.13% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000020s : 0.05% optimize.opt_a.cse : 0.000048s : 0.13% optimize.opt_a.a_3 : 0.000068s : 0.18% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000024s : 0.07% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000436s : 1.17% optimize.opt_b.b_1 : 0.000101s : 0.27% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.06% optimize.loop_unroll : 0.000400s : 1.07% optimize.opt_after_cconv.c_1 : 0.000024s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000018s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.04% optimize.tuple_transform.d_1 : 0.000034s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000074s : 0.20% optimize.cse_after_recomputation.cse : 0.000014s : 0.04% optimize.environ_conv : 0.000008s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000011s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000432s : 1.16% validate : 0.000038s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007918s : 21.21% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000170 26 1.06% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.08% : 0.000005s : 3: substitution.graph_param_transform 78.36% : 0.000133s : 6: substitution.inline 1.88% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.61% : 0.000004s : 4: substitution.remove_not_recompute_node 1.65% : 0.000003s : 2: substitution.replace_old_param 4.30% : 0.000007s : 1: substitution.switch_simplify 6.26% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024348 2 95.36% : 0.023219s : 1: type_inference.infer 4.64% : 0.001129s : 1: type_inference.specialize ------[replace.] 0.000075 9 59.71% : 0.000045s : 6: replace.inline 21.63% : 0.000016s : 1: replace.switch_simplify 18.66% : 0.000014s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000146 9 89.09% : 0.000130s : 6: match.inline 4.40% : 0.000006s : 1: match.switch_simplify 6.50% : 0.000009s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000177 1092 0.98% : 0.000002s : 12: predicate.accumulaten_eliminater 0.75% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 6: predicate.addn_check_dump 0.98% : 0.000002s : 12: predicate.addn_zero_filter 0.84% : 0.000001s : 12: predicate.adjust_all_reduce_mul_add 8.37% : 0.000015s : 18: predicate.arithmetic_simplify 0.97% : 0.000002s : 12: predicate.cast_eliminate 0.44% : 0.000001s : 6: predicate.check_bprop_eliminate 0.45% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 0.95% : 0.000002s : 12: predicate.dict_get_item_const_eliminator 1.08% : 0.000002s : 12: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 12: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 3: predicate.elim_not_effective 0.32% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 15: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 15: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 15: predicate.environ_get_depend_swap 1.77% : 0.000003s : 21: predicate.environ_get_eliminate 1.11% : 0.000002s : 15: predicate.environ_get_set_eliminate 1.52% : 0.000003s : 20: predicate.exchange_switch_depend_value 2.41% : 0.000004s : 20: predicate.float_depend_g_call 0.42% : 0.000001s : 6: predicate.float_environ_get_switch 0.63% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.14% : 0.000000s : 3: predicate.fold_const_symbol 0.64% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.51% : 0.000001s : 6: predicate.incorporate_call 0.41% : 0.000001s : 6: predicate.incorporate_call_switch 5.53% : 0.000010s : 50: predicate.inline 0.61% : 0.000001s : 6: predicate.inline_without_move 0.25% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.76% : 0.000001s : 6: predicate.less_batch_normalization 1.64% : 0.000003s : 20: predicate.list_to_tuple_eliminator_ 2.33% : 0.000004s : 32: predicate.load_eliminater 0.86% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.73% : 0.000005s : 37: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.39% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.42% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 12: predicate.minmaximum_grad 1.03% : 0.000002s : 3: predicate.mutable_eliminate 0.32% : 0.000001s : 3: predicate.opt_reshape 0.30% : 0.000001s : 3: predicate.parallel_virtual_node 1.91% : 0.000003s : 20: predicate.partial_defer_inline 1.34% : 0.000002s : 17: predicate.partial_eliminate 0.95% : 0.000002s : 12: predicate.print_const_string_wrapper 0.47% : 0.000001s : 6: predicate.reduce_all_const_elim 1.42% : 0.000003s : 12: predicate.reduce_eliminate 2.31% : 0.000004s : 32: predicate.redundant_stop_gradient_eliminater 0.29% : 0.000001s : 6: predicate.remove_not_recompute_node 1.22% : 0.000002s : 20: predicate.replace_applicator 0.34% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 0.92% : 0.000002s : 12: predicate.reshape_eliminate 0.76% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.32% : 0.000001s : 3: predicate.row_tensor_eliminate 0.70% : 0.000001s : 6: predicate.same_eliminate 0.37% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.66% : 0.000001s : 6: predicate.shard_identity_eliminate 0.61% : 0.000001s : 6: predicate.special_op_eliminate 0.56% : 0.000001s : 6: predicate.specialize_transform 0.69% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.55% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.64% : 0.000003s : 20: predicate.switch_defer_inline 2.05% : 0.000004s : 26: predicate.switch_layer_defer_inline 5.77% : 0.000010s : 68: predicate.switch_simplify 0.92% : 0.000002s : 12: predicate.tile_eliminate 0.89% : 0.000002s : 12: predicate.transpose_eliminate 1.34% : 0.000002s : 18: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000003s : 18: predicate.tuple_list_get_item_const_eliminator 1.37% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.07% : 0.000004s : 24: predicate.tuple_list_set_item_eliminator 1.60% : 0.000003s : 20: predicate.tuple_to_list_eliminator_ 2.35% : 0.000004s : 32: predicate.updatestate_pure_node_eliminater 2.79% : 0.000005s : 38: predicate.updatestate_useless_node_eliminater 0.28% : 0.000000s : 3: predicate.value_based_eliminate 0.57% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.36% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000905 16 58.30% : 0.000528s : 8: func_graph_cloner_run.FuncGraphClonerGraph 41.70% : 0.000377s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.051389 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.42% : 0.003299s : 1: add_attr 6.40% : 0.003289s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.15% : 0.000079s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000089s : 1: auto_monad 0.04% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.07% : 0.000547s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000011s : 1: environ_conv 0.05% : 0.000026s : 1: event_method 0.02% : 0.000011s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.79% : 0.000407s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.86% : 0.000444s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.02% : 0.000012s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000013s : 1: opt.transform.mutable_eliminate 2.04% : 0.001049s : 78: opt.transform.opt_a 0.05% : 0.000023s : 1: opt.transform.opt_after_cconv 0.04% : 0.000020s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000080s : 28: opt.transform.opt_b 0.07% : 0.000038s : 2: opt.transform.opt_trans_graph 0.06% : 0.000030s : 4: opt.transform.symbol_engine_opt 5.06% : 0.002600s : 1: opt_a 0.18% : 0.000095s : 1: opt_after_cconv 0.86% : 0.000440s : 1: opt_after_jit_grad 0.35% : 0.000178s : 1: opt_b 8.89% : 0.004569s : 1: optimize 0.04% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000004s : 1: pipeline_split 0.07% : 0.000037s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000005s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.87% : 0.000447s : 1: renormalize.infer 0.66% : 0.000339s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000028s : 1: rewriter_after_opt_a 0.46% : 0.000235s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000076s : 1: symbol_engine_optimizer 15.43% : 0.007929s : 1: task_emit 0.13% : 0.000065s : 1: tuple_transform 47.52% : 0.024422s : 1: type_inference 0.13% : 0.000069s : 1: validate random_generator: generate a numpy.ndarray(shape=(3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 3, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 3, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 3, 3), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' random_generator: generate a numpy.ndarray(shape=(5, 5, 3, 2), dtype=, seed=1967515154) by numpy.random.randn, will be used as svd 'x' group_cases_17 have all been run, results of sub cases are below: case: (0, 0) {} pass. case: (0, 2) {} pass. case: (0, -1) {} pass. case: ('ge', ) {} pass. case: ('ge', ) {} pass. case: ('pynative', ) {} pass. case: ('ge', ) {} pass. case: ('kbk', ) {} pass. ops group_cases_18 with 8 cases start to running, all cases are below: case: (, 1, 0) case: (, 1, 2) case: (, 1, -1) case: (, 0) case: (, 1) case: (, 'pynative') case: (, 'pynative') case: (, 0) ops group_cases_18 total running memory: 35M, memory threshold: 51200M TotalTime = 1.99041, [24] [bootstrap]: 0.00085335 [type_inference]: 0.0486582 [event_method]: 2.063e-05 [auto_monad]: 0.00016669 [graph_reusing]: 6.69001e-06 [inline]: 2.68998e-06 [add_attr]: 0.00728653, [1] [add_attr_with_inline]: 0.00727499, [1] [Cycle 1]: 0.00012459, [2] [tag_attr]: 3.517e-05 [meta_addattr_fg_expand]: 1.522e-05 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 5.005e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.89999e-06 [pipeline_split]: 1.76e-06 [optimize]: 0.0059126, [53] [py_interpret_to_execute]: 4.23999e-06 [rewriter_before_opt_a]: 0.00022809 [opt_a]: 0.00350137, [2] [Cycle 1]: 0.00273316, [45] [expand_dump_flag]: 3.83001e-06 [switch_simplify]: 7.598e-05 [loop_unroll]: 3.478e-05 [a_1]: 0.00067925 [with_stream_mark]: 2.531e-05 [recompute_prepare]: 1.002e-05 [updatestate_depend_eliminate]: 1.395e-05 [updatestate_assign_eliminate]: 5.66e-06 [updatestate_loads_eliminate]: 4.68999e-06 [parameter_eliminate]: 2.05002e-06 [a_2]: 0.00011955 [accelerated_algorithm]: 8.95999e-06 [shard]: 2.04999e-06 [meta_shard_fg_expand]: 2.30002e-06 [shard_inline]: 8.18001e-06 [merge_send_recv]: 4.243e-05 [auto_parallel]: 7.83001e-06 [parallel]: 8.183e-05 [flash_sp]: 3.242e-05 [merge_comm]: 5.84e-06 [allreduce_fusion]: 1.253e-05 [matmul_add_comm_reduction]: 1.785e-05 [allreduce_slice_to_reducescatter]: 7.66001e-06 [virtual_shard_identity]: 1.044e-05 [virtual_dataset]: 8.43999e-06 [get_grad_eliminate_]: 8.08999e-06 [virtual_output]: 8.13999e-06 [merge_forward]: 5.13002e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.922e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.265e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.371e-05 [set_forward_comm_id_for_comm_node_pass]: 1.248e-05 [meta_fg_expand]: 4.67e-06 [flash_sp_send_recv_attached]: 2.61999e-06 [receive_attached]: 1.657e-05 [after_resolve]: 1.148e-05 [a_after_grad]: 1.224e-05 [renormalize]: 0.00097384 [add_forward_monad_depend]: 5.59e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 3.068e-05 [cse]: 6.274e-05 [a_3]: 5.996e-05 [Cycle 2]: 0.000759, [45] [expand_dump_flag]: 1.19998e-06 [switch_simplify]: 8.97999e-06 [loop_unroll]: 7.92e-06 [a_1]: 0.00016578 [with_stream_mark]: 1.231e-05 [recompute_prepare]: 8.11002e-06 [updatestate_depend_eliminate]: 4.74998e-06 [updatestate_assign_eliminate]: 3.94002e-06 [updatestate_loads_eliminate]: 3.9e-06 [parameter_eliminate]: 1.06002e-06 [a_2]: 0.00010506 [accelerated_algorithm]: 7.87998e-06 [shard]: 1.43002e-06 [meta_shard_fg_expand]: 1.71002e-06 [shard_inline]: 7.77998e-06 [merge_send_recv]: 5.84e-06 [auto_parallel]: 6.69999e-06 [parallel]: 3.78001e-06 [flash_sp]: 3.3e-06 [merge_comm]: 4.45999e-06 [allreduce_fusion]: 4.22e-06 [matmul_add_comm_reduction]: 6.80002e-06 [allreduce_slice_to_reducescatter]: 4.60015e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 7.63001e-06 [get_grad_eliminate_]: 8e-06 [virtual_output]: 7.31001e-06 [merge_forward]: 3.71001e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 7.51001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.378e-05 [merge_recompute_call_nodes]: 8.00006e-07 [before_grad]: 2.381e-05 [set_forward_comm_id_for_comm_node_pass]: 5.16002e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 9.60019e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.09e-05 [a_after_grad]: 1.131e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 7.99977e-07 [auto_monad_eliminator]: 1.009e-05 [cse]: 1.736e-05 [a_3]: 4.822e-05 [py_interpret_to_execute_after_opt_a]: 4.39002e-06 [slice_cell_reuse_recomputed_activation]: 1.82001e-06 [rewriter_after_opt_a]: 3.151e-05 [convert_after_rewriter]: 1.31002e-06 [order_py_execute_after_rewriter]: 1.17e-06 [mutable_eliminate]: 0.00052623 [opt_b]: 0.00026057, [1] [Cycle 1]: 0.00025486, [7] [b_1]: 0.00017421 [b_2]: 1.013e-05 [updatestate_depend_eliminate]: 7.1e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 3.76001e-06 [renormalize]: 4.89992e-07 [cse]: 2.14e-05 [optimize_parallel_all_gather_comm]: 2.8e-05 [overlap_param_gather]: 1.055e-05 [cconv]: 2.403e-05 [loop_unroll]: 0.00042169 [opt_after_cconv]: 0.00012299, [1] [Cycle 1]: 0.00011758, [7] [c_1]: 4.553e-05 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 7.2e-06 [updatestate_assign_eliminate]: 4.16001e-06 [updatestate_loads_eliminate]: 4.01001e-06 [cse]: 2.108e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 1.779e-05 [tuple_transform]: 7.938e-05, [1] [Cycle 1]: 7.475e-05, [4] [d_1]: 4.737e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 8.69998e-06 [partial_unused_args_eliminate]: 1.72001e-06 [add_recomputation]: 7.396e-05 [cse_after_recomputation]: 2.562e-05, [1] [Cycle 1]: 2.1e-05, [1] [cse]: 1.576e-05 [environ_conv]: 1.713e-05 [swap_dp_allreduce_reducescatter]: 2.348e-05 [bias_add_comm_swap]: 1.024e-05 [label_micro_interleaved_index]: 1.161e-05 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.57999e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.59001e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 1.00001e-06 [remove_cast_before_assign_add]: 8.85999e-06 [full_micro_interleaved_order_control]: 9.37999e-06 [reorder_send_recv_between_fp_bp]: 2.53e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 8.37e-06 [overlap_opt_shard_in_pipeline]: 1.403e-05 [overlap_opt_shard_grad_in_pipeline]: 2.12999e-06 [control_data_broadcast_order]: 1.592e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 4.84e-06 [overlap_recompute_and_grad_model_parallel]: 1.238e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.59e-06 [overlap_recompute_allgather_and_fa_grad]: 1.55001e-06 [overlap_recompute_comm]: 2.44999e-06 [overlap_grad_ring_attention]: 1.927e-05 [overlap_grad_flash_sp]: 4.464e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 9.63002e-06 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 1.05999e-06 [symbol_engine_optimizer]: 8.539e-05, [1] [Cycle 1]: 8.103e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.201e-05 [elim_not_effective]: 1.54e-05 [opt_reshape]: 8.77e-06 [fold_const_symbol]: 1.343e-05 [renormalize]: 2.40019e-07 [detach_backward]: 1.52999e-06 [pipeline_parallel_scheduler]: 1.45001e-06 [auto_monad_reorder]: 4.285e-05 [get_jit_bprop_graph]: 1.08001e-06 [rewriter_after_jit_bprop_graph]: 2.94999e-06 [opt_after_jit_grad]: 0.00047913 [validate]: 6.018e-05 [backend_pass]: 9.10019e-07 [task_emit]: 1.92659 [execute]: 8.64998e-06 Sums bootstrap : 0.000853s : 0.04% type_inference : 0.048658s : 2.45% event_method : 0.000021s : 0.00% auto_monad : 0.000167s : 0.01% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000035s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000050s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000228s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000085s : 0.00% optimize.opt_a.loop_unroll : 0.000043s : 0.00% optimize.opt_a.a_1 : 0.000845s : 0.04% optimize.opt_a.with_stream_mark : 0.000038s : 0.00% optimize.opt_a.recompute_prepare : 0.000018s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000225s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000048s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000086s : 0.00% optimize.opt_a.flash_sp : 0.000036s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000019s : 0.00% optimize.opt_a.virtual_dataset : 0.000016s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000015s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000027s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000036s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000038s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.00% optimize.opt_a.a_after_grad : 0.000024s : 0.00% optimize.opt_a.renormalize : 0.000974s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000041s : 0.00% optimize.opt_a.cse : 0.000080s : 0.00% optimize.opt_a.a_3 : 0.000108s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000032s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000526s : 0.03% optimize.opt_b.b_1 : 0.000174s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000422s : 0.02% optimize.opt_after_cconv.c_1 : 0.000046s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.00% optimize.tuple_transform.d_1 : 0.000047s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000074s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000012s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000009s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000016s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000045s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000043s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000479s : 0.02% validate : 0.000060s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 1.926593s : 97.20% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000242 54 2.70% : 0.000007s : 2: substitution.depend_value_elim 0.96% : 0.000002s : 4: substitution.elim_not_effective 0.87% : 0.000002s : 4: substitution.fold_const_symbol 2.52% : 0.000006s : 5: substitution.graph_param_transform 65.13% : 0.000158s : 5: substitution.inline 1.98% : 0.000005s : 8: substitution.j_node_and_user_rematch 5.52% : 0.000013s : 8: substitution.remove_not_recompute_node 1.36% : 0.000003s : 2: substitution.replace_old_param 6.03% : 0.000015s : 2: substitution.tuple_list_get_item_eliminator 4.77% : 0.000012s : 6: substitution.updatestate_pure_node_eliminater 8.15% : 0.000020s : 8: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.048575 2 97.43% : 0.047327s : 1: type_inference.infer 2.57% : 0.001248s : 1: type_inference.specialize ------[replace.] 0.000064 7 74.19% : 0.000048s : 5: replace.inline 25.81% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000168 7 91.96% : 0.000155s : 5: match.inline 8.04% : 0.000014s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000237 1539 0.94% : 0.000002s : 16: predicate.accumulaten_eliminater 0.73% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.58% : 0.000001s : 10: predicate.addn_check_dump 0.97% : 0.000002s : 16: predicate.addn_zero_filter 0.88% : 0.000002s : 16: predicate.adjust_all_reduce_mul_add 2.25% : 0.000005s : 26: predicate.arithmetic_simplify 0.97% : 0.000002s : 16: predicate.cast_eliminate 0.75% : 0.000002s : 10: predicate.check_bprop_eliminate 0.59% : 0.000001s : 10: predicate.compare_switch_simplify 0.18% : 0.000000s : 5: predicate.const_output_eliminate 0.67% : 0.000002s : 10: predicate.depend_value_elim 0.95% : 0.000002s : 16: predicate.dict_get_item_const_eliminator 1.16% : 0.000003s : 16: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 16: predicate.dict_set_item_eliminator 0.83% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.35% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000003s : 21: predicate.environ_add_const_eliminate 1.20% : 0.000003s : 21: predicate.environ_get_add_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_depend_swap 1.80% : 0.000004s : 31: predicate.environ_get_eliminate 1.13% : 0.000003s : 21: predicate.environ_get_set_eliminate 1.34% : 0.000003s : 23: predicate.exchange_switch_depend_value 2.13% : 0.000005s : 23: predicate.float_depend_g_call 0.57% : 0.000001s : 10: predicate.float_environ_get_switch 0.85% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 5: predicate.fold_const_symbol 0.76% : 0.000002s : 10: predicate.get_grad_eliminate 0.21% : 0.000001s : 5: predicate.graph_param_transform 0.62% : 0.000001s : 10: predicate.incorporate_call 0.52% : 0.000001s : 10: predicate.incorporate_call_switch 5.58% : 0.000013s : 69: predicate.inline 0.76% : 0.000002s : 10: predicate.inline_without_move 0.29% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.82% : 0.000002s : 10: predicate.less_batch_normalization 1.77% : 0.000004s : 28: predicate.list_to_tuple_eliminator_ 2.52% : 0.000006s : 44: predicate.load_eliminater 0.86% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.44% : 0.000006s : 43: predicate.loop_unroll_before_grad 1.63% : 0.000004s : 26: predicate.make_slice_get_slice_eliminator 0.66% : 0.000002s : 10: predicate.merge_addn 0.59% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.58% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 16: predicate.minmaximum_grad 0.94% : 0.000002s : 5: predicate.mutable_eliminate 0.40% : 0.000001s : 5: predicate.opt_reshape 0.36% : 0.000001s : 5: predicate.parallel_virtual_node 1.67% : 0.000004s : 23: predicate.partial_defer_inline 1.46% : 0.000003s : 23: predicate.partial_eliminate 0.99% : 0.000002s : 16: predicate.print_const_string_wrapper 0.63% : 0.000001s : 10: predicate.reduce_all_const_elim 1.25% : 0.000003s : 16: predicate.reduce_eliminate 2.50% : 0.000006s : 44: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 10: predicate.remove_not_recompute_node 1.25% : 0.000003s : 28: predicate.replace_applicator 0.42% : 0.000001s : 10: predicate.replace_old_param 0.23% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000002s : 16: predicate.reshape_eliminate 0.72% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.78% : 0.000002s : 10: predicate.same_eliminate 0.40% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.84% : 0.000002s : 10: predicate.shard_identity_eliminate 0.93% : 0.000002s : 10: predicate.special_op_eliminate 0.82% : 0.000002s : 10: predicate.specialize_transform 0.82% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.80% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.45% : 0.000003s : 23: predicate.switch_defer_inline 2.05% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.16% : 0.000012s : 81: predicate.switch_simplify 0.99% : 0.000002s : 16: predicate.tile_eliminate 0.99% : 0.000002s : 16: predicate.transpose_eliminate 1.62% : 0.000004s : 26: predicate.tuple_list_convert_item_index_to_positive 1.70% : 0.000004s : 26: predicate.tuple_list_get_item_const_eliminator 1.61% : 0.000004s : 26: predicate.tuple_list_get_item_depend_reorder 3.06% : 0.000007s : 38: predicate.tuple_list_get_item_eliminator 1.49% : 0.000004s : 26: predicate.tuple_list_get_set_item_eliminator 2.49% : 0.000006s : 36: predicate.tuple_list_set_item_eliminator 1.78% : 0.000004s : 28: predicate.tuple_to_list_eliminator_ 2.54% : 0.000006s : 44: predicate.updatestate_pure_node_eliminater 3.30% : 0.000008s : 54: predicate.updatestate_useless_node_eliminater 0.37% : 0.000001s : 5: predicate.value_based_eliminate 0.68% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.66% : 0.000002s : 10: predicate.virtual_output_eliminate 0.28% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000718 12 45.02% : 0.000323s : 5: func_graph_cloner_run.FuncGraphClonerGraph 54.98% : 0.000395s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.006300 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.36% : 0.007291s : 1: add_attr 0.36% : 0.007279s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000078s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000173s : 1: auto_monad 0.00% : 0.000047s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.04% : 0.000900s : 1: bootstrap 0.00% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000019s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000021s : 1: environ_conv 0.00% : 0.000027s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000012s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000014s : 1: label_micro_interleaved_index 0.02% : 0.000430s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000534s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.07% : 0.001458s : 78: opt.transform.opt_a 0.00% : 0.000044s : 1: opt.transform.opt_after_cconv 0.00% : 0.000035s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000157s : 28: opt.transform.opt_b 0.00% : 0.000054s : 2: opt.transform.opt_trans_graph 0.00% : 0.000046s : 4: opt.transform.symbol_engine_opt 0.17% : 0.003504s : 1: opt_a 0.01% : 0.000126s : 1: opt_after_cconv 0.02% : 0.000487s : 1: opt_after_jit_grad 0.01% : 0.000264s : 1: opt_b 0.29% : 0.005916s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000048s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000054s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000021s : 1: remove_dup_value 0.02% : 0.000499s : 1: renormalize.infer 0.02% : 0.000466s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000035s : 1: rewriter_after_opt_a 0.01% : 0.000233s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000012s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000088s : 1: symbol_engine_optimizer 96.03% : 1.926619s : 1: task_emit 0.00% : 0.000082s : 1: tuple_transform 2.43% : 0.048676s : 1: type_inference 0.00% : 0.000090s : 1: validate TotalTime = 2.37544, [24] [bootstrap]: 0.00085341 [type_inference]: 0.0550743 [event_method]: 6.387e-05 [auto_monad]: 0.00010321 [graph_reusing]: 4.42998e-06 [inline]: 1.66e-06 [add_attr]: 0.00681334, [1] [add_attr_with_inline]: 0.00679396, [1] [Cycle 1]: 7.795e-05, [2] [tag_attr]: 2.475e-05 [meta_addattr_fg_expand]: 8.94e-06 [parallel-infer-symbol]: 1.69e-06 [pre_auto_parallel]: 3.613e-05 [insert-virtual-dataset]: 1.09e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 1.02e-06 [pipeline_split]: 9.20001e-07 [optimize]: 0.00565454, [53] [py_interpret_to_execute]: 3.27002e-06 [rewriter_before_opt_a]: 0.00022093 [opt_a]: 0.00333578, [2] [Cycle 1]: 0.00255105, [45] [expand_dump_flag]: 1.87999e-06 [switch_simplify]: 5.602e-05 [loop_unroll]: 3.365e-05 [a_1]: 0.00063615 [with_stream_mark]: 1.009e-05 [recompute_prepare]: 1.008e-05 [updatestate_depend_eliminate]: 7.94002e-06 [updatestate_assign_eliminate]: 5.94e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00010822 [accelerated_algorithm]: 8.90001e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.71e-06 [shard_inline]: 8.17e-06 [merge_send_recv]: 2.166e-05 [auto_parallel]: 6.29999e-06 [parallel]: 4.693e-05 [flash_sp]: 1.78e-05 [merge_comm]: 3.96001e-06 [allreduce_fusion]: 7.93001e-06 [matmul_add_comm_reduction]: 8.48999e-06 [allreduce_slice_to_reducescatter]: 3.4e-06 [virtual_shard_identity]: 1.062e-05 [virtual_dataset]: 8.62e-06 [get_grad_eliminate_]: 8.03999e-06 [virtual_output]: 8.17e-06 [merge_forward]: 3.55998e-06 [cell_reuse_recompute_pass]: 1.02e-06 [offload_activation]: 1.04e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.848e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.16e-05 [set_forward_comm_id_for_comm_node_pass]: 7.41999e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 1.55999e-06 [receive_attached]: 7.89002e-06 [after_resolve]: 1.127e-05 [a_after_grad]: 1.287e-05 [renormalize]: 0.00104431 [add_forward_monad_depend]: 4.43001e-06 [auto_monad_grad]: 1.49998e-06 [auto_monad_eliminator]: 1.886e-05 [cse]: 5.489e-05 [a_3]: 5.984e-05 [Cycle 2]: 0.00077481, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 9.49999e-06 [loop_unroll]: 8.23999e-06 [a_1]: 0.00020101 [with_stream_mark]: 1.109e-05 [recompute_prepare]: 8.03001e-06 [updatestate_depend_eliminate]: 3.14999e-06 [updatestate_assign_eliminate]: 2.63998e-06 [updatestate_loads_eliminate]: 2.86999e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 9.912e-05 [accelerated_algorithm]: 7.93999e-06 [shard]: 1.24998e-06 [meta_shard_fg_expand]: 1.47001e-06 [shard_inline]: 7.87e-06 [merge_send_recv]: 4.73001e-06 [auto_parallel]: 5.25001e-06 [parallel]: 4.3e-06 [flash_sp]: 2.27999e-06 [merge_comm]: 3.14999e-06 [allreduce_fusion]: 2.78e-06 [matmul_add_comm_reduction]: 5.54e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 8.89003e-06 [virtual_dataset]: 7.7e-06 [get_grad_eliminate_]: 7.92e-06 [virtual_output]: 7.71001e-06 [merge_forward]: 2.99001e-06 [cell_reuse_recompute_pass]: 1.64e-06 [offload_activation]: 6.34999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.475e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 1.072e-05 [set_forward_comm_id_for_comm_node_pass]: 3.28998e-06 [meta_fg_expand]: 2.19999e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.04998e-06 [after_resolve]: 1.047e-05 [a_after_grad]: 1.253e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 6.43e-06 [cse]: 1.855e-05 [a_3]: 4.855e-05 [py_interpret_to_execute_after_opt_a]: 5.37001e-06 [slice_cell_reuse_recomputed_activation]: 1.20001e-06 [rewriter_after_opt_a]: 2.235e-05 [convert_after_rewriter]: 1.09003e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00054038 [opt_b]: 0.0002488, [1] [Cycle 1]: 0.00024308, [7] [b_1]: 0.00016408 [b_2]: 9.71e-06 [updatestate_depend_eliminate]: 5.89e-06 [updatestate_assign_eliminate]: 2.71e-06 [updatestate_loads_eliminate]: 2.53e-06 [renormalize]: 5.09986e-07 [cse]: 2.415e-05 [optimize_parallel_all_gather_comm]: 1.937e-05 [overlap_param_gather]: 6.14001e-06 [cconv]: 1.89e-05 [loop_unroll]: 0.00045509 [opt_after_cconv]: 0.00011961, [1] [Cycle 1]: 0.00011407, [7] [c_1]: 4.282e-05 [parameter_eliminate]: 2.80002e-06 [updatestate_depend_eliminate]: 5.64998e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.63e-06 [cse]: 2.405e-05 [renormalize]: 5.09986e-07 [remove_dup_value]: 2.921e-05 [tuple_transform]: 8.411e-05, [1] [Cycle 1]: 8.01e-05, [4] [d_1]: 5.282e-05 [none_parameter_eliminate]: 1.04e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 8.48999e-06 [partial_unused_args_eliminate]: 1.15999e-06 [add_recomputation]: 4.549e-05 [cse_after_recomputation]: 2.384e-05, [1] [Cycle 1]: 1.962e-05, [1] [cse]: 1.418e-05 [environ_conv]: 1.97e-05 [swap_dp_allreduce_reducescatter]: 1.43e-05 [bias_add_comm_swap]: 5.92001e-06 [label_micro_interleaved_index]: 8.52998e-06 [label_fine_grained_interleaved_index]: 1.42e-06 [merge_cast_opt]: 8.80013e-07 [slice_recompute_activation]: 1.15999e-06 [micro_interleaved_order_control]: 1.97001e-06 [assign_add_opt]: 8.2e-07 [ForceFp32Comm]: 6.89994e-07 [remove_cast_before_assign_add]: 4.67998e-06 [full_micro_interleaved_order_control]: 5.76998e-06 [reorder_send_recv_between_fp_bp]: 1.59e-06 [comm_op_add_attrs]: 6.89994e-07 [add_comm_op_reuse_tag]: 6.10016e-07 [interleave_split_concat_branches]: 9.50007e-07 [interleave_parallel_branches]: 5.42001e-06 [overlap_opt_shard_in_pipeline]: 1.482e-05 [overlap_opt_shard_grad_in_pipeline]: 1.27e-06 [control_data_broadcast_order]: 1.1e-05 [grouped_pairwise_exchange_alltoall]: 9.20001e-07 [offloading_packed_experts]: 3.31999e-06 [overlap_recompute_and_grad_model_parallel]: 9.05999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 8.00006e-07 [overlap_recompute_comm]: 1.71e-06 [overlap_grad_ring_attention]: 1.202e-05 [overlap_grad_flash_sp]: 2.96e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 5.84e-06 [split_layernorm_comm]: 1.35999e-06 [handle_group_info]: 7.50006e-07 [symbol_engine_optimizer]: 7.963e-05, [1] [Cycle 1]: 7.522e-05, [6] [build]: 2.21998e-06 [elim_shapecalc]: 1.151e-05 [elim_not_effective]: 1.393e-05 [opt_reshape]: 8.86997e-06 [fold_const_symbol]: 1.07e-05 [renormalize]: 3.00002e-07 [detach_backward]: 1.25999e-06 [pipeline_parallel_scheduler]: 1.32999e-06 [auto_monad_reorder]: 1.738e-05 [get_jit_bprop_graph]: 1.54e-06 [rewriter_after_jit_bprop_graph]: 2.96001e-06 [opt_after_jit_grad]: 0.00048225 [validate]: 7.287e-05 [backend_pass]: 6.59988e-07 [task_emit]: 2.30595 [execute]: 1.178e-05 Sums bootstrap : 0.000853s : 0.04% type_inference : 0.055074s : 2.33% event_method : 0.000064s : 0.00% auto_monad : 0.000103s : 0.00% graph_reusing : 0.000004s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000009s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000036s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000003s : 0.00% optimize.rewriter_before_opt_a : 0.000221s : 0.01% optimize.opt_a.expand_dump_flag : 0.000003s : 0.00% optimize.opt_a.switch_simplify : 0.000066s : 0.00% optimize.opt_a.loop_unroll : 0.000042s : 0.00% optimize.opt_a.a_1 : 0.000837s : 0.04% optimize.opt_a.with_stream_mark : 0.000021s : 0.00% optimize.opt_a.recompute_prepare : 0.000018s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000011s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000207s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000026s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000051s : 0.00% optimize.opt_a.flash_sp : 0.000020s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000016s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000033s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000001s : 0.00% optimize.opt_a.before_grad : 0.000022s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000022s : 0.00% optimize.opt_a.a_after_grad : 0.000025s : 0.00% optimize.opt_a.renormalize : 0.001044s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000025s : 0.00% optimize.opt_a.cse : 0.000073s : 0.00% optimize.opt_a.a_3 : 0.000108s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000540s : 0.02% optimize.opt_b.b_1 : 0.000164s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.00% optimize.overlap_param_gather : 0.000006s : 0.00% optimize.cconv : 0.000019s : 0.00% optimize.loop_unroll : 0.000455s : 0.02% optimize.opt_after_cconv.c_1 : 0.000043s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.00% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000029s : 0.00% optimize.tuple_transform.d_1 : 0.000053s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000045s : 0.00% optimize.cse_after_recomputation.cse : 0.000014s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.00% optimize.bias_add_comm_swap : 0.000006s : 0.00% optimize.label_micro_interleaved_index : 0.000009s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000005s : 0.00% optimize.full_micro_interleaved_order_control : 0.000006s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000005s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000015s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000009s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000012s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000482s : 0.02% validate : 0.000073s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.305947s : 97.40% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000152 27 1.09% : 0.000002s : 2: substitution.elim_not_effective 0.79% : 0.000001s : 2: substitution.fold_const_symbol 3.40% : 0.000005s : 6: substitution.graph_param_transform 80.17% : 0.000122s : 5: substitution.inline 1.93% : 0.000003s : 4: substitution.j_node_and_user_rematch 5.95% : 0.000009s : 4: substitution.remove_not_recompute_node 1.75% : 0.000003s : 2: substitution.replace_old_param 4.92% : 0.000007s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.055012 2 97.60% : 0.053692s : 1: type_inference.infer 2.40% : 0.001320s : 1: type_inference.specialize ------[replace.] 0.000066 7 71.67% : 0.000047s : 5: replace.inline 28.33% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000125 7 95.04% : 0.000119s : 5: match.inline 4.96% : 0.000006s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000235 1917 0.92% : 0.000002s : 19: predicate.accumulaten_eliminater 0.74% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 14: predicate.addn_check_dump 0.95% : 0.000002s : 19: predicate.addn_zero_filter 0.86% : 0.000002s : 19: predicate.adjust_all_reduce_mul_add 2.07% : 0.000005s : 33: predicate.arithmetic_simplify 1.00% : 0.000002s : 19: predicate.cast_eliminate 0.68% : 0.000002s : 14: predicate.check_bprop_eliminate 0.63% : 0.000001s : 14: predicate.compare_switch_simplify 0.29% : 0.000001s : 7: predicate.const_output_eliminate 0.61% : 0.000001s : 14: predicate.depend_value_elim 0.97% : 0.000002s : 19: predicate.dict_get_item_const_eliminator 1.12% : 0.000003s : 19: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 19: predicate.dict_set_item_eliminator 1.07% : 0.000002s : 13: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 6: predicate.elim_not_effective 0.41% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_depend_swap 1.99% : 0.000005s : 40: predicate.environ_get_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.29% : 0.000003s : 26: predicate.exchange_switch_depend_value 1.87% : 0.000004s : 26: predicate.float_depend_g_call 0.63% : 0.000001s : 14: predicate.float_environ_get_switch 0.97% : 0.000002s : 21: predicate.float_tuple_getitem_switch 0.23% : 0.000001s : 6: predicate.fold_const_symbol 0.73% : 0.000002s : 14: predicate.get_grad_eliminate 0.25% : 0.000001s : 6: predicate.graph_param_transform 0.65% : 0.000002s : 14: predicate.incorporate_call 0.57% : 0.000001s : 14: predicate.incorporate_call_switch 5.57% : 0.000013s : 87: predicate.inline 0.81% : 0.000002s : 14: predicate.inline_without_move 0.43% : 0.000001s : 14: predicate.j_node_and_user_rematch 0.92% : 0.000002s : 14: predicate.less_batch_normalization 1.75% : 0.000004s : 34: predicate.list_to_tuple_eliminator_ 2.50% : 0.000006s : 54: predicate.load_eliminater 0.94% : 0.000002s : 7: predicate.loop_unroll_after_grad 2.28% : 0.000005s : 42: predicate.loop_unroll_before_grad 1.71% : 0.000004s : 33: predicate.make_slice_get_slice_eliminator 0.68% : 0.000002s : 14: predicate.merge_addn 0.63% : 0.000001s : 14: predicate.micro_step_allgather_replace 0.66% : 0.000002s : 14: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 19: predicate.minmaximum_grad 0.99% : 0.000002s : 7: predicate.mutable_eliminate 0.41% : 0.000001s : 6: predicate.opt_reshape 0.35% : 0.000001s : 7: predicate.parallel_virtual_node 1.58% : 0.000004s : 26: predicate.partial_defer_inline 1.45% : 0.000003s : 28: predicate.partial_eliminate 0.94% : 0.000002s : 19: predicate.print_const_string_wrapper 0.65% : 0.000002s : 14: predicate.reduce_all_const_elim 1.15% : 0.000003s : 19: predicate.reduce_eliminate 2.53% : 0.000006s : 54: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 14: predicate.remove_not_recompute_node 1.67% : 0.000004s : 35: predicate.replace_applicator 0.55% : 0.000001s : 14: predicate.replace_old_param 0.35% : 0.000001s : 7: predicate.reset_defer_inline 0.90% : 0.000002s : 19: predicate.reshape_eliminate 0.73% : 0.000002s : 14: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 7: predicate.row_tensor_eliminate 0.78% : 0.000002s : 14: predicate.same_eliminate 0.58% : 0.000001s : 14: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 14: predicate.shard_identity_eliminate 0.68% : 0.000002s : 13: predicate.special_op_eliminate 0.78% : 0.000002s : 14: predicate.specialize_transform 0.82% : 0.000002s : 14: predicate.split_environ_get_set_with_tuple_value 0.91% : 0.000002s : 14: predicate.stack_unstack_eliminate 0.36% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.43% : 0.000003s : 26: predicate.switch_defer_inline 2.05% : 0.000005s : 40: predicate.switch_layer_defer_inline 4.85% : 0.000011s : 88: predicate.switch_simplify 0.90% : 0.000002s : 19: predicate.tile_eliminate 0.89% : 0.000002s : 19: predicate.transpose_eliminate 1.59% : 0.000004s : 32: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 32: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000004s : 32: predicate.tuple_list_get_item_depend_reorder 2.80% : 0.000007s : 48: predicate.tuple_list_get_item_eliminator 1.58% : 0.000004s : 32: predicate.tuple_list_get_set_item_eliminator 2.37% : 0.000006s : 46: predicate.tuple_list_set_item_eliminator 1.71% : 0.000004s : 34: predicate.tuple_to_list_eliminator_ 2.37% : 0.000006s : 54: predicate.updatestate_pure_node_eliminater 3.17% : 0.000007s : 68: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 7: predicate.value_based_eliminate 0.79% : 0.000002s : 14: predicate.virtual_dataset_eliminate 0.74% : 0.000002s : 14: predicate.virtual_output_eliminate 0.30% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000905 14 50.65% : 0.000459s : 7: func_graph_cloner_run.FuncGraphClonerGraph 49.35% : 0.000447s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.390566 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.29% : 0.006818s : 1: add_attr 0.28% : 0.006797s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000050s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000111s : 1: auto_monad 0.00% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000009s : 1: bias_add_comm_swap 0.04% : 0.000900s : 1: bootstrap 0.00% : 0.000023s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.00% : 0.000071s : 1: event_method 0.00% : 0.000025s : 1: execute 0.00% : 0.000009s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000009s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.02% : 0.000463s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000549s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.06% : 0.001402s : 78: opt.transform.opt_a 0.00% : 0.000042s : 1: opt.transform.opt_after_cconv 0.00% : 0.000030s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000148s : 28: opt.transform.opt_b 0.00% : 0.000059s : 2: opt.transform.opt_trans_graph 0.00% : 0.000041s : 4: opt.transform.symbol_engine_opt 0.14% : 0.003339s : 1: opt_a 0.01% : 0.000123s : 1: opt_after_cconv 0.02% : 0.000491s : 1: opt_after_jit_grad 0.01% : 0.000252s : 1: opt_b 0.24% : 0.005659s : 1: optimize 0.00% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000015s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000012s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000040s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000033s : 1: remove_dup_value 0.02% : 0.000560s : 1: renormalize.infer 0.02% : 0.000477s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000026s : 1: rewriter_after_opt_a 0.01% : 0.000226s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000018s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000082s : 1: symbol_engine_optimizer 96.46% : 2.305984s : 1: task_emit 0.00% : 0.000087s : 1: tuple_transform 2.30% : 0.055089s : 1: type_inference 0.00% : 0.000102s : 1: validate TotalTime = 0.136841, [24] [bootstrap]: 0.00075077 [type_inference]: 0.0669746 [event_method]: 0.00035721 [auto_monad]: 0.00031479 [graph_reusing]: 1.099e-05 [inline]: 2.61e-06 [add_attr]: 0.0039905, [1] [add_attr_with_inline]: 0.00397875, [1] [Cycle 1]: 8.56e-05, [2] [tag_attr]: 4.614e-05 [meta_addattr_fg_expand]: 1.236e-05 [parallel-infer-symbol]: 3.46001e-06 [pre_auto_parallel]: 6.104e-05 [insert-virtual-dataset]: 2.76999e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.57001e-06 [optimize]: 0.053843, [53] [py_interpret_to_execute]: 4.84e-06 [rewriter_before_opt_a]: 0.00034611 [opt_a]: 0.0513033, [4] [Cycle 1]: 0.0431416, [45] [expand_dump_flag]: 4.97e-06 [switch_simplify]: 0.00019082 [loop_unroll]: 0.00010061 [a_1]: 0.00206302 [with_stream_mark]: 2.675e-05 [recompute_prepare]: 3.21e-05 [updatestate_depend_eliminate]: 1.167e-05 [updatestate_assign_eliminate]: 1.061e-05 [updatestate_loads_eliminate]: 1.011e-05 [parameter_eliminate]: 3.16999e-06 [a_2]: 0.00033258 [accelerated_algorithm]: 2.521e-05 [shard]: 2.16e-06 [meta_shard_fg_expand]: 5.04e-06 [shard_inline]: 1.861e-05 [merge_send_recv]: 1.972e-05 [auto_parallel]: 1.286e-05 [parallel]: 2.78e-05 [flash_sp]: 1.069e-05 [merge_comm]: 1.185e-05 [allreduce_fusion]: 1.139e-05 [matmul_add_comm_reduction]: 2.979e-05 [allreduce_slice_to_reducescatter]: 7.29982e-07 [virtual_shard_identity]: 2.513e-05 [virtual_dataset]: 2.342e-05 [get_grad_eliminate_]: 2.245e-05 [virtual_output]: 2.123e-05 [merge_forward]: 1.159e-05 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 2.012e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.361e-05 [merge_recompute_call_nodes]: 1.83002e-06 [before_grad]: 3.99e-05 [set_forward_comm_id_for_comm_node_pass]: 1.216e-05 [meta_fg_expand]: 0.0215234 [flash_sp_send_recv_attached]: 4.18999e-06 [receive_attached]: 2.81e-06 [after_resolve]: 0.00011147 [a_after_grad]: 0.00016036 [renormalize]: 0.0160973 [add_forward_monad_depend]: 1.483e-05 [auto_monad_grad]: 1.234e-05 [auto_monad_eliminator]: 0.00010986 [cse]: 0.00029204 [a_3]: 0.00132257 [Cycle 2]: 0.00579356, [45] [expand_dump_flag]: 2.31e-06 [switch_simplify]: 8.367e-05 [loop_unroll]: 8.077e-05 [a_1]: 0.00324987 [with_stream_mark]: 2.5e-05 [recompute_prepare]: 2.335e-05 [updatestate_depend_eliminate]: 1.121e-05 [updatestate_assign_eliminate]: 1.119e-05 [updatestate_loads_eliminate]: 1.132e-05 [parameter_eliminate]: 3.36999e-06 [a_2]: 0.00040506 [accelerated_algorithm]: 1.257e-05 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 3.63e-06 [shard_inline]: 1.16e-05 [merge_send_recv]: 9.67001e-06 [auto_parallel]: 1.001e-05 [parallel]: 5.12e-06 [flash_sp]: 3.48e-06 [merge_comm]: 7.25e-06 [allreduce_fusion]: 6.53e-06 [matmul_add_comm_reduction]: 1.019e-05 [allreduce_slice_to_reducescatter]: 5.40022e-07 [virtual_shard_identity]: 1.246e-05 [virtual_dataset]: 1.112e-05 [get_grad_eliminate_]: 1.071e-05 [virtual_output]: 1.077e-05 [merge_forward]: 6.16e-06 [cell_reuse_recompute_pass]: 1.07e-06 [offload_activation]: 1.068e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.124e-05 [merge_recompute_call_nodes]: 8.59989e-07 [before_grad]: 1.86e-05 [set_forward_comm_id_for_comm_node_pass]: 7.5e-06 [meta_fg_expand]: 7.589e-05 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.04e-06 [after_resolve]: 1.688e-05 [a_after_grad]: 1.699e-05 [renormalize]: 0.00110738 [add_forward_monad_depend]: 4.45e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 2.043e-05 [cse]: 0.00010514 [a_3]: 8.589e-05 [Cycle 3]: 0.00149614, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 1.371e-05 [loop_unroll]: 1.16e-05 [a_1]: 0.00026894 [with_stream_mark]: 1.243e-05 [recompute_prepare]: 1.126e-05 [updatestate_depend_eliminate]: 2.943e-05 [updatestate_assign_eliminate]: 5.28002e-06 [updatestate_loads_eliminate]: 5.28002e-06 [parameter_eliminate]: 1.02e-06 [a_2]: 0.0001303 [accelerated_algorithm]: 9.82999e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.89e-06 [shard_inline]: 9.34e-06 [merge_send_recv]: 7.08998e-06 [auto_parallel]: 7.68999e-06 [parallel]: 4.04002e-06 [flash_sp]: 1.00001e-06 [merge_comm]: 5.35001e-06 [allreduce_fusion]: 4.97999e-06 [matmul_add_comm_reduction]: 7.28999e-06 [allreduce_slice_to_reducescatter]: 3.29979e-07 [virtual_shard_identity]: 1.009e-05 [virtual_dataset]: 9.41003e-06 [get_grad_eliminate_]: 9.05001e-06 [virtual_output]: 8.90001e-06 [merge_forward]: 4.77e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 8.85999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.844e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 1.484e-05 [set_forward_comm_id_for_comm_node_pass]: 5.32001e-06 [meta_fg_expand]: 3.65998e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 9.79984e-07 [after_resolve]: 1.209e-05 [a_after_grad]: 1.351e-05 [renormalize]: 0.00043455 [add_forward_monad_depend]: 3.72002e-06 [auto_monad_grad]: 1.14998e-06 [auto_monad_eliminator]: 1.678e-05 [cse]: 7.807e-05 [a_3]: 7.033e-05 [Cycle 4]: 0.0008526, [45] [expand_dump_flag]: 1.05001e-06 [switch_simplify]: 1.074e-05 [loop_unroll]: 9.25001e-06 [a_1]: 0.00020058 [with_stream_mark]: 1.069e-05 [recompute_prepare]: 9.32999e-06 [updatestate_depend_eliminate]: 4.97999e-06 [updatestate_assign_eliminate]: 4.47e-06 [updatestate_loads_eliminate]: 4.45e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 0.00012762 [accelerated_algorithm]: 9.76e-06 [shard]: 1.10999e-06 [meta_shard_fg_expand]: 1.88002e-06 [shard_inline]: 9.54e-06 [merge_send_recv]: 6.71999e-06 [auto_parallel]: 7.38e-06 [parallel]: 3.81999e-06 [flash_sp]: 9.70002e-07 [merge_comm]: 6.61999e-06 [allreduce_fusion]: 4.81002e-06 [matmul_add_comm_reduction]: 7.3e-06 [allreduce_slice_to_reducescatter]: 3.10014e-07 [virtual_shard_identity]: 9.74e-06 [virtual_dataset]: 1.007e-05 [get_grad_eliminate_]: 8.48001e-06 [virtual_output]: 8.74e-06 [merge_forward]: 4.48999e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 8.60999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.692e-05 [merge_recompute_call_nodes]: 6.50005e-07 [before_grad]: 1.457e-05 [set_forward_comm_id_for_comm_node_pass]: 5.91998e-06 [meta_fg_expand]: 3.5e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 1.215e-05 [a_after_grad]: 1.342e-05 [renormalize]: 5.00004e-08 [add_forward_monad_depend]: 1.20999e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 1.257e-05 [cse]: 2.222e-05 [a_3]: 5.867e-05 [py_interpret_to_execute_after_opt_a]: 4.17e-06 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 2.82e-05 [convert_after_rewriter]: 1.30001e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00052301 [opt_b]: 0.00030802, [1] [Cycle 1]: 0.00030197, [7] [b_1]: 0.00021219 [b_2]: 1.132e-05 [updatestate_depend_eliminate]: 7.46001e-06 [updatestate_assign_eliminate]: 4.67e-06 [updatestate_loads_eliminate]: 4.53999e-06 [renormalize]: 4.39992e-07 [cse]: 2.643e-05 [optimize_parallel_all_gather_comm]: 2.125e-05 [overlap_param_gather]: 2.06998e-06 [cconv]: 2.093e-05 [loop_unroll]: 0.00045898 [opt_after_cconv]: 0.00013746, [1] [Cycle 1]: 0.00013193, [7] [c_1]: 5.333e-05 [parameter_eliminate]: 2.25002e-06 [updatestate_depend_eliminate]: 7.51999e-06 [updatestate_assign_eliminate]: 5.40999e-06 [updatestate_loads_eliminate]: 4.4e-06 [cse]: 2.573e-05 [renormalize]: 2.3999e-07 [remove_dup_value]: 3.782e-05 [tuple_transform]: 9.086e-05, [1] [Cycle 1]: 8.624e-05, [4] [d_1]: 5.716e-05 [none_parameter_eliminate]: 1.79998e-06 [renormalize]: 1.90019e-07 [switch_simplify]: 1e-05 [partial_unused_args_eliminate]: 1.66998e-06 [add_recomputation]: 6.767e-05 [cse_after_recomputation]: 2.899e-05, [1] [Cycle 1]: 2.453e-05, [1] [cse]: 1.902e-05 [environ_conv]: 1.162e-05 [swap_dp_allreduce_reducescatter]: 8.08999e-06 [bias_add_comm_swap]: 2.72001e-06 [label_micro_interleaved_index]: 4.76997e-06 [label_fine_grained_interleaved_index]: 2.66999e-06 [merge_cast_opt]: 1.62999e-06 [slice_recompute_activation]: 2.43e-06 [micro_interleaved_order_control]: 2.51e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 8.40024e-07 [remove_cast_before_assign_add]: 1.29e-06 [full_micro_interleaved_order_control]: 2.74999e-06 [reorder_send_recv_between_fp_bp]: 2.94001e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.30001e-06 [overlap_opt_shard_in_pipeline]: 1.44e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 1.746e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 6.01e-06 [overlap_recompute_and_grad_model_parallel]: 6.04001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.08001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.43998e-06 [overlap_grad_ring_attention]: 6.48003e-06 [overlap_grad_flash_sp]: 2.624e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.32999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.929e-05, [1] [Cycle 1]: 9.516e-05, [6] [build]: 3.79002e-06 [elim_shapecalc]: 1.541e-05 [elim_not_effective]: 1.974e-05 [opt_reshape]: 1.096e-05 [fold_const_symbol]: 1.625e-05 [renormalize]: 1.60013e-07 [detach_backward]: 1.35001e-06 [pipeline_parallel_scheduler]: 1.39e-06 [auto_monad_reorder]: 3.351e-05 [get_jit_bprop_graph]: 1.00001e-06 [rewriter_after_jit_bprop_graph]: 3.66999e-06 [opt_after_jit_grad]: 0.00050985 [validate]: 4.693e-05 [backend_pass]: 1.02998e-06 [task_emit]: 0.00968155 [execute]: 7.85e-06 Sums bootstrap : 0.000751s : 0.57% type_inference : 0.066975s : 51.01% event_method : 0.000357s : 0.27% auto_monad : 0.000315s : 0.24% graph_reusing : 0.000011s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000046s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000061s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000346s : 0.26% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000299s : 0.23% optimize.opt_a.loop_unroll : 0.000202s : 0.15% optimize.opt_a.a_1 : 0.005782s : 4.40% optimize.opt_a.with_stream_mark : 0.000075s : 0.06% optimize.opt_a.recompute_prepare : 0.000076s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000057s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000032s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000031s : 0.02% optimize.opt_a.parameter_eliminate : 0.000009s : 0.01% optimize.opt_a.a_2 : 0.000996s : 0.76% optimize.opt_a.accelerated_algorithm : 0.000057s : 0.04% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.01% optimize.opt_a.shard_inline : 0.000049s : 0.04% optimize.opt_a.merge_send_recv : 0.000043s : 0.03% optimize.opt_a.auto_parallel : 0.000038s : 0.03% optimize.opt_a.parallel : 0.000041s : 0.03% optimize.opt_a.flash_sp : 0.000016s : 0.01% optimize.opt_a.merge_comm : 0.000031s : 0.02% optimize.opt_a.allreduce_fusion : 0.000028s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000055s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000057s : 0.04% optimize.opt_a.virtual_dataset : 0.000054s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000051s : 0.04% optimize.opt_a.virtual_output : 0.000050s : 0.04% optimize.opt_a.merge_forward : 0.000027s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000048s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000100s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000088s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000031s : 0.02% optimize.opt_a.meta_fg_expand : 0.021606s : 16.45% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.01% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000153s : 0.12% optimize.opt_a.a_after_grad : 0.000204s : 0.16% optimize.opt_a.renormalize : 0.017639s : 13.43% optimize.opt_a.add_forward_monad_depend : 0.000024s : 0.02% optimize.opt_a.auto_monad_grad : 0.000016s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000160s : 0.12% optimize.opt_a.cse : 0.000497s : 0.38% optimize.opt_a.a_3 : 0.001537s : 1.17% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000523s : 0.40% optimize.opt_b.b_1 : 0.000212s : 0.16% optimize.opt_b.b_2 : 0.000011s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000026s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.02% optimize.loop_unroll : 0.000459s : 0.35% optimize.opt_after_cconv.c_1 : 0.000053s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000038s : 0.03% optimize.tuple_transform.d_1 : 0.000057s : 0.04% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000068s : 0.05% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000008s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.02% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000020s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000011s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000016s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000034s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000510s : 0.39% validate : 0.000047s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.009682s : 7.37% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.002325 351 0.66% : 0.000015s : 8: substitution.depend_value_elim 0.12% : 0.000003s : 5: substitution.elim_not_effective 0.41% : 0.000010s : 14: substitution.float_depend_g_call 0.21% : 0.000005s : 2: substitution.float_tuple_getitem_switch 0.09% : 0.000002s : 5: substitution.fold_const_symbol 31.78% : 0.000739s : 4: substitution.getattr_setattr_resolve 0.30% : 0.000007s : 6: substitution.graph_param_transform 0.11% : 0.000003s : 2: substitution.incorporate_call 0.08% : 0.000002s : 2: substitution.incorporate_call_switch 45.39% : 0.001055s : 31: substitution.inline 1.17% : 0.000027s : 4: substitution.inline_without_move 0.63% : 0.000015s : 29: substitution.j_node_and_user_rematch 0.76% : 0.000018s : 13: substitution.minmaximum_grad 0.92% : 0.000021s : 14: substitution.partial_eliminate 0.88% : 0.000020s : 29: substitution.remove_not_recompute_node 2.71% : 0.000063s : 28: substitution.replace_applicator 0.59% : 0.000014s : 19: substitution.replace_old_param 0.23% : 0.000005s : 2: substitution.set_cell_output_no_recompute 1.04% : 0.000024s : 3: substitution.switch_simplify 1.55% : 0.000036s : 13: substitution.tuple_list_convert_item_index_to_positive 0.86% : 0.000020s : 13: substitution.tuple_list_get_item_const_eliminator 1.17% : 0.000027s : 13: substitution.tuple_list_get_item_depend_reorder 4.07% : 0.000095s : 34: substitution.tuple_list_get_item_eliminator 1.11% : 0.000026s : 13: substitution.tuple_list_get_set_item_eliminator 1.11% : 0.000026s : 18: substitution.updatestate_pure_node_eliminater 2.07% : 0.000048s : 27: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.066878 2 94.64% : 0.063290s : 1: type_inference.infer 5.36% : 0.003587s : 1: type_inference.specialize ------[replace.] 0.000661 58 7.25% : 0.000048s : 3: replace.getattr_setattr_resolve 53.09% : 0.000351s : 31: replace.inline 3.96% : 0.000026s : 1: replace.replace_applicator 5.36% : 0.000035s : 3: replace.switch_simplify 26.13% : 0.000173s : 19: replace.tuple_list_get_item_eliminator 4.21% : 0.000028s : 1: replace.updatestate_useless_node_eliminater ------[match.] 0.001810 58 38.21% : 0.000691s : 3: match.getattr_setattr_resolve 57.25% : 0.001036s : 31: match.inline 0.50% : 0.000009s : 1: match.replace_applicator 1.20% : 0.000022s : 3: match.switch_simplify 2.50% : 0.000045s : 19: match.tuple_list_get_item_eliminator 0.34% : 0.000006s : 1: match.updatestate_useless_node_eliminater ------[predicate.] 0.001230 8579 1.05% : 0.000013s : 93: predicate.accumulaten_eliminater 0.20% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.55% : 0.000007s : 50: predicate.addn_check_dump 1.00% : 0.000012s : 93: predicate.addn_zero_filter 0.95% : 0.000012s : 93: predicate.adjust_all_reduce_mul_add 2.08% : 0.000026s : 137: predicate.arithmetic_simplify 1.03% : 0.000013s : 93: predicate.cast_eliminate 2.18% : 0.000027s : 204: predicate.check_bprop_eliminate 0.55% : 0.000007s : 50: predicate.compare_switch_simplify 0.04% : 0.000000s : 6: predicate.const_output_eliminate 0.54% : 0.000007s : 44: predicate.depend_value_elim 1.09% : 0.000013s : 93: predicate.dict_get_item_const_eliminator 1.17% : 0.000014s : 93: predicate.dict_get_item_eliminator 1.02% : 0.000013s : 93: predicate.dict_set_item_eliminator 0.19% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.05% : 0.000001s : 6: predicate.elim_not_effective 0.09% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.05% : 0.000013s : 99: predicate.environ_add_const_eliminate 1.03% : 0.000013s : 99: predicate.environ_get_add_eliminate 1.08% : 0.000013s : 99: predicate.environ_get_depend_swap 1.63% : 0.000020s : 143: predicate.environ_get_eliminate 1.05% : 0.000013s : 99: predicate.environ_get_set_eliminate 1.59% : 0.000020s : 143: predicate.exchange_switch_depend_value 2.10% : 0.000026s : 143: predicate.float_depend_g_call 0.55% : 0.000007s : 50: predicate.float_environ_get_switch 0.61% : 0.000008s : 56: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 6: predicate.fold_const_symbol 0.45% : 0.000006s : 36: predicate.get_grad_eliminate 0.44% : 0.000005s : 20: predicate.getattr_setattr_resolve 0.06% : 0.000001s : 6: predicate.graph_param_transform 0.51% : 0.000006s : 44: predicate.incorporate_call 0.46% : 0.000006s : 44: predicate.incorporate_call_switch 4.92% : 0.000060s : 334: predicate.inline 1.47% : 0.000018s : 100: predicate.inline_without_move 0.21% : 0.000003s : 36: predicate.j_node_and_user_rematch 0.53% : 0.000007s : 36: predicate.less_batch_normalization 1.45% : 0.000018s : 124: predicate.list_to_tuple_eliminator_ 2.39% : 0.000029s : 217: predicate.load_eliminater 0.23% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.09% : 0.000026s : 187: predicate.loop_unroll_before_grad 1.20% : 0.000015s : 105: predicate.make_slice_get_slice_eliminator 0.57% : 0.000007s : 50: predicate.merge_addn 2.11% : 0.000026s : 200: predicate.micro_step_allgather_replace 2.13% : 0.000026s : 200: predicate.mini_step_allgather_replace 0.99% : 0.000012s : 93: predicate.minmaximum_grad 0.22% : 0.000003s : 6: predicate.mutable_eliminate 0.11% : 0.000001s : 6: predicate.opt_reshape 0.11% : 0.000001s : 6: predicate.parallel_virtual_node 2.14% : 0.000026s : 143: predicate.partial_defer_inline 1.48% : 0.000018s : 118: predicate.partial_eliminate 1.05% : 0.000013s : 93: predicate.print_const_string_wrapper 0.51% : 0.000006s : 44: predicate.reduce_all_const_elim 1.37% : 0.000017s : 93: predicate.reduce_eliminate 2.36% : 0.000029s : 217: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000003s : 36: predicate.remove_not_recompute_node 2.25% : 0.000028s : 314: predicate.replace_applicator 0.67% : 0.000008s : 100: predicate.replace_old_param 0.05% : 0.000001s : 6: predicate.reset_defer_inline 1.04% : 0.000013s : 93: predicate.reshape_eliminate 2.17% : 0.000027s : 200: predicate.row_tensor_add_zeros_like 0.08% : 0.000001s : 6: predicate.row_tensor_eliminate 2.33% : 0.000029s : 204: predicate.same_eliminate 0.32% : 0.000004s : 44: predicate.set_cell_output_no_recompute 0.51% : 0.000006s : 36: predicate.shard_identity_eliminate 0.18% : 0.000002s : 12: predicate.special_op_eliminate 0.64% : 0.000008s : 50: predicate.specialize_transform 2.20% : 0.000027s : 200: predicate.split_environ_get_set_with_tuple_value 1.33% : 0.000016s : 100: predicate.stack_unstack_eliminate 0.08% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.75% : 0.000022s : 143: predicate.switch_defer_inline 3.88% : 0.000048s : 347: predicate.switch_layer_defer_inline 4.69% : 0.000058s : 392: predicate.switch_simplify 1.04% : 0.000013s : 93: predicate.tile_eliminate 1.02% : 0.000013s : 93: predicate.transpose_eliminate 1.29% : 0.000016s : 105: predicate.tuple_list_convert_item_index_to_positive 1.36% : 0.000017s : 105: predicate.tuple_list_get_item_const_eliminator 1.25% : 0.000015s : 105: predicate.tuple_list_get_item_depend_reorder 2.44% : 0.000030s : 168: predicate.tuple_list_get_item_eliminator 1.34% : 0.000017s : 105: predicate.tuple_list_get_set_item_eliminator 1.85% : 0.000023s : 149: predicate.tuple_list_set_item_eliminator 1.40% : 0.000017s : 124: predicate.tuple_to_list_eliminator_ 2.33% : 0.000029s : 217: predicate.updatestate_pure_node_eliminater 2.94% : 0.000036s : 263: predicate.updatestate_useless_node_eliminater 0.08% : 0.000001s : 6: predicate.value_based_eliminate 0.48% : 0.000006s : 36: predicate.virtual_dataset_eliminate 0.44% : 0.000005s : 36: predicate.virtual_output_eliminate 0.07% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.12% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005271 65 55.84% : 0.002943s : 26: func_graph_cloner_run.FuncGraphClonerGraph 4.95% : 0.000261s : 7: func_graph_cloner_run.FuncGraphClonerNode 39.21% : 0.002067s : 32: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.223039 305 0.00% : 0.000003s : 1: ForceFp32Comm 1.79% : 0.003995s : 1: add_attr 1.79% : 0.003982s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.15% : 0.000328s : 1: auto_monad 0.02% : 0.000038s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.35% : 0.000791s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.17% : 0.000370s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.21% : 0.000468s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.24% : 0.000532s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.01% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000019s : 1: opt.transform.mutable_eliminate 4.28% : 0.009553s : 181: opt.transform.opt_a 0.02% : 0.000052s : 1: opt.transform.opt_after_cconv 0.02% : 0.000040s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000193s : 28: opt.transform.opt_b 0.38% : 0.000843s : 2: opt.transform.opt_resolve 0.03% : 0.000065s : 2: opt.transform.opt_trans_graph 0.03% : 0.000059s : 4: opt.transform.symbol_engine_opt 23.00% : 0.051307s : 1: opt_a 0.06% : 0.000141s : 1: opt_after_cconv 0.23% : 0.000520s : 1: opt_after_jit_grad 0.14% : 0.000311s : 1: opt_b 24.14% : 0.053847s : 1: optimize 0.01% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000066s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000042s : 1: remove_dup_value 5.97% : 0.013316s : 3: renormalize.infer 1.93% : 0.004301s : 3: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000031s : 1: rewriter_after_opt_a 0.16% : 0.000353s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000006s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000011s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000102s : 1: symbol_engine_optimizer 4.35% : 0.009693s : 1: task_emit 0.04% : 0.000094s : 1: tuple_transform 30.04% : 0.066991s : 1: type_inference 0.04% : 0.000079s : 1: validate TotalTime = 0.116487, [24] [bootstrap]: 0.00069993 [type_inference]: 0.0619117 [event_method]: 0.00023389 [auto_monad]: 0.00017624 [graph_reusing]: 1.011e-05 [inline]: 2.43e-06 [add_attr]: 0.00382082, [1] [add_attr_with_inline]: 0.00381179, [1] [Cycle 1]: 8.303e-05, [2] [tag_attr]: 4.09e-05 [meta_addattr_fg_expand]: 1.207e-05 [parallel-infer-symbol]: 3.13998e-06 [pre_auto_parallel]: 6.391e-05 [insert-virtual-dataset]: 2.49001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.63002e-06 [optimize]: 0.026055, [53] [py_interpret_to_execute]: 4.61002e-06 [rewriter_before_opt_a]: 0.00038968 [opt_a]: 0.0234833, [3] [Cycle 1]: 0.0194553, [45] [expand_dump_flag]: 5.04e-06 [switch_simplify]: 0.00020761 [loop_unroll]: 6.943e-05 [a_1]: 0.00147442 [with_stream_mark]: 2.452e-05 [recompute_prepare]: 2.33e-05 [updatestate_depend_eliminate]: 8.87999e-06 [updatestate_assign_eliminate]: 7.9e-06 [updatestate_loads_eliminate]: 7.55998e-06 [parameter_eliminate]: 2.96999e-06 [a_2]: 0.00024957 [accelerated_algorithm]: 1.674e-05 [shard]: 1.60001e-06 [meta_shard_fg_expand]: 5.35001e-06 [shard_inline]: 1.615e-05 [merge_send_recv]: 1.677e-05 [auto_parallel]: 1.075e-05 [parallel]: 2.437e-05 [flash_sp]: 9.82999e-06 [merge_comm]: 9.52999e-06 [allreduce_fusion]: 9.15999e-06 [matmul_add_comm_reduction]: 2.655e-05 [allreduce_slice_to_reducescatter]: 8.49977e-07 [virtual_shard_identity]: 1.923e-05 [virtual_dataset]: 1.653e-05 [get_grad_eliminate_]: 1.635e-05 [virtual_output]: 1.627e-05 [merge_forward]: 9.42001e-06 [cell_reuse_recompute_pass]: 1.21997e-06 [offload_activation]: 1.812e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.689e-05 [merge_recompute_call_nodes]: 1.40001e-06 [before_grad]: 2.685e-05 [set_forward_comm_id_for_comm_node_pass]: 9.49e-06 [meta_fg_expand]: 0.00202129 [flash_sp_send_recv_attached]: 3.88999e-06 [receive_attached]: 2.46998e-06 [after_resolve]: 7.584e-05 [a_after_grad]: 9.922e-05 [renormalize]: 0.0137097 [add_forward_monad_depend]: 1.02e-05 [auto_monad_grad]: 6.69001e-06 [auto_monad_eliminator]: 5.527e-05 [cse]: 0.00038556 [a_3]: 0.00040297 [Cycle 2]: 0.00324126, [45] [expand_dump_flag]: 1.77001e-06 [switch_simplify]: 5.686e-05 [loop_unroll]: 5.277e-05 [a_1]: 0.00144453 [with_stream_mark]: 1.48e-05 [recompute_prepare]: 1.086e-05 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 1.14003e-06 [a_2]: 0.00010692 [accelerated_algorithm]: 8.57e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 2.02001e-06 [shard_inline]: 8.27e-06 [merge_send_recv]: 5.04e-06 [auto_parallel]: 5.79999e-06 [parallel]: 4.31002e-06 [flash_sp]: 3.50998e-06 [merge_comm]: 3.38999e-06 [allreduce_fusion]: 3.08998e-06 [matmul_add_comm_reduction]: 5.09e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 9.70002e-06 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 8.14997e-06 [virtual_output]: 8.32e-06 [merge_forward]: 3.11999e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [offload_activation]: 8.42998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.375e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 3.51999e-06 [meta_fg_expand]: 9.199e-05 [flash_sp_send_recv_attached]: 1.03001e-06 [receive_attached]: 1.05001e-06 [after_resolve]: 1.065e-05 [a_after_grad]: 1.313e-05 [renormalize]: 0.00095898 [add_forward_monad_depend]: 4.59002e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 1.134e-05 [cse]: 2.309e-05 [a_3]: 6.24e-05 [Cycle 3]: 0.00077271, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 9.99999e-06 [loop_unroll]: 8.45001e-06 [a_1]: 0.00020281 [with_stream_mark]: 8.84e-06 [recompute_prepare]: 8.70001e-06 [updatestate_depend_eliminate]: 3.31001e-06 [updatestate_assign_eliminate]: 2.85998e-06 [updatestate_loads_eliminate]: 2.58003e-06 [parameter_eliminate]: 8.70001e-07 [a_2]: 0.00010549 [accelerated_algorithm]: 8.37998e-06 [shard]: 1.06002e-06 [meta_shard_fg_expand]: 1.49e-06 [shard_inline]: 8.48001e-06 [merge_send_recv]: 4.95999e-06 [auto_parallel]: 5.64e-06 [parallel]: 3.85e-06 [flash_sp]: 9.39996e-07 [merge_comm]: 3.26999e-06 [allreduce_fusion]: 2.96001e-06 [matmul_add_comm_reduction]: 7.11999e-06 [allreduce_slice_to_reducescatter]: 4.80009e-07 [virtual_shard_identity]: 9.52001e-06 [virtual_dataset]: 8.27e-06 [get_grad_eliminate_]: 8.21002e-06 [virtual_output]: 8.23999e-06 [merge_forward]: 2.88e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 8.23001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.392e-05 [merge_recompute_call_nodes]: 7.39994e-07 [before_grad]: 1.138e-05 [set_forward_comm_id_for_comm_node_pass]: 3.36001e-06 [meta_fg_expand]: 2.43e-06 [flash_sp_send_recv_attached]: 9.20001e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 8.40001e-06 [a_after_grad]: 1.295e-05 [renormalize]: 4.99713e-08 [add_forward_monad_depend]: 1.09998e-06 [auto_monad_grad]: 8.2e-07 [auto_monad_eliminator]: 6.23e-06 [cse]: 2.046e-05 [a_3]: 5.191e-05 [py_interpret_to_execute_after_opt_a]: 4.15e-06 [slice_cell_reuse_recomputed_activation]: 2.26e-06 [rewriter_after_opt_a]: 2.1e-05 [convert_after_rewriter]: 1.14e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00049376 [opt_b]: 0.0002883, [1] [Cycle 1]: 0.00028204, [7] [b_1]: 0.00019852 [b_2]: 1.007e-05 [updatestate_depend_eliminate]: 5.66998e-06 [updatestate_assign_eliminate]: 2.94001e-06 [updatestate_loads_eliminate]: 2.71999e-06 [renormalize]: 3.7998e-07 [cse]: 2.653e-05 [optimize_parallel_all_gather_comm]: 0.0001028 [overlap_param_gather]: 2.72001e-06 [cconv]: 2.069e-05 [loop_unroll]: 0.00045779 [opt_after_cconv]: 0.00013878, [1] [Cycle 1]: 0.00013304, [7] [c_1]: 4.385e-05 [parameter_eliminate]: 2.39001e-06 [updatestate_depend_eliminate]: 5.62001e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 3.39001e-06 [cse]: 2.641e-05 [renormalize]: 3.30008e-07 [remove_dup_value]: 3.484e-05 [tuple_transform]: 9.275e-05, [1] [Cycle 1]: 8.779e-05, [4] [d_1]: 5.683e-05 [none_parameter_eliminate]: 2.05002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.007e-05 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 6.593e-05 [cse_after_recomputation]: 2.927e-05, [1] [Cycle 1]: 2.46e-05, [1] [cse]: 1.873e-05 [environ_conv]: 8.72e-06 [swap_dp_allreduce_reducescatter]: 5.84999e-06 [bias_add_comm_swap]: 2.93998e-06 [label_micro_interleaved_index]: 5.07999e-06 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.68998e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 8.09989e-07 [remove_cast_before_assign_add]: 1.58002e-06 [full_micro_interleaved_order_control]: 2.36e-06 [reorder_send_recv_between_fp_bp]: 2.56998e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.19e-06 [interleave_parallel_branches]: 1.11997e-06 [overlap_opt_shard_in_pipeline]: 1.52999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 1.351e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 3.76001e-06 [overlap_recompute_and_grad_model_parallel]: 5.42001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.27e-06 [overlap_recompute_allgather_and_fa_grad]: 1.45999e-06 [overlap_recompute_comm]: 2.41e-06 [overlap_grad_ring_attention]: 4.52e-06 [overlap_grad_flash_sp]: 1.853e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.30002e-06 [split_layernorm_comm]: 2.21e-06 [handle_group_info]: 1.36002e-06 [symbol_engine_optimizer]: 8.55e-05, [1] [Cycle 1]: 8.107e-05, [6] [build]: 2.54999e-06 [elim_shapecalc]: 1.204e-05 [elim_not_effective]: 1.486e-05 [opt_reshape]: 9.50001e-06 [fold_const_symbol]: 1.276e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.64e-06 [pipeline_parallel_scheduler]: 1.46002e-06 [auto_monad_reorder]: 1.805e-05 [get_jit_bprop_graph]: 1.03001e-06 [rewriter_after_jit_bprop_graph]: 3.5e-06 [opt_after_jit_grad]: 0.00050018 [validate]: 4.44e-05 [backend_pass]: 1.13001e-06 [task_emit]: 0.0226935 [execute]: 6.26e-06 Sums bootstrap : 0.000700s : 0.63% type_inference : 0.061912s : 55.59% event_method : 0.000234s : 0.21% auto_monad : 0.000176s : 0.16% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000064s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000390s : 0.35% optimize.opt_a.expand_dump_flag : 0.000008s : 0.01% optimize.opt_a.switch_simplify : 0.000274s : 0.25% optimize.opt_a.loop_unroll : 0.000131s : 0.12% optimize.opt_a.a_1 : 0.003122s : 2.80% optimize.opt_a.with_stream_mark : 0.000048s : 0.04% optimize.opt_a.recompute_prepare : 0.000043s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000013s : 0.01% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000462s : 0.41% optimize.opt_a.accelerated_algorithm : 0.000034s : 0.03% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.01% optimize.opt_a.shard_inline : 0.000033s : 0.03% optimize.opt_a.merge_send_recv : 0.000027s : 0.02% optimize.opt_a.auto_parallel : 0.000022s : 0.02% optimize.opt_a.parallel : 0.000033s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000016s : 0.01% optimize.opt_a.allreduce_fusion : 0.000015s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000039s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000038s : 0.03% optimize.opt_a.virtual_dataset : 0.000033s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000033s : 0.03% optimize.opt_a.virtual_output : 0.000033s : 0.03% optimize.opt_a.merge_forward : 0.000015s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000049s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.01% optimize.opt_a.meta_fg_expand : 0.002116s : 1.90% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000095s : 0.09% optimize.opt_a.a_after_grad : 0.000125s : 0.11% optimize.opt_a.renormalize : 0.014669s : 13.17% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000009s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000073s : 0.07% optimize.opt_a.cse : 0.000429s : 0.39% optimize.opt_a.a_3 : 0.000517s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000021s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000494s : 0.44% optimize.opt_b.b_1 : 0.000199s : 0.18% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000027s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000103s : 0.09% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000021s : 0.02% optimize.loop_unroll : 0.000458s : 0.41% optimize.opt_after_cconv.c_1 : 0.000044s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000026s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000035s : 0.03% optimize.tuple_transform.d_1 : 0.000057s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.06% optimize.cse_after_recomputation.cse : 0.000019s : 0.02% optimize.environ_conv : 0.000009s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000019s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000500s : 0.45% validate : 0.000044s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.022694s : 20.38% execute : 0.000006s : 0.01% Time group info: ------[substitution.] 0.000710 161 0.25% : 0.000002s : 2: substitution.elim_not_effective 0.96% : 0.000007s : 11: substitution.float_depend_g_call 0.45% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.20% : 0.000001s : 2: substitution.fold_const_symbol 0.94% : 0.000007s : 6: substitution.graph_param_transform 0.35% : 0.000002s : 2: substitution.incorporate_call 0.27% : 0.000002s : 2: substitution.incorporate_call_switch 64.40% : 0.000457s : 20: substitution.inline 2.33% : 0.000017s : 2: substitution.inline_without_move 1.09% : 0.000008s : 12: substitution.j_node_and_user_rematch 1.29% : 0.000009s : 7: substitution.minmaximum_grad 3.65% : 0.000026s : 11: substitution.partial_eliminate 1.40% : 0.000010s : 12: substitution.remove_not_recompute_node 3.04% : 0.000022s : 9: substitution.replace_applicator 1.05% : 0.000007s : 13: substitution.replace_old_param 0.37% : 0.000003s : 1: substitution.set_cell_output_no_recompute 4.45% : 0.000032s : 3: substitution.switch_simplify 2.71% : 0.000019s : 7: substitution.tuple_list_convert_item_index_to_positive 1.27% : 0.000009s : 7: substitution.tuple_list_get_item_const_eliminator 1.73% : 0.000012s : 7: substitution.tuple_list_get_item_depend_reorder 6.06% : 0.000043s : 16: substitution.tuple_list_get_item_eliminator 1.70% : 0.000012s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.061813 2 95.07% : 0.058768s : 1: type_inference.infer 4.93% : 0.003045s : 1: type_inference.specialize ------[replace.] 0.000275 30 60.98% : 0.000168s : 20: replace.inline 13.46% : 0.000037s : 3: replace.switch_simplify 25.57% : 0.000070s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000497 30 90.04% : 0.000447s : 20: match.inline 5.89% : 0.000029s : 3: match.switch_simplify 4.07% : 0.000020s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000677 5422 1.11% : 0.000008s : 65: predicate.accumulaten_eliminater 0.26% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.47% : 0.000003s : 29: predicate.addn_check_dump 1.10% : 0.000007s : 65: predicate.addn_zero_filter 1.02% : 0.000007s : 65: predicate.adjust_all_reduce_mul_add 1.95% : 0.000013s : 94: predicate.arithmetic_simplify 1.12% : 0.000008s : 65: predicate.cast_eliminate 1.29% : 0.000009s : 72: predicate.check_bprop_eliminate 0.48% : 0.000003s : 29: predicate.compare_switch_simplify 0.10% : 0.000001s : 7: predicate.const_output_eliminate 0.50% : 0.000003s : 29: predicate.depend_value_elim 1.22% : 0.000008s : 65: predicate.dict_get_item_const_eliminator 1.23% : 0.000008s : 65: predicate.dict_get_item_eliminator 1.07% : 0.000007s : 65: predicate.dict_set_item_eliminator 0.39% : 0.000003s : 13: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 6: predicate.elim_not_effective 0.12% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000008s : 72: predicate.environ_add_const_eliminate 1.17% : 0.000008s : 72: predicate.environ_get_add_eliminate 1.16% : 0.000008s : 72: predicate.environ_get_depend_swap 1.73% : 0.000012s : 101: predicate.environ_get_eliminate 1.19% : 0.000008s : 72: predicate.environ_get_set_eliminate 1.65% : 0.000011s : 92: predicate.exchange_switch_depend_value 2.25% : 0.000015s : 92: predicate.float_depend_g_call 0.49% : 0.000003s : 29: predicate.float_environ_get_switch 0.60% : 0.000004s : 36: predicate.float_tuple_getitem_switch 0.08% : 0.000001s : 6: predicate.fold_const_symbol 0.59% : 0.000004s : 29: predicate.get_grad_eliminate 0.08% : 0.000001s : 6: predicate.graph_param_transform 0.50% : 0.000003s : 29: predicate.incorporate_call 0.46% : 0.000003s : 29: predicate.incorporate_call_switch 5.30% : 0.000036s : 229: predicate.inline 1.48% : 0.000010s : 62: predicate.inline_without_move 0.30% : 0.000002s : 29: predicate.j_node_and_user_rematch 0.61% : 0.000004s : 29: predicate.less_batch_normalization 1.49% : 0.000010s : 85: predicate.list_to_tuple_eliminator_ 2.45% : 0.000017s : 151: predicate.load_eliminater 0.31% : 0.000002s : 7: predicate.loop_unroll_after_grad 2.76% : 0.000019s : 142: predicate.loop_unroll_before_grad 1.36% : 0.000009s : 79: predicate.make_slice_get_slice_eliminator 0.52% : 0.000004s : 29: predicate.merge_addn 1.23% : 0.000008s : 72: predicate.micro_step_allgather_replace 1.28% : 0.000009s : 72: predicate.mini_step_allgather_replace 1.04% : 0.000007s : 65: predicate.minmaximum_grad 0.33% : 0.000002s : 7: predicate.mutable_eliminate 0.13% : 0.000001s : 6: predicate.opt_reshape 0.15% : 0.000001s : 7: predicate.parallel_virtual_node 2.12% : 0.000014s : 92: predicate.partial_defer_inline 1.54% : 0.000010s : 79: predicate.partial_eliminate 1.07% : 0.000007s : 65: predicate.print_const_string_wrapper 0.51% : 0.000003s : 29: predicate.reduce_all_const_elim 1.39% : 0.000009s : 65: predicate.reduce_eliminate 2.46% : 0.000017s : 151: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000002s : 29: predicate.remove_not_recompute_node 2.14% : 0.000014s : 144: predicate.replace_applicator 0.76% : 0.000005s : 62: predicate.replace_old_param 0.13% : 0.000001s : 7: predicate.reset_defer_inline 1.08% : 0.000007s : 65: predicate.reshape_eliminate 1.30% : 0.000009s : 72: predicate.row_tensor_add_zeros_like 0.15% : 0.000001s : 7: predicate.row_tensor_eliminate 1.48% : 0.000010s : 72: predicate.same_eliminate 0.43% : 0.000003s : 29: predicate.set_cell_output_no_recompute 0.60% : 0.000004s : 29: predicate.shard_identity_eliminate 0.28% : 0.000002s : 13: predicate.special_op_eliminate 0.56% : 0.000004s : 29: predicate.specialize_transform 1.36% : 0.000009s : 72: predicate.split_environ_get_set_with_tuple_value 1.42% : 0.000010s : 62: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 7: predicate.switch_call_monad_eliminater 1.86% : 0.000013s : 92: predicate.switch_defer_inline 3.04% : 0.000021s : 164: predicate.switch_layer_defer_inline 5.60% : 0.000038s : 275: predicate.switch_simplify 1.12% : 0.000008s : 65: predicate.tile_eliminate 1.15% : 0.000008s : 65: predicate.transpose_eliminate 1.45% : 0.000010s : 78: predicate.tuple_list_convert_item_index_to_positive 1.43% : 0.000010s : 78: predicate.tuple_list_get_item_const_eliminator 1.36% : 0.000009s : 78: predicate.tuple_list_get_item_depend_reorder 2.57% : 0.000017s : 114: predicate.tuple_list_get_item_eliminator 1.42% : 0.000010s : 78: predicate.tuple_list_get_set_item_eliminator 2.01% : 0.000014s : 107: predicate.tuple_list_set_item_eliminator 1.52% : 0.000010s : 85: predicate.tuple_to_list_eliminator_ 2.38% : 0.000016s : 151: predicate.updatestate_pure_node_eliminater 2.98% : 0.000020s : 180: predicate.updatestate_useless_node_eliminater 0.13% : 0.000001s : 7: predicate.value_based_eliminate 0.56% : 0.000004s : 29: predicate.virtual_dataset_eliminate 0.53% : 0.000004s : 29: predicate.virtual_output_eliminate 0.09% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.18% : 0.000001s : 7: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003914 49 65.71% : 0.002572s : 25: func_graph_cloner_run.FuncGraphClonerGraph 34.29% : 0.001342s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.166276 237 0.00% : 0.000004s : 1: ForceFp32Comm 2.30% : 0.003826s : 1: add_attr 2.29% : 0.003815s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000188s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.44% : 0.000736s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000032s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.15% : 0.000246s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.28% : 0.000466s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.30% : 0.000502s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 2.98% : 0.004951s : 117: opt.transform.opt_a 0.03% : 0.000043s : 1: opt.transform.opt_after_cconv 0.02% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.11% : 0.000181s : 28: opt.transform.opt_b 0.04% : 0.000065s : 2: opt.transform.opt_trans_graph 0.03% : 0.000045s : 4: opt.transform.symbol_engine_opt 14.13% : 0.023487s : 1: opt_a 0.09% : 0.000142s : 1: opt_after_cconv 0.31% : 0.000510s : 1: opt_after_jit_grad 0.18% : 0.000292s : 1: opt_b 15.67% : 0.026059s : 1: optimize 0.06% : 0.000108s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000022s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000006s : 1: pipeline_split 0.04% : 0.000069s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.02% : 0.000039s : 1: remove_dup_value 7.21% : 0.011988s : 2: renormalize.infer 1.60% : 0.002665s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000024s : 1: rewriter_after_opt_a 0.24% : 0.000396s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000088s : 1: symbol_engine_optimizer 13.66% : 0.022710s : 1: task_emit 0.06% : 0.000096s : 1: tuple_transform 37.24% : 0.061926s : 1: type_inference 0.04% : 0.000071s : 1: validate group_cases_18 have all been run, results of sub cases are below: case: (1,) {} pass. case: ('pynative',) {} pass. case: ('pynative',) {} pass. case: (1, 2) {} pass. case: (1, 0) {} pass. case: (1, -1) {} pass. case: (0,) {} pass. case: (0,) {} pass. ops group_cases_19 with 8 cases start to running, all cases are below: case: (, 1) case: (, 'GRAPH_MODE_O0') case: (, 'PYNATIVE_MODE') case: (, 'pynative') case: (, 'KBK') case: (, 0) case: (, 1) case: (, 0, ) ops group_cases_19 total running memory: 32M, memory threshold: 51200M TotalTime = 2.51146, [24] [bootstrap]: 0.00083251 [type_inference]: 0.145476 [event_method]: 6.18e-05 [auto_monad]: 0.00021507 [graph_reusing]: 1.034e-05 [inline]: 2.78e-06 [add_attr]: 0.00727793, [1] [add_attr_with_inline]: 0.00726635, [1] [Cycle 1]: 0.00016047, [2] [tag_attr]: 5.483e-05 [meta_addattr_fg_expand]: 2.261e-05 [parallel-infer-symbol]: 3.05002e-06 [pre_auto_parallel]: 8.059e-05 [insert-virtual-dataset]: 2.39999e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.60001e-06 [optimize]: 0.0086497, [53] [py_interpret_to_execute]: 4.07e-06 [rewriter_before_opt_a]: 0.00032297 [opt_a]: 0.00604311, [2] [Cycle 1]: 0.00523861, [45] [expand_dump_flag]: 4.84e-06 [switch_simplify]: 0.00021294 [loop_unroll]: 6.967e-05 [a_1]: 0.00147778 [with_stream_mark]: 1.474e-05 [recompute_prepare]: 1.049e-05 [updatestate_depend_eliminate]: 1.485e-05 [updatestate_assign_eliminate]: 1.304e-05 [updatestate_loads_eliminate]: 4.1e-06 [parameter_eliminate]: 2.04e-06 [a_2]: 0.00011742 [accelerated_algorithm]: 8.86002e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 3.01999e-06 [shard_inline]: 8.25e-06 [merge_send_recv]: 4.634e-05 [auto_parallel]: 7.66001e-06 [parallel]: 8.743e-05 [flash_sp]: 3.498e-05 [merge_comm]: 5.52001e-06 [allreduce_fusion]: 1.362e-05 [matmul_add_comm_reduction]: 1.876e-05 [allreduce_slice_to_reducescatter]: 9.12001e-06 [virtual_shard_identity]: 1.096e-05 [virtual_dataset]: 8.45001e-06 [get_grad_eliminate_]: 8.3e-06 [virtual_output]: 7.95998e-06 [merge_forward]: 5.20999e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.947e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.482e-05 [merge_recompute_call_nodes]: 1.35001e-06 [before_grad]: 1.338e-05 [set_forward_comm_id_for_comm_node_pass]: 1.394e-05 [meta_fg_expand]: 5.12e-06 [flash_sp_send_recv_attached]: 2.46998e-06 [receive_attached]: 1.873e-05 [after_resolve]: 1.37e-05 [a_after_grad]: 1.329e-05 [renormalize]: 0.00245698 [add_forward_monad_depend]: 5.34e-06 [auto_monad_grad]: 1.96998e-06 [auto_monad_eliminator]: 2.989e-05 [cse]: 7.64e-05 [a_3]: 6.049e-05 [Cycle 2]: 0.00079457, [45] [expand_dump_flag]: 9.79984e-07 [switch_simplify]: 9.57999e-06 [loop_unroll]: 8.28999e-06 [a_1]: 0.00020651 [with_stream_mark]: 1.218e-05 [recompute_prepare]: 8.37e-06 [updatestate_depend_eliminate]: 4.12e-06 [updatestate_assign_eliminate]: 3.43e-06 [updatestate_loads_eliminate]: 3.7e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 0.00010562 [accelerated_algorithm]: 8.39998e-06 [shard]: 1.07e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 8.32e-06 [merge_send_recv]: 5.66998e-06 [auto_parallel]: 6.46999e-06 [parallel]: 4.27998e-06 [flash_sp]: 3.45e-06 [merge_comm]: 4.20999e-06 [allreduce_fusion]: 3.93999e-06 [matmul_add_comm_reduction]: 6.76e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 8.80999e-06 [virtual_dataset]: 7.81001e-06 [get_grad_eliminate_]: 7.7e-06 [virtual_output]: 7.38999e-06 [merge_forward]: 3.46999e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 8.13001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.426e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.256e-05 [set_forward_comm_id_for_comm_node_pass]: 4.58999e-06 [meta_fg_expand]: 2.79001e-06 [flash_sp_send_recv_attached]: 8.99978e-07 [receive_attached]: 9.90025e-07 [after_resolve]: 1.103e-05 [a_after_grad]: 1.209e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.11002e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 8.82e-06 [cse]: 2.124e-05 [a_3]: 4.98e-05 [py_interpret_to_execute_after_opt_a]: 4.09002e-06 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 3.273e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.08001e-06 [mutable_eliminate]: 0.00049583 [opt_b]: 0.00027749, [1] [Cycle 1]: 0.00027157, [7] [b_1]: 0.0001885 [b_2]: 1.002e-05 [updatestate_depend_eliminate]: 6.54999e-06 [updatestate_assign_eliminate]: 3.50998e-06 [updatestate_loads_eliminate]: 3.38999e-06 [renormalize]: 3.39991e-07 [cse]: 2.529e-05 [optimize_parallel_all_gather_comm]: 2.892e-05 [overlap_param_gather]: 1.18e-05 [cconv]: 2.274e-05 [loop_unroll]: 0.00042221 [opt_after_cconv]: 0.00012169, [1] [Cycle 1]: 0.0001167, [7] [c_1]: 4.13e-05 [parameter_eliminate]: 2.30997e-06 [updatestate_depend_eliminate]: 6.71e-06 [updatestate_assign_eliminate]: 3.52997e-06 [updatestate_loads_eliminate]: 3.51999e-06 [cse]: 2.534e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 3.26e-05 [tuple_transform]: 8.771e-05, [1] [Cycle 1]: 8.364e-05, [4] [d_1]: 5.618e-05 [none_parameter_eliminate]: 1.66998e-06 [renormalize]: 1.49972e-07 [switch_simplify]: 8.38999e-06 [partial_unused_args_eliminate]: 1.63002e-06 [add_recomputation]: 7.161e-05 [cse_after_recomputation]: 2.643e-05, [1] [Cycle 1]: 2.182e-05, [1] [cse]: 1.663e-05 [environ_conv]: 1.513e-05 [swap_dp_allreduce_reducescatter]: 2.658e-05 [bias_add_comm_swap]: 1.162e-05 [label_micro_interleaved_index]: 1.266e-05 [label_fine_grained_interleaved_index]: 2.46e-06 [merge_cast_opt]: 1.27999e-06 [slice_recompute_activation]: 1.89e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.01e-05 [full_micro_interleaved_order_control]: 1.076e-05 [reorder_send_recv_between_fp_bp]: 3.5e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.10001e-06 [interleave_parallel_branches]: 9.82001e-06 [overlap_opt_shard_in_pipeline]: 1.423e-05 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.84e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 4.82e-06 [overlap_recompute_and_grad_model_parallel]: 1.452e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.15999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.09e-06 [overlap_grad_ring_attention]: 2.177e-05 [overlap_grad_flash_sp]: 4.988e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 1.107e-05 [split_layernorm_comm]: 1.83002e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 9.064e-05, [1] [Cycle 1]: 8.535e-05, [6] [build]: 3.34001e-06 [elim_shapecalc]: 1.373e-05 [elim_not_effective]: 1.702e-05 [opt_reshape]: 8.87e-06 [fold_const_symbol]: 1.302e-05 [renormalize]: 2.79979e-07 [detach_backward]: 1.67001e-06 [pipeline_parallel_scheduler]: 1.42e-06 [auto_monad_reorder]: 2.618e-05 [get_jit_bprop_graph]: 1.25999e-06 [rewriter_after_jit_bprop_graph]: 3.28e-06 [opt_after_jit_grad]: 0.0004651 [validate]: 5.773e-05 [backend_pass]: 1.20999e-06 [task_emit]: 2.34773 [execute]: 1.047e-05 Sums bootstrap : 0.000833s : 0.03% type_inference : 0.145476s : 5.81% event_method : 0.000062s : 0.00% auto_monad : 0.000215s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000055s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000023s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000081s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000323s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000223s : 0.01% optimize.opt_a.loop_unroll : 0.000078s : 0.00% optimize.opt_a.a_1 : 0.001684s : 0.07% optimize.opt_a.with_stream_mark : 0.000027s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000223s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000017s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.00% optimize.opt_a.auto_parallel : 0.000014s : 0.00% optimize.opt_a.parallel : 0.000092s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000016s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000015s : 0.00% optimize.opt_a.merge_forward : 0.000009s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000028s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000026s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000019s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000025s : 0.00% optimize.opt_a.a_after_grad : 0.000025s : 0.00% optimize.opt_a.renormalize : 0.002457s : 0.10% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000039s : 0.00% optimize.opt_a.cse : 0.000098s : 0.00% optimize.opt_a.a_3 : 0.000110s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000033s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000496s : 0.02% optimize.opt_b.b_1 : 0.000189s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000025s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000422s : 0.02% optimize.opt_after_cconv.c_1 : 0.000041s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000033s : 0.00% optimize.tuple_transform.d_1 : 0.000056s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000072s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000022s : 0.00% optimize.overlap_grad_flash_sp : 0.000050s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000017s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000013s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000465s : 0.02% validate : 0.000058s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.347726s : 93.80% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000455 73 5.91% : 0.000027s : 5: substitution.arithmetic_simplify 8.82% : 0.000040s : 7: substitution.cast_eliminate 0.60% : 0.000003s : 4: substitution.elim_not_effective 0.43% : 0.000002s : 4: substitution.fold_const_symbol 1.43% : 0.000006s : 6: substitution.graph_param_transform 69.72% : 0.000317s : 18: substitution.inline 1.03% : 0.000005s : 8: substitution.j_node_and_user_rematch 3.38% : 0.000015s : 8: substitution.remove_not_recompute_node 0.94% : 0.000004s : 4: substitution.replace_old_param 4.91% : 0.000022s : 5: substitution.switch_simplify 2.84% : 0.000013s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.145368 2 97.27% : 0.141393s : 1: type_inference.infer 2.73% : 0.003975s : 1: type_inference.specialize ------[replace.] 0.000180 27 58.67% : 0.000106s : 18: replace.inline 27.70% : 0.000050s : 5: replace.switch_simplify 13.64% : 0.000025s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000339 27 91.02% : 0.000308s : 18: match.inline 5.75% : 0.000019s : 5: match.switch_simplify 3.23% : 0.000011s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000385 2586 1.07% : 0.000004s : 31: predicate.accumulaten_eliminater 0.51% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.40% : 0.000002s : 12: predicate.addn_check_dump 1.07% : 0.000004s : 31: predicate.addn_zero_filter 0.99% : 0.000004s : 31: predicate.adjust_all_reduce_mul_add 3.41% : 0.000013s : 43: predicate.arithmetic_simplify 1.34% : 0.000005s : 31: predicate.cast_eliminate 0.47% : 0.000002s : 12: predicate.check_bprop_eliminate 0.43% : 0.000002s : 12: predicate.compare_switch_simplify 0.13% : 0.000001s : 6: predicate.const_output_eliminate 0.40% : 0.000002s : 12: predicate.depend_value_elim 1.14% : 0.000004s : 31: predicate.dict_get_item_const_eliminator 1.27% : 0.000005s : 31: predicate.dict_get_item_eliminator 1.10% : 0.000004s : 31: predicate.dict_set_item_eliminator 0.63% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 6: predicate.elim_not_effective 0.24% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000005s : 37: predicate.environ_add_const_eliminate 1.21% : 0.000005s : 37: predicate.environ_get_add_eliminate 1.22% : 0.000005s : 37: predicate.environ_get_depend_swap 1.80% : 0.000007s : 49: predicate.environ_get_eliminate 1.22% : 0.000005s : 37: predicate.environ_get_set_eliminate 1.89% : 0.000007s : 53: predicate.exchange_switch_depend_value 2.67% : 0.000010s : 53: predicate.float_depend_g_call 0.40% : 0.000002s : 12: predicate.float_environ_get_switch 0.60% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 6: predicate.fold_const_symbol 0.48% : 0.000002s : 12: predicate.get_grad_eliminate 0.14% : 0.000001s : 6: predicate.graph_param_transform 0.44% : 0.000002s : 12: predicate.incorporate_call 0.39% : 0.000001s : 12: predicate.incorporate_call_switch 6.09% : 0.000023s : 120: predicate.inline 0.53% : 0.000002s : 12: predicate.inline_without_move 0.22% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.57% : 0.000002s : 12: predicate.less_batch_normalization 1.70% : 0.000007s : 47: predicate.list_to_tuple_eliminator_ 2.65% : 0.000010s : 78: predicate.load_eliminater 0.64% : 0.000002s : 6: predicate.loop_unroll_after_grad 3.25% : 0.000012s : 90: predicate.loop_unroll_before_grad 1.54% : 0.000006s : 43: predicate.make_slice_get_slice_eliminator 0.44% : 0.000002s : 12: predicate.merge_addn 0.40% : 0.000002s : 12: predicate.micro_step_allgather_replace 0.44% : 0.000002s : 12: predicate.mini_step_allgather_replace 1.05% : 0.000004s : 31: predicate.minmaximum_grad 0.68% : 0.000003s : 6: predicate.mutable_eliminate 0.33% : 0.000001s : 6: predicate.opt_reshape 0.32% : 0.000001s : 6: predicate.parallel_virtual_node 2.48% : 0.000010s : 53: predicate.partial_defer_inline 1.54% : 0.000006s : 41: predicate.partial_eliminate 1.12% : 0.000004s : 31: predicate.print_const_string_wrapper 0.42% : 0.000002s : 12: predicate.reduce_all_const_elim 1.47% : 0.000006s : 31: predicate.reduce_eliminate 2.58% : 0.000010s : 78: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000001s : 12: predicate.remove_not_recompute_node 1.23% : 0.000005s : 47: predicate.replace_applicator 0.34% : 0.000001s : 12: predicate.replace_old_param 0.15% : 0.000001s : 6: predicate.reset_defer_inline 1.09% : 0.000004s : 31: predicate.reshape_eliminate 0.46% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.25% : 0.000001s : 6: predicate.row_tensor_eliminate 0.53% : 0.000002s : 12: predicate.same_eliminate 0.29% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.49% : 0.000002s : 12: predicate.shard_identity_eliminate 0.50% : 0.000002s : 12: predicate.special_op_eliminate 0.55% : 0.000002s : 12: predicate.specialize_transform 0.55% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.54% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.23% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.04% : 0.000008s : 53: predicate.switch_defer_inline 2.41% : 0.000009s : 65: predicate.switch_layer_defer_inline 6.77% : 0.000026s : 171: predicate.switch_simplify 1.12% : 0.000004s : 31: predicate.tile_eliminate 1.06% : 0.000004s : 31: predicate.transpose_eliminate 1.63% : 0.000006s : 43: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000006s : 43: predicate.tuple_list_get_item_const_eliminator 1.57% : 0.000006s : 43: predicate.tuple_list_get_item_depend_reorder 2.75% : 0.000011s : 59: predicate.tuple_list_get_item_eliminator 1.51% : 0.000006s : 43: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000008s : 55: predicate.tuple_list_set_item_eliminator 1.63% : 0.000006s : 47: predicate.tuple_to_list_eliminator_ 2.54% : 0.000010s : 78: predicate.updatestate_pure_node_eliminater 3.10% : 0.000012s : 90: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 6: predicate.value_based_eliminate 0.50% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.45% : 0.000002s : 12: predicate.virtual_output_eliminate 0.19% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.29% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002125 34 46.69% : 0.000992s : 14: func_graph_cloner_run.FuncGraphClonerGraph 53.31% : 0.001133s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.532478 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.29% : 0.007282s : 1: add_attr 0.29% : 0.007270s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000076s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000223s : 1: auto_monad 0.00% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000010s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000876s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.00% : 0.000070s : 1: event_method 0.00% : 0.000024s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000431s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000504s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.10% : 0.002467s : 78: opt.transform.opt_a 0.00% : 0.000040s : 1: opt.transform.opt_after_cconv 0.00% : 0.000031s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000151s : 28: opt.transform.opt_b 0.00% : 0.000063s : 2: opt.transform.opt_trans_graph 0.00% : 0.000049s : 4: opt.transform.symbol_engine_opt 0.24% : 0.006046s : 1: opt_a 0.00% : 0.000125s : 1: opt_after_cconv 0.02% : 0.000475s : 1: opt_after_jit_grad 0.01% : 0.000281s : 1: opt_b 0.34% : 0.008654s : 1: optimize 0.00% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000054s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000086s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000037s : 1: remove_dup_value 0.05% : 0.001373s : 1: renormalize.infer 0.04% : 0.001075s : 1: renormalize.specialize 0.00% : 0.000008s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000036s : 1: rewriter_after_opt_a 0.01% : 0.000330s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000030s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000094s : 1: symbol_engine_optimizer 92.71% : 2.347763s : 1: task_emit 0.00% : 0.000091s : 1: tuple_transform 5.75% : 0.145496s : 1: type_inference 0.01% : 0.000324s : 1: validate TotalTime = 2.58584, [24] [bootstrap]: 0.00081843 [type_inference]: 0.265973 [event_method]: 0.00081244 [auto_monad]: 0.0002778 [graph_reusing]: 1.165e-05 [inline]: 2.59001e-06 [add_attr]: 0.00793698, [1] [add_attr_with_inline]: 0.00792191, [1] [Cycle 1]: 0.00018106, [2] [tag_attr]: 7.041e-05 [meta_addattr_fg_expand]: 2.441e-05 [parallel-infer-symbol]: 3.28e-06 [pre_auto_parallel]: 9.914e-05 [insert-virtual-dataset]: 2.39001e-06 [parallel-infer-symbol-second]: 1.04e-06 [dataset_repeat_opt]: 2.37001e-06 [pipeline_split]: 2.02001e-06 [optimize]: 0.010143, [53] [py_interpret_to_execute]: 4.37e-06 [rewriter_before_opt_a]: 0.00053689 [opt_a]: 0.00738036, [2] [Cycle 1]: 0.00674609, [45] [expand_dump_flag]: 6.32001e-06 [switch_simplify]: 0.00025425 [loop_unroll]: 0.00013578 [a_1]: 0.00167118 [with_stream_mark]: 2.572e-05 [recompute_prepare]: 9.21998e-06 [updatestate_depend_eliminate]: 1.218e-05 [updatestate_assign_eliminate]: 1.179e-05 [updatestate_loads_eliminate]: 3.31999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 9.15e-05 [accelerated_algorithm]: 7.76001e-06 [shard]: 1.84e-06 [meta_shard_fg_expand]: 3.04999e-06 [shard_inline]: 7.18e-06 [merge_send_recv]: 4.471e-05 [auto_parallel]: 7.35e-06 [parallel]: 7.27e-05 [flash_sp]: 3.342e-05 [merge_comm]: 4.25999e-06 [allreduce_fusion]: 1.185e-05 [matmul_add_comm_reduction]: 1.747e-05 [allreduce_slice_to_reducescatter]: 9.57999e-06 [virtual_shard_identity]: 9.38002e-06 [virtual_dataset]: 7.25e-06 [get_grad_eliminate_]: 6.71e-06 [virtual_output]: 7.06999e-06 [merge_forward]: 4.02e-06 [cell_reuse_recompute_pass]: 9.70002e-07 [offload_activation]: 1.754e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.387e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 1.106e-05 [set_forward_comm_id_for_comm_node_pass]: 1.31e-05 [meta_fg_expand]: 4.67998e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 1.967e-05 [after_resolve]: 1.022e-05 [a_after_grad]: 1.037e-05 [renormalize]: 0.00376035 [add_forward_monad_depend]: 5.76998e-06 [auto_monad_grad]: 2.38002e-06 [auto_monad_eliminator]: 2.933e-05 [cse]: 5.523e-05 [a_3]: 4.972e-05 [Cycle 2]: 0.00062383, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 7.92998e-06 [loop_unroll]: 6.84999e-06 [a_1]: 0.00013021 [with_stream_mark]: 1.183e-05 [recompute_prepare]: 6.74999e-06 [updatestate_depend_eliminate]: 3.23e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.14999e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 7.45e-05 [accelerated_algorithm]: 6.56e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 1.38002e-06 [shard_inline]: 6.02999e-06 [merge_send_recv]: 5.01002e-06 [auto_parallel]: 5.34e-06 [parallel]: 4.83001e-06 [flash_sp]: 2.76e-06 [merge_comm]: 3.08e-06 [allreduce_fusion]: 2.73e-06 [matmul_add_comm_reduction]: 6.07001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 7.11001e-06 [virtual_dataset]: 6.16e-06 [get_grad_eliminate_]: 5.91003e-06 [virtual_output]: 6.09001e-06 [merge_forward]: 2.63998e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 6.34999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.276e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 8.85999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.08e-06 [meta_fg_expand]: 1.73002e-06 [flash_sp_send_recv_attached]: 9.39996e-07 [receive_attached]: 1.01002e-06 [after_resolve]: 8.72e-06 [a_after_grad]: 9.24e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.09998e-06 [auto_monad_grad]: 8.30012e-07 [auto_monad_eliminator]: 5.90002e-06 [cse]: 1.484e-05 [a_3]: 3.613e-05 [py_interpret_to_execute_after_opt_a]: 5.04e-06 [slice_cell_reuse_recomputed_activation]: 2.40002e-06 [rewriter_after_opt_a]: 2.721e-05 [convert_after_rewriter]: 1.71e-06 [order_py_execute_after_rewriter]: 1.05999e-06 [mutable_eliminate]: 0.00057277 [opt_b]: 0.00020502, [1] [Cycle 1]: 0.00019861, [7] [b_1]: 0.00012675 [b_2]: 7.97003e-06 [updatestate_depend_eliminate]: 5.09998e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.21e-06 [renormalize]: 4.00003e-07 [cse]: 1.968e-05 [optimize_parallel_all_gather_comm]: 2.527e-05 [overlap_param_gather]: 1.171e-05 [cconv]: 2.354e-05 [loop_unroll]: 0.00045344 [opt_after_cconv]: 9.816e-05, [1] [Cycle 1]: 9.316e-05, [7] [c_1]: 3.026e-05 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 5.07999e-06 [updatestate_assign_eliminate]: 2.32001e-06 [updatestate_loads_eliminate]: 2.12001e-06 [cse]: 1.894e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 1.549e-05 [tuple_transform]: 7.238e-05, [1] [Cycle 1]: 6.82e-05, [4] [d_1]: 4.176e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.01999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 5.841e-05 [cse_after_recomputation]: 2.281e-05, [1] [Cycle 1]: 1.835e-05, [1] [cse]: 1.313e-05 [environ_conv]: 1.937e-05 [swap_dp_allreduce_reducescatter]: 2.494e-05 [bias_add_comm_swap]: 1.132e-05 [label_micro_interleaved_index]: 1.293e-05 [label_fine_grained_interleaved_index]: 2.33998e-06 [merge_cast_opt]: 1.44e-06 [slice_recompute_activation]: 2.19999e-06 [micro_interleaved_order_control]: 2.35002e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 9.92999e-06 [full_micro_interleaved_order_control]: 1.074e-05 [reorder_send_recv_between_fp_bp]: 2.59999e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 9.16002e-06 [overlap_opt_shard_in_pipeline]: 2.461e-05 [overlap_opt_shard_grad_in_pipeline]: 1.64998e-06 [control_data_broadcast_order]: 1.288e-05 [grouped_pairwise_exchange_alltoall]: 1.29003e-06 [offloading_packed_experts]: 3.78001e-06 [overlap_recompute_and_grad_model_parallel]: 1.324e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.15999e-06 [overlap_recompute_comm]: 1.99e-06 [overlap_grad_ring_attention]: 2.124e-05 [overlap_grad_flash_sp]: 4.384e-05 [begin_end_overlap_inline]: 6.19999e-07 [split_matmul_comm_elemetwise]: 1.192e-05 [split_layernorm_comm]: 1.57001e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 8.208e-05, [1] [Cycle 1]: 7.734e-05, [6] [build]: 3.23e-06 [elim_shapecalc]: 1.229e-05 [elim_not_effective]: 1.4e-05 [opt_reshape]: 7.38999e-06 [fold_const_symbol]: 1.048e-05 [renormalize]: 3.89991e-07 [detach_backward]: 1.65001e-06 [pipeline_parallel_scheduler]: 1.41002e-06 [auto_monad_reorder]: 2.379e-05 [get_jit_bprop_graph]: 1.42e-06 [rewriter_after_jit_bprop_graph]: 3.56001e-06 [opt_after_jit_grad]: 0.00045697 [validate]: 5.765e-05 [backend_pass]: 9.10019e-07 [task_emit]: 2.29889 [execute]: 1.069e-05 Sums bootstrap : 0.000818s : 0.03% type_inference : 0.265973s : 10.32% event_method : 0.000812s : 0.03% auto_monad : 0.000278s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000070s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000024s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000099s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000537s : 0.02% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000262s : 0.01% optimize.opt_a.loop_unroll : 0.000143s : 0.01% optimize.opt_a.a_1 : 0.001801s : 0.07% optimize.opt_a.with_stream_mark : 0.000038s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000166s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000050s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000078s : 0.00% optimize.opt_a.flash_sp : 0.000036s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000024s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.00% optimize.opt_a.a_after_grad : 0.000020s : 0.00% optimize.opt_a.renormalize : 0.003760s : 0.15% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.00% optimize.opt_a.cse : 0.000070s : 0.00% optimize.opt_a.a_3 : 0.000086s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000027s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000573s : 0.02% optimize.opt_b.b_1 : 0.000127s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000025s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000453s : 0.02% optimize.opt_after_cconv.c_1 : 0.000030s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000019s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.00% optimize.tuple_transform.d_1 : 0.000042s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000058s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000019s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000025s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000025s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000044s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000457s : 0.02% validate : 0.000058s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.298894s : 89.21% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000504 55 0.37% : 0.000002s : 2: substitution.elim_not_effective 0.68% : 0.000003s : 4: substitution.float_depend_g_call 0.26% : 0.000001s : 2: substitution.fold_const_symbol 1.09% : 0.000005s : 4: substitution.graph_param_transform 83.16% : 0.000419s : 21: substitution.inline 0.62% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.60% : 0.000023s : 4: substitution.partial_eliminate 1.06% : 0.000005s : 4: substitution.remove_not_recompute_node 0.59% : 0.000003s : 2: substitution.replace_old_param 4.58% : 0.000023s : 5: substitution.switch_simplify 3.00% : 0.000015s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.265821 2 97.57% : 0.259356s : 1: type_inference.infer 2.43% : 0.006466s : 1: type_inference.specialize ------[replace.] 0.000228 29 59.61% : 0.000136s : 21: replace.inline 26.72% : 0.000061s : 5: replace.switch_simplify 13.68% : 0.000031s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000442 29 92.50% : 0.000409s : 21: match.inline 4.45% : 0.000020s : 5: match.switch_simplify 3.04% : 0.000013s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000341 2299 1.19% : 0.000004s : 30: predicate.accumulaten_eliminater 0.55% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.28% : 0.000001s : 8: predicate.addn_check_dump 1.10% : 0.000004s : 30: predicate.addn_zero_filter 1.12% : 0.000004s : 30: predicate.adjust_all_reduce_mul_add 2.33% : 0.000008s : 38: predicate.arithmetic_simplify 1.12% : 0.000004s : 30: predicate.cast_eliminate 0.35% : 0.000001s : 8: predicate.check_bprop_eliminate 0.29% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.28% : 0.000001s : 8: predicate.depend_value_elim 1.24% : 0.000004s : 30: predicate.dict_get_item_const_eliminator 1.34% : 0.000005s : 30: predicate.dict_get_item_eliminator 1.15% : 0.000004s : 30: predicate.dict_set_item_eliminator 0.54% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.14% : 0.000000s : 4: predicate.elim_not_effective 0.24% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.34% : 0.000005s : 34: predicate.environ_add_const_eliminate 1.22% : 0.000004s : 34: predicate.environ_get_add_eliminate 1.23% : 0.000004s : 34: predicate.environ_get_depend_swap 1.59% : 0.000005s : 42: predicate.environ_get_eliminate 1.20% : 0.000004s : 34: predicate.environ_get_set_eliminate 2.16% : 0.000007s : 54: predicate.exchange_switch_depend_value 3.07% : 0.000010s : 54: predicate.float_depend_g_call 0.29% : 0.000001s : 8: predicate.float_environ_get_switch 0.43% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.40% : 0.000001s : 8: predicate.get_grad_eliminate 0.13% : 0.000000s : 4: predicate.graph_param_transform 0.32% : 0.000001s : 8: predicate.incorporate_call 0.28% : 0.000001s : 8: predicate.incorporate_call_switch 6.08% : 0.000021s : 108: predicate.inline 0.44% : 0.000001s : 8: predicate.inline_without_move 0.19% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.47% : 0.000002s : 8: predicate.less_batch_normalization 1.59% : 0.000005s : 41: predicate.list_to_tuple_eliminator_ 2.56% : 0.000009s : 71: predicate.load_eliminater 0.56% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.20% : 0.000014s : 97: predicate.loop_unroll_before_grad 1.54% : 0.000005s : 38: predicate.make_slice_get_slice_eliminator 0.33% : 0.000001s : 8: predicate.merge_addn 0.32% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.33% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.06% : 0.000004s : 30: predicate.minmaximum_grad 0.64% : 0.000002s : 4: predicate.mutable_eliminate 0.21% : 0.000001s : 4: predicate.opt_reshape 0.23% : 0.000001s : 4: predicate.parallel_virtual_node 3.08% : 0.000011s : 54: predicate.partial_defer_inline 1.55% : 0.000005s : 37: predicate.partial_eliminate 1.15% : 0.000004s : 30: predicate.print_const_string_wrapper 0.36% : 0.000001s : 8: predicate.reduce_all_const_elim 1.52% : 0.000005s : 30: predicate.reduce_eliminate 2.55% : 0.000009s : 71: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000004s : 41: predicate.replace_applicator 0.24% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000001s : 4: predicate.reset_defer_inline 1.19% : 0.000004s : 30: predicate.reshape_eliminate 0.36% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 4: predicate.row_tensor_eliminate 0.43% : 0.000001s : 8: predicate.same_eliminate 0.27% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.45% : 0.000002s : 8: predicate.shard_identity_eliminate 0.47% : 0.000002s : 8: predicate.special_op_eliminate 0.42% : 0.000001s : 8: predicate.specialize_transform 0.45% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.44% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.18% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.50% : 0.000009s : 54: predicate.switch_defer_inline 2.81% : 0.000010s : 62: predicate.switch_layer_defer_inline 7.99% : 0.000027s : 173: predicate.switch_simplify 1.16% : 0.000004s : 30: predicate.tile_eliminate 1.15% : 0.000004s : 30: predicate.transpose_eliminate 1.58% : 0.000005s : 38: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000006s : 38: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000005s : 38: predicate.tuple_list_get_item_depend_reorder 2.66% : 0.000009s : 49: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 38: predicate.tuple_list_get_set_item_eliminator 2.05% : 0.000007s : 46: predicate.tuple_list_set_item_eliminator 1.62% : 0.000006s : 41: predicate.tuple_to_list_eliminator_ 2.46% : 0.000008s : 71: predicate.updatestate_pure_node_eliminater 2.97% : 0.000010s : 79: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.41% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.43% : 0.000001s : 8: predicate.virtual_output_eliminate 0.15% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.24% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005528 49 70.49% : 0.003897s : 26: func_graph_cloner_run.FuncGraphClonerGraph 29.51% : 0.001631s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.610332 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.30% : 0.007942s : 1: add_attr 0.30% : 0.007927s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000063s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000291s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.03% : 0.000869s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.03% : 0.000829s : 1: event_method 0.00% : 0.000022s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000012s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000462s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000581s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.10% : 0.002552s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000108s : 28: opt.transform.opt_b 0.00% : 0.000047s : 2: opt.transform.opt_trans_graph 0.00% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.28% : 0.007384s : 1: opt_a 0.00% : 0.000101s : 1: opt_after_cconv 0.02% : 0.000466s : 1: opt_after_jit_grad 0.01% : 0.000208s : 1: opt_b 0.39% : 0.010148s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000047s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000028s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000104s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000019s : 1: remove_dup_value 0.09% : 0.002259s : 1: renormalize.infer 0.06% : 0.001492s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.02% : 0.000544s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000016s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000085s : 1: symbol_engine_optimizer 88.07% : 2.298929s : 1: task_emit 0.00% : 0.000075s : 1: tuple_transform 10.19% : 0.266000s : 1: type_inference 0.00% : 0.000086s : 1: validate TotalTime = 2.63849, [24] [bootstrap]: 0.00085883 [type_inference]: 0.0601305 [event_method]: 0.00018606 [auto_monad]: 0.00016061 [graph_reusing]: 7.01999e-06 [inline]: 2.86e-06 [add_attr]: 0.00883227, [1] [add_attr_with_inline]: 0.00881676, [1] [Cycle 1]: 0.00014717, [2] [tag_attr]: 4.061e-05 [meta_addattr_fg_expand]: 1.637e-05 [parallel-infer-symbol]: 3.73001e-06 [pre_auto_parallel]: 6.137e-05 [insert-virtual-dataset]: 2.89001e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.22999e-06 [pipeline_split]: 1.87999e-06 [optimize]: 0.00580124, [53] [py_interpret_to_execute]: 4.53001e-06 [rewriter_before_opt_a]: 0.00029347 [opt_a]: 0.00339691, [2] [Cycle 1]: 0.00282034, [45] [expand_dump_flag]: 3.85e-06 [switch_simplify]: 8.356e-05 [loop_unroll]: 3.935e-05 [a_1]: 0.00067352 [with_stream_mark]: 1.454e-05 [recompute_prepare]: 7.49002e-06 [updatestate_depend_eliminate]: 1.304e-05 [updatestate_assign_eliminate]: 1.148e-05 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 2.24001e-06 [a_2]: 7.984e-05 [accelerated_algorithm]: 6.59001e-06 [shard]: 2.07001e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 5.71e-06 [merge_send_recv]: 4.486e-05 [auto_parallel]: 7.23999e-06 [parallel]: 7.934e-05 [flash_sp]: 3.362e-05 [merge_comm]: 4.13999e-06 [allreduce_fusion]: 1.157e-05 [matmul_add_comm_reduction]: 1.832e-05 [allreduce_slice_to_reducescatter]: 8.84003e-06 [virtual_shard_identity]: 8.48001e-06 [virtual_dataset]: 6.38e-06 [get_grad_eliminate_]: 6.00002e-06 [virtual_output]: 6.49001e-06 [merge_forward]: 4.66002e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 1.76e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.062e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.016e-05 [set_forward_comm_id_for_comm_node_pass]: 1.244e-05 [meta_fg_expand]: 3.53999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 1.83e-05 [after_resolve]: 9.93998e-06 [a_after_grad]: 8.69e-06 [renormalize]: 0.0011476 [add_forward_monad_depend]: 6.04999e-06 [auto_monad_grad]: 3.03e-06 [auto_monad_eliminator]: 2.611e-05 [cse]: 5.538e-05 [a_3]: 4.538e-05 [Cycle 2]: 0.00056637, [45] [expand_dump_flag]: 1.19e-06 [switch_simplify]: 6.86999e-06 [loop_unroll]: 6.09001e-06 [a_1]: 9.679e-05 [with_stream_mark]: 1.098e-05 [recompute_prepare]: 5.89999e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.75002e-06 [updatestate_loads_eliminate]: 2.86e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 6.607e-05 [accelerated_algorithm]: 5.60001e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.23002e-06 [shard_inline]: 5.66e-06 [merge_send_recv]: 4.57998e-06 [auto_parallel]: 5.22999e-06 [parallel]: 4.13001e-06 [flash_sp]: 3.35e-06 [merge_comm]: 3.10998e-06 [allreduce_fusion]: 2.94001e-06 [matmul_add_comm_reduction]: 5.17e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 6.10002e-06 [virtual_dataset]: 5.64e-06 [get_grad_eliminate_]: 5.36998e-06 [virtual_output]: 5.29e-06 [merge_forward]: 2.86e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 6.47001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.178e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 8.22e-06 [set_forward_comm_id_for_comm_node_pass]: 3.36001e-06 [meta_fg_expand]: 1.97999e-06 [flash_sp_send_recv_attached]: 8.59989e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.22e-06 [a_after_grad]: 7.65e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 6.48e-06 [cse]: 1.562e-05 [a_3]: 3.278e-05 [py_interpret_to_execute_after_opt_a]: 3.98001e-06 [slice_cell_reuse_recomputed_activation]: 2.12001e-06 [rewriter_after_opt_a]: 2.773e-05 [convert_after_rewriter]: 1.47001e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00058263 [opt_b]: 0.00019379, [1] [Cycle 1]: 0.00018772, [7] [b_1]: 0.00011574 [b_2]: 7.03998e-06 [updatestate_depend_eliminate]: 5.29e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.43002e-06 [renormalize]: 4.89992e-07 [cse]: 2.045e-05 [optimize_parallel_all_gather_comm]: 2.721e-05 [overlap_param_gather]: 1.173e-05 [cconv]: 2.335e-05 [loop_unroll]: 0.00043031 [opt_after_cconv]: 9.578e-05, [1] [Cycle 1]: 9.025e-05, [7] [c_1]: 2.447e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 5.45001e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.34001e-06 [cse]: 2.018e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 1.735e-05 [tuple_transform]: 6.389e-05, [1] [Cycle 1]: 5.967e-05, [4] [d_1]: 3.463e-05 [none_parameter_eliminate]: 1.67001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.16998e-06 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 6.023e-05 [cse_after_recomputation]: 2.276e-05, [1] [Cycle 1]: 1.868e-05, [1] [cse]: 1.344e-05 [environ_conv]: 1.499e-05 [swap_dp_allreduce_reducescatter]: 2.436e-05 [bias_add_comm_swap]: 1.066e-05 [label_micro_interleaved_index]: 1.335e-05 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.65001e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.55002e-06 [assign_add_opt]: 1.52001e-06 [ForceFp32Comm]: 1.05001e-06 [remove_cast_before_assign_add]: 9.94999e-06 [full_micro_interleaved_order_control]: 1.091e-05 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.52001e-06 [add_comm_op_reuse_tag]: 1.05001e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 8.55001e-06 [overlap_opt_shard_in_pipeline]: 1.449e-05 [overlap_opt_shard_grad_in_pipeline]: 2.16e-06 [control_data_broadcast_order]: 1.295e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 4.10998e-06 [overlap_recompute_and_grad_model_parallel]: 1.35e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.45001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 1.952e-05 [overlap_grad_flash_sp]: 4.303e-05 [begin_end_overlap_inline]: 6.40022e-07 [split_matmul_comm_elemetwise]: 1.061e-05 [split_layernorm_comm]: 1.89e-06 [handle_group_info]: 1.34e-06 [symbol_engine_optimizer]: 7.237e-05, [1] [Cycle 1]: 6.829e-05, [6] [build]: 2.66e-06 [elim_shapecalc]: 9.69999e-06 [elim_not_effective]: 1.231e-05 [opt_reshape]: 6.38e-06 [fold_const_symbol]: 9.48002e-06 [renormalize]: 1.69995e-07 [detach_backward]: 1.87001e-06 [pipeline_parallel_scheduler]: 1.57001e-06 [auto_monad_reorder]: 2.407e-05 [get_jit_bprop_graph]: 1.49998e-06 [rewriter_after_jit_bprop_graph]: 3.26001e-06 [opt_after_jit_grad]: 0.00046919 [validate]: 5.769e-05 [backend_pass]: 9.89996e-07 [task_emit]: 2.56157 [execute]: 1.063e-05 Sums bootstrap : 0.000859s : 0.03% type_inference : 0.060130s : 2.29% event_method : 0.000186s : 0.01% auto_monad : 0.000161s : 0.01% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000061s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000293s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000090s : 0.00% optimize.opt_a.loop_unroll : 0.000045s : 0.00% optimize.opt_a.a_1 : 0.000770s : 0.03% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000146s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000049s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000083s : 0.00% optimize.opt_a.flash_sp : 0.000037s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.001148s : 0.04% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.00% optimize.opt_a.cse : 0.000071s : 0.00% optimize.opt_a.a_3 : 0.000078s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000583s : 0.02% optimize.opt_b.b_1 : 0.000116s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000430s : 0.02% optimize.opt_after_cconv.c_1 : 0.000024s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.00% optimize.tuple_transform.d_1 : 0.000035s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000060s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000024s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000002s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.00% optimize.overlap_grad_flash_sp : 0.000043s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000469s : 0.02% validate : 0.000058s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.561570s : 97.45% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000205 26 0.89% : 0.000002s : 2: substitution.elim_not_effective 0.64% : 0.000001s : 2: substitution.fold_const_symbol 2.39% : 0.000005s : 3: substitution.graph_param_transform 79.95% : 0.000164s : 6: substitution.inline 1.66% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.32% : 0.000013s : 4: substitution.remove_not_recompute_node 1.54% : 0.000003s : 2: substitution.replace_old_param 6.60% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.060016 2 96.38% : 0.057846s : 1: type_inference.infer 3.62% : 0.002170s : 1: type_inference.specialize ------[replace.] 0.000081 9 71.43% : 0.000058s : 6: replace.inline 28.57% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000172 9 93.05% : 0.000160s : 6: match.inline 6.95% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000183 1162 1.05% : 0.000002s : 13: predicate.accumulaten_eliminater 0.82% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.48% : 0.000001s : 6: predicate.addn_check_dump 0.96% : 0.000002s : 13: predicate.addn_zero_filter 0.92% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.17% : 0.000004s : 19: predicate.arithmetic_simplify 0.99% : 0.000002s : 13: predicate.cast_eliminate 0.49% : 0.000001s : 6: predicate.check_bprop_eliminate 0.47% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.46% : 0.000001s : 6: predicate.depend_value_elim 1.04% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.20% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.13% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.95% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.28% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 16: predicate.environ_add_const_eliminate 1.10% : 0.000002s : 16: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 16: predicate.environ_get_depend_swap 1.63% : 0.000003s : 22: predicate.environ_get_eliminate 1.15% : 0.000002s : 16: predicate.environ_get_set_eliminate 1.67% : 0.000003s : 22: predicate.exchange_switch_depend_value 2.60% : 0.000005s : 22: predicate.float_depend_g_call 0.43% : 0.000001s : 6: predicate.float_environ_get_switch 0.67% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.63% : 0.000001s : 6: predicate.get_grad_eliminate 0.16% : 0.000000s : 3: predicate.graph_param_transform 0.49% : 0.000001s : 6: predicate.incorporate_call 0.44% : 0.000001s : 6: predicate.incorporate_call_switch 5.99% : 0.000011s : 53: predicate.inline 0.65% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.65% : 0.000001s : 6: predicate.less_batch_normalization 1.77% : 0.000003s : 22: predicate.list_to_tuple_eliminator_ 2.52% : 0.000005s : 35: predicate.load_eliminater 0.90% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.13% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.58% : 0.000003s : 19: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 6: predicate.merge_addn 0.41% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.46% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.94% : 0.000002s : 13: predicate.minmaximum_grad 1.00% : 0.000002s : 3: predicate.mutable_eliminate 0.27% : 0.000000s : 3: predicate.opt_reshape 0.29% : 0.000001s : 3: predicate.parallel_virtual_node 2.26% : 0.000004s : 22: predicate.partial_defer_inline 1.50% : 0.000003s : 19: predicate.partial_eliminate 1.00% : 0.000002s : 13: predicate.print_const_string_wrapper 0.50% : 0.000001s : 6: predicate.reduce_all_const_elim 1.24% : 0.000002s : 13: predicate.reduce_eliminate 2.70% : 0.000005s : 35: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 6: predicate.remove_not_recompute_node 1.27% : 0.000002s : 22: predicate.replace_applicator 0.47% : 0.000001s : 6: predicate.replace_old_param 0.26% : 0.000000s : 3: predicate.reset_defer_inline 1.02% : 0.000002s : 13: predicate.reshape_eliminate 0.55% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.31% : 0.000001s : 3: predicate.row_tensor_eliminate 0.58% : 0.000001s : 6: predicate.same_eliminate 0.36% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.64% : 0.000001s : 6: predicate.shard_identity_eliminate 0.61% : 0.000001s : 6: predicate.special_op_eliminate 0.67% : 0.000001s : 6: predicate.specialize_transform 0.63% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.60% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.28% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.75% : 0.000003s : 22: predicate.switch_defer_inline 2.38% : 0.000004s : 28: predicate.switch_layer_defer_inline 6.23% : 0.000011s : 73: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 1.15% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000003s : 19: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000003s : 19: predicate.tuple_list_get_item_const_eliminator 1.60% : 0.000003s : 19: predicate.tuple_list_get_item_depend_reorder 3.23% : 0.000006s : 28: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 19: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 22: predicate.tuple_to_list_eliminator_ 2.40% : 0.000004s : 35: predicate.updatestate_pure_node_eliminater 3.06% : 0.000006s : 41: predicate.updatestate_useless_node_eliminater 0.29% : 0.000001s : 3: predicate.value_based_eliminate 0.58% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001221 16 53.20% : 0.000650s : 8: func_graph_cloner_run.FuncGraphClonerGraph 46.80% : 0.000572s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.655610 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.33% : 0.008838s : 1: add_attr 0.33% : 0.008821s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000172s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.03% : 0.000903s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.01% : 0.000202s : 1: event_method 0.00% : 0.000024s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000592s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.05% : 0.001217s : 78: opt.transform.opt_a 0.00% : 0.000023s : 1: opt.transform.opt_after_cconv 0.00% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000093s : 28: opt.transform.opt_b 0.00% : 0.000039s : 2: opt.transform.opt_trans_graph 0.00% : 0.000034s : 4: opt.transform.symbol_engine_opt 0.13% : 0.003400s : 1: opt_a 0.00% : 0.000099s : 1: opt_after_cconv 0.02% : 0.000479s : 1: opt_after_jit_grad 0.01% : 0.000197s : 1: opt_b 0.22% : 0.005805s : 1: optimize 0.00% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000046s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000066s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000021s : 1: remove_dup_value 0.02% : 0.000562s : 1: renormalize.infer 0.02% : 0.000576s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.01% : 0.000299s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000075s : 1: symbol_engine_optimizer 96.46% : 2.561607s : 1: task_emit 0.00% : 0.000067s : 1: tuple_transform 2.27% : 0.060158s : 1: type_inference 0.00% : 0.000086s : 1: validate TotalTime = 2.7113, [24] [bootstrap]: 0.00083417 [type_inference]: 0.186704 [event_method]: 0.0003659 [auto_monad]: 0.00030414 [graph_reusing]: 1.175e-05 [inline]: 2.56e-06 [add_attr]: 0.00791937, [1] [add_attr_with_inline]: 0.00790489, [1] [Cycle 1]: 0.00019699, [2] [tag_attr]: 7.123e-05 [meta_addattr_fg_expand]: 2.765e-05 [parallel-infer-symbol]: 3.67002e-06 [pre_auto_parallel]: 9.841e-05 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 7.40023e-07 [dataset_repeat_opt]: 2.16998e-06 [pipeline_split]: 1.74998e-06 [optimize]: 0.0109405, [53] [py_interpret_to_execute]: 5.43002e-06 [rewriter_before_opt_a]: 0.00041399 [opt_a]: 0.00780215, [2] [Cycle 1]: 0.00695557, [45] [expand_dump_flag]: 6.38998e-06 [switch_simplify]: 0.00027458 [loop_unroll]: 7.575e-05 [a_1]: 0.00165223 [with_stream_mark]: 2.775e-05 [recompute_prepare]: 1.659e-05 [updatestate_depend_eliminate]: 2.316e-05 [updatestate_assign_eliminate]: 1.736e-05 [updatestate_loads_eliminate]: 5.80002e-06 [parameter_eliminate]: 2.37999e-06 [a_2]: 0.00026756 [accelerated_algorithm]: 9.58002e-06 [shard]: 2.16e-06 [meta_shard_fg_expand]: 3.82002e-06 [shard_inline]: 8.33999e-06 [merge_send_recv]: 6.272e-05 [auto_parallel]: 1.048e-05 [parallel]: 0.00011522 [flash_sp]: 4.92e-05 [merge_comm]: 7.07997e-06 [allreduce_fusion]: 1.756e-05 [matmul_add_comm_reduction]: 2.511e-05 [allreduce_slice_to_reducescatter]: 1.311e-05 [virtual_shard_identity]: 1.445e-05 [virtual_dataset]: 9.16998e-06 [get_grad_eliminate_]: 8.42e-06 [virtual_output]: 8.77e-06 [merge_forward]: 6.77002e-06 [cell_reuse_recompute_pass]: 1.50999e-06 [offload_activation]: 2.42e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.243e-05 [merge_recompute_call_nodes]: 1.69998e-06 [before_grad]: 1.623e-05 [set_forward_comm_id_for_comm_node_pass]: 1.855e-05 [meta_fg_expand]: 5.77999e-06 [flash_sp_send_recv_attached]: 3.65e-06 [receive_attached]: 2.707e-05 [after_resolve]: 1.241e-05 [a_after_grad]: 1.264e-05 [renormalize]: 0.00335849 [add_forward_monad_depend]: 8.02998e-06 [auto_monad_grad]: 2.66e-06 [auto_monad_eliminator]: 3.927e-05 [cse]: 0.0002738 [a_3]: 7.013e-05 [Cycle 2]: 0.00083349, [45] [expand_dump_flag]: 2.93e-06 [switch_simplify]: 1.018e-05 [loop_unroll]: 8.82999e-06 [a_1]: 0.00016278 [with_stream_mark]: 2.109e-05 [recompute_prepare]: 8.59998e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 4.22e-06 [updatestate_loads_eliminate]: 3.61999e-06 [parameter_eliminate]: 1.17999e-06 [a_2]: 0.00010148 [accelerated_algorithm]: 8.96002e-06 [shard]: 1.81e-06 [meta_shard_fg_expand]: 2.24999e-06 [shard_inline]: 7.66999e-06 [merge_send_recv]: 7.58999e-06 [auto_parallel]: 7.97e-06 [parallel]: 7.11999e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 5.17e-06 [allreduce_fusion]: 5.07e-06 [matmul_add_comm_reduction]: 8.47e-06 [allreduce_slice_to_reducescatter]: 9.79984e-07 [virtual_shard_identity]: 9.46998e-06 [virtual_dataset]: 7.63999e-06 [get_grad_eliminate_]: 7.48e-06 [virtual_output]: 8.00999e-06 [merge_forward]: 4.87e-06 [cell_reuse_recompute_pass]: 1.82001e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.012e-05 [merge_recompute_call_nodes]: 1.27e-06 [before_grad]: 1.422e-05 [set_forward_comm_id_for_comm_node_pass]: 5.00999e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 7.59988e-07 [receive_attached]: 1.60999e-06 [after_resolve]: 1.147e-05 [a_after_grad]: 1.197e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.97999e-06 [auto_monad_grad]: 1.23002e-06 [auto_monad_eliminator]: 1.259e-05 [cse]: 4.462e-05 [a_3]: 5.245e-05 [py_interpret_to_execute_after_opt_a]: 6.09999e-06 [slice_cell_reuse_recomputed_activation]: 2.37999e-06 [rewriter_after_opt_a]: 4.265e-05 [convert_after_rewriter]: 1.52999e-06 [order_py_execute_after_rewriter]: 1.99999e-06 [mutable_eliminate]: 0.00072276 [opt_b]: 0.00028887, [1] [Cycle 1]: 0.0002811, [7] [b_1]: 0.00017583 [b_2]: 1.038e-05 [updatestate_depend_eliminate]: 8.92e-06 [updatestate_assign_eliminate]: 4.3e-06 [updatestate_loads_eliminate]: 3.83999e-06 [renormalize]: 5.89993e-07 [cse]: 3.926e-05 [optimize_parallel_all_gather_comm]: 3.726e-05 [overlap_param_gather]: 1.33e-05 [cconv]: 2.571e-05 [loop_unroll]: 0.00049327 [opt_after_cconv]: 0.0001379, [1] [Cycle 1]: 0.00013099, [7] [c_1]: 3.785e-05 [parameter_eliminate]: 3.43e-06 [updatestate_depend_eliminate]: 8.27998e-06 [updatestate_assign_eliminate]: 4.32e-06 [updatestate_loads_eliminate]: 3.85e-06 [cse]: 3.729e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 4.512e-05 [tuple_transform]: 9.06e-05, [1] [Cycle 1]: 8.578e-05, [4] [d_1]: 5.46e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 9.72999e-06 [partial_unused_args_eliminate]: 1.76003e-06 [add_recomputation]: 8.432e-05 [cse_after_recomputation]: 3.021e-05, [1] [Cycle 1]: 2.568e-05, [1] [cse]: 1.951e-05 [environ_conv]: 1.854e-05 [swap_dp_allreduce_reducescatter]: 3.084e-05 [bias_add_comm_swap]: 1.428e-05 [label_micro_interleaved_index]: 1.672e-05 [label_fine_grained_interleaved_index]: 2.54999e-06 [merge_cast_opt]: 1.40999e-06 [slice_recompute_activation]: 2.11998e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.33002e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.17e-05 [full_micro_interleaved_order_control]: 1.278e-05 [reorder_send_recv_between_fp_bp]: 3.06001e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.03001e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.264e-05 [overlap_opt_shard_in_pipeline]: 2.446e-05 [overlap_opt_shard_grad_in_pipeline]: 2.01e-06 [control_data_broadcast_order]: 1.917e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 5.46e-06 [overlap_recompute_and_grad_model_parallel]: 1.726e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.35999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.76e-06 [overlap_grad_ring_attention]: 2.747e-05 [overlap_grad_flash_sp]: 5.833e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 1.306e-05 [split_layernorm_comm]: 1.87999e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 0.00010034, [1] [Cycle 1]: 9.449e-05, [6] [build]: 4.25e-06 [elim_shapecalc]: 1.538e-05 [elim_not_effective]: 1.852e-05 [opt_reshape]: 9.90002e-06 [fold_const_symbol]: 1.422e-05 [renormalize]: 6.00005e-07 [detach_backward]: 2.31e-06 [pipeline_parallel_scheduler]: 1.71e-06 [auto_monad_reorder]: 3.944e-05 [get_jit_bprop_graph]: 2.04e-06 [rewriter_after_jit_bprop_graph]: 4.17e-06 [opt_after_jit_grad]: 0.0005462 [validate]: 7.108e-05 [backend_pass]: 1.07e-06 [task_emit]: 2.50284 [execute]: 1.177e-05 Sums bootstrap : 0.000834s : 0.03% type_inference : 0.186704s : 6.91% event_method : 0.000366s : 0.01% auto_monad : 0.000304s : 0.01% graph_reusing : 0.000012s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000071s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000028s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000098s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000414s : 0.02% optimize.opt_a.expand_dump_flag : 0.000009s : 0.00% optimize.opt_a.switch_simplify : 0.000285s : 0.01% optimize.opt_a.loop_unroll : 0.000085s : 0.00% optimize.opt_a.a_1 : 0.001815s : 0.07% optimize.opt_a.with_stream_mark : 0.000049s : 0.00% optimize.opt_a.recompute_prepare : 0.000025s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000029s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000022s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000369s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000019s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000006s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000070s : 0.00% optimize.opt_a.auto_parallel : 0.000018s : 0.00% optimize.opt_a.parallel : 0.000122s : 0.00% optimize.opt_a.flash_sp : 0.000052s : 0.00% optimize.opt_a.merge_comm : 0.000012s : 0.00% optimize.opt_a.allreduce_fusion : 0.000023s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000034s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000014s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000024s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000017s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000053s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000030s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000024s : 0.00% optimize.opt_a.meta_fg_expand : 0.000009s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000029s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.00% optimize.opt_a.a_after_grad : 0.000025s : 0.00% optimize.opt_a.renormalize : 0.003359s : 0.12% optimize.opt_a.add_forward_monad_depend : 0.000010s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000052s : 0.00% optimize.opt_a.cse : 0.000318s : 0.01% optimize.opt_a.a_3 : 0.000123s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000043s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000723s : 0.03% optimize.opt_b.b_1 : 0.000176s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000039s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000037s : 0.00% optimize.overlap_param_gather : 0.000013s : 0.00% optimize.cconv : 0.000026s : 0.00% optimize.loop_unroll : 0.000493s : 0.02% optimize.opt_after_cconv.c_1 : 0.000038s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000037s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000045s : 0.00% optimize.tuple_transform.d_1 : 0.000055s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000084s : 0.00% optimize.cse_after_recomputation.cse : 0.000020s : 0.00% optimize.environ_conv : 0.000019s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000031s : 0.00% optimize.bias_add_comm_swap : 0.000014s : 0.00% optimize.label_micro_interleaved_index : 0.000017s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000012s : 0.00% optimize.full_micro_interleaved_order_control : 0.000013s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000024s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000017s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000027s : 0.00% optimize.overlap_grad_flash_sp : 0.000058s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000013s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000015s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000014s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000001s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000039s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000546s : 0.02% validate : 0.000071s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.502840s : 92.63% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000526 58 0.72% : 0.000004s : 1: substitution.depend_value_elim 0.47% : 0.000002s : 4: substitution.elim_not_effective 0.46% : 0.000002s : 4: substitution.fold_const_symbol 1.35% : 0.000007s : 5: substitution.graph_param_transform 83.09% : 0.000437s : 19: substitution.inline 1.12% : 0.000006s : 8: substitution.j_node_and_user_rematch 3.92% : 0.000021s : 8: substitution.remove_not_recompute_node 0.66% : 0.000003s : 2: substitution.replace_old_param 5.64% : 0.000030s : 5: substitution.switch_simplify 2.57% : 0.000014s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.186574 2 97.52% : 0.181939s : 1: type_inference.infer 2.48% : 0.004635s : 1: type_inference.specialize ------[replace.] 0.000246 27 5.91% : 0.000015s : 1: replace.depend_value_elim 54.53% : 0.000134s : 19: replace.inline 31.69% : 0.000078s : 5: replace.switch_simplify 7.87% : 0.000019s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000469 27 0.65% : 0.000003s : 1: match.depend_value_elim 91.09% : 0.000427s : 19: match.inline 5.69% : 0.000027s : 5: match.switch_simplify 2.56% : 0.000012s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000410 2598 1.11% : 0.000005s : 32: predicate.accumulaten_eliminater 0.50% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.54% : 0.000002s : 17: predicate.addn_check_dump 1.09% : 0.000004s : 32: predicate.addn_zero_filter 1.03% : 0.000004s : 32: predicate.adjust_all_reduce_mul_add 2.32% : 0.000009s : 49: predicate.arithmetic_simplify 1.13% : 0.000005s : 32: predicate.cast_eliminate 0.40% : 0.000002s : 10: predicate.check_bprop_eliminate 0.53% : 0.000002s : 17: predicate.compare_switch_simplify 0.11% : 0.000000s : 5: predicate.const_output_eliminate 0.61% : 0.000002s : 17: predicate.depend_value_elim 1.15% : 0.000005s : 32: predicate.dict_get_item_const_eliminator 1.34% : 0.000005s : 32: predicate.dict_get_item_eliminator 1.07% : 0.000004s : 32: predicate.dict_set_item_eliminator 0.71% : 0.000003s : 10: predicate.dumpgradient_eliminate 0.20% : 0.000001s : 5: predicate.elim_not_effective 0.26% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000005s : 37: predicate.environ_add_const_eliminate 1.23% : 0.000005s : 37: predicate.environ_get_add_eliminate 1.22% : 0.000005s : 37: predicate.environ_get_depend_swap 1.89% : 0.000008s : 54: predicate.environ_get_eliminate 1.19% : 0.000005s : 37: predicate.environ_get_set_eliminate 1.80% : 0.000007s : 53: predicate.exchange_switch_depend_value 2.78% : 0.000011s : 53: predicate.float_depend_g_call 0.54% : 0.000002s : 17: predicate.float_environ_get_switch 0.69% : 0.000003s : 22: predicate.float_tuple_getitem_switch 0.10% : 0.000000s : 5: predicate.fold_const_symbol 0.39% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000001s : 5: predicate.graph_param_transform 0.58% : 0.000002s : 17: predicate.incorporate_call 0.52% : 0.000002s : 17: predicate.incorporate_call_switch 6.05% : 0.000025s : 122: predicate.inline 0.60% : 0.000002s : 10: predicate.inline_without_move 0.18% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.50% : 0.000002s : 10: predicate.less_batch_normalization 1.56% : 0.000006s : 44: predicate.list_to_tuple_eliminator_ 2.62% : 0.000011s : 76: predicate.load_eliminater 0.77% : 0.000003s : 5: predicate.loop_unroll_after_grad 3.19% : 0.000013s : 91: predicate.loop_unroll_before_grad 1.49% : 0.000006s : 42: predicate.make_slice_get_slice_eliminator 0.59% : 0.000002s : 17: predicate.merge_addn 0.35% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.35% : 0.000001s : 10: predicate.mini_step_allgather_replace 1.04% : 0.000004s : 32: predicate.minmaximum_grad 0.85% : 0.000003s : 5: predicate.mutable_eliminate 0.25% : 0.000001s : 5: predicate.opt_reshape 0.22% : 0.000001s : 5: predicate.parallel_virtual_node 2.47% : 0.000010s : 53: predicate.partial_defer_inline 1.50% : 0.000006s : 39: predicate.partial_eliminate 1.06% : 0.000004s : 32: predicate.print_const_string_wrapper 0.51% : 0.000002s : 15: predicate.reduce_all_const_elim 1.41% : 0.000006s : 32: predicate.reduce_eliminate 2.52% : 0.000010s : 76: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000002s : 10: predicate.remove_not_recompute_node 1.16% : 0.000005s : 44: predicate.replace_applicator 0.24% : 0.000001s : 10: predicate.replace_old_param 0.25% : 0.000001s : 5: predicate.reset_defer_inline 1.13% : 0.000005s : 32: predicate.reshape_eliminate 0.41% : 0.000002s : 10: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 5: predicate.row_tensor_eliminate 0.60% : 0.000002s : 10: predicate.same_eliminate 0.39% : 0.000002s : 12: predicate.set_cell_output_no_recompute 0.58% : 0.000002s : 10: predicate.shard_identity_eliminate 0.40% : 0.000002s : 10: predicate.special_op_eliminate 0.74% : 0.000003s : 17: predicate.specialize_transform 0.52% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.48% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.24% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.95% : 0.000008s : 53: predicate.switch_defer_inline 2.32% : 0.000009s : 63: predicate.switch_layer_defer_inline 6.94% : 0.000028s : 176: predicate.switch_simplify 1.10% : 0.000005s : 32: predicate.tile_eliminate 1.15% : 0.000005s : 32: predicate.transpose_eliminate 1.68% : 0.000007s : 42: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000006s : 42: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000006s : 42: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000012s : 61: predicate.tuple_list_get_item_eliminator 1.59% : 0.000007s : 42: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000009s : 59: predicate.tuple_list_set_item_eliminator 1.55% : 0.000006s : 44: predicate.tuple_to_list_eliminator_ 2.41% : 0.000010s : 76: predicate.updatestate_pure_node_eliminater 3.07% : 0.000013s : 93: predicate.updatestate_useless_node_eliminater 0.22% : 0.000001s : 5: predicate.value_based_eliminate 0.49% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.43% : 0.000002s : 10: predicate.virtual_output_eliminate 0.18% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.27% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003295 48 57.01% : 0.001879s : 23: func_graph_cloner_run.FuncGraphClonerGraph 42.99% : 0.001417s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.736459 213 0.00% : 0.000004s : 1: ForceFp32Comm 0.29% : 0.007925s : 1: add_attr 0.29% : 0.007910s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000089s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000316s : 1: auto_monad 0.00% : 0.000044s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.03% : 0.000885s : 1: bootstrap 0.00% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000023s : 1: control_data_broadcast_order 0.00% : 0.000042s : 1: convert_after_rewriter 0.00% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000006s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000023s : 1: environ_conv 0.01% : 0.000382s : 1: event_method 0.00% : 0.000054s : 1: execute 0.00% : 0.000016s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000016s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000015s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.02% : 0.000503s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000733s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 0.10% : 0.002825s : 95: opt.transform.opt_a 0.00% : 0.000037s : 1: opt.transform.opt_after_cconv 0.00% : 0.000032s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000156s : 28: opt.transform.opt_b 0.00% : 0.000062s : 2: opt.transform.opt_trans_graph 0.00% : 0.000054s : 4: opt.transform.symbol_engine_opt 0.29% : 0.007806s : 1: opt_a 0.01% : 0.000141s : 1: opt_after_cconv 0.02% : 0.000556s : 1: opt_after_jit_grad 0.01% : 0.000293s : 1: opt_b 0.40% : 0.010946s : 1: optimize 0.00% : 0.000041s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000005s : 1: order_py_execute_after_rewriter 0.00% : 0.000062s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000031s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000028s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000006s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000104s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000015s : 1: remove_cast_before_assign_add 0.00% : 0.000050s : 1: remove_dup_value 0.07% : 0.001990s : 1: renormalize.infer 0.05% : 0.001357s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000047s : 1: rewriter_after_opt_a 0.02% : 0.000421s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000016s : 1: split_matmul_comm_elemetwise 0.00% : 0.000034s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000103s : 1: symbol_engine_optimizer 91.47% : 2.502946s : 1: task_emit 0.00% : 0.000094s : 1: tuple_transform 6.82% : 0.186729s : 1: type_inference 0.01% : 0.000145s : 1: validate TotalTime = 0.152158, [24] [bootstrap]: 0.00051525 [type_inference]: 0.131178 [event_method]: 0.00083626 [auto_monad]: 0.00017669 [graph_reusing]: 1.137e-05 [inline]: 2.06e-06 [add_attr]: 0.00359785, [1] [add_attr_with_inline]: 0.00358912, [1] [Cycle 1]: 9.435e-05, [2] [tag_attr]: 4.964e-05 [meta_addattr_fg_expand]: 1.545e-05 [parallel-infer-symbol]: 3.17002e-06 [pre_auto_parallel]: 6.836e-05 [insert-virtual-dataset]: 2.65002e-06 [parallel-infer-symbol-second]: 7.00005e-07 [dataset_repeat_opt]: 2.14999e-06 [pipeline_split]: 1.47999e-06 [optimize]: 0.00805784, [53] [py_interpret_to_execute]: 4.68999e-06 [rewriter_before_opt_a]: 0.00044465 [opt_a]: 0.00569996, [2] [Cycle 1]: 0.0050705, [45] [expand_dump_flag]: 6.29001e-06 [switch_simplify]: 0.00017932 [loop_unroll]: 8.051e-05 [a_1]: 0.00156865 [with_stream_mark]: 1.537e-05 [recompute_prepare]: 9.56e-06 [updatestate_depend_eliminate]: 3.58e-06 [updatestate_assign_eliminate]: 3.76999e-06 [updatestate_loads_eliminate]: 3.35e-06 [parameter_eliminate]: 1.71002e-06 [a_2]: 8.382e-05 [accelerated_algorithm]: 7.5e-06 [shard]: 1.67001e-06 [meta_shard_fg_expand]: 3.38e-06 [shard_inline]: 7.45998e-06 [merge_send_recv]: 8.08999e-06 [auto_parallel]: 6.09001e-06 [parallel]: 2.341e-05 [flash_sp]: 7.21001e-06 [merge_comm]: 3.53999e-06 [allreduce_fusion]: 3.15002e-06 [matmul_add_comm_reduction]: 8.70001e-06 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 8.50001e-06 [virtual_dataset]: 6.94001e-06 [get_grad_eliminate_]: 6.49001e-06 [virtual_output]: 7.03e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 1.15001e-06 [offload_activation]: 9.04998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.341e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.158e-05 [set_forward_comm_id_for_comm_node_pass]: 3.73001e-06 [meta_fg_expand]: 4.57998e-06 [flash_sp_send_recv_attached]: 2.74001e-06 [receive_attached]: 2.53998e-06 [after_resolve]: 1.109e-05 [a_after_grad]: 1.107e-05 [renormalize]: 0.00256936 [add_forward_monad_depend]: 5.57999e-06 [auto_monad_grad]: 1.78002e-06 [auto_monad_eliminator]: 1.518e-05 [cse]: 3.366e-05 [a_3]: 4.915e-05 [Cycle 2]: 0.00062009, [45] [expand_dump_flag]: 1.10001e-06 [switch_simplify]: 7.76001e-06 [loop_unroll]: 7.21001e-06 [a_1]: 0.00012857 [with_stream_mark]: 1.125e-05 [recompute_prepare]: 6.56999e-06 [updatestate_depend_eliminate]: 2.86e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.30002e-06 [parameter_eliminate]: 9.00007e-07 [a_2]: 7.328e-05 [accelerated_algorithm]: 6.39001e-06 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 1.35001e-06 [shard_inline]: 5.96e-06 [merge_send_recv]: 4.38999e-06 [auto_parallel]: 5.43002e-06 [parallel]: 4.05998e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 2.91e-06 [allreduce_fusion]: 2.68e-06 [matmul_add_comm_reduction]: 5.15999e-06 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 7.18998e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 5.90002e-06 [virtual_output]: 6.11e-06 [merge_forward]: 2.57001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.01998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.24e-05 [merge_recompute_call_nodes]: 7.40023e-07 [before_grad]: 8.97e-06 [set_forward_comm_id_for_comm_node_pass]: 2.99001e-06 [meta_fg_expand]: 2.02001e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.15999e-06 [after_resolve]: 8.68001e-06 [a_after_grad]: 9.44e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 6.31e-06 [cse]: 1.422e-05 [a_3]: 3.743e-05 [py_interpret_to_execute_after_opt_a]: 4.54002e-06 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 1.625e-05 [convert_after_rewriter]: 1.85001e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00046864 [opt_b]: 0.00020674, [1] [Cycle 1]: 0.00020042, [7] [b_1]: 0.00012831 [b_2]: 7.87e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 2.63e-06 [updatestate_loads_eliminate]: 2.31998e-06 [renormalize]: 4.30009e-07 [cse]: 2.024e-05 [optimize_parallel_all_gather_comm]: 1.655e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 3.59e-05 [loop_unroll]: 0.00043186 [opt_after_cconv]: 0.0001011, [1] [Cycle 1]: 9.557e-05, [7] [c_1]: 3.135e-05 [parameter_eliminate]: 2.22001e-06 [updatestate_depend_eliminate]: 5.44e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.959e-05 [renormalize]: 4.70027e-07 [remove_dup_value]: 1.513e-05 [tuple_transform]: 7.473e-05, [1] [Cycle 1]: 7.028e-05, [4] [d_1]: 4.363e-05 [none_parameter_eliminate]: 1.69998e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 6.84001e-06 [partial_unused_args_eliminate]: 1.66e-06 [add_recomputation]: 4.233e-05 [cse_after_recomputation]: 2.212e-05, [1] [Cycle 1]: 1.804e-05, [1] [cse]: 1.313e-05 [environ_conv]: 8.30999e-06 [swap_dp_allreduce_reducescatter]: 5.07e-06 [bias_add_comm_swap]: 2.13998e-06 [label_micro_interleaved_index]: 3.93999e-06 [label_fine_grained_interleaved_index]: 2.34999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.00002e-06 [micro_interleaved_order_control]: 2.32999e-06 [assign_add_opt]: 1.17999e-06 [ForceFp32Comm]: 9.29984e-07 [remove_cast_before_assign_add]: 9.79984e-07 [full_micro_interleaved_order_control]: 2.06e-06 [reorder_send_recv_between_fp_bp]: 2.70997e-06 [comm_op_add_attrs]: 9.89996e-07 [add_comm_op_reuse_tag]: 9.00007e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.31998e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.80001e-06 [control_data_broadcast_order]: 1.071e-05 [grouped_pairwise_exchange_alltoall]: 1.42e-06 [offloading_packed_experts]: 3.86999e-06 [overlap_recompute_and_grad_model_parallel]: 4.41002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.15001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.07999e-06 [overlap_grad_ring_attention]: 3.91999e-06 [overlap_grad_flash_sp]: 1.619e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.19999e-06 [split_layernorm_comm]: 1.92001e-06 [handle_group_info]: 1.07998e-06 [symbol_engine_optimizer]: 0.00015382, [1] [Cycle 1]: 0.00014976, [6] [build]: 2.81999e-06 [elim_shapecalc]: 9.32001e-06 [elim_not_effective]: 8.786e-05 [opt_reshape]: 8.34002e-06 [fold_const_symbol]: 1.108e-05 [renormalize]: 3.39991e-07 [detach_backward]: 1.82001e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 1.88e-05 [get_jit_bprop_graph]: 1.15001e-06 [rewriter_after_jit_bprop_graph]: 3.63e-06 [opt_after_jit_grad]: 0.00046219 [validate]: 3.977e-05 [backend_pass]: 1.29003e-06 [task_emit]: 0.00695543 [execute]: 7.43e-06 Sums bootstrap : 0.000515s : 0.35% type_inference : 0.131178s : 88.87% event_method : 0.000836s : 0.57% auto_monad : 0.000177s : 0.12% graph_reusing : 0.000011s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000050s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000068s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000445s : 0.30% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000187s : 0.13% optimize.opt_a.loop_unroll : 0.000088s : 0.06% optimize.opt_a.a_1 : 0.001697s : 1.15% optimize.opt_a.with_stream_mark : 0.000027s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000157s : 0.11% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000012s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000027s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.01% optimize.opt_a.a_after_grad : 0.000021s : 0.01% optimize.opt_a.renormalize : 0.002569s : 1.74% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.01% optimize.opt_a.cse : 0.000048s : 0.03% optimize.opt_a.a_3 : 0.000087s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000469s : 0.32% optimize.opt_b.b_1 : 0.000128s : 0.09% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000036s : 0.02% optimize.loop_unroll : 0.000432s : 0.29% optimize.opt_after_cconv.c_1 : 0.000031s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.03% optimize.cse_after_recomputation.cse : 0.000013s : 0.01% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000088s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000462s : 0.31% validate : 0.000040s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.006955s : 4.71% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000408 55 0.44% : 0.000002s : 2: substitution.elim_not_effective 0.70% : 0.000003s : 4: substitution.float_depend_g_call 0.35% : 0.000001s : 2: substitution.fold_const_symbol 1.40% : 0.000006s : 4: substitution.graph_param_transform 86.88% : 0.000354s : 21: substitution.inline 0.82% : 0.000003s : 4: substitution.j_node_and_user_rematch 0.97% : 0.000004s : 4: substitution.partial_eliminate 1.14% : 0.000005s : 4: substitution.remove_not_recompute_node 0.74% : 0.000003s : 2: substitution.replace_old_param 3.34% : 0.000014s : 5: substitution.switch_simplify 3.21% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.131076 2 95.85% : 0.125641s : 1: type_inference.infer 4.15% : 0.005435s : 1: type_inference.specialize ------[replace.] 0.000207 29 62.78% : 0.000130s : 21: replace.inline 22.73% : 0.000047s : 5: replace.switch_simplify 14.49% : 0.000030s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000366 29 93.82% : 0.000344s : 21: match.inline 3.04% : 0.000011s : 5: match.switch_simplify 3.14% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000336 2299 1.23% : 0.000004s : 30: predicate.accumulaten_eliminater 0.45% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.30% : 0.000001s : 8: predicate.addn_check_dump 1.12% : 0.000004s : 30: predicate.addn_zero_filter 1.08% : 0.000004s : 30: predicate.adjust_all_reduce_mul_add 2.18% : 0.000007s : 38: predicate.arithmetic_simplify 1.18% : 0.000004s : 30: predicate.cast_eliminate 0.36% : 0.000001s : 8: predicate.check_bprop_eliminate 0.29% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.31% : 0.000001s : 8: predicate.depend_value_elim 1.20% : 0.000004s : 30: predicate.dict_get_item_const_eliminator 1.31% : 0.000004s : 30: predicate.dict_get_item_eliminator 1.10% : 0.000004s : 30: predicate.dict_set_item_eliminator 0.55% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.18% : 0.000001s : 4: predicate.elim_not_effective 0.21% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000004s : 34: predicate.environ_add_const_eliminate 1.20% : 0.000004s : 34: predicate.environ_get_add_eliminate 1.33% : 0.000004s : 34: predicate.environ_get_depend_swap 1.66% : 0.000006s : 42: predicate.environ_get_eliminate 1.24% : 0.000004s : 34: predicate.environ_get_set_eliminate 2.14% : 0.000007s : 54: predicate.exchange_switch_depend_value 3.07% : 0.000010s : 54: predicate.float_depend_g_call 0.30% : 0.000001s : 8: predicate.float_environ_get_switch 0.47% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.12% : 0.000000s : 4: predicate.fold_const_symbol 0.39% : 0.000001s : 8: predicate.get_grad_eliminate 0.12% : 0.000000s : 4: predicate.graph_param_transform 0.34% : 0.000001s : 8: predicate.incorporate_call 0.28% : 0.000001s : 8: predicate.incorporate_call_switch 6.17% : 0.000021s : 108: predicate.inline 0.40% : 0.000001s : 8: predicate.inline_without_move 0.19% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.46% : 0.000002s : 8: predicate.less_batch_normalization 1.68% : 0.000006s : 41: predicate.list_to_tuple_eliminator_ 2.78% : 0.000009s : 71: predicate.load_eliminater 0.57% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.02% : 0.000014s : 97: predicate.loop_unroll_before_grad 1.49% : 0.000005s : 38: predicate.make_slice_get_slice_eliminator 0.35% : 0.000001s : 8: predicate.merge_addn 0.35% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.31% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.15% : 0.000004s : 30: predicate.minmaximum_grad 0.63% : 0.000002s : 4: predicate.mutable_eliminate 0.21% : 0.000001s : 4: predicate.opt_reshape 0.24% : 0.000001s : 4: predicate.parallel_virtual_node 2.97% : 0.000010s : 54: predicate.partial_defer_inline 1.57% : 0.000005s : 37: predicate.partial_eliminate 1.12% : 0.000004s : 30: predicate.print_const_string_wrapper 0.36% : 0.000001s : 8: predicate.reduce_all_const_elim 1.54% : 0.000005s : 30: predicate.reduce_eliminate 2.55% : 0.000009s : 71: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000001s : 8: predicate.remove_not_recompute_node 1.32% : 0.000004s : 41: predicate.replace_applicator 0.24% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000001s : 4: predicate.reset_defer_inline 1.13% : 0.000004s : 30: predicate.reshape_eliminate 0.46% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 4: predicate.row_tensor_eliminate 0.43% : 0.000001s : 8: predicate.same_eliminate 0.27% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.42% : 0.000001s : 8: predicate.shard_identity_eliminate 0.40% : 0.000001s : 8: predicate.special_op_eliminate 0.38% : 0.000001s : 8: predicate.specialize_transform 0.47% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.43% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.51% : 0.000008s : 54: predicate.switch_defer_inline 2.73% : 0.000009s : 62: predicate.switch_layer_defer_inline 7.78% : 0.000026s : 173: predicate.switch_simplify 1.28% : 0.000004s : 30: predicate.tile_eliminate 1.20% : 0.000004s : 30: predicate.transpose_eliminate 1.56% : 0.000005s : 38: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000005s : 38: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000005s : 38: predicate.tuple_list_get_item_depend_reorder 2.66% : 0.000009s : 49: predicate.tuple_list_get_item_eliminator 1.67% : 0.000006s : 38: predicate.tuple_list_get_set_item_eliminator 2.04% : 0.000007s : 46: predicate.tuple_list_set_item_eliminator 1.62% : 0.000005s : 41: predicate.tuple_to_list_eliminator_ 2.53% : 0.000008s : 71: predicate.updatestate_pure_node_eliminater 2.96% : 0.000010s : 79: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.43% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.39% : 0.000001s : 8: predicate.virtual_output_eliminate 0.14% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.30% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004235 40 64.59% : 0.002735s : 17: func_graph_cloner_run.FuncGraphClonerGraph 35.41% : 0.001500s : 23: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.168970 196 0.00% : 0.000004s : 1: ForceFp32Comm 2.13% : 0.003602s : 1: add_attr 2.13% : 0.003593s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000046s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000187s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.32% : 0.000544s : 1: bootstrap 0.02% : 0.000040s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000012s : 1: environ_conv 0.50% : 0.000848s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.26% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.28% : 0.000477s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.37% : 0.002318s : 78: opt.transform.opt_a 0.02% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000109s : 28: opt.transform.opt_b 0.03% : 0.000048s : 2: opt.transform.opt_trans_graph 0.07% : 0.000113s : 4: opt.transform.symbol_engine_opt 3.38% : 0.005703s : 1: opt_a 0.06% : 0.000104s : 1: opt_after_cconv 0.28% : 0.000471s : 1: opt_after_jit_grad 0.12% : 0.000210s : 1: opt_b 4.77% : 0.008062s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000073s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.01% : 0.000019s : 1: remove_dup_value 0.74% : 0.001246s : 1: renormalize.infer 0.78% : 0.001315s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.27% : 0.000452s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000157s : 1: symbol_engine_optimizer 4.12% : 0.006966s : 1: task_emit 0.05% : 0.000078s : 1: tuple_transform 77.65% : 0.131198s : 1: type_inference 0.04% : 0.000068s : 1: validate TotalTime = 0.0439823, [24] [bootstrap]: 0.00058738 [type_inference]: 0.0265476 [event_method]: 0.00011526 [auto_monad]: 0.00010609 [graph_reusing]: 6.21998e-06 [inline]: 1.81998e-06 [add_attr]: 0.00369543, [1] [add_attr_with_inline]: 0.00368526, [1] [Cycle 1]: 6.035e-05, [2] [tag_attr]: 2.554e-05 [meta_addattr_fg_expand]: 7.15e-06 [parallel-infer-symbol]: 3.16999e-06 [pre_auto_parallel]: 3.787e-05 [insert-virtual-dataset]: 2.68e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.69001e-06 [pipeline_split]: 1.82001e-06 [optimize]: 0.00489671, [53] [py_interpret_to_execute]: 4.63001e-06 [rewriter_before_opt_a]: 0.00026592 [opt_a]: 0.00281204, [2] [Cycle 1]: 0.00222376, [45] [expand_dump_flag]: 3.6e-06 [switch_simplify]: 5.18e-05 [loop_unroll]: 3.839e-05 [a_1]: 0.00064383 [with_stream_mark]: 1.397e-05 [recompute_prepare]: 7.46999e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.64002e-06 [updatestate_loads_eliminate]: 3.3e-06 [parameter_eliminate]: 2.14e-06 [a_2]: 7.541e-05 [accelerated_algorithm]: 6.33e-06 [shard]: 1.62001e-06 [meta_shard_fg_expand]: 2.25002e-06 [shard_inline]: 6.14999e-06 [merge_send_recv]: 7.81001e-06 [auto_parallel]: 5.67999e-06 [parallel]: 2.551e-05 [flash_sp]: 7.16001e-06 [merge_comm]: 3.68e-06 [allreduce_fusion]: 3.65e-06 [matmul_add_comm_reduction]: 8.70001e-06 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 7.44002e-06 [virtual_dataset]: 6.23e-06 [get_grad_eliminate_]: 5.61e-06 [virtual_output]: 5.99e-06 [merge_forward]: 3.9e-06 [cell_reuse_recompute_pass]: 1.12999e-06 [offload_activation]: 9.79e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.184e-05 [merge_recompute_call_nodes]: 1.77001e-06 [before_grad]: 9.36e-06 [set_forward_comm_id_for_comm_node_pass]: 3.48e-06 [meta_fg_expand]: 2.99001e-06 [flash_sp_send_recv_attached]: 2.62001e-06 [receive_attached]: 2.23002e-06 [after_resolve]: 9.08002e-06 [a_after_grad]: 8.39998e-06 [renormalize]: 0.00086494 [add_forward_monad_depend]: 5.64e-06 [auto_monad_grad]: 1.74e-06 [auto_monad_eliminator]: 1.467e-05 [cse]: 3.451e-05 [a_3]: 4.297e-05 [Cycle 2]: 0.00057877, [45] [expand_dump_flag]: 1.04998e-06 [switch_simplify]: 7.16001e-06 [loop_unroll]: 5.54998e-06 [a_1]: 0.00010784 [with_stream_mark]: 1.064e-05 [recompute_prepare]: 6.07999e-06 [updatestate_depend_eliminate]: 3.11001e-06 [updatestate_assign_eliminate]: 2.52001e-06 [updatestate_loads_eliminate]: 2.74999e-06 [parameter_eliminate]: 9.00007e-07 [a_2]: 6.696e-05 [accelerated_algorithm]: 5.69e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.38002e-06 [shard_inline]: 5.52001e-06 [merge_send_recv]: 4.57e-06 [auto_parallel]: 5.25999e-06 [parallel]: 4.37e-06 [flash_sp]: 2.98e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.91e-06 [matmul_add_comm_reduction]: 5.14e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.18002e-06 [virtual_dataset]: 5.66e-06 [get_grad_eliminate_]: 5.14998e-06 [virtual_output]: 5.39998e-06 [merge_forward]: 2.78e-06 [cell_reuse_recompute_pass]: 1.27999e-06 [offload_activation]: 6.29001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.176e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 8.29998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.2e-06 [meta_fg_expand]: 1.92999e-06 [flash_sp_send_recv_attached]: 7.80012e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 8.05e-06 [a_after_grad]: 7.58001e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 9.80013e-07 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 6.19999e-06 [cse]: 1.624e-05 [a_3]: 3.368e-05 [py_interpret_to_execute_after_opt_a]: 3.78001e-06 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 1.761e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.826e-05 [mutable_eliminate]: 0.00044957 [opt_b]: 0.00018918, [1] [Cycle 1]: 0.00018335, [7] [b_1]: 0.00011302 [b_2]: 7.28e-06 [updatestate_depend_eliminate]: 5.24e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 4.39992e-07 [cse]: 1.96e-05 [optimize_parallel_all_gather_comm]: 1.604e-05 [overlap_param_gather]: 2.26998e-06 [cconv]: 2.222e-05 [loop_unroll]: 0.00041135 [opt_after_cconv]: 9.729e-05, [1] [Cycle 1]: 9.142e-05, [7] [c_1]: 2.504e-05 [parameter_eliminate]: 2.31998e-06 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 2.049e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.672e-05 [tuple_transform]: 6.494e-05, [1] [Cycle 1]: 6.069e-05, [4] [d_1]: 3.528e-05 [none_parameter_eliminate]: 1.70001e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 6.25002e-06 [partial_unused_args_eliminate]: 1.64998e-06 [add_recomputation]: 5.059e-05 [cse_after_recomputation]: 2.245e-05, [1] [Cycle 1]: 1.842e-05, [1] [cse]: 1.312e-05 [environ_conv]: 5.47001e-06 [swap_dp_allreduce_reducescatter]: 5.31998e-06 [bias_add_comm_swap]: 2.46998e-06 [label_micro_interleaved_index]: 4.30999e-06 [label_fine_grained_interleaved_index]: 2.98998e-06 [merge_cast_opt]: 1.67999e-06 [slice_recompute_activation]: 1.99e-06 [micro_interleaved_order_control]: 2.16e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 7.2e-07 [remove_cast_before_assign_add]: 1.12e-06 [full_micro_interleaved_order_control]: 2.27001e-06 [reorder_send_recv_between_fp_bp]: 2.88998e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.35999e-06 [interleave_parallel_branches]: 1.15999e-06 [overlap_opt_shard_in_pipeline]: 2.09e-06 [overlap_opt_shard_grad_in_pipeline]: 2.32001e-06 [control_data_broadcast_order]: 1.237e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.4e-06 [overlap_recompute_and_grad_model_parallel]: 4.57998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.53002e-06 [overlap_recompute_comm]: 2.02999e-06 [overlap_grad_ring_attention]: 4.25999e-06 [overlap_grad_flash_sp]: 1.777e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.39001e-06 [split_layernorm_comm]: 1.77999e-06 [handle_group_info]: 1.40999e-06 [symbol_engine_optimizer]: 7.479e-05, [1] [Cycle 1]: 7.101e-05, [6] [build]: 3.03998e-06 [elim_shapecalc]: 8.45001e-06 [elim_not_effective]: 1.176e-05 [opt_reshape]: 6.46999e-06 [fold_const_symbol]: 9.07999e-06 [renormalize]: 3.00002e-07 [detach_backward]: 2.04e-06 [pipeline_parallel_scheduler]: 1.51002e-06 [auto_monad_reorder]: 1.878e-05 [get_jit_bprop_graph]: 1.15999e-06 [rewriter_after_jit_bprop_graph]: 3.34001e-06 [opt_after_jit_grad]: 0.00044489 [validate]: 3.918e-05 [backend_pass]: 1.29e-06 [task_emit]: 0.00722411 [execute]: 7.79002e-06 Sums bootstrap : 0.000587s : 1.50% type_inference : 0.026548s : 67.63% event_method : 0.000115s : 0.29% auto_monad : 0.000106s : 0.27% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000038s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000003s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000266s : 0.68% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.15% optimize.opt_a.loop_unroll : 0.000044s : 0.11% optimize.opt_a.a_1 : 0.000752s : 1.91% optimize.opt_a.with_stream_mark : 0.000025s : 0.06% optimize.opt_a.recompute_prepare : 0.000014s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000142s : 0.36% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.03% optimize.opt_a.merge_send_recv : 0.000012s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000030s : 0.08% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000007s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.03% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.04% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000017s : 0.04% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.000865s : 2.20% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.05% optimize.opt_a.cse : 0.000051s : 0.13% optimize.opt_a.a_3 : 0.000077s : 0.20% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000018s : 0.04% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000018s : 0.05% optimize.mutable_eliminate : 0.000450s : 1.15% optimize.opt_b.b_1 : 0.000113s : 0.29% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.06% optimize.loop_unroll : 0.000411s : 1.05% optimize.opt_after_cconv.c_1 : 0.000025s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.04% optimize.tuple_transform.d_1 : 0.000035s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000051s : 0.13% optimize.cse_after_recomputation.cse : 0.000013s : 0.03% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000445s : 1.13% validate : 0.000039s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.007224s : 18.40% execute : 0.000008s : 0.02% Time group info: ------[substitution.] 0.000174 26 1.03% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 3.12% : 0.000005s : 3: substitution.graph_param_transform 82.17% : 0.000143s : 6: substitution.inline 1.68% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.56% : 0.000004s : 4: substitution.remove_not_recompute_node 1.51% : 0.000003s : 2: substitution.replace_old_param 7.19% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.026484 2 93.76% : 0.024831s : 1: type_inference.infer 6.24% : 0.001653s : 1: type_inference.specialize ------[replace.] 0.000076 9 70.53% : 0.000054s : 6: replace.inline 29.47% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000150 9 92.72% : 0.000139s : 6: match.inline 7.28% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000182 1162 0.98% : 0.000002s : 13: predicate.accumulaten_eliminater 0.82% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 6: predicate.addn_check_dump 1.01% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.03% : 0.000004s : 19: predicate.arithmetic_simplify 1.02% : 0.000002s : 13: predicate.cast_eliminate 0.57% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.17% : 0.000000s : 3: predicate.const_output_eliminate 0.47% : 0.000001s : 6: predicate.depend_value_elim 1.05% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.86% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.18% : 0.000000s : 3: predicate.elim_not_effective 0.29% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000002s : 16: predicate.environ_add_const_eliminate 1.17% : 0.000002s : 16: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 16: predicate.environ_get_depend_swap 1.67% : 0.000003s : 22: predicate.environ_get_eliminate 1.18% : 0.000002s : 16: predicate.environ_get_set_eliminate 1.63% : 0.000003s : 22: predicate.exchange_switch_depend_value 2.58% : 0.000005s : 22: predicate.float_depend_g_call 0.44% : 0.000001s : 6: predicate.float_environ_get_switch 0.66% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.57% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.54% : 0.000001s : 6: predicate.incorporate_call 0.47% : 0.000001s : 6: predicate.incorporate_call_switch 6.17% : 0.000011s : 53: predicate.inline 0.62% : 0.000001s : 6: predicate.inline_without_move 0.26% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.71% : 0.000001s : 6: predicate.less_batch_normalization 1.81% : 0.000003s : 22: predicate.list_to_tuple_eliminator_ 2.48% : 0.000004s : 35: predicate.load_eliminater 0.95% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.06% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 19: predicate.make_slice_get_slice_eliminator 0.48% : 0.000001s : 6: predicate.merge_addn 0.48% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.52% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.91% : 0.000002s : 13: predicate.minmaximum_grad 1.11% : 0.000002s : 3: predicate.mutable_eliminate 0.29% : 0.000001s : 3: predicate.opt_reshape 0.34% : 0.000001s : 3: predicate.parallel_virtual_node 2.18% : 0.000004s : 22: predicate.partial_defer_inline 1.55% : 0.000003s : 19: predicate.partial_eliminate 1.02% : 0.000002s : 13: predicate.print_const_string_wrapper 0.45% : 0.000001s : 6: predicate.reduce_all_const_elim 1.34% : 0.000002s : 13: predicate.reduce_eliminate 2.58% : 0.000005s : 35: predicate.redundant_stop_gradient_eliminater 0.34% : 0.000001s : 6: predicate.remove_not_recompute_node 1.31% : 0.000002s : 22: predicate.replace_applicator 0.50% : 0.000001s : 6: predicate.replace_old_param 0.23% : 0.000000s : 3: predicate.reset_defer_inline 1.15% : 0.000002s : 13: predicate.reshape_eliminate 0.54% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 3: predicate.row_tensor_eliminate 0.67% : 0.000001s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.66% : 0.000001s : 6: predicate.shard_identity_eliminate 0.79% : 0.000001s : 6: predicate.special_op_eliminate 0.58% : 0.000001s : 6: predicate.specialize_transform 0.72% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.58% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.82% : 0.000003s : 22: predicate.switch_defer_inline 2.26% : 0.000004s : 28: predicate.switch_layer_defer_inline 6.23% : 0.000011s : 73: predicate.switch_simplify 0.98% : 0.000002s : 13: predicate.tile_eliminate 0.98% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 19: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000003s : 19: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 19: predicate.tuple_list_get_item_depend_reorder 3.04% : 0.000006s : 28: predicate.tuple_list_get_item_eliminator 1.55% : 0.000003s : 19: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.81% : 0.000003s : 22: predicate.tuple_to_list_eliminator_ 2.57% : 0.000005s : 35: predicate.updatestate_pure_node_eliminater 3.12% : 0.000006s : 41: predicate.updatestate_useless_node_eliminater 0.28% : 0.000000s : 3: predicate.value_based_eliminate 0.65% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.55% : 0.000001s : 6: predicate.virtual_output_eliminate 0.21% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001063 16 55.86% : 0.000594s : 8: func_graph_cloner_run.FuncGraphClonerGraph 44.14% : 0.000469s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.054745 196 0.01% : 0.000003s : 1: ForceFp32Comm 6.76% : 0.003700s : 1: add_attr 6.74% : 0.003689s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.21% : 0.000114s : 1: auto_monad 0.06% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.14% : 0.000621s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000006s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.23% : 0.000125s : 1: event_method 0.02% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.04% : 0.000020s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.77% : 0.000419s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.84% : 0.000458s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.02% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000015s : 1: opt.transform.mutable_eliminate 2.11% : 0.001155s : 78: opt.transform.opt_a 0.04% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.17% : 0.000094s : 28: opt.transform.opt_b 0.07% : 0.000039s : 2: opt.transform.opt_trans_graph 0.06% : 0.000032s : 4: opt.transform.symbol_engine_opt 5.14% : 0.002815s : 1: opt_a 0.18% : 0.000101s : 1: opt_after_cconv 0.83% : 0.000453s : 1: opt_after_jit_grad 0.35% : 0.000192s : 1: opt_b 8.95% : 0.004901s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.04% : 0.000021s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.04% : 0.000020s : 1: overlap_grad_ring_attention 0.04% : 0.000021s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.03% : 0.000018s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.76% : 0.000417s : 1: renormalize.infer 0.81% : 0.000441s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000021s : 1: rewriter_after_opt_a 0.50% : 0.000271s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.14% : 0.000077s : 1: symbol_engine_optimizer 13.22% : 0.007236s : 1: task_emit 0.12% : 0.000068s : 1: tuple_transform 48.52% : 0.026563s : 1: type_inference 0.13% : 0.000068s : 1: validate TotalTime = 0.0408404, [24] [bootstrap]: 0.00049399 [type_inference]: 0.0246045 [event_method]: 0.00010453 [auto_monad]: 9.138e-05 [graph_reusing]: 6.89999e-06 [inline]: 1.94e-06 [add_attr]: 0.00308775, [1] [add_attr_with_inline]: 0.00308041, [1] [Cycle 1]: 5.73e-05, [2] [tag_attr]: 2.519e-05 [meta_addattr_fg_expand]: 7.1e-06 [parallel-infer-symbol]: 2.86999e-06 [pre_auto_parallel]: 3.59e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.21e-06 [pipeline_split]: 1.99999e-06 [optimize]: 0.00480848, [53] [py_interpret_to_execute]: 4.57998e-06 [rewriter_before_opt_a]: 0.00024165 [opt_a]: 0.00275842, [2] [Cycle 1]: 0.0021886, [45] [expand_dump_flag]: 3.86999e-06 [switch_simplify]: 5.179e-05 [loop_unroll]: 3.801e-05 [a_1]: 0.00063877 [with_stream_mark]: 1.384e-05 [recompute_prepare]: 7.85e-06 [updatestate_depend_eliminate]: 4.53001e-06 [updatestate_assign_eliminate]: 3.77002e-06 [updatestate_loads_eliminate]: 3.13e-06 [parameter_eliminate]: 1.94999e-06 [a_2]: 7.61e-05 [accelerated_algorithm]: 6.58e-06 [shard]: 1.69e-06 [meta_shard_fg_expand]: 1.97001e-06 [shard_inline]: 5.96e-06 [merge_send_recv]: 8.48001e-06 [auto_parallel]: 5.76e-06 [parallel]: 1.792e-05 [flash_sp]: 7.78999e-06 [merge_comm]: 3.56001e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 8.48001e-06 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 6.05002e-06 [get_grad_eliminate_]: 6.09999e-06 [virtual_output]: 5.97999e-06 [merge_forward]: 3.88001e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 9.44998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.181e-05 [merge_recompute_call_nodes]: 1.74998e-06 [before_grad]: 9.51e-06 [set_forward_comm_id_for_comm_node_pass]: 3.91999e-06 [meta_fg_expand]: 3.19001e-06 [flash_sp_send_recv_attached]: 3.11001e-06 [receive_attached]: 2.64001e-06 [after_resolve]: 9.31e-06 [a_after_grad]: 8.37e-06 [renormalize]: 0.00084887 [add_forward_monad_depend]: 5.28002e-06 [auto_monad_grad]: 1.50999e-06 [auto_monad_eliminator]: 1.527e-05 [cse]: 3.341e-05 [a_3]: 4.432e-05 [Cycle 2]: 0.00056079, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 6.79001e-06 [loop_unroll]: 5.66e-06 [a_1]: 9.669e-05 [with_stream_mark]: 1.076e-05 [recompute_prepare]: 5.89e-06 [updatestate_depend_eliminate]: 3.02002e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.75997e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 6.625e-05 [accelerated_algorithm]: 5.48002e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.20001e-06 [shard_inline]: 5.27999e-06 [merge_send_recv]: 4.33999e-06 [auto_parallel]: 5.27999e-06 [parallel]: 4.07e-06 [flash_sp]: 3.3e-06 [merge_comm]: 3.23e-06 [allreduce_fusion]: 2.86e-06 [matmul_add_comm_reduction]: 5.04e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.27001e-06 [virtual_dataset]: 5.59e-06 [get_grad_eliminate_]: 5.33002e-06 [virtual_output]: 5.22e-06 [merge_forward]: 2.76e-06 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 6.01998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.21e-05 [merge_recompute_call_nodes]: 6.90023e-07 [before_grad]: 8.10999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21999e-06 [meta_fg_expand]: 1.81998e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.12e-06 [after_resolve]: 8.94998e-06 [a_after_grad]: 7.45e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.07e-06 [auto_monad_grad]: 7.49977e-07 [auto_monad_eliminator]: 5.99999e-06 [cse]: 1.532e-05 [a_3]: 3.257e-05 [py_interpret_to_execute_after_opt_a]: 4.28999e-06 [slice_cell_reuse_recomputed_activation]: 2.29001e-06 [rewriter_after_opt_a]: 1.784e-05 [convert_after_rewriter]: 1.13001e-06 [order_py_execute_after_rewriter]: 1.55999e-06 [mutable_eliminate]: 0.00044981 [opt_b]: 0.00023209, [1] [Cycle 1]: 0.0002264, [7] [b_1]: 0.00013739 [b_2]: 7.78999e-06 [updatestate_depend_eliminate]: 5.52001e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.56e-06 [renormalize]: 4.19997e-07 [cse]: 2.014e-05 [optimize_parallel_all_gather_comm]: 1.616e-05 [overlap_param_gather]: 2.02999e-06 [cconv]: 2.242e-05 [loop_unroll]: 0.00042279 [opt_after_cconv]: 9.576e-05, [1] [Cycle 1]: 9.016e-05, [7] [c_1]: 2.476e-05 [parameter_eliminate]: 2.40002e-06 [updatestate_depend_eliminate]: 5.32999e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 2.36998e-06 [cse]: 2.008e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.692e-05 [tuple_transform]: 6.424e-05, [1] [Cycle 1]: 6.013e-05, [4] [d_1]: 3.46e-05 [none_parameter_eliminate]: 1.52999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.36e-06 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 4.502e-05 [cse_after_recomputation]: 2.26e-05, [1] [Cycle 1]: 1.833e-05, [1] [cse]: 1.327e-05 [environ_conv]: 4.94e-06 [swap_dp_allreduce_reducescatter]: 4.84e-06 [bias_add_comm_swap]: 2.75002e-06 [label_micro_interleaved_index]: 4.12003e-06 [label_fine_grained_interleaved_index]: 2.75002e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.09e-06 [micro_interleaved_order_control]: 2.24001e-06 [assign_add_opt]: 1.38002e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.07998e-06 [full_micro_interleaved_order_control]: 2.39999e-06 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.02998e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.20999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.84e-06 [control_data_broadcast_order]: 1.261e-05 [grouped_pairwise_exchange_alltoall]: 1.71e-06 [offloading_packed_experts]: 3.51999e-06 [overlap_recompute_and_grad_model_parallel]: 4.63001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.23002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.03997e-06 [overlap_grad_ring_attention]: 4.50001e-06 [overlap_grad_flash_sp]: 1.8e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 7.276e-05, [1] [Cycle 1]: 6.887e-05, [6] [build]: 2.26e-06 [elim_shapecalc]: 9.39998e-06 [elim_not_effective]: 1.204e-05 [opt_reshape]: 6.63e-06 [fold_const_symbol]: 9.22999e-06 [renormalize]: 1.59984e-07 [detach_backward]: 1.66e-06 [pipeline_parallel_scheduler]: 1.69998e-06 [auto_monad_reorder]: 1.781e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 3.30003e-06 [opt_after_jit_grad]: 0.00045092 [validate]: 3.54e-05 [backend_pass]: 9.50007e-07 [task_emit]: 0.0068779 [execute]: 6.86999e-06 Sums bootstrap : 0.000494s : 1.34% type_inference : 0.024605s : 66.88% event_method : 0.000105s : 0.28% auto_monad : 0.000091s : 0.25% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000036s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000242s : 0.66% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.16% optimize.opt_a.loop_unroll : 0.000044s : 0.12% optimize.opt_a.a_1 : 0.000735s : 2.00% optimize.opt_a.with_stream_mark : 0.000025s : 0.07% optimize.opt_a.recompute_prepare : 0.000014s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000142s : 0.39% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000022s : 0.06% optimize.opt_a.flash_sp : 0.000011s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.04% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.03% optimize.opt_a.virtual_output : 0.000011s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000004s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.05% optimize.opt_a.a_after_grad : 0.000016s : 0.04% optimize.opt_a.renormalize : 0.000849s : 2.31% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000002s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.06% optimize.opt_a.cse : 0.000049s : 0.13% optimize.opt_a.a_3 : 0.000077s : 0.21% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000018s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000450s : 1.22% optimize.opt_b.b_1 : 0.000137s : 0.37% optimize.opt_b.b_2 : 0.000008s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.06% optimize.loop_unroll : 0.000423s : 1.15% optimize.opt_after_cconv.c_1 : 0.000025s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000035s : 0.09% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000045s : 0.12% optimize.cse_after_recomputation.cse : 0.000013s : 0.04% optimize.environ_conv : 0.000005s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000451s : 1.23% validate : 0.000035s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.006878s : 18.69% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000171 26 1.18% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 2.78% : 0.000005s : 3: substitution.graph_param_transform 81.31% : 0.000139s : 6: substitution.inline 1.75% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.65% : 0.000005s : 4: substitution.remove_not_recompute_node 1.94% : 0.000003s : 2: substitution.replace_old_param 7.64% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024544 2 93.50% : 0.022950s : 1: type_inference.infer 6.50% : 0.001594s : 1: type_inference.specialize ------[replace.] 0.000074 9 70.13% : 0.000052s : 6: replace.inline 29.87% : 0.000022s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000147 9 92.18% : 0.000135s : 6: match.inline 7.82% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000184 1162 0.99% : 0.000002s : 13: predicate.accumulaten_eliminater 0.90% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 6: predicate.addn_check_dump 0.97% : 0.000002s : 13: predicate.addn_zero_filter 0.91% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.20% : 0.000004s : 19: predicate.arithmetic_simplify 1.02% : 0.000002s : 13: predicate.cast_eliminate 0.52% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.15% : 0.000000s : 3: predicate.const_output_eliminate 0.55% : 0.000001s : 6: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 13: predicate.dict_get_item_eliminator 1.00% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.81% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 3: predicate.elim_not_effective 0.37% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.37% : 0.000003s : 16: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 16: predicate.environ_get_add_eliminate 1.16% : 0.000002s : 16: predicate.environ_get_depend_swap 1.70% : 0.000003s : 22: predicate.environ_get_eliminate 1.16% : 0.000002s : 16: predicate.environ_get_set_eliminate 1.67% : 0.000003s : 22: predicate.exchange_switch_depend_value 2.51% : 0.000005s : 22: predicate.float_depend_g_call 0.47% : 0.000001s : 6: predicate.float_environ_get_switch 0.65% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.15% : 0.000000s : 3: predicate.fold_const_symbol 0.66% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.50% : 0.000001s : 6: predicate.incorporate_call 0.42% : 0.000001s : 6: predicate.incorporate_call_switch 6.00% : 0.000011s : 53: predicate.inline 0.63% : 0.000001s : 6: predicate.inline_without_move 0.27% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.67% : 0.000001s : 6: predicate.less_batch_normalization 1.72% : 0.000003s : 22: predicate.list_to_tuple_eliminator_ 2.53% : 0.000005s : 35: predicate.load_eliminater 0.91% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.04% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.63% : 0.000003s : 19: predicate.make_slice_get_slice_eliminator 0.49% : 0.000001s : 6: predicate.merge_addn 0.49% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.49% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.96% : 0.000002s : 13: predicate.minmaximum_grad 0.83% : 0.000002s : 3: predicate.mutable_eliminate 0.35% : 0.000001s : 3: predicate.opt_reshape 0.30% : 0.000001s : 3: predicate.parallel_virtual_node 2.23% : 0.000004s : 22: predicate.partial_defer_inline 1.53% : 0.000003s : 19: predicate.partial_eliminate 1.01% : 0.000002s : 13: predicate.print_const_string_wrapper 0.45% : 0.000001s : 6: predicate.reduce_all_const_elim 1.37% : 0.000003s : 13: predicate.reduce_eliminate 2.49% : 0.000005s : 35: predicate.redundant_stop_gradient_eliminater 0.30% : 0.000001s : 6: predicate.remove_not_recompute_node 1.35% : 0.000002s : 22: predicate.replace_applicator 0.52% : 0.000001s : 6: predicate.replace_old_param 0.21% : 0.000000s : 3: predicate.reset_defer_inline 1.08% : 0.000002s : 13: predicate.reshape_eliminate 0.53% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.62% : 0.000001s : 6: predicate.same_eliminate 0.35% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.67% : 0.000001s : 6: predicate.shard_identity_eliminate 0.79% : 0.000001s : 6: predicate.special_op_eliminate 0.56% : 0.000001s : 6: predicate.specialize_transform 0.75% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.63% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.83% : 0.000003s : 22: predicate.switch_defer_inline 2.30% : 0.000004s : 28: predicate.switch_layer_defer_inline 6.01% : 0.000011s : 73: predicate.switch_simplify 1.05% : 0.000002s : 13: predicate.tile_eliminate 1.01% : 0.000002s : 13: predicate.transpose_eliminate 1.65% : 0.000003s : 19: predicate.tuple_list_convert_item_index_to_positive 1.64% : 0.000003s : 19: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000003s : 19: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000006s : 28: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 19: predicate.tuple_list_get_set_item_eliminator 2.28% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.67% : 0.000003s : 22: predicate.tuple_to_list_eliminator_ 2.55% : 0.000005s : 35: predicate.updatestate_pure_node_eliminater 3.05% : 0.000006s : 41: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 3: predicate.value_based_eliminate 0.55% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.22% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000982 16 54.11% : 0.000532s : 8: func_graph_cloner_run.FuncGraphClonerGraph 45.89% : 0.000451s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050880 196 0.01% : 0.000004s : 1: ForceFp32Comm 6.08% : 0.003092s : 1: add_attr 6.06% : 0.003084s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000049s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000099s : 1: auto_monad 0.04% : 0.000021s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 1.03% : 0.000523s : 1: bootstrap 0.05% : 0.000026s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.22% : 0.000113s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.85% : 0.000431s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.90% : 0.000458s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000013s : 1: opt.transform.mutable_eliminate 2.24% : 0.001138s : 78: opt.transform.opt_a 0.05% : 0.000023s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000096s : 28: opt.transform.opt_b 0.08% : 0.000039s : 2: opt.transform.opt_trans_graph 0.07% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.43% : 0.002762s : 1: opt_a 0.19% : 0.000099s : 1: opt_after_cconv 0.90% : 0.000460s : 1: opt_after_jit_grad 0.46% : 0.000235s : 1: opt_b 9.46% : 0.004813s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000021s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000009s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000040s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000020s : 1: remove_dup_value 0.79% : 0.000402s : 1: renormalize.infer 0.86% : 0.000439s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000021s : 1: rewriter_after_opt_a 0.49% : 0.000247s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000075s : 1: symbol_engine_optimizer 13.54% : 0.006888s : 1: task_emit 0.13% : 0.000067s : 1: tuple_transform 48.39% : 0.024619s : 1: type_inference 0.12% : 0.000062s : 1: validate TotalTime = 0.310337, [24] [bootstrap]: 0.00047027 [type_inference]: 0.267847 [event_method]: 0.0001374 [auto_monad]: 0.00026128 [graph_reusing]: 1.885e-05 [inline]: 2.72001e-06 [add_attr]: 0.00349468, [1] [add_attr_with_inline]: 0.00348543, [1] [Cycle 1]: 0.00012712, [2] [tag_attr]: 7.034e-05 [meta_addattr_fg_expand]: 2.231e-05 [parallel-infer-symbol]: 3.45003e-06 [pre_auto_parallel]: 9.265e-05 [insert-virtual-dataset]: 3.27002e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.82001e-06 [pipeline_split]: 2.10002e-06 [optimize]: 0.0288382, [53] [py_interpret_to_execute]: 4.89e-06 [rewriter_before_opt_a]: 0.0006082 [opt_a]: 0.0261915, [3] [Cycle 1]: 0.0216782, [45] [expand_dump_flag]: 8.23999e-06 [switch_simplify]: 0.00026338 [loop_unroll]: 0.00011689 [a_1]: 0.00254417 [with_stream_mark]: 2.355e-05 [recompute_prepare]: 2.236e-05 [updatestate_depend_eliminate]: 8.28999e-06 [updatestate_assign_eliminate]: 7.2e-06 [updatestate_loads_eliminate]: 7.48e-06 [parameter_eliminate]: 3.01999e-06 [a_2]: 0.00022309 [accelerated_algorithm]: 1.535e-05 [shard]: 1.55999e-06 [meta_shard_fg_expand]: 5.97999e-06 [shard_inline]: 1.518e-05 [merge_send_recv]: 1.599e-05 [auto_parallel]: 1.037e-05 [parallel]: 1.696e-05 [flash_sp]: 9.14e-06 [merge_comm]: 9.08002e-06 [allreduce_fusion]: 8.13001e-06 [matmul_add_comm_reduction]: 2.554e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.697e-05 [virtual_dataset]: 1.47e-05 [get_grad_eliminate_]: 1.444e-05 [virtual_output]: 1.466e-05 [merge_forward]: 8.97e-06 [cell_reuse_recompute_pass]: 1.24998e-06 [offload_activation]: 1.684e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.636e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 2.508e-05 [set_forward_comm_id_for_comm_node_pass]: 8.76002e-06 [meta_fg_expand]: 0.00175985 [flash_sp_send_recv_attached]: 4.1e-06 [receive_attached]: 2.89999e-06 [after_resolve]: 6.717e-05 [a_after_grad]: 8.895e-05 [renormalize]: 0.0150616 [add_forward_monad_depend]: 1.102e-05 [auto_monad_grad]: 6.44999e-06 [auto_monad_eliminator]: 5.537e-05 [cse]: 0.00045987 [a_3]: 0.00034535 [Cycle 2]: 0.00374041, [45] [expand_dump_flag]: 2.02001e-06 [switch_simplify]: 4.604e-05 [loop_unroll]: 4.285e-05 [a_1]: 0.00126786 [with_stream_mark]: 1.477e-05 [recompute_prepare]: 8.74e-06 [updatestate_depend_eliminate]: 3.80998e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.81999e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 8.345e-05 [accelerated_algorithm]: 7.45e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 1.97999e-06 [shard_inline]: 9.89999e-06 [merge_send_recv]: 5.64998e-06 [auto_parallel]: 6.58e-06 [parallel]: 5.03002e-06 [flash_sp]: 3.53e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.51999e-06 [matmul_add_comm_reduction]: 5.89e-06 [allreduce_slice_to_reducescatter]: 5.29981e-07 [virtual_shard_identity]: 8.09997e-06 [virtual_dataset]: 7.01999e-06 [get_grad_eliminate_]: 7.41999e-06 [virtual_output]: 7.21001e-06 [merge_forward]: 3.21999e-06 [cell_reuse_recompute_pass]: 1.00999e-06 [offload_activation]: 7.38999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.271e-05 [merge_recompute_call_nodes]: 7.29982e-07 [before_grad]: 9.81e-06 [set_forward_comm_id_for_comm_node_pass]: 3.41001e-06 [meta_fg_expand]: 0.00066461 [flash_sp_send_recv_attached]: 1.89999e-06 [receive_attached]: 1.62999e-06 [after_resolve]: 1.584e-05 [a_after_grad]: 1.128e-05 [renormalize]: 0.00110584 [add_forward_monad_depend]: 3.98001e-06 [auto_monad_grad]: 1.37999e-06 [auto_monad_eliminator]: 1.077e-05 [cse]: 2.3e-05 [a_3]: 4.902e-05 [Cycle 3]: 0.00075785, [45] [expand_dump_flag]: 1.29e-06 [switch_simplify]: 8.07e-06 [loop_unroll]: 6.53e-06 [a_1]: 0.00013004 [with_stream_mark]: 7.87e-06 [recompute_prepare]: 6.89999e-06 [updatestate_depend_eliminate]: 3.25e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.53998e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 8.073e-05 [accelerated_algorithm]: 6.73e-06 [shard]: 1.12e-06 [meta_shard_fg_expand]: 1.39998e-06 [shard_inline]: 6.64001e-06 [merge_send_recv]: 4.4e-06 [auto_parallel]: 7.9e-06 [parallel]: 4.17e-06 [flash_sp]: 9.79984e-07 [merge_comm]: 3.58999e-06 [allreduce_fusion]: 2.81e-06 [matmul_add_comm_reduction]: 5.75001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 9.94001e-06 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 7.01001e-06 [virtual_output]: 6.23e-06 [merge_forward]: 3.04001e-06 [cell_reuse_recompute_pass]: 1.40999e-06 [offload_activation]: 6.07999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.372e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 9.52001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.43e-06 [meta_fg_expand]: 2.37001e-06 [flash_sp_send_recv_attached]: 8.60018e-07 [receive_attached]: 1.04e-06 [after_resolve]: 6.38998e-06 [a_after_grad]: 9.57999e-06 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.72001e-06 [auto_monad_grad]: 1.03001e-06 [auto_monad_eliminator]: 7.01999e-06 [cse]: 2.006e-05 [a_3]: 4.049e-05 [py_interpret_to_execute_after_opt_a]: 3.86001e-06 [slice_cell_reuse_recomputed_activation]: 2.01e-06 [rewriter_after_opt_a]: 1.905e-05 [convert_after_rewriter]: 1.22999e-06 [order_py_execute_after_rewriter]: 1.09e-06 [mutable_eliminate]: 0.00051238 [opt_b]: 0.00022203, [1] [Cycle 1]: 0.00021549, [7] [b_1]: 0.00013922 [b_2]: 8.43001e-06 [updatestate_depend_eliminate]: 5.34e-06 [updatestate_assign_eliminate]: 2.65002e-06 [updatestate_loads_eliminate]: 2.48002e-06 [renormalize]: 4.19997e-07 [cse]: 2.32e-05 [optimize_parallel_all_gather_comm]: 1.491e-05 [overlap_param_gather]: 2.36998e-06 [cconv]: 2.074e-05 [loop_unroll]: 0.00044548 [opt_after_cconv]: 0.00010831, [1] [Cycle 1]: 0.00010264, [7] [c_1]: 3.27e-05 [parameter_eliminate]: 2.25002e-06 [updatestate_depend_eliminate]: 5.72001e-06 [updatestate_assign_eliminate]: 2.61e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 2.267e-05 [renormalize]: 4.90021e-07 [remove_dup_value]: 1.901e-05 [tuple_transform]: 0.00017493, [1] [Cycle 1]: 7.341e-05, [4] [d_1]: 4.659e-05 [none_parameter_eliminate]: 2.21998e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.35e-06 [partial_unused_args_eliminate]: 2.49999e-06 [add_recomputation]: 4.277e-05 [cse_after_recomputation]: 3.04e-05, [1] [Cycle 1]: 2.54e-05, [1] [cse]: 1.926e-05 [environ_conv]: 8.74e-06 [swap_dp_allreduce_reducescatter]: 5.27999e-06 [bias_add_comm_swap]: 2.24001e-06 [label_micro_interleaved_index]: 4.00998e-06 [label_fine_grained_interleaved_index]: 2.34001e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 1.87999e-06 [micro_interleaved_order_control]: 2.38998e-06 [assign_add_opt]: 1.26002e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 9.5999e-07 [full_micro_interleaved_order_control]: 2.13998e-06 [reorder_send_recv_between_fp_bp]: 2.51e-06 [comm_op_add_attrs]: 1.22e-06 [add_comm_op_reuse_tag]: 1.24e-06 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.05001e-06 [overlap_opt_shard_in_pipeline]: 1.37e-06 [overlap_opt_shard_grad_in_pipeline]: 1.60999e-06 [control_data_broadcast_order]: 1.271e-05 [grouped_pairwise_exchange_alltoall]: 1.44e-06 [offloading_packed_experts]: 3.38e-06 [overlap_recompute_and_grad_model_parallel]: 4.4e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.33998e-06 [overlap_grad_ring_attention]: 4.00998e-06 [overlap_grad_flash_sp]: 1.769e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 1.90001e-06 [split_layernorm_comm]: 1.74998e-06 [handle_group_info]: 1.07e-06 [symbol_engine_optimizer]: 8.028e-05, [1] [Cycle 1]: 7.533e-05, [6] [build]: 2.41e-06 [elim_shapecalc]: 1.17e-05 [elim_not_effective]: 1.433e-05 [opt_reshape]: 8.43001e-06 [fold_const_symbol]: 1.109e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.71998e-06 [pipeline_parallel_scheduler]: 1.59e-06 [auto_monad_reorder]: 1.833e-05 [get_jit_bprop_graph]: 1.32e-06 [rewriter_after_jit_bprop_graph]: 3.63e-06 [opt_after_jit_grad]: 0.00048625 [validate]: 4.185e-05 [backend_pass]: 9.50007e-07 [task_emit]: 0.00837929 [execute]: 6.96001e-06 Sums bootstrap : 0.000470s : 0.15% type_inference : 0.267847s : 87.71% event_method : 0.000137s : 0.04% auto_monad : 0.000261s : 0.09% graph_reusing : 0.000019s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000070s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000022s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000093s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000608s : 0.20% optimize.opt_a.expand_dump_flag : 0.000012s : 0.00% optimize.opt_a.switch_simplify : 0.000317s : 0.10% optimize.opt_a.loop_unroll : 0.000166s : 0.05% optimize.opt_a.a_1 : 0.003942s : 1.29% optimize.opt_a.with_stream_mark : 0.000046s : 0.02% optimize.opt_a.recompute_prepare : 0.000038s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000013s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000387s : 0.13% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000032s : 0.01% optimize.opt_a.merge_send_recv : 0.000026s : 0.01% optimize.opt_a.auto_parallel : 0.000025s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.00% optimize.opt_a.merge_comm : 0.000016s : 0.01% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000037s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000035s : 0.01% optimize.opt_a.virtual_dataset : 0.000029s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000029s : 0.01% optimize.opt_a.virtual_output : 0.000028s : 0.01% optimize.opt_a.merge_forward : 0.000015s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000053s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000044s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.01% optimize.opt_a.meta_fg_expand : 0.002427s : 0.79% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000089s : 0.03% optimize.opt_a.a_after_grad : 0.000110s : 0.04% optimize.opt_a.renormalize : 0.016167s : 5.29% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.01% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000073s : 0.02% optimize.opt_a.cse : 0.000503s : 0.16% optimize.opt_a.a_3 : 0.000435s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000512s : 0.17% optimize.opt_b.b_1 : 0.000139s : 0.05% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.01% optimize.loop_unroll : 0.000445s : 0.15% optimize.opt_after_cconv.c_1 : 0.000033s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000047s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000043s : 0.01% optimize.cse_after_recomputation.cse : 0.000019s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000018s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000486s : 0.16% validate : 0.000042s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.008379s : 2.74% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.001011 183 0.22% : 0.000002s : 2: substitution.elim_not_effective 1.00% : 0.000010s : 14: substitution.float_depend_g_call 0.29% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.19% : 0.000002s : 2: substitution.fold_const_symbol 0.56% : 0.000006s : 4: substitution.graph_param_transform 0.30% : 0.000003s : 2: substitution.incorporate_call 0.18% : 0.000002s : 2: substitution.incorporate_call_switch 76.15% : 0.000770s : 36: substitution.inline 1.74% : 0.000018s : 2: substitution.inline_without_move 0.78% : 0.000008s : 12: substitution.j_node_and_user_rematch 0.97% : 0.000010s : 7: substitution.minmaximum_grad 1.09% : 0.000011s : 14: substitution.partial_eliminate 1.02% : 0.000010s : 12: substitution.remove_not_recompute_node 2.30% : 0.000023s : 9: substitution.replace_applicator 0.60% : 0.000006s : 9: substitution.replace_old_param 0.26% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.89% : 0.000019s : 8: substitution.switch_simplify 2.08% : 0.000021s : 7: substitution.tuple_list_convert_item_index_to_positive 0.96% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.30% : 0.000013s : 7: substitution.tuple_list_get_item_depend_reorder 4.85% : 0.000049s : 17: substitution.tuple_list_get_item_eliminator 1.28% : 0.000013s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.266377 2 96.82% : 0.257905s : 1: type_inference.infer 3.18% : 0.008472s : 1: type_inference.specialize ------[replace.] 0.000432 52 58.70% : 0.000254s : 36: replace.inline 17.22% : 0.000074s : 8: replace.switch_simplify 24.08% : 0.000104s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000788 52 95.21% : 0.000750s : 36: match.inline 1.82% : 0.000014s : 8: match.switch_simplify 2.97% : 0.000023s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000760 5362 1.24% : 0.000009s : 71: predicate.accumulaten_eliminater 0.26% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.35% : 0.000003s : 21: predicate.addn_check_dump 1.16% : 0.000009s : 71: predicate.addn_zero_filter 1.17% : 0.000009s : 71: predicate.adjust_all_reduce_mul_add 2.09% : 0.000016s : 92: predicate.arithmetic_simplify 1.19% : 0.000009s : 71: predicate.cast_eliminate 0.96% : 0.000007s : 52: predicate.check_bprop_eliminate 0.34% : 0.000003s : 21: predicate.compare_switch_simplify 0.06% : 0.000000s : 4: predicate.const_output_eliminate 0.37% : 0.000003s : 21: predicate.depend_value_elim 1.23% : 0.000009s : 71: predicate.dict_get_item_const_eliminator 1.44% : 0.000011s : 71: predicate.dict_get_item_eliminator 1.16% : 0.000009s : 71: predicate.dict_set_item_eliminator 0.26% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.06% : 0.000000s : 4: predicate.elim_not_effective 0.10% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000010s : 75: predicate.environ_add_const_eliminate 1.21% : 0.000009s : 75: predicate.environ_get_add_eliminate 1.20% : 0.000009s : 75: predicate.environ_get_depend_swap 1.56% : 0.000012s : 96: predicate.environ_get_eliminate 1.23% : 0.000009s : 75: predicate.environ_get_set_eliminate 2.01% : 0.000015s : 115: predicate.exchange_switch_depend_value 2.83% : 0.000021s : 115: predicate.float_depend_g_call 0.35% : 0.000003s : 21: predicate.float_environ_get_switch 0.43% : 0.000003s : 25: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 4: predicate.fold_const_symbol 0.45% : 0.000003s : 21: predicate.get_grad_eliminate 0.06% : 0.000000s : 4: predicate.graph_param_transform 0.36% : 0.000003s : 21: predicate.incorporate_call 0.34% : 0.000003s : 21: predicate.incorporate_call_switch 5.65% : 0.000043s : 236: predicate.inline 1.16% : 0.000009s : 48: predicate.inline_without_move 0.21% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.50% : 0.000004s : 21: predicate.less_batch_normalization 1.46% : 0.000011s : 87: predicate.list_to_tuple_eliminator_ 2.57% : 0.000020s : 158: predicate.load_eliminater 0.26% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.44% : 0.000026s : 185: predicate.loop_unroll_before_grad 1.42% : 0.000011s : 79: predicate.make_slice_get_slice_eliminator 0.40% : 0.000003s : 21: predicate.merge_addn 0.88% : 0.000007s : 52: predicate.micro_step_allgather_replace 0.90% : 0.000007s : 52: predicate.mini_step_allgather_replace 1.11% : 0.000008s : 71: predicate.minmaximum_grad 0.28% : 0.000002s : 4: predicate.mutable_eliminate 0.11% : 0.000001s : 4: predicate.opt_reshape 0.09% : 0.000001s : 4: predicate.parallel_virtual_node 2.86% : 0.000022s : 115: predicate.partial_defer_inline 1.58% : 0.000012s : 83: predicate.partial_eliminate 1.15% : 0.000009s : 71: predicate.print_const_string_wrapper 0.45% : 0.000003s : 21: predicate.reduce_all_const_elim 1.58% : 0.000012s : 71: predicate.reduce_eliminate 2.54% : 0.000019s : 158: predicate.redundant_stop_gradient_eliminater 0.25% : 0.000002s : 21: predicate.remove_not_recompute_node 1.79% : 0.000014s : 131: predicate.replace_applicator 0.56% : 0.000004s : 48: predicate.replace_old_param 0.08% : 0.000001s : 4: predicate.reset_defer_inline 1.23% : 0.000009s : 71: predicate.reshape_eliminate 0.93% : 0.000007s : 52: predicate.row_tensor_add_zeros_like 0.11% : 0.000001s : 4: predicate.row_tensor_eliminate 1.16% : 0.000009s : 52: predicate.same_eliminate 0.28% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.47% : 0.000004s : 21: predicate.shard_identity_eliminate 0.19% : 0.000001s : 8: predicate.special_op_eliminate 0.42% : 0.000003s : 21: predicate.specialize_transform 1.01% : 0.000008s : 52: predicate.split_environ_get_set_with_tuple_value 1.09% : 0.000008s : 48: predicate.stack_unstack_eliminate 0.08% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.25% : 0.000017s : 115: predicate.switch_defer_inline 3.13% : 0.000024s : 167: predicate.switch_layer_defer_inline 6.94% : 0.000053s : 341: predicate.switch_simplify 1.21% : 0.000009s : 71: predicate.tile_eliminate 1.11% : 0.000008s : 71: predicate.transpose_eliminate 1.42% : 0.000011s : 79: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000012s : 79: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000011s : 79: predicate.tuple_list_get_item_depend_reorder 2.45% : 0.000019s : 108: predicate.tuple_list_get_item_eliminator 1.48% : 0.000011s : 79: predicate.tuple_list_get_set_item_eliminator 1.97% : 0.000015s : 100: predicate.tuple_list_set_item_eliminator 1.46% : 0.000011s : 87: predicate.tuple_to_list_eliminator_ 2.54% : 0.000019s : 158: predicate.updatestate_pure_node_eliminater 2.87% : 0.000022s : 179: predicate.updatestate_useless_node_eliminater 0.14% : 0.000001s : 4: predicate.value_based_eliminate 0.41% : 0.000003s : 21: predicate.virtual_dataset_eliminate 0.41% : 0.000003s : 21: predicate.virtual_output_eliminate 0.07% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.10% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007195 80 72.17% : 0.005193s : 40: func_graph_cloner_run.FuncGraphClonerGraph 27.83% : 0.002003s : 40: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.364655 237 0.00% : 0.000003s : 1: ForceFp32Comm 0.96% : 0.003500s : 1: add_attr 0.96% : 0.003490s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000048s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000272s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.14% : 0.000499s : 1: bootstrap 0.01% : 0.000024s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.04% : 0.000148s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000023s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.12% : 0.000454s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.14% : 0.000521s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000016s : 1: opt.transform.mutable_eliminate 1.54% : 0.005626s : 117: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000117s : 28: opt.transform.opt_b 0.01% : 0.000052s : 2: opt.transform.opt_trans_graph 0.01% : 0.000042s : 4: opt.transform.symbol_engine_opt 7.18% : 0.026195s : 1: opt_a 0.03% : 0.000112s : 1: opt_after_cconv 0.14% : 0.000496s : 1: opt_after_jit_grad 0.06% : 0.000226s : 1: opt_b 7.91% : 0.028843s : 1: optimize 0.01% : 0.000018s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000098s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000023s : 1: remove_dup_value 3.73% : 0.013607s : 2: renormalize.infer 0.70% : 0.002543s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000022s : 1: rewriter_after_opt_a 0.17% : 0.000617s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000083s : 1: symbol_engine_optimizer 2.30% : 0.008389s : 1: task_emit 0.05% : 0.000179s : 1: tuple_transform 73.46% : 0.267872s : 1: type_inference 0.02% : 0.000069s : 1: validate TotalTime = 0.0403752, [24] [bootstrap]: 0.00045928 [type_inference]: 0.0243756 [event_method]: 0.00011625 [auto_monad]: 7.76e-05 [graph_reusing]: 7.26001e-06 [inline]: 1.77999e-06 [add_attr]: 0.00302694, [1] [add_attr_with_inline]: 0.00301895, [1] [Cycle 1]: 5.796e-05, [2] [tag_attr]: 2.445e-05 [meta_addattr_fg_expand]: 7.11001e-06 [parallel-infer-symbol]: 3.10002e-06 [pre_auto_parallel]: 3.589e-05 [insert-virtual-dataset]: 2.41e-06 [parallel-infer-symbol-second]: 8.00006e-07 [dataset_repeat_opt]: 1.83002e-06 [pipeline_split]: 1.97001e-06 [optimize]: 0.00482911, [53] [py_interpret_to_execute]: 4.33001e-06 [rewriter_before_opt_a]: 0.0002405 [opt_a]: 0.00281664, [2] [Cycle 1]: 0.00223906, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 5.19e-05 [loop_unroll]: 3.942e-05 [a_1]: 0.00064543 [with_stream_mark]: 1.387e-05 [recompute_prepare]: 7.31999e-06 [updatestate_depend_eliminate]: 3.71999e-06 [updatestate_assign_eliminate]: 3.93999e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 7.612e-05 [accelerated_algorithm]: 6.54001e-06 [shard]: 1.69998e-06 [meta_shard_fg_expand]: 1.96e-06 [shard_inline]: 5.76998e-06 [merge_send_recv]: 8.22e-06 [auto_parallel]: 5.77001e-06 [parallel]: 1.696e-05 [flash_sp]: 7.00002e-06 [merge_comm]: 3.71001e-06 [allreduce_fusion]: 3.31999e-06 [matmul_add_comm_reduction]: 9.24998e-06 [allreduce_slice_to_reducescatter]: 6.19999e-07 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 6.11e-06 [get_grad_eliminate_]: 6.20997e-06 [virtual_output]: 6.17001e-06 [merge_forward]: 4.58999e-06 [cell_reuse_recompute_pass]: 1.12e-06 [offload_activation]: 9.59999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.139e-05 [merge_recompute_call_nodes]: 1.39e-06 [before_grad]: 9.32999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.53e-06 [meta_fg_expand]: 2.84001e-06 [flash_sp_send_recv_attached]: 2.76e-06 [receive_attached]: 2.29999e-06 [after_resolve]: 9.29e-06 [a_after_grad]: 8.52e-06 [renormalize]: 0.00089471 [add_forward_monad_depend]: 5.44e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.569e-05 [cse]: 3.595e-05 [a_3]: 4.494e-05 [Cycle 2]: 0.00056812, [45] [expand_dump_flag]: 1.07e-06 [switch_simplify]: 7.26001e-06 [loop_unroll]: 6.09999e-06 [a_1]: 9.625e-05 [with_stream_mark]: 1.049e-05 [recompute_prepare]: 5.76003e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 2.81e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 6.773e-05 [accelerated_algorithm]: 5.76e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.42999e-06 [shard_inline]: 5.49998e-06 [merge_send_recv]: 4.44002e-06 [auto_parallel]: 5.39998e-06 [parallel]: 4.38999e-06 [flash_sp]: 3.29001e-06 [merge_comm]: 3.16001e-06 [allreduce_fusion]: 2.89999e-06 [matmul_add_comm_reduction]: 4.97999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 6.01e-06 [virtual_dataset]: 5.60001e-06 [get_grad_eliminate_]: 5.39e-06 [virtual_output]: 5.47001e-06 [merge_forward]: 2.77002e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 6.16998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.143e-05 [merge_recompute_call_nodes]: 6.30011e-07 [before_grad]: 8.70001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.08e-06 [meta_fg_expand]: 1.96e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 8.42e-06 [a_after_grad]: 8.03999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 7.80012e-07 [auto_monad_eliminator]: 6.26e-06 [cse]: 1.517e-05 [a_3]: 3.382e-05 [py_interpret_to_execute_after_opt_a]: 4.00998e-06 [slice_cell_reuse_recomputed_activation]: 2.66999e-06 [rewriter_after_opt_a]: 1.797e-05 [convert_after_rewriter]: 1.44e-06 [order_py_execute_after_rewriter]: 1.42999e-06 [mutable_eliminate]: 0.00044563 [opt_b]: 0.00019143, [1] [Cycle 1]: 0.0001859, [7] [b_1]: 0.0001139 [b_2]: 7.11999e-06 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.34999e-06 [renormalize]: 4.00003e-07 [cse]: 2.021e-05 [optimize_parallel_all_gather_comm]: 1.565e-05 [overlap_param_gather]: 2.07001e-06 [cconv]: 4.307e-05 [loop_unroll]: 0.00041434 [opt_after_cconv]: 9.723e-05, [1] [Cycle 1]: 9.179e-05, [7] [c_1]: 2.582e-05 [parameter_eliminate]: 2.29001e-06 [updatestate_depend_eliminate]: 5.41002e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.38998e-06 [cse]: 1.994e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 1.726e-05 [tuple_transform]: 6.469e-05, [1] [Cycle 1]: 6.032e-05, [4] [d_1]: 3.466e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.47001e-06 [partial_unused_args_eliminate]: 2.02001e-06 [add_recomputation]: 4.567e-05 [cse_after_recomputation]: 2.286e-05, [1] [Cycle 1]: 1.863e-05, [1] [cse]: 1.328e-05 [environ_conv]: 5.52999e-06 [swap_dp_allreduce_reducescatter]: 6.09001e-06 [bias_add_comm_swap]: 2.62001e-06 [label_micro_interleaved_index]: 4.07e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.42999e-06 [slice_recompute_activation]: 1.95001e-06 [micro_interleaved_order_control]: 2.37001e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.11998e-06 [reorder_send_recv_between_fp_bp]: 2.77002e-06 [comm_op_add_attrs]: 1.01002e-06 [add_comm_op_reuse_tag]: 1.06002e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.09e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.229e-05 [grouped_pairwise_exchange_alltoall]: 1.50999e-06 [offloading_packed_experts]: 3.53e-06 [overlap_recompute_and_grad_model_parallel]: 4.34002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.26e-06 [overlap_grad_ring_attention]: 3.95998e-06 [overlap_grad_flash_sp]: 1.684e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.61e-06 [split_layernorm_comm]: 1.79998e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 7.08e-05, [1] [Cycle 1]: 6.684e-05, [6] [build]: 2.54001e-06 [elim_shapecalc]: 9.17999e-06 [elim_not_effective]: 1.22e-05 [opt_reshape]: 6.53e-06 [fold_const_symbol]: 9.10999e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.54998e-06 [pipeline_parallel_scheduler]: 1.41998e-06 [auto_monad_reorder]: 1.927e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 3.36001e-06 [opt_after_jit_grad]: 0.00044327 [validate]: 3.579e-05 [backend_pass]: 8.89995e-07 [task_emit]: 0.0067206 [execute]: 7.19001e-06 Sums bootstrap : 0.000459s : 1.26% type_inference : 0.024376s : 66.96% event_method : 0.000116s : 0.32% auto_monad : 0.000078s : 0.21% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000024s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000036s : 0.10% insert-virtual-dataset : 0.000002s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000240s : 0.66% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000059s : 0.16% optimize.opt_a.loop_unroll : 0.000046s : 0.13% optimize.opt_a.a_1 : 0.000742s : 2.04% optimize.opt_a.with_stream_mark : 0.000024s : 0.07% optimize.opt_a.recompute_prepare : 0.000013s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000144s : 0.40% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.03% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000011s : 0.03% optimize.opt_a.merge_send_recv : 0.000013s : 0.03% optimize.opt_a.auto_parallel : 0.000011s : 0.03% optimize.opt_a.parallel : 0.000021s : 0.06% optimize.opt_a.flash_sp : 0.000010s : 0.03% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000013s : 0.04% optimize.opt_a.virtual_dataset : 0.000012s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.03% optimize.opt_a.virtual_output : 0.000012s : 0.03% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000018s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000018s : 0.05% optimize.opt_a.a_after_grad : 0.000017s : 0.05% optimize.opt_a.renormalize : 0.000895s : 2.46% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.02% optimize.opt_a.auto_monad_grad : 0.000002s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.06% optimize.opt_a.cse : 0.000051s : 0.14% optimize.opt_a.a_3 : 0.000079s : 0.22% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000018s : 0.05% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000446s : 1.22% optimize.opt_b.b_1 : 0.000114s : 0.31% optimize.opt_b.b_2 : 0.000007s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.06% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.04% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000043s : 0.12% optimize.loop_unroll : 0.000414s : 1.14% optimize.opt_after_cconv.c_1 : 0.000026s : 0.07% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.05% optimize.tuple_transform.d_1 : 0.000035s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.13% optimize.cse_after_recomputation.cse : 0.000013s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.03% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.05% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000443s : 1.22% validate : 0.000036s : 0.10% backend_pass : 0.000001s : 0.00% task_emit : 0.006721s : 18.46% execute : 0.000007s : 0.02% Time group info: ------[substitution.] 0.000173 26 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.80% : 0.000001s : 2: substitution.fold_const_symbol 2.96% : 0.000005s : 3: substitution.graph_param_transform 82.24% : 0.000142s : 6: substitution.inline 1.67% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.50% : 0.000004s : 4: substitution.remove_not_recompute_node 1.52% : 0.000003s : 2: substitution.replace_old_param 7.26% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.024315 2 93.49% : 0.022732s : 1: type_inference.infer 6.51% : 0.001583s : 1: type_inference.specialize ------[replace.] 0.000077 9 70.02% : 0.000054s : 6: replace.inline 29.98% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000149 9 92.64% : 0.000138s : 6: match.inline 7.36% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000187 1162 0.95% : 0.000002s : 13: predicate.accumulaten_eliminater 1.02% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 0.44% : 0.000001s : 6: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.89% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.22% : 0.000004s : 19: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.51% : 0.000001s : 6: predicate.check_bprop_eliminate 0.48% : 0.000001s : 6: predicate.compare_switch_simplify 0.14% : 0.000000s : 3: predicate.const_output_eliminate 0.53% : 0.000001s : 6: predicate.depend_value_elim 0.97% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.16% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.98% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.90% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.19% : 0.000000s : 3: predicate.elim_not_effective 0.31% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000002s : 16: predicate.environ_add_const_eliminate 1.09% : 0.000002s : 16: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 16: predicate.environ_get_depend_swap 1.74% : 0.000003s : 22: predicate.environ_get_eliminate 1.08% : 0.000002s : 16: predicate.environ_get_set_eliminate 1.68% : 0.000003s : 22: predicate.exchange_switch_depend_value 2.60% : 0.000005s : 22: predicate.float_depend_g_call 0.45% : 0.000001s : 6: predicate.float_environ_get_switch 0.69% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 3: predicate.fold_const_symbol 0.56% : 0.000001s : 6: predicate.get_grad_eliminate 0.17% : 0.000000s : 3: predicate.graph_param_transform 0.47% : 0.000001s : 6: predicate.incorporate_call 0.42% : 0.000001s : 6: predicate.incorporate_call_switch 5.85% : 0.000011s : 53: predicate.inline 0.61% : 0.000001s : 6: predicate.inline_without_move 0.28% : 0.000001s : 6: predicate.j_node_and_user_rematch 0.79% : 0.000001s : 6: predicate.less_batch_normalization 1.94% : 0.000004s : 22: predicate.list_to_tuple_eliminator_ 2.60% : 0.000005s : 35: predicate.load_eliminater 0.93% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.19% : 0.000006s : 42: predicate.loop_unroll_before_grad 1.79% : 0.000003s : 19: predicate.make_slice_get_slice_eliminator 0.50% : 0.000001s : 6: predicate.merge_addn 0.47% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.66% : 0.000001s : 6: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 0.96% : 0.000002s : 3: predicate.mutable_eliminate 0.31% : 0.000001s : 3: predicate.opt_reshape 0.33% : 0.000001s : 3: predicate.parallel_virtual_node 2.23% : 0.000004s : 22: predicate.partial_defer_inline 1.48% : 0.000003s : 19: predicate.partial_eliminate 1.05% : 0.000002s : 13: predicate.print_const_string_wrapper 0.51% : 0.000001s : 6: predicate.reduce_all_const_elim 1.17% : 0.000002s : 13: predicate.reduce_eliminate 2.55% : 0.000005s : 35: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 6: predicate.remove_not_recompute_node 1.24% : 0.000002s : 22: predicate.replace_applicator 0.49% : 0.000001s : 6: predicate.replace_old_param 0.24% : 0.000000s : 3: predicate.reset_defer_inline 1.07% : 0.000002s : 13: predicate.reshape_eliminate 0.57% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 3: predicate.row_tensor_eliminate 0.66% : 0.000001s : 6: predicate.same_eliminate 0.36% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.63% : 0.000001s : 6: predicate.shard_identity_eliminate 0.66% : 0.000001s : 6: predicate.special_op_eliminate 0.61% : 0.000001s : 6: predicate.specialize_transform 0.74% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.59% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.76% : 0.000003s : 22: predicate.switch_defer_inline 2.21% : 0.000004s : 28: predicate.switch_layer_defer_inline 6.04% : 0.000011s : 73: predicate.switch_simplify 1.02% : 0.000002s : 13: predicate.tile_eliminate 0.97% : 0.000002s : 13: predicate.transpose_eliminate 1.52% : 0.000003s : 19: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000003s : 19: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 19: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000006s : 28: predicate.tuple_list_get_item_eliminator 1.44% : 0.000003s : 19: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000004s : 25: predicate.tuple_list_set_item_eliminator 1.88% : 0.000004s : 22: predicate.tuple_to_list_eliminator_ 2.57% : 0.000005s : 35: predicate.updatestate_pure_node_eliminater 3.03% : 0.000006s : 41: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 3: predicate.value_based_eliminate 0.57% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 6: predicate.virtual_output_eliminate 0.23% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001005 16 54.18% : 0.000545s : 8: func_graph_cloner_run.FuncGraphClonerGraph 45.82% : 0.000461s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.050434 196 0.01% : 0.000005s : 1: ForceFp32Comm 6.01% : 0.003031s : 1: add_attr 5.99% : 0.003022s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.17% : 0.000085s : 1: auto_monad 0.05% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.97% : 0.000491s : 1: bootstrap 0.09% : 0.000047s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.05% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.25% : 0.000125s : 1: event_method 0.02% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.84% : 0.000422s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.90% : 0.000454s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000014s : 1: opt.transform.mutable_eliminate 2.28% : 0.001149s : 78: opt.transform.opt_a 0.05% : 0.000024s : 1: opt.transform.opt_after_cconv 0.04% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000094s : 28: opt.transform.opt_b 0.08% : 0.000039s : 2: opt.transform.opt_trans_graph 0.07% : 0.000034s : 4: opt.transform.symbol_engine_opt 5.59% : 0.002820s : 1: opt_a 0.20% : 0.000101s : 1: opt_after_cconv 0.90% : 0.000452s : 1: opt_after_jit_grad 0.39% : 0.000195s : 1: opt_b 9.58% : 0.004833s : 1: optimize 0.04% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000040s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000021s : 1: remove_dup_value 0.82% : 0.000413s : 1: renormalize.infer 0.94% : 0.000474s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000021s : 1: rewriter_after_opt_a 0.49% : 0.000246s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000073s : 1: symbol_engine_optimizer 13.35% : 0.006731s : 1: task_emit 0.13% : 0.000067s : 1: tuple_transform 48.36% : 0.024391s : 1: type_inference 0.13% : 0.000063s : 1: validate TotalTime = 0.196875, [24] [bootstrap]: 0.00076956 [type_inference]: 0.165397 [event_method]: 0.00114414 [auto_monad]: 0.00019307 [graph_reusing]: 1.216e-05 [inline]: 2.26998e-06 [add_attr]: 0.00347268, [1] [add_attr_with_inline]: 0.00346214, [1] [Cycle 1]: 0.00010321, [2] [tag_attr]: 5.749e-05 [meta_addattr_fg_expand]: 1.688e-05 [parallel-infer-symbol]: 2.86e-06 [pre_auto_parallel]: 7.137e-05 [insert-virtual-dataset]: 2.26998e-06 [parallel-infer-symbol-second]: 6.30011e-07 [dataset_repeat_opt]: 1.65001e-06 [pipeline_split]: 1.50999e-06 [optimize]: 0.00949732, [53] [py_interpret_to_execute]: 4.80999e-06 [rewriter_before_opt_a]: 0.00048071 [opt_a]: 0.00711028, [2] [Cycle 1]: 0.00647135, [45] [expand_dump_flag]: 6.61e-06 [switch_simplify]: 0.00020121 [loop_unroll]: 8.678e-05 [a_1]: 0.00173724 [with_stream_mark]: 3.488e-05 [recompute_prepare]: 1.059e-05 [updatestate_depend_eliminate]: 4.52e-06 [updatestate_assign_eliminate]: 3.46999e-06 [updatestate_loads_eliminate]: 2.98e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 8.474e-05 [accelerated_algorithm]: 6.90998e-06 [shard]: 1.70001e-06 [meta_shard_fg_expand]: 3.39001e-06 [shard_inline]: 6.57002e-06 [merge_send_recv]: 8.25e-06 [auto_parallel]: 6.16e-06 [parallel]: 1.739e-05 [flash_sp]: 7.50998e-06 [merge_comm]: 3.2e-06 [allreduce_fusion]: 3.21001e-06 [matmul_add_comm_reduction]: 8.09002e-06 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 7.63999e-06 [virtual_dataset]: 7.01001e-06 [get_grad_eliminate_]: 6.63e-06 [virtual_output]: 6.92002e-06 [merge_forward]: 4.28999e-06 [cell_reuse_recompute_pass]: 1.04e-06 [offload_activation]: 8.99e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.292e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 1.055e-05 [set_forward_comm_id_for_comm_node_pass]: 3.81999e-06 [meta_fg_expand]: 4.90001e-06 [flash_sp_send_recv_attached]: 2.56e-06 [receive_attached]: 2.39001e-06 [after_resolve]: 1.029e-05 [a_after_grad]: 1.033e-05 [renormalize]: 0.00376328 [add_forward_monad_depend]: 5.54e-06 [auto_monad_grad]: 2.34001e-06 [auto_monad_eliminator]: 1.532e-05 [cse]: 3.319e-05 [a_3]: 4.909e-05 [Cycle 2]: 0.0006291, [45] [expand_dump_flag]: 1.23002e-06 [switch_simplify]: 7.75998e-06 [loop_unroll]: 6.86001e-06 [a_1]: 0.00013146 [with_stream_mark]: 1.163e-05 [recompute_prepare]: 6.26998e-06 [updatestate_depend_eliminate]: 3.00002e-06 [updatestate_assign_eliminate]: 2.44001e-06 [updatestate_loads_eliminate]: 2.22999e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 7.442e-05 [accelerated_algorithm]: 6.34999e-06 [shard]: 1.24e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 5.89e-06 [merge_send_recv]: 4.38999e-06 [auto_parallel]: 5.44e-06 [parallel]: 4.31002e-06 [flash_sp]: 3.31001e-06 [merge_comm]: 3.04999e-06 [allreduce_fusion]: 3.01001e-06 [matmul_add_comm_reduction]: 4.85001e-06 [allreduce_slice_to_reducescatter]: 3.00002e-07 [virtual_shard_identity]: 7.33e-06 [virtual_dataset]: 6.36e-06 [get_grad_eliminate_]: 5.77999e-06 [virtual_output]: 6.04001e-06 [merge_forward]: 2.75002e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 6.21e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.367e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 9.02999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.09001e-06 [meta_fg_expand]: 2.00002e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 9.29984e-07 [after_resolve]: 8.69998e-06 [a_after_grad]: 9.64e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.09998e-06 [auto_monad_grad]: 9.30013e-07 [auto_monad_eliminator]: 6.06e-06 [cse]: 1.524e-05 [a_3]: 3.743e-05 [py_interpret_to_execute_after_opt_a]: 4.48999e-06 [slice_cell_reuse_recomputed_activation]: 2.06003e-06 [rewriter_after_opt_a]: 1.582e-05 [convert_after_rewriter]: 1.71998e-06 [order_py_execute_after_rewriter]: 1.36998e-06 [mutable_eliminate]: 0.00047597 [opt_b]: 0.0002077, [1] [Cycle 1]: 0.00020124, [7] [b_1]: 0.00012967 [b_2]: 8.04997e-06 [updatestate_depend_eliminate]: 5.25001e-06 [updatestate_assign_eliminate]: 2.58e-06 [updatestate_loads_eliminate]: 2.37001e-06 [renormalize]: 4.10015e-07 [cse]: 2.014e-05 [optimize_parallel_all_gather_comm]: 3.632e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 2.341e-05 [loop_unroll]: 0.00045587 [opt_after_cconv]: 0.00010446, [1] [Cycle 1]: 9.901e-05, [7] [c_1]: 3.345e-05 [parameter_eliminate]: 2.41e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.49999e-06 [updatestate_loads_eliminate]: 2.28998e-06 [cse]: 2.009e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.573e-05 [tuple_transform]: 7.43e-05, [1] [Cycle 1]: 6.984e-05, [4] [d_1]: 4.363e-05 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 6.94999e-06 [partial_unused_args_eliminate]: 1.81e-06 [add_recomputation]: 6.18e-05 [cse_after_recomputation]: 2.381e-05, [1] [Cycle 1]: 1.947e-05, [1] [cse]: 1.391e-05 [environ_conv]: 8.55001e-06 [swap_dp_allreduce_reducescatter]: 4.79e-06 [bias_add_comm_swap]: 2.68e-06 [label_micro_interleaved_index]: 3.83001e-06 [label_fine_grained_interleaved_index]: 2.56998e-06 [merge_cast_opt]: 1.30001e-06 [slice_recompute_activation]: 1.84998e-06 [micro_interleaved_order_control]: 2.68e-06 [assign_add_opt]: 1.15001e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 1.97999e-06 [reorder_send_recv_between_fp_bp]: 2.33998e-06 [comm_op_add_attrs]: 9.69972e-07 [add_comm_op_reuse_tag]: 8.80013e-07 [interleave_split_concat_branches]: 1.08001e-06 [interleave_parallel_branches]: 1.38002e-06 [overlap_opt_shard_in_pipeline]: 1.20001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.82999e-06 [control_data_broadcast_order]: 1.263e-05 [grouped_pairwise_exchange_alltoall]: 1.65001e-06 [offloading_packed_experts]: 3.95e-06 [overlap_recompute_and_grad_model_parallel]: 4.32e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.25999e-06 [overlap_recompute_comm]: 1.82999e-06 [overlap_grad_ring_attention]: 4.4e-06 [overlap_grad_flash_sp]: 1.636e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.22999e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 7.359e-05, [1] [Cycle 1]: 6.989e-05, [6] [build]: 2.76e-06 [elim_shapecalc]: 9.50001e-06 [elim_not_effective]: 1.216e-05 [opt_reshape]: 7.82e-06 [fold_const_symbol]: 1.017e-05 [renormalize]: 2.40019e-07 [detach_backward]: 1.71998e-06 [pipeline_parallel_scheduler]: 1.37e-06 [auto_monad_reorder]: 1.73e-05 [get_jit_bprop_graph]: 1.60999e-06 [rewriter_after_jit_bprop_graph]: 3.59002e-06 [opt_after_jit_grad]: 0.00047247 [validate]: 3.936e-05 [backend_pass]: 9.89996e-07 [task_emit]: 0.0155053 [execute]: 7.61001e-06 Sums bootstrap : 0.000770s : 0.40% type_inference : 0.165397s : 85.96% event_method : 0.001144s : 0.59% auto_monad : 0.000193s : 0.10% graph_reusing : 0.000012s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000057s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000071s : 0.04% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000481s : 0.25% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000209s : 0.11% optimize.opt_a.loop_unroll : 0.000094s : 0.05% optimize.opt_a.a_1 : 0.001869s : 0.97% optimize.opt_a.with_stream_mark : 0.000047s : 0.02% optimize.opt_a.recompute_prepare : 0.000017s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000159s : 0.08% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.01% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.003763s : 1.96% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.01% optimize.opt_a.cse : 0.000048s : 0.03% optimize.opt_a.a_3 : 0.000087s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000476s : 0.25% optimize.opt_b.b_1 : 0.000130s : 0.07% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000036s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.01% optimize.loop_unroll : 0.000456s : 0.24% optimize.opt_after_cconv.c_1 : 0.000033s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000062s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000472s : 0.25% validate : 0.000039s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.015505s : 8.06% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000471 58 0.38% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000003s : 4: substitution.float_depend_g_call 0.42% : 0.000002s : 2: substitution.fold_const_symbol 1.24% : 0.000006s : 4: substitution.graph_param_transform 87.62% : 0.000413s : 23: substitution.inline 0.71% : 0.000003s : 4: substitution.j_node_and_user_rematch 0.94% : 0.000004s : 4: substitution.partial_eliminate 1.13% : 0.000005s : 4: substitution.remove_not_recompute_node 0.60% : 0.000003s : 2: substitution.replace_old_param 3.18% : 0.000015s : 6: substitution.switch_simplify 3.11% : 0.000015s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.165291 2 96.32% : 0.159204s : 1: type_inference.infer 3.68% : 0.006087s : 1: type_inference.specialize ------[replace.] 0.000230 32 61.72% : 0.000142s : 23: replace.inline 24.92% : 0.000057s : 6: replace.switch_simplify 13.36% : 0.000031s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000426 32 94.29% : 0.000402s : 23: match.inline 2.68% : 0.000011s : 6: match.switch_simplify 3.03% : 0.000013s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000363 2421 1.20% : 0.000004s : 32: predicate.accumulaten_eliminater 0.47% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.29% : 0.000001s : 8: predicate.addn_check_dump 1.14% : 0.000004s : 32: predicate.addn_zero_filter 1.14% : 0.000004s : 32: predicate.adjust_all_reduce_mul_add 2.15% : 0.000008s : 40: predicate.arithmetic_simplify 1.20% : 0.000004s : 32: predicate.cast_eliminate 0.31% : 0.000001s : 8: predicate.check_bprop_eliminate 0.29% : 0.000001s : 8: predicate.compare_switch_simplify 0.11% : 0.000000s : 4: predicate.const_output_eliminate 0.29% : 0.000001s : 8: predicate.depend_value_elim 1.22% : 0.000004s : 32: predicate.dict_get_item_const_eliminator 1.30% : 0.000005s : 32: predicate.dict_get_item_eliminator 1.11% : 0.000004s : 32: predicate.dict_set_item_eliminator 0.50% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.12% : 0.000000s : 4: predicate.elim_not_effective 0.18% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.34% : 0.000005s : 36: predicate.environ_add_const_eliminate 1.23% : 0.000004s : 36: predicate.environ_get_add_eliminate 1.29% : 0.000005s : 36: predicate.environ_get_depend_swap 1.70% : 0.000006s : 44: predicate.environ_get_eliminate 1.28% : 0.000005s : 36: predicate.environ_get_set_eliminate 2.22% : 0.000008s : 58: predicate.exchange_switch_depend_value 2.96% : 0.000011s : 58: predicate.float_depend_g_call 0.30% : 0.000001s : 8: predicate.float_environ_get_switch 0.44% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.34% : 0.000001s : 8: predicate.get_grad_eliminate 0.12% : 0.000000s : 4: predicate.graph_param_transform 0.31% : 0.000001s : 8: predicate.incorporate_call 0.26% : 0.000001s : 8: predicate.incorporate_call_switch 6.12% : 0.000022s : 114: predicate.inline 0.39% : 0.000001s : 8: predicate.inline_without_move 0.17% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.44% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000006s : 43: predicate.list_to_tuple_eliminator_ 2.69% : 0.000010s : 75: predicate.load_eliminater 0.52% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.17% : 0.000015s : 104: predicate.loop_unroll_before_grad 1.46% : 0.000005s : 40: predicate.make_slice_get_slice_eliminator 0.32% : 0.000001s : 8: predicate.merge_addn 0.28% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.29% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.12% : 0.000004s : 32: predicate.minmaximum_grad 0.63% : 0.000002s : 4: predicate.mutable_eliminate 0.21% : 0.000001s : 4: predicate.opt_reshape 0.20% : 0.000001s : 4: predicate.parallel_virtual_node 3.13% : 0.000011s : 58: predicate.partial_defer_inline 1.57% : 0.000006s : 39: predicate.partial_eliminate 1.13% : 0.000004s : 32: predicate.print_const_string_wrapper 0.35% : 0.000001s : 8: predicate.reduce_all_const_elim 1.62% : 0.000006s : 32: predicate.reduce_eliminate 2.61% : 0.000009s : 75: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000005s : 43: predicate.replace_applicator 0.24% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000001s : 4: predicate.reset_defer_inline 1.22% : 0.000004s : 32: predicate.reshape_eliminate 0.35% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.20% : 0.000001s : 4: predicate.row_tensor_eliminate 0.36% : 0.000001s : 8: predicate.same_eliminate 0.25% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.36% : 0.000001s : 8: predicate.shard_identity_eliminate 0.44% : 0.000002s : 8: predicate.special_op_eliminate 0.36% : 0.000001s : 8: predicate.specialize_transform 0.53% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.38% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.52% : 0.000009s : 58: predicate.switch_defer_inline 2.87% : 0.000010s : 66: predicate.switch_layer_defer_inline 8.20% : 0.000030s : 186: predicate.switch_simplify 1.27% : 0.000005s : 32: predicate.tile_eliminate 1.16% : 0.000004s : 32: predicate.transpose_eliminate 1.49% : 0.000005s : 40: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000006s : 40: predicate.tuple_list_get_item_const_eliminator 1.54% : 0.000006s : 40: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.58% : 0.000006s : 40: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000008s : 48: predicate.tuple_list_set_item_eliminator 1.66% : 0.000006s : 43: predicate.tuple_to_list_eliminator_ 2.48% : 0.000009s : 75: predicate.updatestate_pure_node_eliminater 3.04% : 0.000011s : 83: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.41% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.35% : 0.000001s : 8: predicate.virtual_output_eliminate 0.14% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.22% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005572 57 71.48% : 0.003983s : 31: func_graph_cloner_run.FuncGraphClonerGraph 28.52% : 0.001589s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.216302 196 0.00% : 0.000003s : 1: ForceFp32Comm 1.61% : 0.003477s : 1: add_attr 1.60% : 0.003466s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000066s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000203s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.37% : 0.000800s : 1: bootstrap 0.01% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.54% : 0.001157s : 1: event_method 0.01% : 0.000031s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000016s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.21% : 0.000465s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.22% : 0.000485s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.16% : 0.002515s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000110s : 28: opt.transform.opt_b 0.02% : 0.000049s : 2: opt.transform.opt_trans_graph 0.02% : 0.000036s : 4: opt.transform.symbol_engine_opt 3.29% : 0.007114s : 1: opt_a 0.05% : 0.000108s : 1: opt_after_cconv 0.22% : 0.000482s : 1: opt_after_jit_grad 0.10% : 0.000211s : 1: opt_b 4.39% : 0.009502s : 1: optimize 0.02% : 0.000041s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000003s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000076s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 1.06% : 0.002298s : 1: renormalize.infer 0.67% : 0.001456s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.23% : 0.000489s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000076s : 1: symbol_engine_optimizer 7.18% : 0.015529s : 1: task_emit 0.04% : 0.000077s : 1: tuple_transform 76.47% : 0.165415s : 1: type_inference 0.03% : 0.000067s : 1: validate TotalTime = 0.166763, [24] [bootstrap]: 0.00046163 [type_inference]: 0.148183 [event_method]: 0.00048017 [auto_monad]: 0.00010447 [graph_reusing]: 7.13e-06 [inline]: 2.24999e-06 [add_attr]: 0.00315868, [1] [add_attr_with_inline]: 0.00315072, [1] [Cycle 1]: 6.084e-05, [2] [tag_attr]: 2.612e-05 [meta_addattr_fg_expand]: 7.68999e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 3.691e-05 [insert-virtual-dataset]: 2.34999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.91e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00660006, [53] [py_interpret_to_execute]: 4.82e-06 [rewriter_before_opt_a]: 0.00024071 [opt_a]: 0.00446754, [2] [Cycle 1]: 0.00382982, [45] [expand_dump_flag]: 3.70998e-06 [switch_simplify]: 5.048e-05 [loop_unroll]: 3.896e-05 [a_1]: 0.00070166 [with_stream_mark]: 1.485e-05 [recompute_prepare]: 8.74e-06 [updatestate_depend_eliminate]: 4.11001e-06 [updatestate_assign_eliminate]: 4.05998e-06 [updatestate_loads_eliminate]: 3.29001e-06 [parameter_eliminate]: 1.91e-06 [a_2]: 8.736e-05 [accelerated_algorithm]: 7.4e-06 [shard]: 1.72999e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.93e-06 [merge_send_recv]: 8.18001e-06 [auto_parallel]: 5.83002e-06 [parallel]: 1.83e-05 [flash_sp]: 7.12002e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.73001e-06 [matmul_add_comm_reduction]: 9.29e-06 [allreduce_slice_to_reducescatter]: 6.29982e-07 [virtual_shard_identity]: 8.22e-06 [virtual_dataset]: 7.26001e-06 [get_grad_eliminate_]: 6.75998e-06 [virtual_output]: 6.70002e-06 [merge_forward]: 4.07e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 1.05e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.268e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.043e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 3.10998e-06 [receive_attached]: 2.59999e-06 [after_resolve]: 1.049e-05 [a_after_grad]: 1.037e-05 [renormalize]: 0.00238778 [add_forward_monad_depend]: 5.44998e-06 [auto_monad_grad]: 1.80001e-06 [auto_monad_eliminator]: 1.575e-05 [cse]: 3.5e-05 [a_3]: 5.052e-05 [Cycle 2]: 0.00062783, [45] [expand_dump_flag]: 1.02998e-06 [switch_simplify]: 8.18001e-06 [loop_unroll]: 6.58e-06 [a_1]: 0.00012617 [with_stream_mark]: 1.068e-05 [recompute_prepare]: 6.83e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 9.10019e-07 [a_2]: 7.799e-05 [accelerated_algorithm]: 6.44999e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.34e-06 [shard_inline]: 6.34001e-06 [merge_send_recv]: 4.44002e-06 [auto_parallel]: 5.30999e-06 [parallel]: 4.12e-06 [flash_sp]: 3.01001e-06 [merge_comm]: 3.24001e-06 [allreduce_fusion]: 3.10998e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.11001e-06 [virtual_dataset]: 6.34001e-06 [get_grad_eliminate_]: 6.05002e-06 [virtual_output]: 5.87001e-06 [merge_forward]: 2.87002e-06 [cell_reuse_recompute_pass]: 1.27e-06 [offload_activation]: 6.28e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.235e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 9.07001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.41999e-06 [meta_fg_expand]: 2.16998e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 9.05001e-06 [a_after_grad]: 9.98002e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.07998e-06 [auto_monad_grad]: 7.89994e-07 [auto_monad_eliminator]: 6.28e-06 [cse]: 1.637e-05 [a_3]: 3.833e-05 [py_interpret_to_execute_after_opt_a]: 4.22998e-06 [slice_cell_reuse_recomputed_activation]: 1.92001e-06 [rewriter_after_opt_a]: 1.76e-05 [convert_after_rewriter]: 1.28002e-06 [order_py_execute_after_rewriter]: 1.41998e-06 [mutable_eliminate]: 0.00046841 [opt_b]: 0.00021197, [1] [Cycle 1]: 0.00020597, [7] [b_1]: 0.00013296 [b_2]: 8.17e-06 [updatestate_depend_eliminate]: 5.46002e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.49001e-06 [renormalize]: 3.69997e-07 [cse]: 2.121e-05 [optimize_parallel_all_gather_comm]: 1.584e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 2.253e-05 [loop_unroll]: 0.0004321 [opt_after_cconv]: 0.00010173, [1] [Cycle 1]: 9.636e-05, [7] [c_1]: 3.006e-05 [parameter_eliminate]: 2.23002e-06 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 2.56e-06 [updatestate_loads_eliminate]: 2.46e-06 [cse]: 2.089e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.695e-05 [tuple_transform]: 7.258e-05, [1] [Cycle 1]: 6.848e-05, [4] [d_1]: 4.28e-05 [none_parameter_eliminate]: 1.59e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.18e-06 [partial_unused_args_eliminate]: 1.99e-06 [add_recomputation]: 4.436e-05 [cse_after_recomputation]: 5.185e-05, [1] [Cycle 1]: 1.876e-05, [1] [cse]: 1.363e-05 [environ_conv]: 7.63001e-06 [swap_dp_allreduce_reducescatter]: 5.68002e-06 [bias_add_comm_swap]: 2.41e-06 [label_micro_interleaved_index]: 3.97e-06 [label_fine_grained_interleaved_index]: 2.69999e-06 [merge_cast_opt]: 1.37e-06 [slice_recompute_activation]: 2.07999e-06 [micro_interleaved_order_control]: 2.58e-06 [assign_add_opt]: 1.50999e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.17999e-06 [full_micro_interleaved_order_control]: 2.11e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.16997e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.42e-06 [overlap_opt_shard_in_pipeline]: 1.42e-06 [overlap_opt_shard_grad_in_pipeline]: 1.81998e-06 [control_data_broadcast_order]: 1.344e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 3.59002e-06 [overlap_recompute_and_grad_model_parallel]: 4.48001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.46002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.324e-05 [overlap_grad_ring_attention]: 4.45e-06 [overlap_grad_flash_sp]: 1.814e-05 [begin_end_overlap_inline]: 5.60016e-07 [split_matmul_comm_elemetwise]: 1.94e-06 [split_layernorm_comm]: 1.69e-06 [handle_group_info]: 1.00999e-06 [symbol_engine_optimizer]: 7.743e-05, [1] [Cycle 1]: 7.234e-05, [6] [build]: 2.50002e-06 [elim_shapecalc]: 1.138e-05 [elim_not_effective]: 1.323e-05 [opt_reshape]: 7.16999e-06 [fold_const_symbol]: 1.048e-05 [renormalize]: 2.79979e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.69e-06 [auto_monad_reorder]: 1.87e-05 [get_jit_bprop_graph]: 1.14003e-06 [rewriter_after_jit_bprop_graph]: 3.67002e-06 [opt_after_jit_grad]: 0.00046895 [validate]: 4.043e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.00696825 [execute]: 7.08e-06 Sums bootstrap : 0.000462s : 0.28% type_inference : 0.148183s : 91.13% event_method : 0.000480s : 0.30% auto_monad : 0.000104s : 0.06% graph_reusing : 0.000007s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000008s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000037s : 0.02% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000241s : 0.15% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000059s : 0.04% optimize.opt_a.loop_unroll : 0.000046s : 0.03% optimize.opt_a.a_1 : 0.000828s : 0.51% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.01% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.01% optimize.opt_a.virtual_dataset : 0.000014s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.002388s : 1.47% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.01% optimize.opt_a.cse : 0.000051s : 0.03% optimize.opt_a.a_3 : 0.000089s : 0.05% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000468s : 0.29% optimize.opt_b.b_1 : 0.000133s : 0.08% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.01% optimize.loop_unroll : 0.000432s : 0.27% optimize.opt_after_cconv.c_1 : 0.000030s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000043s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000002s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000023s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000018s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000469s : 0.29% validate : 0.000040s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.006968s : 4.29% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000187 27 0.98% : 0.000002s : 2: substitution.elim_not_effective 0.85% : 0.000002s : 2: substitution.fold_const_symbol 2.82% : 0.000005s : 4: substitution.graph_param_transform 83.01% : 0.000155s : 6: substitution.inline 1.63% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.45% : 0.000005s : 4: substitution.remove_not_recompute_node 1.49% : 0.000003s : 2: substitution.replace_old_param 6.76% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.148117 2 98.12% : 0.145331s : 1: type_inference.infer 1.88% : 0.002787s : 1: type_inference.specialize ------[replace.] 0.000081 9 70.52% : 0.000057s : 6: replace.inline 29.48% : 0.000024s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000163 9 93.23% : 0.000152s : 6: match.inline 6.77% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000207 1388 1.02% : 0.000002s : 15: predicate.accumulaten_eliminater 0.79% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 15: predicate.addn_zero_filter 0.88% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.44% : 0.000005s : 23: predicate.arithmetic_simplify 1.01% : 0.000002s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.59% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 15: predicate.dict_get_item_eliminator 1.05% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.88% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 19: predicate.environ_add_const_eliminate 1.21% : 0.000003s : 19: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 19: predicate.environ_get_depend_swap 1.77% : 0.000004s : 27: predicate.environ_get_eliminate 1.13% : 0.000002s : 19: predicate.environ_get_set_eliminate 1.48% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.22% : 0.000005s : 24: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.22% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.47% : 0.000001s : 8: predicate.incorporate_call_switch 5.81% : 0.000012s : 63: predicate.inline 0.67% : 0.000001s : 8: predicate.inline_without_move 0.30% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 8: predicate.less_batch_normalization 1.71% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.50% : 0.000005s : 41: predicate.load_eliminater 0.80% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.80% : 0.000006s : 44: predicate.loop_unroll_before_grad 1.62% : 0.000003s : 23: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.51% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 0.96% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.32% : 0.000001s : 4: predicate.parallel_virtual_node 2.06% : 0.000004s : 24: predicate.partial_defer_inline 1.49% : 0.000003s : 22: predicate.partial_eliminate 0.98% : 0.000002s : 15: predicate.print_const_string_wrapper 0.54% : 0.000001s : 8: predicate.reduce_all_const_elim 1.33% : 0.000003s : 15: predicate.reduce_eliminate 2.52% : 0.000005s : 41: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 8: predicate.remove_not_recompute_node 1.43% : 0.000003s : 26: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.06% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.69% : 0.000001s : 8: predicate.shard_identity_eliminate 0.65% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.87% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.70% : 0.000004s : 24: predicate.switch_defer_inline 2.22% : 0.000005s : 32: predicate.switch_layer_defer_inline 5.75% : 0.000012s : 80: predicate.switch_simplify 1.15% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.63% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.39% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.13% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.75% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.43% : 0.000005s : 41: predicate.updatestate_pure_node_eliminater 3.10% : 0.000006s : 49: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.68% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.43% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002370 25 62.68% : 0.001485s : 17: func_graph_cloner_run.FuncGraphClonerGraph 37.32% : 0.000884s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.180389 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.75% : 0.003163s : 1: add_attr 1.75% : 0.003154s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000049s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000113s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.27% : 0.000492s : 1: bootstrap 0.01% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000056s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.27% : 0.000493s : 1: event_method 0.01% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.24% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.26% : 0.000477s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 0.71% : 0.001286s : 78: opt.transform.opt_a 0.02% : 0.000029s : 1: opt.transform.opt_after_cconv 0.01% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000113s : 28: opt.transform.opt_b 0.03% : 0.000048s : 2: opt.transform.opt_trans_graph 0.02% : 0.000039s : 4: opt.transform.symbol_engine_opt 2.48% : 0.004471s : 1: opt_a 0.06% : 0.000105s : 1: opt_after_cconv 0.27% : 0.000478s : 1: opt_after_jit_grad 0.12% : 0.000216s : 1: opt_b 3.66% : 0.006604s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000027s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.02% : 0.000041s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.83% : 0.001490s : 1: renormalize.infer 0.49% : 0.000889s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000021s : 1: rewriter_after_opt_a 0.14% : 0.000247s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000080s : 1: symbol_engine_optimizer 3.87% : 0.006978s : 1: task_emit 0.04% : 0.000076s : 1: tuple_transform 82.15% : 0.148199s : 1: type_inference 0.04% : 0.000067s : 1: validate TotalTime = 0.0958668, [24] [bootstrap]: 0.00047721 [type_inference]: 0.0779801 [event_method]: 0.00029778 [auto_monad]: 7.959e-05 [graph_reusing]: 7.06999e-06 [inline]: 2.63e-06 [add_attr]: 0.00307786, [1] [add_attr_with_inline]: 0.00306958, [1] [Cycle 1]: 5.922e-05, [2] [tag_attr]: 2.629e-05 [meta_addattr_fg_expand]: 7.08e-06 [parallel-infer-symbol]: 3.19001e-06 [pre_auto_parallel]: 3.761e-05 [insert-virtual-dataset]: 2.50002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.00624512, [53] [py_interpret_to_execute]: 4.50001e-06 [rewriter_before_opt_a]: 0.00028836 [opt_a]: 0.00410985, [2] [Cycle 1]: 0.00346529, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 5.573e-05 [loop_unroll]: 4.204e-05 [a_1]: 0.00069727 [with_stream_mark]: 1.478e-05 [recompute_prepare]: 8.63001e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 3.2e-06 [parameter_eliminate]: 1.82001e-06 [a_2]: 8.636e-05 [accelerated_algorithm]: 7.03e-06 [shard]: 1.76998e-06 [meta_shard_fg_expand]: 1.86e-06 [shard_inline]: 6.81999e-06 [merge_send_recv]: 8.43001e-06 [auto_parallel]: 5.66998e-06 [parallel]: 1.726e-05 [flash_sp]: 7.35e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.50998e-06 [matmul_add_comm_reduction]: 9.05001e-06 [allreduce_slice_to_reducescatter]: 1.01002e-06 [virtual_shard_identity]: 8.16002e-06 [virtual_dataset]: 6.98e-06 [get_grad_eliminate_]: 6.76999e-06 [virtual_output]: 6.68998e-06 [merge_forward]: 4.18999e-06 [cell_reuse_recompute_pass]: 1.15999e-06 [offload_activation]: 9.56e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.297e-05 [merge_recompute_call_nodes]: 1.47999e-06 [before_grad]: 1.016e-05 [set_forward_comm_id_for_comm_node_pass]: 3.31999e-06 [meta_fg_expand]: 3.09999e-06 [flash_sp_send_recv_attached]: 2.44999e-06 [receive_attached]: 2.03997e-06 [after_resolve]: 1.065e-05 [a_after_grad]: 9.92001e-06 [renormalize]: 0.00201006 [add_forward_monad_depend]: 5.73002e-06 [auto_monad_grad]: 1.89999e-06 [auto_monad_eliminator]: 1.591e-05 [cse]: 3.542e-05 [a_3]: 6.931e-05 [Cycle 2]: 0.00063506, [45] [expand_dump_flag]: 1.08001e-06 [switch_simplify]: 7.98999e-06 [loop_unroll]: 6.52001e-06 [a_1]: 0.00012793 [with_stream_mark]: 1.154e-05 [recompute_prepare]: 6.78e-06 [updatestate_depend_eliminate]: 3.06001e-06 [updatestate_assign_eliminate]: 2.69999e-06 [updatestate_loads_eliminate]: 2.91e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 7.836e-05 [accelerated_algorithm]: 6.58e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.42e-06 [shard_inline]: 6.59999e-06 [merge_send_recv]: 4.53999e-06 [auto_parallel]: 5.30999e-06 [parallel]: 4.21001e-06 [flash_sp]: 2.86999e-06 [merge_comm]: 3.18998e-06 [allreduce_fusion]: 2.88e-06 [matmul_add_comm_reduction]: 5.11002e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 7.51999e-06 [virtual_dataset]: 6.83998e-06 [get_grad_eliminate_]: 6.41998e-06 [virtual_output]: 6.33002e-06 [merge_forward]: 2.88003e-06 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 6.43998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.296e-05 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 9.39998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.26999e-06 [meta_fg_expand]: 2.11003e-06 [flash_sp_send_recv_attached]: 7.59988e-07 [receive_attached]: 9.50007e-07 [after_resolve]: 9.32001e-06 [a_after_grad]: 9.45001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.20001e-06 [auto_monad_grad]: 8.79983e-07 [auto_monad_eliminator]: 6.65002e-06 [cse]: 1.758e-05 [a_3]: 3.956e-05 [py_interpret_to_execute_after_opt_a]: 4.03999e-06 [slice_cell_reuse_recomputed_activation]: 2.38002e-06 [rewriter_after_opt_a]: 1.8e-05 [convert_after_rewriter]: 1.54e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00045844 [opt_b]: 0.00021145, [1] [Cycle 1]: 0.00020585, [7] [b_1]: 0.00013298 [b_2]: 8.55001e-06 [updatestate_depend_eliminate]: 5.41998e-06 [updatestate_assign_eliminate]: 2.59999e-06 [updatestate_loads_eliminate]: 2.52001e-06 [renormalize]: 3.69997e-07 [cse]: 2.037e-05 [optimize_parallel_all_gather_comm]: 1.585e-05 [overlap_param_gather]: 2.20002e-06 [cconv]: 2.283e-05 [loop_unroll]: 0.0004216 [opt_after_cconv]: 0.00010317, [1] [Cycle 1]: 9.772e-05, [7] [c_1]: 3.168e-05 [parameter_eliminate]: 2.30002e-06 [updatestate_depend_eliminate]: 5.30999e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.42001e-06 [cse]: 2.096e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.752e-05 [tuple_transform]: 7.314e-05, [1] [Cycle 1]: 6.901e-05, [4] [d_1]: 4.255e-05 [none_parameter_eliminate]: 1.86e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 7.21999e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 4.445e-05 [cse_after_recomputation]: 2.302e-05, [1] [Cycle 1]: 1.89e-05, [1] [cse]: 1.379e-05 [environ_conv]: 7.7e-06 [swap_dp_allreduce_reducescatter]: 5.49e-06 [bias_add_comm_swap]: 2.57001e-06 [label_micro_interleaved_index]: 4.12e-06 [label_fine_grained_interleaved_index]: 2.94999e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.39001e-06 [micro_interleaved_order_control]: 2.43e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.09998e-06 [full_micro_interleaved_order_control]: 2.24001e-06 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.30999e-06 [add_comm_op_reuse_tag]: 1.23002e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.25001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.71e-06 [control_data_broadcast_order]: 1.33e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 3.89002e-06 [overlap_recompute_and_grad_model_parallel]: 4.89998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.18001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 1.762e-05 [begin_end_overlap_inline]: 5.3001e-07 [split_matmul_comm_elemetwise]: 2.40002e-06 [split_layernorm_comm]: 1.64e-06 [handle_group_info]: 1.11997e-06 [symbol_engine_optimizer]: 7.58e-05, [1] [Cycle 1]: 7.065e-05, [6] [build]: 2.69001e-06 [elim_shapecalc]: 1.017e-05 [elim_not_effective]: 1.334e-05 [opt_reshape]: 7.45e-06 [fold_const_symbol]: 1.027e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.64998e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.888e-05 [get_jit_bprop_graph]: 1.10001e-06 [rewriter_after_jit_bprop_graph]: 3.33e-06 [opt_after_jit_grad]: 0.00045871 [validate]: 4.182e-05 [backend_pass]: 9.70002e-07 [task_emit]: 0.00689452 [execute]: 6.16e-06 Sums bootstrap : 0.000477s : 0.52% type_inference : 0.077980s : 84.96% event_method : 0.000298s : 0.32% auto_monad : 0.000080s : 0.09% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000026s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000038s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000288s : 0.31% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000064s : 0.07% optimize.opt_a.loop_unroll : 0.000049s : 0.05% optimize.opt_a.a_1 : 0.000825s : 0.90% optimize.opt_a.with_stream_mark : 0.000026s : 0.03% optimize.opt_a.recompute_prepare : 0.000015s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.18% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000021s : 0.02% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000006s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.02% optimize.opt_a.a_after_grad : 0.000019s : 0.02% optimize.opt_a.renormalize : 0.002010s : 2.19% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000023s : 0.02% optimize.opt_a.cse : 0.000053s : 0.06% optimize.opt_a.a_3 : 0.000109s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000458s : 0.50% optimize.opt_b.b_1 : 0.000133s : 0.14% optimize.opt_b.b_2 : 0.000009s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.02% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.02% optimize.loop_unroll : 0.000422s : 0.46% optimize.opt_after_cconv.c_1 : 0.000032s : 0.03% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.02% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000043s : 0.05% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.05% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000018s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000459s : 0.50% validate : 0.000042s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.006895s : 7.51% execute : 0.000006s : 0.01% Time group info: ------[substitution.] 0.000182 27 1.05% : 0.000002s : 2: substitution.elim_not_effective 0.72% : 0.000001s : 2: substitution.fold_const_symbol 3.06% : 0.000006s : 4: substitution.graph_param_transform 82.05% : 0.000149s : 6: substitution.inline 1.72% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.64% : 0.000005s : 4: substitution.remove_not_recompute_node 1.87% : 0.000003s : 2: substitution.replace_old_param 6.89% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.077905 2 97.26% : 0.075770s : 1: type_inference.infer 2.74% : 0.002135s : 1: type_inference.specialize ------[replace.] 0.000080 9 69.74% : 0.000056s : 6: replace.inline 30.26% : 0.000024s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000156 9 93.10% : 0.000146s : 6: match.inline 6.90% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000209 1396 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.82% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.52% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 15: predicate.addn_zero_filter 0.84% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.24% : 0.000005s : 23: predicate.arithmetic_simplify 0.95% : 0.000002s : 15: predicate.cast_eliminate 0.61% : 0.000001s : 8: predicate.check_bprop_eliminate 0.50% : 0.000001s : 8: predicate.compare_switch_simplify 0.18% : 0.000000s : 4: predicate.const_output_eliminate 0.49% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.07% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.92% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.23% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.10% : 0.000002s : 19: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.17% : 0.000002s : 19: predicate.environ_get_depend_swap 1.71% : 0.000004s : 27: predicate.environ_get_eliminate 1.09% : 0.000002s : 19: predicate.environ_get_set_eliminate 1.56% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.51% : 0.000005s : 24: predicate.float_depend_g_call 0.51% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.60% : 0.000001s : 8: predicate.get_grad_eliminate 0.21% : 0.000000s : 4: predicate.graph_param_transform 0.53% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.83% : 0.000012s : 63: predicate.inline 0.60% : 0.000001s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.74% : 0.000002s : 8: predicate.less_batch_normalization 1.74% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.49% : 0.000005s : 41: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.98% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.60% : 0.000003s : 23: predicate.make_slice_get_slice_eliminator 0.56% : 0.000001s : 8: predicate.merge_addn 0.50% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.54% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.85% : 0.000002s : 15: predicate.minmaximum_grad 0.96% : 0.000002s : 4: predicate.mutable_eliminate 0.34% : 0.000001s : 4: predicate.opt_reshape 0.41% : 0.000001s : 4: predicate.parallel_virtual_node 2.02% : 0.000004s : 24: predicate.partial_defer_inline 1.43% : 0.000003s : 22: predicate.partial_eliminate 0.96% : 0.000002s : 15: predicate.print_const_string_wrapper 0.55% : 0.000001s : 8: predicate.reduce_all_const_elim 1.23% : 0.000003s : 15: predicate.reduce_eliminate 2.63% : 0.000005s : 41: predicate.redundant_stop_gradient_eliminater 0.41% : 0.000001s : 8: predicate.remove_not_recompute_node 1.40% : 0.000003s : 26: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.28% : 0.000001s : 4: predicate.reset_defer_inline 1.03% : 0.000002s : 15: predicate.reshape_eliminate 0.60% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.38% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000001s : 8: predicate.same_eliminate 0.42% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.80% : 0.000002s : 8: predicate.shard_identity_eliminate 0.62% : 0.000001s : 8: predicate.special_op_eliminate 0.61% : 0.000001s : 8: predicate.specialize_transform 0.76% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.67% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.29% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.72% : 0.000004s : 24: predicate.switch_defer_inline 2.28% : 0.000005s : 32: predicate.switch_layer_defer_inline 6.04% : 0.000013s : 84: predicate.switch_simplify 1.10% : 0.000002s : 15: predicate.tile_eliminate 0.91% : 0.000002s : 15: predicate.transpose_eliminate 1.57% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000004s : 23: predicate.tuple_list_get_item_const_eliminator 1.56% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.10% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.49% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.30% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.90% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 41: predicate.updatestate_pure_node_eliminater 3.15% : 0.000007s : 49: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.59% : 0.000001s : 8: predicate.virtual_output_eliminate 0.23% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.37% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002711 33 70.87% : 0.001921s : 25: func_graph_cloner_run.FuncGraphClonerGraph 29.13% : 0.000790s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.108689 196 0.00% : 0.000003s : 1: ForceFp32Comm 2.84% : 0.003082s : 1: add_attr 2.83% : 0.003073s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.04% : 0.000048s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000087s : 1: auto_monad 0.02% : 0.000023s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.03% : 0.000033s : 1: bias_add_comm_swap 0.47% : 0.000507s : 1: bootstrap 0.02% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.29% : 0.000310s : 1: event_method 0.01% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000005s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.40% : 0.000429s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.43% : 0.000466s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 1.21% : 0.001310s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.02% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000114s : 28: opt.transform.opt_b 0.04% : 0.000048s : 2: opt.transform.opt_trans_graph 0.03% : 0.000038s : 4: opt.transform.symbol_engine_opt 3.78% : 0.004113s : 1: opt_a 0.10% : 0.000107s : 1: opt_after_cconv 0.43% : 0.000468s : 1: opt_after_jit_grad 0.20% : 0.000215s : 1: opt_b 5.75% : 0.006249s : 1: optimize 0.02% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 1.08% : 0.001170s : 1: renormalize.infer 0.77% : 0.000832s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000021s : 1: rewriter_after_opt_a 0.27% : 0.000294s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000078s : 1: symbol_engine_optimizer 6.35% : 0.006905s : 1: task_emit 0.07% : 0.000076s : 1: tuple_transform 71.76% : 0.077996s : 1: type_inference 0.06% : 0.000069s : 1: validate TotalTime = 0.157024, [24] [bootstrap]: 0.00056241 [type_inference]: 0.135419 [event_method]: 0.00094871 [auto_monad]: 0.00020643 [graph_reusing]: 1.272e-05 [inline]: 2.66999e-06 [add_attr]: 0.00345145, [1] [add_attr_with_inline]: 0.00344301, [1] [Cycle 1]: 0.00010469, [2] [tag_attr]: 5.639e-05 [meta_addattr_fg_expand]: 1.653e-05 [parallel-infer-symbol]: 3.03e-06 [pre_auto_parallel]: 7.23e-05 [insert-virtual-dataset]: 2.81e-06 [parallel-infer-symbol-second]: 7.90023e-07 [dataset_repeat_opt]: 1.63002e-06 [pipeline_split]: 1.54998e-06 [optimize]: 0.00846441, [53] [py_interpret_to_execute]: 4.4e-06 [rewriter_before_opt_a]: 0.00051007 [opt_a]: 0.0060921, [2] [Cycle 1]: 0.00545315, [45] [expand_dump_flag]: 6.78e-06 [switch_simplify]: 0.00020377 [loop_unroll]: 8.673e-05 [a_1]: 0.00169459 [with_stream_mark]: 1.443e-05 [recompute_prepare]: 9.17999e-06 [updatestate_depend_eliminate]: 4.24997e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.26001e-06 [parameter_eliminate]: 1.78002e-06 [a_2]: 8.649e-05 [accelerated_algorithm]: 7.35e-06 [shard]: 1.79e-06 [meta_shard_fg_expand]: 3.68e-06 [shard_inline]: 7.70998e-06 [merge_send_recv]: 8.05e-06 [auto_parallel]: 6.46e-06 [parallel]: 1.744e-05 [flash_sp]: 7.51999e-06 [merge_comm]: 3.81999e-06 [allreduce_fusion]: 3.86999e-06 [matmul_add_comm_reduction]: 8.91002e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 8.23999e-06 [virtual_dataset]: 7.13e-06 [get_grad_eliminate_]: 6.83e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 3.71999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 9.53997e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.282e-05 [merge_recompute_call_nodes]: 1.49e-06 [before_grad]: 9.65002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.41999e-06 [meta_fg_expand]: 5.27999e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 1.021e-05 [a_after_grad]: 1.039e-05 [renormalize]: 0.00279808 [add_forward_monad_depend]: 4.89e-06 [auto_monad_grad]: 2.43e-06 [auto_monad_eliminator]: 1.579e-05 [cse]: 3.363e-05 [a_3]: 4.906e-05 [Cycle 2]: 0.00062876, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 8.3e-06 [loop_unroll]: 7.01001e-06 [a_1]: 0.00013189 [with_stream_mark]: 1.122e-05 [recompute_prepare]: 6.52001e-06 [updatestate_depend_eliminate]: 2.88e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.19001e-06 [parameter_eliminate]: 1.03001e-06 [a_2]: 7.372e-05 [accelerated_algorithm]: 6.40002e-06 [shard]: 1.09e-06 [meta_shard_fg_expand]: 1.45999e-06 [shard_inline]: 5.77999e-06 [merge_send_recv]: 4.34002e-06 [auto_parallel]: 5.40001e-06 [parallel]: 4.25999e-06 [flash_sp]: 3.12002e-06 [merge_comm]: 2.91e-06 [allreduce_fusion]: 2.79001e-06 [matmul_add_comm_reduction]: 5.01002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 7.40998e-06 [virtual_dataset]: 6.23e-06 [get_grad_eliminate_]: 5.82999e-06 [virtual_output]: 5.84999e-06 [merge_forward]: 2.71e-06 [cell_reuse_recompute_pass]: 1.29998e-06 [offload_activation]: 5.92999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 8.94e-06 [set_forward_comm_id_for_comm_node_pass]: 3.13998e-06 [meta_fg_expand]: 2.04e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 8.81002e-06 [a_after_grad]: 9.25999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.05999e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 6.37001e-06 [cse]: 1.554e-05 [a_3]: 3.709e-05 [py_interpret_to_execute_after_opt_a]: 4.30999e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 1.599e-05 [convert_after_rewriter]: 1.71e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00047892 [opt_b]: 0.00020797, [1] [Cycle 1]: 0.00020172, [7] [b_1]: 0.00012931 [b_2]: 8.25999e-06 [updatestate_depend_eliminate]: 5.44998e-06 [updatestate_assign_eliminate]: 2.64001e-06 [updatestate_loads_eliminate]: 2.24001e-06 [renormalize]: 4.09986e-07 [cse]: 2.028e-05 [optimize_parallel_all_gather_comm]: 1.535e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 2.228e-05 [loop_unroll]: 0.00043961 [opt_after_cconv]: 0.00010286, [1] [Cycle 1]: 9.768e-05, [7] [c_1]: 3.228e-05 [parameter_eliminate]: 2.51e-06 [updatestate_depend_eliminate]: 5.13002e-06 [updatestate_assign_eliminate]: 2.31e-06 [updatestate_loads_eliminate]: 2.31998e-06 [cse]: 1.991e-05 [renormalize]: 3.7998e-07 [remove_dup_value]: 1.711e-05 [tuple_transform]: 7.428e-05, [1] [Cycle 1]: 6.981e-05, [4] [d_1]: 4.315e-05 [none_parameter_eliminate]: 1.50001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 7.05998e-06 [partial_unused_args_eliminate]: 2.04999e-06 [add_recomputation]: 4.268e-05 [cse_after_recomputation]: 2.302e-05, [1] [Cycle 1]: 1.893e-05, [1] [cse]: 1.358e-05 [environ_conv]: 7.47002e-06 [swap_dp_allreduce_reducescatter]: 5.00999e-06 [bias_add_comm_swap]: 2.39999e-06 [label_micro_interleaved_index]: 4.2e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.27e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.27999e-06 [assign_add_opt]: 1.19003e-06 [ForceFp32Comm]: 9.09989e-07 [remove_cast_before_assign_add]: 9.70002e-07 [full_micro_interleaved_order_control]: 2.43e-06 [reorder_send_recv_between_fp_bp]: 2.49001e-06 [comm_op_add_attrs]: 9.99979e-07 [add_comm_op_reuse_tag]: 1.14e-06 [interleave_split_concat_branches]: 1.32e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.11997e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.121e-05 [grouped_pairwise_exchange_alltoall]: 1.69e-06 [offloading_packed_experts]: 4.22998e-06 [overlap_recompute_and_grad_model_parallel]: 4.40999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.14e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.27999e-06 [overlap_grad_ring_attention]: 3.75998e-06 [overlap_grad_flash_sp]: 1.706e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.81998e-06 [handle_group_info]: 9.70002e-07 [symbol_engine_optimizer]: 8.741e-05, [1] [Cycle 1]: 7.255e-05, [6] [build]: 3.53e-06 [elim_shapecalc]: 9.99001e-06 [elim_not_effective]: 1.297e-05 [opt_reshape]: 7.56999e-06 [fold_const_symbol]: 9.34e-06 [renormalize]: 1.80007e-07 [detach_backward]: 1.86e-06 [pipeline_parallel_scheduler]: 1.97999e-06 [auto_monad_reorder]: 1.793e-05 [get_jit_bprop_graph]: 9.89996e-07 [rewriter_after_jit_bprop_graph]: 3.51999e-06 [opt_after_jit_grad]: 0.00047518 [validate]: 6.523e-05 [backend_pass]: 8.50006e-07 [task_emit]: 0.00708105 [execute]: 6.69999e-06 Sums bootstrap : 0.000562s : 0.37% type_inference : 0.135419s : 88.75% event_method : 0.000949s : 0.62% auto_monad : 0.000206s : 0.14% graph_reusing : 0.000013s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000056s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000072s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000510s : 0.33% optimize.opt_a.expand_dump_flag : 0.000008s : 0.01% optimize.opt_a.switch_simplify : 0.000212s : 0.14% optimize.opt_a.loop_unroll : 0.000094s : 0.06% optimize.opt_a.a_1 : 0.001826s : 1.20% optimize.opt_a.with_stream_mark : 0.000026s : 0.02% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000160s : 0.10% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000012s : 0.01% optimize.opt_a.auto_parallel : 0.000012s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.01% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000007s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000012s : 0.01% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.002798s : 1.83% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.01% optimize.opt_a.cse : 0.000049s : 0.03% optimize.opt_a.a_3 : 0.000086s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000016s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000479s : 0.31% optimize.opt_b.b_1 : 0.000129s : 0.08% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.01% optimize.loop_unroll : 0.000440s : 0.29% optimize.opt_after_cconv.c_1 : 0.000032s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.01% optimize.tuple_transform.d_1 : 0.000043s : 0.03% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000043s : 0.03% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000017s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000475s : 0.31% validate : 0.000065s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.007081s : 4.64% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000450 58 0.38% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000003s : 4: substitution.float_depend_g_call 0.28% : 0.000001s : 2: substitution.fold_const_symbol 1.29% : 0.000006s : 4: substitution.graph_param_transform 87.47% : 0.000393s : 23: substitution.inline 0.66% : 0.000003s : 4: substitution.j_node_and_user_rematch 0.87% : 0.000004s : 4: substitution.partial_eliminate 1.06% : 0.000005s : 4: substitution.remove_not_recompute_node 0.61% : 0.000003s : 2: substitution.replace_old_param 3.46% : 0.000016s : 6: substitution.switch_simplify 3.16% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.135320 2 95.98% : 0.129874s : 1: type_inference.infer 4.02% : 0.005445s : 1: type_inference.specialize ------[replace.] 0.000231 32 61.04% : 0.000141s : 23: replace.inline 25.76% : 0.000060s : 6: replace.switch_simplify 13.20% : 0.000030s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000406 32 94.01% : 0.000382s : 23: match.inline 2.95% : 0.000012s : 6: match.switch_simplify 3.04% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000360 2421 1.20% : 0.000004s : 32: predicate.accumulaten_eliminater 0.48% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.27% : 0.000001s : 8: predicate.addn_check_dump 1.13% : 0.000004s : 32: predicate.addn_zero_filter 1.05% : 0.000004s : 32: predicate.adjust_all_reduce_mul_add 2.17% : 0.000008s : 40: predicate.arithmetic_simplify 1.24% : 0.000004s : 32: predicate.cast_eliminate 0.33% : 0.000001s : 8: predicate.check_bprop_eliminate 0.29% : 0.000001s : 8: predicate.compare_switch_simplify 0.11% : 0.000000s : 4: predicate.const_output_eliminate 0.30% : 0.000001s : 8: predicate.depend_value_elim 1.26% : 0.000005s : 32: predicate.dict_get_item_const_eliminator 1.33% : 0.000005s : 32: predicate.dict_get_item_eliminator 1.13% : 0.000004s : 32: predicate.dict_set_item_eliminator 0.56% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.14% : 0.000000s : 4: predicate.elim_not_effective 0.21% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000004s : 36: predicate.environ_add_const_eliminate 1.31% : 0.000005s : 36: predicate.environ_get_add_eliminate 1.22% : 0.000004s : 36: predicate.environ_get_depend_swap 1.61% : 0.000006s : 44: predicate.environ_get_eliminate 1.23% : 0.000004s : 36: predicate.environ_get_set_eliminate 2.28% : 0.000008s : 58: predicate.exchange_switch_depend_value 3.01% : 0.000011s : 58: predicate.float_depend_g_call 0.30% : 0.000001s : 8: predicate.float_environ_get_switch 0.45% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.38% : 0.000001s : 8: predicate.get_grad_eliminate 0.11% : 0.000000s : 4: predicate.graph_param_transform 0.31% : 0.000001s : 8: predicate.incorporate_call 0.26% : 0.000001s : 8: predicate.incorporate_call_switch 6.21% : 0.000022s : 114: predicate.inline 0.41% : 0.000001s : 8: predicate.inline_without_move 0.17% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.45% : 0.000002s : 8: predicate.less_batch_normalization 1.64% : 0.000006s : 43: predicate.list_to_tuple_eliminator_ 2.59% : 0.000009s : 75: predicate.load_eliminater 0.55% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.24% : 0.000015s : 104: predicate.loop_unroll_before_grad 1.57% : 0.000006s : 40: predicate.make_slice_get_slice_eliminator 0.32% : 0.000001s : 8: predicate.merge_addn 0.30% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.36% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.09% : 0.000004s : 32: predicate.minmaximum_grad 0.56% : 0.000002s : 4: predicate.mutable_eliminate 0.21% : 0.000001s : 4: predicate.opt_reshape 0.29% : 0.000001s : 4: predicate.parallel_virtual_node 3.08% : 0.000011s : 58: predicate.partial_defer_inline 1.57% : 0.000006s : 39: predicate.partial_eliminate 1.13% : 0.000004s : 32: predicate.print_const_string_wrapper 0.31% : 0.000001s : 8: predicate.reduce_all_const_elim 1.55% : 0.000006s : 32: predicate.reduce_eliminate 2.59% : 0.000009s : 75: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000005s : 43: predicate.replace_applicator 0.29% : 0.000001s : 8: predicate.replace_old_param 0.16% : 0.000001s : 4: predicate.reset_defer_inline 1.17% : 0.000004s : 32: predicate.reshape_eliminate 0.31% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.20% : 0.000001s : 4: predicate.row_tensor_eliminate 0.36% : 0.000001s : 8: predicate.same_eliminate 0.25% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.39% : 0.000001s : 8: predicate.shard_identity_eliminate 0.44% : 0.000002s : 8: predicate.special_op_eliminate 0.36% : 0.000001s : 8: predicate.specialize_transform 0.42% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.42% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.16% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.47% : 0.000009s : 58: predicate.switch_defer_inline 2.88% : 0.000010s : 66: predicate.switch_layer_defer_inline 8.45% : 0.000030s : 186: predicate.switch_simplify 1.14% : 0.000004s : 32: predicate.tile_eliminate 1.13% : 0.000004s : 32: predicate.transpose_eliminate 1.59% : 0.000006s : 40: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000006s : 40: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000005s : 40: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.61% : 0.000006s : 40: predicate.tuple_list_get_set_item_eliminator 2.00% : 0.000007s : 48: predicate.tuple_list_set_item_eliminator 1.68% : 0.000006s : 43: predicate.tuple_to_list_eliminator_ 2.57% : 0.000009s : 75: predicate.updatestate_pure_node_eliminater 2.87% : 0.000010s : 83: predicate.updatestate_useless_node_eliminater 0.19% : 0.000001s : 4: predicate.value_based_eliminate 0.39% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.38% : 0.000001s : 8: predicate.virtual_output_eliminate 0.13% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.23% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004472 44 64.94% : 0.002904s : 18: func_graph_cloner_run.FuncGraphClonerGraph 35.06% : 0.001568s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.174403 196 0.00% : 0.000004s : 1: ForceFp32Comm 1.98% : 0.003456s : 1: add_attr 1.98% : 0.003447s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.03% : 0.000046s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.12% : 0.000217s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.34% : 0.000592s : 1: bootstrap 0.01% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.55% : 0.000961s : 1: event_method 0.01% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.26% : 0.000448s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.28% : 0.000487s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.42% : 0.002477s : 78: opt.transform.opt_a 0.02% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000110s : 28: opt.transform.opt_b 0.03% : 0.000048s : 2: opt.transform.opt_trans_graph 0.02% : 0.000036s : 4: opt.transform.symbol_engine_opt 3.50% : 0.006095s : 1: opt_a 0.06% : 0.000106s : 1: opt_after_cconv 0.28% : 0.000484s : 1: opt_after_jit_grad 0.12% : 0.000211s : 1: opt_b 4.86% : 0.008469s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000077s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000021s : 1: remove_dup_value 0.76% : 0.001323s : 1: renormalize.infer 0.84% : 0.001467s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000019s : 1: rewriter_after_opt_a 0.30% : 0.000518s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.05% : 0.000090s : 1: symbol_engine_optimizer 4.07% : 0.007091s : 1: task_emit 0.04% : 0.000077s : 1: tuple_transform 77.66% : 0.135437s : 1: type_inference 0.05% : 0.000095s : 1: validate TotalTime = 0.0801138, [24] [bootstrap]: 0.00048268 [type_inference]: 0.0618958 [event_method]: 0.00030885 [auto_monad]: 8.024e-05 [graph_reusing]: 6.61999e-06 [inline]: 2.39999e-06 [add_attr]: 0.00315147, [1] [add_attr_with_inline]: 0.00314349, [1] [Cycle 1]: 6.257e-05, [2] [tag_attr]: 2.735e-05 [meta_addattr_fg_expand]: 7.17002e-06 [parallel-infer-symbol]: 3.75998e-06 [pre_auto_parallel]: 3.704e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 8.89995e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00639295, [53] [py_interpret_to_execute]: 4.38999e-06 [rewriter_before_opt_a]: 0.00029836 [opt_a]: 0.00424405, [2] [Cycle 1]: 0.00361184, [45] [expand_dump_flag]: 3.98999e-06 [switch_simplify]: 5.484e-05 [loop_unroll]: 4.115e-05 [a_1]: 0.00070162 [with_stream_mark]: 1.509e-05 [recompute_prepare]: 8.55999e-06 [updatestate_depend_eliminate]: 3.78999e-06 [updatestate_assign_eliminate]: 3.70998e-06 [updatestate_loads_eliminate]: 3.14001e-06 [parameter_eliminate]: 1.94e-06 [a_2]: 8.771e-05 [accelerated_algorithm]: 7.16001e-06 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 6.74001e-06 [merge_send_recv]: 8.48999e-06 [auto_parallel]: 5.91003e-06 [parallel]: 1.791e-05 [flash_sp]: 7.36999e-06 [merge_comm]: 3.81001e-06 [allreduce_fusion]: 3.46001e-06 [matmul_add_comm_reduction]: 8.45999e-06 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 7.7e-06 [virtual_dataset]: 6.99001e-06 [get_grad_eliminate_]: 6.73998e-06 [virtual_output]: 6.54001e-06 [merge_forward]: 4.02998e-06 [cell_reuse_recompute_pass]: 1.19e-06 [offload_activation]: 9.46e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.286e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 1.047e-05 [set_forward_comm_id_for_comm_node_pass]: 3.65998e-06 [meta_fg_expand]: 2.88e-06 [flash_sp_send_recv_attached]: 2.78998e-06 [receive_attached]: 2.37999e-06 [after_resolve]: 1.013e-05 [a_after_grad]: 1.012e-05 [renormalize]: 0.00217089 [add_forward_monad_depend]: 5.35001e-06 [auto_monad_grad]: 1.98002e-06 [auto_monad_eliminator]: 1.558e-05 [cse]: 3.607e-05 [a_3]: 4.974e-05 [Cycle 2]: 0.00062287, [45] [expand_dump_flag]: 1.09e-06 [switch_simplify]: 7.65998e-06 [loop_unroll]: 6.48e-06 [a_1]: 0.00012307 [with_stream_mark]: 1.069e-05 [recompute_prepare]: 6.79999e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.56998e-06 [updatestate_loads_eliminate]: 2.72001e-06 [parameter_eliminate]: 1.07e-06 [a_2]: 7.726e-05 [accelerated_algorithm]: 6.62002e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.45999e-06 [shard_inline]: 6.14001e-06 [merge_send_recv]: 4.43999e-06 [auto_parallel]: 5.35001e-06 [parallel]: 4.1e-06 [flash_sp]: 3.03e-06 [merge_comm]: 3.26001e-06 [allreduce_fusion]: 3.53e-06 [matmul_add_comm_reduction]: 5.37001e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 7.17002e-06 [virtual_dataset]: 6.78e-06 [get_grad_eliminate_]: 6.48e-06 [virtual_output]: 6.00002e-06 [merge_forward]: 2.74999e-06 [cell_reuse_recompute_pass]: 1.28002e-06 [offload_activation]: 6.48998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.287e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 9.02999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.19001e-06 [meta_fg_expand]: 1.94999e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.12999e-06 [after_resolve]: 8.90001e-06 [a_after_grad]: 9.47001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 6.64001e-06 [cse]: 1.603e-05 [a_3]: 3.843e-05 [py_interpret_to_execute_after_opt_a]: 4.30999e-06 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 1.778e-05 [convert_after_rewriter]: 1.64998e-06 [order_py_execute_after_rewriter]: 1.12999e-06 [mutable_eliminate]: 0.00046659 [opt_b]: 0.00021214, [1] [Cycle 1]: 0.00020672, [7] [b_1]: 0.00013358 [b_2]: 7.81001e-06 [updatestate_depend_eliminate]: 5.52999e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 4.60015e-07 [cse]: 2.05e-05 [optimize_parallel_all_gather_comm]: 1.622e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.362e-05 [loop_unroll]: 0.00042123 [opt_after_cconv]: 0.00010182, [1] [Cycle 1]: 9.677e-05, [7] [c_1]: 3.032e-05 [parameter_eliminate]: 2.29001e-06 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 2.69001e-06 [updatestate_loads_eliminate]: 2.37999e-06 [cse]: 2.074e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.788e-05 [tuple_transform]: 7.285e-05, [1] [Cycle 1]: 6.864e-05, [4] [d_1]: 4.233e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.53e-06 [partial_unused_args_eliminate]: 1.90001e-06 [add_recomputation]: 4.287e-05 [cse_after_recomputation]: 2.302e-05, [1] [Cycle 1]: 1.848e-05, [1] [cse]: 1.33e-05 [environ_conv]: 7.85e-06 [swap_dp_allreduce_reducescatter]: 5.07e-06 [bias_add_comm_swap]: 2.18002e-06 [label_micro_interleaved_index]: 4.27e-06 [label_fine_grained_interleaved_index]: 2.64999e-06 [merge_cast_opt]: 1.76e-06 [slice_recompute_activation]: 2.12001e-06 [micro_interleaved_order_control]: 2.80997e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.29982e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.30002e-06 [reorder_send_recv_between_fp_bp]: 3.01001e-06 [comm_op_add_attrs]: 1.27e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.19998e-06 [interleave_parallel_branches]: 1.17e-06 [overlap_opt_shard_in_pipeline]: 1.31002e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87001e-06 [control_data_broadcast_order]: 1.309e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.48999e-06 [overlap_recompute_and_grad_model_parallel]: 4.86002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.49e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.43998e-06 [overlap_grad_ring_attention]: 4e-06 [overlap_grad_flash_sp]: 1.804e-05 [begin_end_overlap_inline]: 5.29981e-07 [split_matmul_comm_elemetwise]: 2.22001e-06 [split_layernorm_comm]: 1.64998e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 7.552e-05, [1] [Cycle 1]: 7.151e-05, [6] [build]: 2.76e-06 [elim_shapecalc]: 1.011e-05 [elim_not_effective]: 1.33e-05 [opt_reshape]: 7.16001e-06 [fold_const_symbol]: 1.024e-05 [renormalize]: 2.10013e-07 [detach_backward]: 1.69e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 1.812e-05 [get_jit_bprop_graph]: 1.03001e-06 [rewriter_after_jit_bprop_graph]: 3.60998e-06 [opt_after_jit_grad]: 0.00048406 [validate]: 4.265e-05 [backend_pass]: 9.00007e-07 [task_emit]: 0.00696842 [execute]: 6.74999e-06 Sums bootstrap : 0.000483s : 0.64% type_inference : 0.061896s : 81.47% event_method : 0.000309s : 0.41% auto_monad : 0.000080s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000037s : 0.05% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000298s : 0.39% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000062s : 0.08% optimize.opt_a.loop_unroll : 0.000048s : 0.06% optimize.opt_a.a_1 : 0.000825s : 1.09% optimize.opt_a.with_stream_mark : 0.000026s : 0.03% optimize.opt_a.recompute_prepare : 0.000015s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.22% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000013s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.01% optimize.opt_a.parallel : 0.000022s : 0.03% optimize.opt_a.flash_sp : 0.000010s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000014s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000013s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000026s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.002171s : 2.86% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.03% optimize.opt_a.cse : 0.000052s : 0.07% optimize.opt_a.a_3 : 0.000088s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.02% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000467s : 0.61% optimize.opt_b.b_1 : 0.000134s : 0.18% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000021s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.03% optimize.loop_unroll : 0.000421s : 0.55% optimize.opt_after_cconv.c_1 : 0.000030s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000018s : 0.02% optimize.tuple_transform.d_1 : 0.000042s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000043s : 0.06% optimize.cse_after_recomputation.cse : 0.000013s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000484s : 0.64% validate : 0.000043s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.006968s : 9.17% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000187 27 1.08% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000002s : 2: substitution.fold_const_symbol 2.81% : 0.000005s : 4: substitution.graph_param_transform 81.99% : 0.000153s : 6: substitution.inline 1.70% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.53% : 0.000005s : 4: substitution.remove_not_recompute_node 1.60% : 0.000003s : 2: substitution.replace_old_param 7.42% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.061815 2 96.44% : 0.059612s : 1: type_inference.infer 3.56% : 0.002204s : 1: type_inference.specialize ------[replace.] 0.000081 9 71.21% : 0.000058s : 6: replace.inline 28.79% : 0.000023s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 9 92.46% : 0.000150s : 6: match.inline 7.54% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000207 1396 1.01% : 0.000002s : 15: predicate.accumulaten_eliminater 0.74% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.49% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 15: predicate.addn_zero_filter 0.89% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.20% : 0.000005s : 23: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.59% : 0.000001s : 8: predicate.check_bprop_eliminate 0.48% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.54% : 0.000001s : 8: predicate.depend_value_elim 1.06% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.18% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.94% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.93% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.29% : 0.000001s : 4: predicate.elim_not_effective 0.35% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000003s : 19: predicate.environ_add_const_eliminate 1.20% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.09% : 0.000002s : 19: predicate.environ_get_depend_swap 1.81% : 0.000004s : 27: predicate.environ_get_eliminate 1.12% : 0.000002s : 19: predicate.environ_get_set_eliminate 1.55% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.46% : 0.000005s : 24: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.76% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.19% : 0.000000s : 4: predicate.fold_const_symbol 0.67% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.51% : 0.000001s : 8: predicate.incorporate_call 0.43% : 0.000001s : 8: predicate.incorporate_call_switch 5.87% : 0.000012s : 63: predicate.inline 0.62% : 0.000001s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.76% : 0.000002s : 8: predicate.less_batch_normalization 1.70% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.54% : 0.000005s : 41: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.00% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.76% : 0.000004s : 23: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 15: predicate.minmaximum_grad 0.96% : 0.000002s : 4: predicate.mutable_eliminate 0.30% : 0.000001s : 4: predicate.opt_reshape 0.34% : 0.000001s : 4: predicate.parallel_virtual_node 2.03% : 0.000004s : 24: predicate.partial_defer_inline 1.45% : 0.000003s : 22: predicate.partial_eliminate 0.91% : 0.000002s : 15: predicate.print_const_string_wrapper 0.57% : 0.000001s : 8: predicate.reduce_all_const_elim 1.33% : 0.000003s : 15: predicate.reduce_eliminate 2.45% : 0.000005s : 41: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 26: predicate.replace_applicator 0.40% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 15: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.43% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000001s : 8: predicate.same_eliminate 0.43% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.78% : 0.000002s : 8: predicate.shard_identity_eliminate 0.58% : 0.000001s : 8: predicate.special_op_eliminate 0.67% : 0.000001s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.30% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.65% : 0.000003s : 24: predicate.switch_defer_inline 2.16% : 0.000004s : 32: predicate.switch_layer_defer_inline 5.95% : 0.000012s : 84: predicate.switch_simplify 0.97% : 0.000002s : 15: predicate.tile_eliminate 0.93% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.00% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.66% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.67% : 0.000003s : 26: predicate.tuple_to_list_eliminator_ 2.38% : 0.000005s : 41: predicate.updatestate_pure_node_eliminater 3.13% : 0.000006s : 49: predicate.updatestate_useless_node_eliminater 0.31% : 0.000001s : 4: predicate.value_based_eliminate 0.62% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.29% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002785 33 70.53% : 0.001964s : 25: func_graph_cloner_run.FuncGraphClonerGraph 29.47% : 0.000821s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.093311 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.38% : 0.003156s : 1: add_attr 3.37% : 0.003147s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000047s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000089s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.55% : 0.000514s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.35% : 0.000322s : 1: event_method 0.01% : 0.000011s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.03% : 0.000032s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.46% : 0.000429s : 1: loop_unroll 0.01% : 0.000005s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.51% : 0.000475s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 1.38% : 0.001287s : 78: opt.transform.opt_a 0.03% : 0.000029s : 1: opt.transform.opt_after_cconv 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000114s : 28: opt.transform.opt_b 0.05% : 0.000048s : 2: opt.transform.opt_trans_graph 0.04% : 0.000037s : 4: opt.transform.symbol_engine_opt 4.55% : 0.004247s : 1: opt_a 0.11% : 0.000105s : 1: opt_after_cconv 0.53% : 0.000493s : 1: opt_after_jit_grad 0.23% : 0.000216s : 1: opt_b 6.86% : 0.006397s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000021s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000041s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000021s : 1: remove_dup_value 1.41% : 0.001312s : 1: renormalize.infer 0.91% : 0.000849s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000021s : 1: rewriter_after_opt_a 0.33% : 0.000304s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.08% : 0.000078s : 1: symbol_engine_optimizer 7.48% : 0.006979s : 1: task_emit 0.08% : 0.000076s : 1: tuple_transform 66.35% : 0.061916s : 1: type_inference 0.08% : 0.000072s : 1: validate TotalTime = 0.272104, [24] [bootstrap]: 0.00050749 [type_inference]: 0.231069 [event_method]: 7.577e-05 [auto_monad]: 0.00025646 [graph_reusing]: 1.922e-05 [inline]: 2.12999e-06 [add_attr]: 0.00343897, [1] [add_attr_with_inline]: 0.00343055, [1] [Cycle 1]: 0.00012928, [2] [tag_attr]: 7.307e-05 [meta_addattr_fg_expand]: 2.366e-05 [parallel-infer-symbol]: 3.69002e-06 [pre_auto_parallel]: 9.542e-05 [insert-virtual-dataset]: 2.84999e-06 [parallel-infer-symbol-second]: 8.10018e-07 [dataset_repeat_opt]: 1.95001e-06 [pipeline_split]: 2.06e-06 [optimize]: 0.0267161, [53] [py_interpret_to_execute]: 4.89998e-06 [rewriter_before_opt_a]: 0.0006707 [opt_a]: 0.0240083, [3] [Cycle 1]: 0.0193897, [45] [expand_dump_flag]: 8.40001e-06 [switch_simplify]: 0.00028262 [loop_unroll]: 0.0001242 [a_1]: 0.00258048 [with_stream_mark]: 2.365e-05 [recompute_prepare]: 2.203e-05 [updatestate_depend_eliminate]: 8.83001e-06 [updatestate_assign_eliminate]: 7.69002e-06 [updatestate_loads_eliminate]: 7.35e-06 [parameter_eliminate]: 2.61e-06 [a_2]: 0.00022393 [accelerated_algorithm]: 1.496e-05 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 6.28998e-06 [shard_inline]: 1.48e-05 [merge_send_recv]: 1.508e-05 [auto_parallel]: 1.055e-05 [parallel]: 1.777e-05 [flash_sp]: 9.15999e-06 [merge_comm]: 8.99e-06 [allreduce_fusion]: 8.02998e-06 [matmul_add_comm_reduction]: 2.486e-05 [allreduce_slice_to_reducescatter]: 6.00005e-07 [virtual_shard_identity]: 1.704e-05 [virtual_dataset]: 1.519e-05 [get_grad_eliminate_]: 1.453e-05 [virtual_output]: 1.553e-05 [merge_forward]: 8.62e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.674e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.521e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 2.401e-05 [set_forward_comm_id_for_comm_node_pass]: 8.37998e-06 [meta_fg_expand]: 0.00169472 [flash_sp_send_recv_attached]: 3.94997e-06 [receive_attached]: 2.26e-06 [after_resolve]: 6.8e-05 [a_after_grad]: 9.047e-05 [renormalize]: 0.0128008 [add_forward_monad_depend]: 1.014e-05 [auto_monad_grad]: 5.89999e-06 [auto_monad_eliminator]: 5.429e-05 [cse]: 0.00044301 [a_3]: 0.00034063 [Cycle 2]: 0.00394311, [45] [expand_dump_flag]: 1.72999e-06 [switch_simplify]: 4.551e-05 [loop_unroll]: 4.276e-05 [a_1]: 0.00128219 [with_stream_mark]: 1.455e-05 [recompute_prepare]: 8.94e-06 [updatestate_depend_eliminate]: 3.45998e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.58003e-06 [parameter_eliminate]: 1.05999e-06 [a_2]: 8.383e-05 [accelerated_algorithm]: 7.33e-06 [shard]: 1.40001e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.72002e-06 [merge_send_recv]: 4.80001e-06 [auto_parallel]: 5.49e-06 [parallel]: 4.28999e-06 [flash_sp]: 3.25998e-06 [merge_comm]: 3.26001e-06 [allreduce_fusion]: 3.06001e-06 [matmul_add_comm_reduction]: 5.52001e-06 [allreduce_slice_to_reducescatter]: 5.69999e-07 [virtual_shard_identity]: 7.69002e-06 [virtual_dataset]: 6.66e-06 [get_grad_eliminate_]: 7.15e-06 [virtual_output]: 6.93e-06 [merge_forward]: 2.96999e-06 [cell_reuse_recompute_pass]: 1.04003e-06 [offload_activation]: 6.93e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.253e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 9.47001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.33998e-06 [meta_fg_expand]: 0.00070922 [flash_sp_send_recv_attached]: 1.55999e-06 [receive_attached]: 1.40001e-06 [after_resolve]: 1.592e-05 [a_after_grad]: 1.14e-05 [renormalize]: 0.00126064 [add_forward_monad_depend]: 4.42e-06 [auto_monad_grad]: 1.30001e-06 [auto_monad_eliminator]: 1.122e-05 [cse]: 2.342e-05 [a_3]: 5.003e-05 [Cycle 3]: 0.00066121, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 7.97e-06 [loop_unroll]: 6.84001e-06 [a_1]: 0.00012987 [with_stream_mark]: 8.80999e-06 [recompute_prepare]: 6.88998e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.77002e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 9.79984e-07 [a_2]: 8.125e-05 [accelerated_algorithm]: 6.64999e-06 [shard]: 1.14e-06 [meta_shard_fg_expand]: 1.45999e-06 [shard_inline]: 6.79001e-06 [merge_send_recv]: 4.52e-06 [auto_parallel]: 5.57001e-06 [parallel]: 4.12e-06 [flash_sp]: 8.50006e-07 [merge_comm]: 3.08e-06 [allreduce_fusion]: 2.83e-06 [matmul_add_comm_reduction]: 4.99e-06 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 1.094e-05 [virtual_dataset]: 6.46999e-06 [get_grad_eliminate_]: 6.19999e-06 [virtual_output]: 6.16e-06 [merge_forward]: 3.03e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 6.49001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.342e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 9.77001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.34001e-06 [meta_fg_expand]: 2.09999e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 1.04e-06 [after_resolve]: 6.73e-06 [a_after_grad]: 9.42001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 2.022e-05 [cse]: 2.023e-05 [a_3]: 4.007e-05 [py_interpret_to_execute_after_opt_a]: 5.15001e-06 [slice_cell_reuse_recomputed_activation]: 2.30002e-06 [rewriter_after_opt_a]: 1.869e-05 [convert_after_rewriter]: 1.32999e-06 [order_py_execute_after_rewriter]: 1.09e-06 [mutable_eliminate]: 0.00049583 [opt_b]: 0.00022229, [1] [Cycle 1]: 0.0002166, [7] [b_1]: 0.00014058 [b_2]: 8e-06 [updatestate_depend_eliminate]: 5.53002e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 2.42001e-06 [renormalize]: 4.19997e-07 [cse]: 2.295e-05 [optimize_parallel_all_gather_comm]: 1.673e-05 [overlap_param_gather]: 2.32001e-06 [cconv]: 1.953e-05 [loop_unroll]: 0.00044363 [opt_after_cconv]: 0.0001093, [1] [Cycle 1]: 0.00010359, [7] [c_1]: 3.314e-05 [parameter_eliminate]: 2.64999e-06 [updatestate_depend_eliminate]: 5.42999e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.43002e-06 [cse]: 2.337e-05 [renormalize]: 4.89992e-07 [remove_dup_value]: 1.851e-05 [tuple_transform]: 7.538e-05, [1] [Cycle 1]: 7.114e-05, [4] [d_1]: 4.417e-05 [none_parameter_eliminate]: 1.89e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 7.9e-06 [partial_unused_args_eliminate]: 1.89e-06 [add_recomputation]: 3.986e-05 [cse_after_recomputation]: 2.582e-05, [1] [Cycle 1]: 2.089e-05, [1] [cse]: 1.568e-05 [environ_conv]: 9.20999e-06 [swap_dp_allreduce_reducescatter]: 5.32001e-06 [bias_add_comm_swap]: 2.16e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.31998e-06 [merge_cast_opt]: 1.20999e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.42001e-06 [assign_add_opt]: 1.24003e-06 [ForceFp32Comm]: 1.36998e-06 [remove_cast_before_assign_add]: 1.05999e-06 [full_micro_interleaved_order_control]: 2.59001e-06 [reorder_send_recv_between_fp_bp]: 2.66e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 9.39996e-07 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.04998e-06 [overlap_opt_shard_in_pipeline]: 1.09003e-06 [overlap_opt_shard_grad_in_pipeline]: 2.12001e-06 [control_data_broadcast_order]: 1.413e-05 [grouped_pairwise_exchange_alltoall]: 1.47001e-06 [offloading_packed_experts]: 4.26001e-06 [overlap_recompute_and_grad_model_parallel]: 5.62999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30001e-06 [overlap_recompute_comm]: 2.03002e-06 [overlap_grad_ring_attention]: 4.00998e-06 [overlap_grad_flash_sp]: 1.899e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.11003e-06 [split_layernorm_comm]: 2.04e-06 [handle_group_info]: 1.25999e-06 [symbol_engine_optimizer]: 8.428e-05, [1] [Cycle 1]: 7.948e-05, [6] [build]: 2.94001e-06 [elim_shapecalc]: 1.325e-05 [elim_not_effective]: 1.436e-05 [opt_reshape]: 7.79002e-06 [fold_const_symbol]: 1.109e-05 [renormalize]: 2.40019e-07 [detach_backward]: 1.67999e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 1.886e-05 [get_jit_bprop_graph]: 1.15001e-06 [rewriter_after_jit_bprop_graph]: 4.14002e-06 [opt_after_jit_grad]: 0.00049051 [validate]: 4.468e-05 [backend_pass]: 1.00999e-06 [task_emit]: 0.00912777 [execute]: 6.74999e-06 Sums bootstrap : 0.000507s : 0.19% type_inference : 0.231069s : 86.45% event_method : 0.000076s : 0.03% auto_monad : 0.000256s : 0.10% graph_reusing : 0.000019s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000073s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000024s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000095s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000671s : 0.25% optimize.opt_a.expand_dump_flag : 0.000011s : 0.00% optimize.opt_a.switch_simplify : 0.000336s : 0.13% optimize.opt_a.loop_unroll : 0.000174s : 0.07% optimize.opt_a.a_1 : 0.003993s : 1.49% optimize.opt_a.with_stream_mark : 0.000047s : 0.02% optimize.opt_a.recompute_prepare : 0.000038s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000389s : 0.15% optimize.opt_a.accelerated_algorithm : 0.000029s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000028s : 0.01% optimize.opt_a.merge_send_recv : 0.000024s : 0.01% optimize.opt_a.auto_parallel : 0.000022s : 0.01% optimize.opt_a.parallel : 0.000026s : 0.01% optimize.opt_a.flash_sp : 0.000013s : 0.00% optimize.opt_a.merge_comm : 0.000015s : 0.01% optimize.opt_a.allreduce_fusion : 0.000014s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000035s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000036s : 0.01% optimize.opt_a.virtual_dataset : 0.000028s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000028s : 0.01% optimize.opt_a.virtual_output : 0.000029s : 0.01% optimize.opt_a.merge_forward : 0.000015s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000051s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000043s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.01% optimize.opt_a.meta_fg_expand : 0.002406s : 0.90% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000091s : 0.03% optimize.opt_a.a_after_grad : 0.000111s : 0.04% optimize.opt_a.renormalize : 0.014061s : 5.26% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000086s : 0.03% optimize.opt_a.cse : 0.000487s : 0.18% optimize.opt_a.a_3 : 0.000431s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000496s : 0.19% optimize.opt_b.b_1 : 0.000141s : 0.05% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000020s : 0.01% optimize.loop_unroll : 0.000444s : 0.17% optimize.opt_after_cconv.c_1 : 0.000033s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000023s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000040s : 0.01% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000019s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000491s : 0.18% validate : 0.000045s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.009128s : 3.41% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.001029 186 0.19% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000010s : 14: substitution.float_depend_g_call 0.33% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.13% : 0.000001s : 2: substitution.fold_const_symbol 0.52% : 0.000005s : 4: substitution.graph_param_transform 0.23% : 0.000002s : 2: substitution.incorporate_call 0.20% : 0.000002s : 2: substitution.incorporate_call_switch 76.65% : 0.000788s : 38: substitution.inline 1.71% : 0.000018s : 2: substitution.inline_without_move 0.71% : 0.000007s : 12: substitution.j_node_and_user_rematch 0.97% : 0.000010s : 7: substitution.minmaximum_grad 1.06% : 0.000011s : 14: substitution.partial_eliminate 0.90% : 0.000009s : 12: substitution.remove_not_recompute_node 2.29% : 0.000024s : 9: substitution.replace_applicator 0.58% : 0.000006s : 9: substitution.replace_old_param 0.25% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.99% : 0.000020s : 9: substitution.switch_simplify 1.95% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 0.95% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.32% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 4.89% : 0.000050s : 17: substitution.tuple_list_get_item_eliminator 1.26% : 0.000013s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.229454 2 96.08% : 0.220450s : 1: type_inference.infer 3.92% : 0.009003s : 1: type_inference.specialize ------[replace.] 0.000422 55 61.38% : 0.000259s : 38: replace.inline 19.69% : 0.000083s : 9: replace.switch_simplify 18.93% : 0.000080s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000811 55 94.93% : 0.000769s : 38: match.inline 1.91% : 0.000016s : 9: match.switch_simplify 3.16% : 0.000026s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000779 5484 1.20% : 0.000009s : 73: predicate.accumulaten_eliminater 0.21% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.36% : 0.000003s : 21: predicate.addn_check_dump 1.18% : 0.000009s : 73: predicate.addn_zero_filter 1.18% : 0.000009s : 73: predicate.adjust_all_reduce_mul_add 2.12% : 0.000017s : 94: predicate.arithmetic_simplify 1.25% : 0.000010s : 73: predicate.cast_eliminate 0.91% : 0.000007s : 52: predicate.check_bprop_eliminate 0.35% : 0.000003s : 21: predicate.compare_switch_simplify 0.06% : 0.000000s : 4: predicate.const_output_eliminate 0.36% : 0.000003s : 21: predicate.depend_value_elim 1.32% : 0.000010s : 73: predicate.dict_get_item_const_eliminator 1.32% : 0.000010s : 73: predicate.dict_get_item_eliminator 1.16% : 0.000009s : 73: predicate.dict_set_item_eliminator 0.25% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.06% : 0.000001s : 4: predicate.elim_not_effective 0.11% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000010s : 77: predicate.environ_add_const_eliminate 1.16% : 0.000009s : 77: predicate.environ_get_add_eliminate 1.21% : 0.000009s : 77: predicate.environ_get_depend_swap 1.68% : 0.000013s : 98: predicate.environ_get_eliminate 1.19% : 0.000009s : 77: predicate.environ_get_set_eliminate 2.04% : 0.000016s : 119: predicate.exchange_switch_depend_value 2.78% : 0.000022s : 119: predicate.float_depend_g_call 0.37% : 0.000003s : 21: predicate.float_environ_get_switch 0.45% : 0.000003s : 25: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 4: predicate.fold_const_symbol 0.41% : 0.000003s : 21: predicate.get_grad_eliminate 0.06% : 0.000000s : 4: predicate.graph_param_transform 0.36% : 0.000003s : 21: predicate.incorporate_call 0.33% : 0.000003s : 21: predicate.incorporate_call_switch 5.70% : 0.000044s : 242: predicate.inline 1.17% : 0.000009s : 48: predicate.inline_without_move 0.21% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.45% : 0.000004s : 21: predicate.less_batch_normalization 1.52% : 0.000012s : 89: predicate.list_to_tuple_eliminator_ 2.57% : 0.000020s : 162: predicate.load_eliminater 0.26% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.59% : 0.000028s : 192: predicate.loop_unroll_before_grad 1.36% : 0.000011s : 81: predicate.make_slice_get_slice_eliminator 0.38% : 0.000003s : 21: predicate.merge_addn 0.88% : 0.000007s : 52: predicate.micro_step_allgather_replace 0.87% : 0.000007s : 52: predicate.mini_step_allgather_replace 1.15% : 0.000009s : 73: predicate.minmaximum_grad 0.27% : 0.000002s : 4: predicate.mutable_eliminate 0.09% : 0.000001s : 4: predicate.opt_reshape 0.16% : 0.000001s : 4: predicate.parallel_virtual_node 2.75% : 0.000021s : 119: predicate.partial_defer_inline 1.57% : 0.000012s : 85: predicate.partial_eliminate 1.22% : 0.000010s : 73: predicate.print_const_string_wrapper 0.38% : 0.000003s : 21: predicate.reduce_all_const_elim 1.56% : 0.000012s : 73: predicate.reduce_eliminate 2.61% : 0.000020s : 162: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000002s : 21: predicate.remove_not_recompute_node 1.71% : 0.000013s : 133: predicate.replace_applicator 0.56% : 0.000004s : 48: predicate.replace_old_param 0.08% : 0.000001s : 4: predicate.reset_defer_inline 1.21% : 0.000009s : 73: predicate.reshape_eliminate 0.88% : 0.000007s : 52: predicate.row_tensor_add_zeros_like 0.09% : 0.000001s : 4: predicate.row_tensor_eliminate 1.10% : 0.000009s : 52: predicate.same_eliminate 0.28% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.42% : 0.000003s : 21: predicate.shard_identity_eliminate 0.19% : 0.000001s : 8: predicate.special_op_eliminate 0.42% : 0.000003s : 21: predicate.specialize_transform 1.01% : 0.000008s : 52: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000008s : 48: predicate.stack_unstack_eliminate 0.08% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.32% : 0.000018s : 119: predicate.switch_defer_inline 3.10% : 0.000024s : 171: predicate.switch_layer_defer_inline 7.06% : 0.000055s : 354: predicate.switch_simplify 1.14% : 0.000009s : 73: predicate.tile_eliminate 1.15% : 0.000009s : 73: predicate.transpose_eliminate 1.49% : 0.000012s : 81: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000012s : 81: predicate.tuple_list_get_item_const_eliminator 1.43% : 0.000011s : 81: predicate.tuple_list_get_item_depend_reorder 2.46% : 0.000019s : 110: predicate.tuple_list_get_item_eliminator 1.52% : 0.000012s : 81: predicate.tuple_list_get_set_item_eliminator 2.02% : 0.000016s : 102: predicate.tuple_list_set_item_eliminator 1.50% : 0.000012s : 89: predicate.tuple_to_list_eliminator_ 2.53% : 0.000020s : 162: predicate.updatestate_pure_node_eliminater 2.90% : 0.000023s : 183: predicate.updatestate_useless_node_eliminater 0.09% : 0.000001s : 4: predicate.value_based_eliminate 0.41% : 0.000003s : 21: predicate.virtual_dataset_eliminate 0.39% : 0.000003s : 21: predicate.virtual_output_eliminate 0.06% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.13% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007680 86 72.23% : 0.005547s : 43: func_graph_cloner_run.FuncGraphClonerGraph 27.77% : 0.002133s : 43: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.322219 237 0.00% : 0.000005s : 1: ForceFp32Comm 1.07% : 0.003443s : 1: add_attr 1.07% : 0.003434s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000044s : 1: add_recomputation 0.03% : 0.000112s : 1: assign_add_opt 0.08% : 0.000266s : 1: auto_monad 0.01% : 0.000023s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000007s : 1: bias_add_comm_swap 0.17% : 0.000537s : 1: bootstrap 0.01% : 0.000023s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.01% : 0.000018s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.03% : 0.000082s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000023s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.14% : 0.000452s : 1: loop_unroll 0.00% : 0.000009s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.16% : 0.000504s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 1.77% : 0.005698s : 117: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000120s : 28: opt.transform.opt_b 0.02% : 0.000050s : 2: opt.transform.opt_trans_graph 0.01% : 0.000042s : 4: opt.transform.symbol_engine_opt 7.45% : 0.024012s : 1: opt_a 0.04% : 0.000114s : 1: opt_after_cconv 0.15% : 0.000499s : 1: opt_after_jit_grad 0.07% : 0.000226s : 1: opt_b 8.29% : 0.026721s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000005s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000100s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 3.53% : 0.011385s : 2: renormalize.infer 0.83% : 0.002659s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000022s : 1: rewriter_after_opt_a 0.21% : 0.000681s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000087s : 1: symbol_engine_optimizer 2.84% : 0.009138s : 1: task_emit 0.02% : 0.000078s : 1: tuple_transform 71.72% : 0.231093s : 1: type_inference 0.02% : 0.000073s : 1: validate TotalTime = 0.0769883, [24] [bootstrap]: 0.00048322 [type_inference]: 0.0589887 [event_method]: 0.0003153 [auto_monad]: 8.168e-05 [graph_reusing]: 6.59001e-06 [inline]: 2.14e-06 [add_attr]: 0.00310942, [1] [add_attr_with_inline]: 0.0031015, [1] [Cycle 1]: 6.185e-05, [2] [tag_attr]: 2.738e-05 [meta_addattr_fg_expand]: 7.28e-06 [parallel-infer-symbol]: 3.08e-06 [pre_auto_parallel]: 3.768e-05 [insert-virtual-dataset]: 2.44999e-06 [parallel-infer-symbol-second]: 7.29982e-07 [dataset_repeat_opt]: 2.18002e-06 [pipeline_split]: 1.67001e-06 [optimize]: 0.00620296, [53] [py_interpret_to_execute]: 4.53001e-06 [rewriter_before_opt_a]: 0.00027582 [opt_a]: 0.0040784, [2] [Cycle 1]: 0.0034124, [45] [expand_dump_flag]: 3.33e-06 [switch_simplify]: 5.884e-05 [loop_unroll]: 4.143e-05 [a_1]: 0.00070604 [with_stream_mark]: 1.398e-05 [recompute_prepare]: 8.79e-06 [updatestate_depend_eliminate]: 4.07e-06 [updatestate_assign_eliminate]: 3.75e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 1.97999e-06 [a_2]: 8.692e-05 [accelerated_algorithm]: 7.18e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.18002e-06 [shard_inline]: 6.99001e-06 [merge_send_recv]: 8.05999e-06 [auto_parallel]: 5.87999e-06 [parallel]: 1.794e-05 [flash_sp]: 7.78001e-06 [merge_comm]: 3.87998e-06 [allreduce_fusion]: 3.50998e-06 [matmul_add_comm_reduction]: 9.02999e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 8.09002e-06 [virtual_dataset]: 6.79999e-06 [get_grad_eliminate_]: 6.71e-06 [virtual_output]: 6.54999e-06 [merge_forward]: 4.22e-06 [cell_reuse_recompute_pass]: 1.04998e-06 [offload_activation]: 9.54e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.266e-05 [merge_recompute_call_nodes]: 1.55999e-06 [before_grad]: 1.014e-05 [set_forward_comm_id_for_comm_node_pass]: 3.68e-06 [meta_fg_expand]: 3.16001e-06 [flash_sp_send_recv_attached]: 2.49001e-06 [receive_attached]: 2.48e-06 [after_resolve]: 9.94001e-06 [a_after_grad]: 1.032e-05 [renormalize]: 0.00195913 [add_forward_monad_depend]: 5.95002e-06 [auto_monad_grad]: 1.93002e-06 [auto_monad_eliminator]: 1.517e-05 [cse]: 3.712e-05 [a_3]: 5.03e-05 [Cycle 2]: 0.0006566, [45] [expand_dump_flag]: 1.04e-06 [switch_simplify]: 7.5e-06 [loop_unroll]: 7.33e-06 [a_1]: 0.00012793 [with_stream_mark]: 1.098e-05 [recompute_prepare]: 6.78998e-06 [updatestate_depend_eliminate]: 3.16001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 3.06999e-06 [parameter_eliminate]: 9.39996e-07 [a_2]: 7.794e-05 [accelerated_algorithm]: 6.61e-06 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 1.41002e-06 [shard_inline]: 6.25002e-06 [merge_send_recv]: 4.95001e-06 [auto_parallel]: 5.05999e-06 [parallel]: 5.04e-06 [flash_sp]: 2.84999e-06 [merge_comm]: 2.98998e-06 [allreduce_fusion]: 3.03998e-06 [matmul_add_comm_reduction]: 5.25999e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 7.03e-06 [virtual_dataset]: 6.31998e-06 [get_grad_eliminate_]: 5.96e-06 [virtual_output]: 5.91e-06 [merge_forward]: 2.69001e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 6.18998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.228e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 9.13002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.28e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.04e-06 [after_resolve]: 9.29998e-06 [a_after_grad]: 9.87999e-06 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.10999e-06 [auto_monad_grad]: 8.49977e-07 [auto_monad_eliminator]: 6.39001e-06 [cse]: 1.674e-05 [a_3]: 3.895e-05 [py_interpret_to_execute_after_opt_a]: 4.70999e-06 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 1.809e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00046197 [opt_b]: 0.00020901, [1] [Cycle 1]: 0.00020342, [7] [b_1]: 0.00013114 [b_2]: 8.2e-06 [updatestate_depend_eliminate]: 5.22e-06 [updatestate_assign_eliminate]: 2.75002e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 3.80009e-07 [cse]: 2.027e-05 [optimize_parallel_all_gather_comm]: 1.691e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 2.333e-05 [loop_unroll]: 0.00042252 [opt_after_cconv]: 0.00010171, [1] [Cycle 1]: 9.626e-05, [7] [c_1]: 3.168e-05 [parameter_eliminate]: 2.26e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.76999e-06 [updatestate_loads_eliminate]: 2.36e-06 [cse]: 2.016e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 1.669e-05 [tuple_transform]: 7.328e-05, [1] [Cycle 1]: 6.913e-05, [4] [d_1]: 4.224e-05 [none_parameter_eliminate]: 1.68002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 7.1e-06 [partial_unused_args_eliminate]: 1.69e-06 [add_recomputation]: 4.354e-05 [cse_after_recomputation]: 5.186e-05, [1] [Cycle 1]: 4.711e-05, [1] [cse]: 1.368e-05 [environ_conv]: 7.97e-06 [swap_dp_allreduce_reducescatter]: 6.08002e-06 [bias_add_comm_swap]: 2.44999e-06 [label_micro_interleaved_index]: 4.58001e-06 [label_fine_grained_interleaved_index]: 3.19001e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 2.32001e-06 [micro_interleaved_order_control]: 2.60002e-06 [assign_add_opt]: 1.19003e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.09998e-06 [full_micro_interleaved_order_control]: 2.41e-06 [reorder_send_recv_between_fp_bp]: 2.58998e-06 [comm_op_add_attrs]: 1.12e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 1.12999e-06 [overlap_opt_shard_in_pipeline]: 1.47001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.87999e-06 [control_data_broadcast_order]: 1.296e-05 [grouped_pairwise_exchange_alltoall]: 1.49e-06 [offloading_packed_experts]: 3.45e-06 [overlap_recompute_and_grad_model_parallel]: 5.79999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.72999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.43e-06 [overlap_grad_ring_attention]: 4.27e-06 [overlap_grad_flash_sp]: 1.827e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.02999e-06 [split_layernorm_comm]: 1.80001e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 7.457e-05, [1] [Cycle 1]: 6.951e-05, [6] [build]: 2.51e-06 [elim_shapecalc]: 1.027e-05 [elim_not_effective]: 1.301e-05 [opt_reshape]: 7.15e-06 [fold_const_symbol]: 1.002e-05 [renormalize]: 1.79978e-07 [detach_backward]: 2.06e-06 [pipeline_parallel_scheduler]: 1.52999e-06 [auto_monad_reorder]: 1.816e-05 [get_jit_bprop_graph]: 1.18001e-06 [rewriter_after_jit_bprop_graph]: 3.48e-06 [opt_after_jit_grad]: 0.00046088 [validate]: 4.167e-05 [backend_pass]: 8.59989e-07 [task_emit]: 0.00700122 [execute]: 6.98998e-06 Sums bootstrap : 0.000483s : 0.66% type_inference : 0.058989s : 80.96% event_method : 0.000315s : 0.43% auto_monad : 0.000082s : 0.11% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000027s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000038s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000276s : 0.38% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000066s : 0.09% optimize.opt_a.loop_unroll : 0.000049s : 0.07% optimize.opt_a.a_1 : 0.000834s : 1.14% optimize.opt_a.with_stream_mark : 0.000025s : 0.03% optimize.opt_a.recompute_prepare : 0.000016s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.23% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.02% optimize.opt_a.merge_send_recv : 0.000013s : 0.02% optimize.opt_a.auto_parallel : 0.000011s : 0.02% optimize.opt_a.parallel : 0.000023s : 0.03% optimize.opt_a.flash_sp : 0.000011s : 0.01% optimize.opt_a.merge_comm : 0.000007s : 0.01% optimize.opt_a.allreduce_fusion : 0.000007s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.02% optimize.opt_a.virtual_dataset : 0.000013s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.02% optimize.opt_a.virtual_output : 0.000012s : 0.02% optimize.opt_a.merge_forward : 0.000007s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000016s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.01% optimize.opt_a.meta_fg_expand : 0.000005s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.03% optimize.opt_a.a_after_grad : 0.000020s : 0.03% optimize.opt_a.renormalize : 0.001959s : 2.69% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.01% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.03% optimize.opt_a.cse : 0.000054s : 0.07% optimize.opt_a.a_3 : 0.000089s : 0.12% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000018s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000462s : 0.63% optimize.opt_b.b_1 : 0.000131s : 0.18% optimize.opt_b.b_2 : 0.000008s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000023s : 0.03% optimize.loop_unroll : 0.000423s : 0.58% optimize.opt_after_cconv.c_1 : 0.000032s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000017s : 0.02% optimize.tuple_transform.d_1 : 0.000042s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.06% optimize.cse_after_recomputation.cse : 0.000014s : 0.02% optimize.environ_conv : 0.000008s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000002s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000018s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000461s : 0.63% validate : 0.000042s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.007001s : 9.61% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000186 27 1.11% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 2.97% : 0.000006s : 4: substitution.graph_param_transform 82.84% : 0.000154s : 6: substitution.inline 1.57% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.46% : 0.000005s : 4: substitution.remove_not_recompute_node 1.44% : 0.000003s : 2: substitution.replace_old_param 6.87% : 0.000013s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.058915 2 96.31% : 0.056740s : 1: type_inference.infer 3.69% : 0.002176s : 1: type_inference.specialize ------[replace.] 0.000080 9 69.87% : 0.000056s : 6: replace.inline 30.13% : 0.000024s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000162 9 93.16% : 0.000151s : 6: match.inline 6.84% : 0.000011s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000210 1396 1.09% : 0.000002s : 15: predicate.accumulaten_eliminater 0.76% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.51% : 0.000001s : 8: predicate.addn_check_dump 0.97% : 0.000002s : 15: predicate.addn_zero_filter 0.91% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.12% : 0.000004s : 23: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.58% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.51% : 0.000001s : 8: predicate.depend_value_elim 1.06% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.14% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.92% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.85% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.22% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000002s : 19: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 19: predicate.environ_get_depend_swap 1.69% : 0.000004s : 27: predicate.environ_get_eliminate 1.20% : 0.000003s : 19: predicate.environ_get_set_eliminate 1.55% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.43% : 0.000005s : 24: predicate.float_depend_g_call 0.52% : 0.000001s : 8: predicate.float_environ_get_switch 0.75% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.61% : 0.000001s : 8: predicate.get_grad_eliminate 0.20% : 0.000000s : 4: predicate.graph_param_transform 0.54% : 0.000001s : 8: predicate.incorporate_call 0.45% : 0.000001s : 8: predicate.incorporate_call_switch 5.91% : 0.000012s : 63: predicate.inline 0.60% : 0.000001s : 8: predicate.inline_without_move 0.28% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.75% : 0.000002s : 8: predicate.less_batch_normalization 1.84% : 0.000004s : 26: predicate.list_to_tuple_eliminator_ 2.57% : 0.000005s : 41: predicate.load_eliminater 0.82% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.01% : 0.000006s : 48: predicate.loop_unroll_before_grad 1.65% : 0.000003s : 23: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.53% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.55% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.90% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.49% : 0.000001s : 4: predicate.parallel_virtual_node 2.05% : 0.000004s : 24: predicate.partial_defer_inline 1.47% : 0.000003s : 22: predicate.partial_eliminate 0.98% : 0.000002s : 15: predicate.print_const_string_wrapper 0.58% : 0.000001s : 8: predicate.reduce_all_const_elim 1.38% : 0.000003s : 15: predicate.reduce_eliminate 2.46% : 0.000005s : 41: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000001s : 8: predicate.remove_not_recompute_node 1.38% : 0.000003s : 26: predicate.replace_applicator 0.50% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 0.95% : 0.000002s : 15: predicate.reshape_eliminate 0.72% : 0.000002s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.66% : 0.000001s : 8: predicate.same_eliminate 0.45% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.67% : 0.000001s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.70% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.66% : 0.000003s : 24: predicate.switch_defer_inline 2.20% : 0.000005s : 32: predicate.switch_layer_defer_inline 6.02% : 0.000013s : 84: predicate.switch_simplify 0.98% : 0.000002s : 15: predicate.tile_eliminate 0.92% : 0.000002s : 15: predicate.transpose_eliminate 1.54% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.03% : 0.000006s : 34: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.73% : 0.000004s : 26: predicate.tuple_to_list_eliminator_ 2.57% : 0.000005s : 41: predicate.updatestate_pure_node_eliminater 3.10% : 0.000007s : 49: predicate.updatestate_useless_node_eliminater 0.33% : 0.000001s : 4: predicate.value_based_eliminate 0.59% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.58% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.41% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002505 29 68.76% : 0.001722s : 21: func_graph_cloner_run.FuncGraphClonerGraph 31.24% : 0.000783s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.089753 196 0.00% : 0.000003s : 1: ForceFp32Comm 3.47% : 0.003114s : 1: add_attr 3.46% : 0.003105s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000047s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.10% : 0.000090s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000005s : 1: bias_add_comm_swap 0.57% : 0.000513s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000055s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000011s : 1: environ_conv 0.37% : 0.000328s : 1: event_method 0.01% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000005s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.48% : 0.000431s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.52% : 0.000470s : 1: mutable_eliminate 0.01% : 0.000006s : 1: offloading_packed_experts 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 1.45% : 0.001301s : 78: opt.transform.opt_a 0.03% : 0.000030s : 1: opt.transform.opt_after_cconv 0.03% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000112s : 28: opt.transform.opt_b 0.05% : 0.000047s : 2: opt.transform.opt_trans_graph 0.04% : 0.000037s : 4: opt.transform.symbol_engine_opt 4.55% : 0.004082s : 1: opt_a 0.12% : 0.000105s : 1: opt_after_cconv 0.52% : 0.000470s : 1: opt_after_jit_grad 0.24% : 0.000212s : 1: opt_b 6.92% : 0.006207s : 1: optimize 0.02% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000042s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000020s : 1: remove_dup_value 1.25% : 0.001126s : 1: renormalize.infer 0.92% : 0.000825s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000021s : 1: rewriter_after_opt_a 0.31% : 0.000282s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.09% : 0.000077s : 1: symbol_engine_optimizer 7.81% : 0.007012s : 1: task_emit 0.08% : 0.000076s : 1: tuple_transform 65.74% : 0.059005s : 1: type_inference 0.08% : 0.000071s : 1: validate TotalTime = 0.276435, [24] [bootstrap]: 0.00047314 [type_inference]: 0.250136 [event_method]: 0.00164933 [auto_monad]: 0.00020517 [graph_reusing]: 1.296e-05 [inline]: 2.56998e-06 [add_attr]: 0.00337579, [1] [add_attr_with_inline]: 0.0033666, [1] [Cycle 1]: 0.00010577, [2] [tag_attr]: 5.647e-05 [meta_addattr_fg_expand]: 1.665e-05 [parallel-infer-symbol]: 2.85002e-06 [pre_auto_parallel]: 7.499e-05 [insert-virtual-dataset]: 2.17999e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.87001e-06 [pipeline_split]: 1.48002e-06 [optimize]: 0.0112101, [53] [py_interpret_to_execute]: 4.63999e-06 [rewriter_before_opt_a]: 0.00051423 [opt_a]: 0.00883032, [2] [Cycle 1]: 0.00818684, [45] [expand_dump_flag]: 6.68e-06 [switch_simplify]: 0.0002064 [loop_unroll]: 8.639e-05 [a_1]: 0.0017208 [with_stream_mark]: 1.462e-05 [recompute_prepare]: 9.07001e-06 [updatestate_depend_eliminate]: 4.13001e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.86998e-06 [a_2]: 8.443e-05 [accelerated_algorithm]: 6.83e-06 [shard]: 1.47001e-06 [meta_shard_fg_expand]: 3.70998e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 7.85e-06 [auto_parallel]: 5.57999e-06 [parallel]: 1.849e-05 [flash_sp]: 7.24001e-06 [merge_comm]: 3.43999e-06 [allreduce_fusion]: 3.36999e-06 [matmul_add_comm_reduction]: 8.17e-06 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 8.32e-06 [virtual_dataset]: 6.38003e-06 [get_grad_eliminate_]: 6.41998e-06 [virtual_output]: 7.11999e-06 [merge_forward]: 3.84002e-06 [cell_reuse_recompute_pass]: 1.04998e-06 [offload_activation]: 8.79998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.313e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 9.34e-06 [set_forward_comm_id_for_comm_node_pass]: 3.25002e-06 [meta_fg_expand]: 4.85999e-06 [flash_sp_send_recv_attached]: 2.40002e-06 [receive_attached]: 2.21e-06 [after_resolve]: 9.88998e-06 [a_after_grad]: 1.006e-05 [renormalize]: 0.00551436 [add_forward_monad_depend]: 5.52001e-06 [auto_monad_grad]: 2.14e-06 [auto_monad_eliminator]: 1.536e-05 [cse]: 3.387e-05 [a_3]: 5.162e-05 [Cycle 2]: 0.00063331, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 8.39002e-06 [loop_unroll]: 7.19001e-06 [a_1]: 0.00013179 [with_stream_mark]: 1.111e-05 [recompute_prepare]: 6.51e-06 [updatestate_depend_eliminate]: 3.03998e-06 [updatestate_assign_eliminate]: 2.53998e-06 [updatestate_loads_eliminate]: 2.22001e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 7.579e-05 [accelerated_algorithm]: 6.43003e-06 [shard]: 1.05999e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 6.10002e-06 [merge_send_recv]: 4.46002e-06 [auto_parallel]: 5.39e-06 [parallel]: 4.52e-06 [flash_sp]: 2.98998e-06 [merge_comm]: 3.01999e-06 [allreduce_fusion]: 2.76e-06 [matmul_add_comm_reduction]: 4.93001e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.80998e-06 [virtual_dataset]: 6.24001e-06 [get_grad_eliminate_]: 6.19001e-06 [virtual_output]: 5.87001e-06 [merge_forward]: 2.59999e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.27001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.344e-05 [merge_recompute_call_nodes]: 6.59988e-07 [before_grad]: 9.25999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.18e-06 [meta_fg_expand]: 2.02001e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 9.07001e-06 [a_after_grad]: 9.69999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.09998e-06 [auto_monad_grad]: 1.05999e-06 [auto_monad_eliminator]: 6.31e-06 [cse]: 1.545e-05 [a_3]: 3.698e-05 [py_interpret_to_execute_after_opt_a]: 4.82998e-06 [slice_cell_reuse_recomputed_activation]: 1.97001e-06 [rewriter_after_opt_a]: 1.688e-05 [convert_after_rewriter]: 1.61998e-06 [order_py_execute_after_rewriter]: 1.12999e-06 [mutable_eliminate]: 0.00048793 [opt_b]: 0.00021023, [1] [Cycle 1]: 0.00020396, [7] [b_1]: 0.00013182 [b_2]: 7.8e-06 [updatestate_depend_eliminate]: 5.19e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.26e-06 [renormalize]: 4.00003e-07 [cse]: 2.044e-05 [optimize_parallel_all_gather_comm]: 1.569e-05 [overlap_param_gather]: 2.16e-06 [cconv]: 2.182e-05 [loop_unroll]: 0.00044287 [opt_after_cconv]: 0.00010492, [1] [Cycle 1]: 9.922e-05, [7] [c_1]: 3.36e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.26998e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.23002e-06 [cse]: 2.012e-05 [renormalize]: 3.39991e-07 [remove_dup_value]: 1.564e-05 [tuple_transform]: 7.558e-05, [1] [Cycle 1]: 7.093e-05, [4] [d_1]: 4.38e-05 [none_parameter_eliminate]: 2.07999e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.97997e-06 [partial_unused_args_eliminate]: 1.64e-06 [add_recomputation]: 4.401e-05 [cse_after_recomputation]: 2.381e-05, [1] [Cycle 1]: 1.913e-05, [1] [cse]: 1.386e-05 [environ_conv]: 7.91001e-06 [swap_dp_allreduce_reducescatter]: 5.39e-06 [bias_add_comm_swap]: 2.86e-06 [label_micro_interleaved_index]: 4.05e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.17e-06 [slice_recompute_activation]: 2.03002e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.15999e-06 [ForceFp32Comm]: 6.99976e-07 [remove_cast_before_assign_add]: 9.49978e-07 [full_micro_interleaved_order_control]: 2.08002e-06 [reorder_send_recv_between_fp_bp]: 2.81999e-06 [comm_op_add_attrs]: 1.29e-06 [add_comm_op_reuse_tag]: 8.90024e-07 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.00999e-06 [overlap_opt_shard_in_pipeline]: 1.14003e-06 [overlap_opt_shard_grad_in_pipeline]: 1.63002e-06 [control_data_broadcast_order]: 1.178e-05 [grouped_pairwise_exchange_alltoall]: 2.07999e-06 [offloading_packed_experts]: 3.84002e-06 [overlap_recompute_and_grad_model_parallel]: 4.33999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.29e-06 [overlap_recompute_comm]: 1.99e-06 [overlap_grad_ring_attention]: 3.75e-06 [overlap_grad_flash_sp]: 1.599e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.10002e-06 [split_layernorm_comm]: 1.59998e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 7.324e-05, [1] [Cycle 1]: 6.938e-05, [6] [build]: 2.56e-06 [elim_shapecalc]: 9.94001e-06 [elim_not_effective]: 1.247e-05 [opt_reshape]: 7.03998e-06 [fold_const_symbol]: 1.028e-05 [renormalize]: 1.50001e-07 [detach_backward]: 1.49e-06 [pipeline_parallel_scheduler]: 1.64998e-06 [auto_monad_reorder]: 1.88e-05 [get_jit_bprop_graph]: 1.04e-06 [rewriter_after_jit_bprop_graph]: 3.40998e-06 [opt_after_jit_grad]: 0.00047677 [validate]: 4.2e-05 [backend_pass]: 9.00007e-07 [task_emit]: 0.00849993 [execute]: 7.09001e-06 Sums bootstrap : 0.000473s : 0.17% type_inference : 0.250136s : 91.93% event_method : 0.001649s : 0.61% auto_monad : 0.000205s : 0.08% graph_reusing : 0.000013s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000056s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000075s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000514s : 0.19% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000215s : 0.08% optimize.opt_a.loop_unroll : 0.000094s : 0.03% optimize.opt_a.a_1 : 0.001853s : 0.68% optimize.opt_a.with_stream_mark : 0.000026s : 0.01% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000160s : 0.06% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000012s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000023s : 0.01% optimize.opt_a.flash_sp : 0.000010s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000006s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.005514s : 2.03% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.01% optimize.opt_a.cse : 0.000049s : 0.02% optimize.opt_a.a_3 : 0.000089s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000488s : 0.18% optimize.opt_b.b_1 : 0.000132s : 0.05% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.01% optimize.loop_unroll : 0.000443s : 0.16% optimize.opt_after_cconv.c_1 : 0.000034s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000044s : 0.02% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000019s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000477s : 0.18% validate : 0.000042s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.008500s : 3.12% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000471 58 0.50% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000003s : 4: substitution.float_depend_g_call 0.26% : 0.000001s : 2: substitution.fold_const_symbol 1.27% : 0.000006s : 4: substitution.graph_param_transform 87.95% : 0.000415s : 23: substitution.inline 0.60% : 0.000003s : 4: substitution.j_node_and_user_rematch 0.89% : 0.000004s : 4: substitution.partial_eliminate 1.05% : 0.000005s : 4: substitution.remove_not_recompute_node 0.55% : 0.000003s : 2: substitution.replace_old_param 3.29% : 0.000015s : 6: substitution.switch_simplify 3.00% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.250031 2 96.99% : 0.242514s : 1: type_inference.infer 3.01% : 0.007517s : 1: type_inference.specialize ------[replace.] 0.000235 32 61.36% : 0.000144s : 23: replace.inline 25.06% : 0.000059s : 6: replace.switch_simplify 13.58% : 0.000032s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000428 32 94.29% : 0.000404s : 23: match.inline 2.79% : 0.000012s : 6: match.switch_simplify 2.92% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000365 2421 1.13% : 0.000004s : 32: predicate.accumulaten_eliminater 0.53% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.28% : 0.000001s : 8: predicate.addn_check_dump 1.14% : 0.000004s : 32: predicate.addn_zero_filter 1.10% : 0.000004s : 32: predicate.adjust_all_reduce_mul_add 2.25% : 0.000008s : 40: predicate.arithmetic_simplify 1.27% : 0.000005s : 32: predicate.cast_eliminate 0.37% : 0.000001s : 8: predicate.check_bprop_eliminate 0.30% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.33% : 0.000001s : 8: predicate.depend_value_elim 1.18% : 0.000004s : 32: predicate.dict_get_item_const_eliminator 1.35% : 0.000005s : 32: predicate.dict_get_item_eliminator 1.17% : 0.000004s : 32: predicate.dict_set_item_eliminator 0.48% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.12% : 0.000000s : 4: predicate.elim_not_effective 0.19% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.24% : 0.000005s : 36: predicate.environ_add_const_eliminate 1.23% : 0.000004s : 36: predicate.environ_get_add_eliminate 1.28% : 0.000005s : 36: predicate.environ_get_depend_swap 1.68% : 0.000006s : 44: predicate.environ_get_eliminate 1.27% : 0.000005s : 36: predicate.environ_get_set_eliminate 2.12% : 0.000008s : 58: predicate.exchange_switch_depend_value 2.98% : 0.000011s : 58: predicate.float_depend_g_call 0.29% : 0.000001s : 8: predicate.float_environ_get_switch 0.42% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.36% : 0.000001s : 8: predicate.get_grad_eliminate 0.12% : 0.000000s : 4: predicate.graph_param_transform 0.30% : 0.000001s : 8: predicate.incorporate_call 0.26% : 0.000001s : 8: predicate.incorporate_call_switch 6.06% : 0.000022s : 114: predicate.inline 0.37% : 0.000001s : 8: predicate.inline_without_move 0.17% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.42% : 0.000002s : 8: predicate.less_batch_normalization 1.66% : 0.000006s : 43: predicate.list_to_tuple_eliminator_ 2.55% : 0.000009s : 75: predicate.load_eliminater 0.54% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.24% : 0.000015s : 104: predicate.loop_unroll_before_grad 1.68% : 0.000006s : 40: predicate.make_slice_get_slice_eliminator 0.32% : 0.000001s : 8: predicate.merge_addn 0.31% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.34% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.07% : 0.000004s : 32: predicate.minmaximum_grad 0.58% : 0.000002s : 4: predicate.mutable_eliminate 0.19% : 0.000001s : 4: predicate.opt_reshape 0.29% : 0.000001s : 4: predicate.parallel_virtual_node 3.33% : 0.000012s : 58: predicate.partial_defer_inline 1.57% : 0.000006s : 39: predicate.partial_eliminate 1.20% : 0.000004s : 32: predicate.print_const_string_wrapper 0.40% : 0.000001s : 8: predicate.reduce_all_const_elim 1.47% : 0.000005s : 32: predicate.reduce_eliminate 2.66% : 0.000010s : 75: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000005s : 43: predicate.replace_applicator 0.25% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000001s : 4: predicate.reset_defer_inline 1.19% : 0.000004s : 32: predicate.reshape_eliminate 0.32% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 4: predicate.row_tensor_eliminate 0.40% : 0.000001s : 8: predicate.same_eliminate 0.24% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.52% : 0.000002s : 8: predicate.shard_identity_eliminate 0.43% : 0.000002s : 8: predicate.special_op_eliminate 0.35% : 0.000001s : 8: predicate.specialize_transform 0.41% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.39% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.18% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.50% : 0.000009s : 58: predicate.switch_defer_inline 2.76% : 0.000010s : 66: predicate.switch_layer_defer_inline 8.27% : 0.000030s : 186: predicate.switch_simplify 1.18% : 0.000004s : 32: predicate.tile_eliminate 1.17% : 0.000004s : 32: predicate.transpose_eliminate 1.63% : 0.000006s : 40: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000006s : 40: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000005s : 40: predicate.tuple_list_get_item_depend_reorder 2.50% : 0.000009s : 51: predicate.tuple_list_get_item_eliminator 1.52% : 0.000006s : 40: predicate.tuple_list_get_set_item_eliminator 1.97% : 0.000007s : 48: predicate.tuple_list_set_item_eliminator 1.65% : 0.000006s : 43: predicate.tuple_to_list_eliminator_ 2.48% : 0.000009s : 75: predicate.updatestate_pure_node_eliminater 2.96% : 0.000011s : 83: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.37% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.36% : 0.000001s : 8: predicate.virtual_output_eliminate 0.17% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.24% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.007661 68 67.13% : 0.005143s : 42: func_graph_cloner_run.FuncGraphClonerGraph 32.87% : 0.002518s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.299217 196 0.00% : 0.000003s : 1: ForceFp32Comm 1.13% : 0.003380s : 1: add_attr 1.13% : 0.003371s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000048s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.07% : 0.000216s : 1: auto_monad 0.01% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.17% : 0.000502s : 1: bootstrap 0.01% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000011s : 1: environ_conv 0.56% : 0.001662s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000003s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.15% : 0.000451s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.17% : 0.000497s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.84% : 0.002505s : 78: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000111s : 28: opt.transform.opt_b 0.02% : 0.000049s : 2: opt.transform.opt_trans_graph 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 2.95% : 0.008834s : 1: opt_a 0.04% : 0.000108s : 1: opt_after_cconv 0.16% : 0.000486s : 1: opt_after_jit_grad 0.07% : 0.000214s : 1: opt_b 3.75% : 0.011215s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000080s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 1.06% : 0.003172s : 1: renormalize.infer 0.78% : 0.002333s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000020s : 1: rewriter_after_opt_a 0.17% : 0.000522s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000076s : 1: symbol_engine_optimizer 2.84% : 0.008510s : 1: task_emit 0.03% : 0.000078s : 1: tuple_transform 83.60% : 0.250155s : 1: type_inference 0.02% : 0.000071s : 1: validate TotalTime = 2.5921, [24] [bootstrap]: 0.00161886 [type_inference]: 0.138496 [event_method]: 6.599e-05 [auto_monad]: 0.00026265 [graph_reusing]: 1.83e-05 [inline]: 2.61e-06 [add_attr]: 0.00501944, [1] [add_attr_with_inline]: 0.00501014, [1] [Cycle 1]: 0.00014653, [2] [tag_attr]: 6.71e-05 [meta_addattr_fg_expand]: 2.166e-05 [parallel-infer-symbol]: 3.75e-06 [pre_auto_parallel]: 9.206e-05 [insert-virtual-dataset]: 3.07002e-06 [parallel-infer-symbol-second]: 1.19e-06 [dataset_repeat_opt]: 1.79998e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.0487494, [53] [py_interpret_to_execute]: 5.53002e-06 [rewriter_before_opt_a]: 0.00051899 [opt_a]: 0.0461675, [3] [Cycle 1]: 0.0391394, [45] [expand_dump_flag]: 6.28998e-06 [switch_simplify]: 0.0002611 [loop_unroll]: 0.00011027 [a_1]: 0.00234081 [with_stream_mark]: 2.971e-05 [recompute_prepare]: 2.634e-05 [updatestate_depend_eliminate]: 1.134e-05 [updatestate_assign_eliminate]: 1.035e-05 [updatestate_loads_eliminate]: 9.92999e-06 [parameter_eliminate]: 2.83e-06 [a_2]: 0.00053202 [accelerated_algorithm]: 1.771e-05 [shard]: 1.60999e-06 [meta_shard_fg_expand]: 6.54001e-06 [shard_inline]: 1.716e-05 [merge_send_recv]: 1.933e-05 [auto_parallel]: 1.29e-05 [parallel]: 6.313e-05 [flash_sp]: 1.196e-05 [merge_comm]: 1.239e-05 [allreduce_fusion]: 1.036e-05 [matmul_add_comm_reduction]: 3.08e-05 [allreduce_slice_to_reducescatter]: 7.7e-07 [virtual_shard_identity]: 2.065e-05 [virtual_dataset]: 1.651e-05 [get_grad_eliminate_]: 1.72e-05 [virtual_output]: 1.681e-05 [merge_forward]: 1.077e-05 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 1.919e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.164e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 2.924e-05 [set_forward_comm_id_for_comm_node_pass]: 1.118e-05 [meta_fg_expand]: 0.00337282 [flash_sp_send_recv_attached]: 5.35999e-06 [receive_attached]: 2.41e-06 [after_resolve]: 0.00012495 [a_after_grad]: 0.00017351 [renormalize]: 0.0295134 [add_forward_monad_depend]: 1.895e-05 [auto_monad_grad]: 1.268e-05 [auto_monad_eliminator]: 0.00011193 [cse]: 0.00033997 [a_3]: 0.00140525 [Cycle 2]: 0.00641719, [45] [expand_dump_flag]: 3.71001e-06 [switch_simplify]: 9.177e-05 [loop_unroll]: 8.565e-05 [a_1]: 0.00232431 [with_stream_mark]: 1.961e-05 [recompute_prepare]: 1.242e-05 [updatestate_depend_eliminate]: 6.79001e-06 [updatestate_assign_eliminate]: 6.06e-06 [updatestate_loads_eliminate]: 5.24e-06 [parameter_eliminate]: 2.01e-06 [a_2]: 0.00014332 [accelerated_algorithm]: 2.708e-05 [shard]: 1.64998e-06 [meta_shard_fg_expand]: 3.88001e-06 [shard_inline]: 1.012e-05 [merge_send_recv]: 1.078e-05 [auto_parallel]: 1.079e-05 [parallel]: 7.6e-06 [flash_sp]: 3.92998e-06 [merge_comm]: 6.74999e-06 [allreduce_fusion]: 5.96e-06 [matmul_add_comm_reduction]: 1.13e-05 [allreduce_slice_to_reducescatter]: 6.60017e-07 [virtual_shard_identity]: 1.054e-05 [virtual_dataset]: 9.47999e-06 [get_grad_eliminate_]: 1.026e-05 [virtual_output]: 9.52001e-06 [merge_forward]: 6.56e-06 [cell_reuse_recompute_pass]: 1.23002e-06 [offload_activation]: 1.305e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.048e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 1.814e-05 [set_forward_comm_id_for_comm_node_pass]: 7.16999e-06 [meta_fg_expand]: 0.00055535 [flash_sp_send_recv_attached]: 2.62001e-06 [receive_attached]: 2.36e-06 [after_resolve]: 1.826e-05 [a_after_grad]: 1.557e-05 [renormalize]: 0.0025637 [add_forward_monad_depend]: 4.57e-06 [auto_monad_grad]: 2.10002e-06 [auto_monad_eliminator]: 1.221e-05 [cse]: 2.37e-05 [a_3]: 4.127e-05 [Cycle 3]: 0.0005924, [45] [expand_dump_flag]: 1.20001e-06 [switch_simplify]: 7.18e-06 [loop_unroll]: 5.84e-06 [a_1]: 0.00013341 [with_stream_mark]: 9.39e-06 [recompute_prepare]: 5.77001e-06 [updatestate_depend_eliminate]: 3.28e-06 [updatestate_assign_eliminate]: 2.12001e-06 [updatestate_loads_eliminate]: 2.08002e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 6.941e-05 [accelerated_algorithm]: 5.45001e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.26997e-06 [shard_inline]: 5.27999e-06 [merge_send_recv]: 4.68999e-06 [auto_parallel]: 5.07999e-06 [parallel]: 4.4e-06 [flash_sp]: 9.39996e-07 [merge_comm]: 2.69001e-06 [allreduce_fusion]: 2.58e-06 [matmul_add_comm_reduction]: 5.04e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.03e-06 [virtual_dataset]: 5.36998e-06 [get_grad_eliminate_]: 5.09e-06 [virtual_output]: 5.32001e-06 [merge_forward]: 2.56e-06 [cell_reuse_recompute_pass]: 1.49e-06 [offload_activation]: 5.86e-06 [cell_reuse_handle_not_recompute_node_pass]: 9.86e-06 [merge_recompute_call_nodes]: 8.49977e-07 [before_grad]: 8.33999e-06 [set_forward_comm_id_for_comm_node_pass]: 2.96001e-06 [meta_fg_expand]: 1.91e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.00001e-06 [after_resolve]: 5.39e-06 [a_after_grad]: 7.83001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.12e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 5.35001e-06 [cse]: 1.349e-05 [a_3]: 3.047e-05 [py_interpret_to_execute_after_opt_a]: 5.19e-06 [slice_cell_reuse_recomputed_activation]: 2.02001e-06 [rewriter_after_opt_a]: 1.906e-05 [convert_after_rewriter]: 1.65001e-06 [order_py_execute_after_rewriter]: 1.05001e-06 [mutable_eliminate]: 0.00063003 [opt_b]: 0.00018393, [1] [Cycle 1]: 0.00017641, [7] [b_1]: 0.00010758 [b_2]: 7.3e-06 [updatestate_depend_eliminate]: 5.00999e-06 [updatestate_assign_eliminate]: 2.22999e-06 [updatestate_loads_eliminate]: 2.02999e-06 [renormalize]: 5.49975e-07 [cse]: 1.679e-05 [optimize_parallel_all_gather_comm]: 1.584e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.355e-05 [loop_unroll]: 0.00045105 [opt_after_cconv]: 9.472e-05, [1] [Cycle 1]: 8.899e-05, [7] [c_1]: 2.673e-05 [parameter_eliminate]: 2.41e-06 [updatestate_depend_eliminate]: 4.85001e-06 [updatestate_assign_eliminate]: 2.22001e-06 [updatestate_loads_eliminate]: 1.99e-06 [cse]: 1.809e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 1.515e-05 [tuple_transform]: 6.717e-05, [1] [Cycle 1]: 6.308e-05, [4] [d_1]: 3.719e-05 [none_parameter_eliminate]: 1.69e-06 [renormalize]: 1.70025e-07 [switch_simplify]: 5.99999e-06 [partial_unused_args_eliminate]: 1.90001e-06 [add_recomputation]: 5.759e-05 [cse_after_recomputation]: 2.165e-05, [1] [Cycle 1]: 1.709e-05, [1] [cse]: 1.17e-05 [environ_conv]: 7e-06 [swap_dp_allreduce_reducescatter]: 5.42999e-06 [bias_add_comm_swap]: 2.53e-06 [label_micro_interleaved_index]: 4.77e-06 [label_fine_grained_interleaved_index]: 2.58e-06 [merge_cast_opt]: 1.54e-06 [slice_recompute_activation]: 1.94e-06 [micro_interleaved_order_control]: 2.06e-06 [assign_add_opt]: 1.20001e-06 [ForceFp32Comm]: 1.06002e-06 [remove_cast_before_assign_add]: 1.25999e-06 [full_micro_interleaved_order_control]: 2.46998e-06 [reorder_send_recv_between_fp_bp]: 2.57001e-06 [comm_op_add_attrs]: 1.10001e-06 [add_comm_op_reuse_tag]: 1.26002e-06 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 1.784e-05 [overlap_opt_shard_grad_in_pipeline]: 1.90001e-06 [control_data_broadcast_order]: 1.14e-05 [grouped_pairwise_exchange_alltoall]: 1.93002e-06 [offloading_packed_experts]: 3.4e-06 [overlap_recompute_and_grad_model_parallel]: 4.85001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.09e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.25002e-06 [overlap_grad_ring_attention]: 4.3e-06 [overlap_grad_flash_sp]: 1.83e-05 [begin_end_overlap_inline]: 5.89993e-07 [split_matmul_comm_elemetwise]: 2.14e-06 [split_layernorm_comm]: 1.66998e-06 [handle_group_info]: 9.39996e-07 [symbol_engine_optimizer]: 0.0001201, [1] [Cycle 1]: 0.0001157, [6] [build]: 2.96999e-06 [elim_shapecalc]: 8.94e-06 [elim_not_effective]: 1.217e-05 [opt_reshape]: 6.74999e-06 [fold_const_symbol]: 5.502e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.36e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 1.698e-05 [get_jit_bprop_graph]: 1.92001e-06 [rewriter_after_jit_bprop_graph]: 3.97e-06 [opt_after_jit_grad]: 0.00049025 [validate]: 3.707e-05 [backend_pass]: 2.06e-06 [task_emit]: 2.39623 [execute]: 1.216e-05 Sums bootstrap : 0.001619s : 0.06% type_inference : 0.138496s : 5.36% event_method : 0.000066s : 0.00% auto_monad : 0.000263s : 0.01% graph_reusing : 0.000018s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000067s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000022s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000092s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000519s : 0.02% optimize.opt_a.expand_dump_flag : 0.000011s : 0.00% optimize.opt_a.switch_simplify : 0.000360s : 0.01% optimize.opt_a.loop_unroll : 0.000202s : 0.01% optimize.opt_a.a_1 : 0.004799s : 0.19% optimize.opt_a.with_stream_mark : 0.000059s : 0.00% optimize.opt_a.recompute_prepare : 0.000045s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000017s : 0.00% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000745s : 0.03% optimize.opt_a.accelerated_algorithm : 0.000050s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000012s : 0.00% optimize.opt_a.shard_inline : 0.000033s : 0.00% optimize.opt_a.merge_send_recv : 0.000035s : 0.00% optimize.opt_a.auto_parallel : 0.000029s : 0.00% optimize.opt_a.parallel : 0.000075s : 0.00% optimize.opt_a.flash_sp : 0.000017s : 0.00% optimize.opt_a.merge_comm : 0.000022s : 0.00% optimize.opt_a.allreduce_fusion : 0.000019s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000038s : 0.00% optimize.opt_a.virtual_dataset : 0.000031s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000033s : 0.00% optimize.opt_a.virtual_output : 0.000032s : 0.00% optimize.opt_a.merge_forward : 0.000020s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000038s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000062s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000056s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000021s : 0.00% optimize.opt_a.meta_fg_expand : 0.003930s : 0.15% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000149s : 0.01% optimize.opt_a.a_after_grad : 0.000197s : 0.01% optimize.opt_a.renormalize : 0.032077s : 1.24% optimize.opt_a.add_forward_monad_depend : 0.000025s : 0.00% optimize.opt_a.auto_monad_grad : 0.000016s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000129s : 0.01% optimize.opt_a.cse : 0.000377s : 0.01% optimize.opt_a.a_3 : 0.001477s : 0.06% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000630s : 0.02% optimize.opt_b.b_1 : 0.000108s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000451s : 0.02% optimize.opt_after_cconv.c_1 : 0.000027s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.00% optimize.tuple_transform.d_1 : 0.000037s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000058s : 0.00% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000007s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000018s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000018s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000055s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000490s : 0.02% validate : 0.000037s : 0.00% backend_pass : 0.000002s : 0.00% task_emit : 2.396232s : 92.70% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.001383 274 0.40% : 0.000005s : 2: substitution.addn_check_dump 2.16% : 0.000030s : 2: substitution.addn_zero_filter 0.44% : 0.000006s : 2: substitution.adjust_all_reduce_mul_add 0.23% : 0.000003s : 1: substitution.depend_value_elim 0.14% : 0.000002s : 2: substitution.elim_not_effective 0.63% : 0.000009s : 12: substitution.float_depend_g_call 0.34% : 0.000005s : 4: substitution.float_tuple_getitem_switch 0.15% : 0.000002s : 2: substitution.fold_const_symbol 0.41% : 0.000006s : 3: substitution.graph_param_transform 0.32% : 0.000004s : 4: substitution.incorporate_call 0.23% : 0.000003s : 4: substitution.incorporate_call_switch 68.07% : 0.000941s : 40: substitution.inline 2.09% : 0.000029s : 4: substitution.inline_without_move 0.86% : 0.000012s : 18: substitution.j_node_and_user_rematch 0.94% : 0.000013s : 2: substitution.less_batch_normalization 0.66% : 0.000009s : 2: substitution.merge_addn 1.05% : 0.000015s : 10: substitution.minmaximum_grad 1.83% : 0.000025s : 12: substitution.partial_eliminate 1.04% : 0.000014s : 18: substitution.remove_not_recompute_node 5.32% : 0.000074s : 34: substitution.replace_applicator 0.77% : 0.000011s : 21: substitution.replace_old_param 0.19% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.38% : 0.000019s : 8: substitution.switch_simplify 1.93% : 0.000027s : 10: substitution.tuple_list_convert_item_index_to_positive 0.93% : 0.000013s : 10: substitution.tuple_list_get_item_const_eliminator 1.36% : 0.000019s : 10: substitution.tuple_list_get_item_depend_reorder 4.83% : 0.000067s : 26: substitution.tuple_list_get_item_eliminator 1.31% : 0.000018s : 10: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.137747 2 95.56% : 0.131628s : 1: type_inference.infer 4.44% : 0.006119s : 1: type_inference.specialize ------[replace.] 0.000545 63 2.12% : 0.000012s : 1: replace.depend_value_elim 53.47% : 0.000291s : 40: replace.inline 7.14% : 0.000039s : 2: replace.replace_applicator 13.96% : 0.000076s : 8: replace.switch_simplify 23.31% : 0.000127s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000979 63 0.24% : 0.000002s : 1: match.depend_value_elim 94.08% : 0.000921s : 40: match.inline 1.18% : 0.000012s : 2: match.replace_applicator 1.50% : 0.000015s : 8: match.switch_simplify 3.00% : 0.000029s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.001078 7429 1.00% : 0.000011s : 81: predicate.accumulaten_eliminater 0.13% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.48% : 0.000005s : 37: predicate.addn_check_dump 1.09% : 0.000012s : 81: predicate.addn_zero_filter 0.96% : 0.000010s : 81: predicate.adjust_all_reduce_mul_add 2.24% : 0.000024s : 118: predicate.arithmetic_simplify 1.07% : 0.000012s : 81: predicate.cast_eliminate 2.34% : 0.000025s : 193: predicate.check_bprop_eliminate 0.47% : 0.000005s : 37: predicate.compare_switch_simplify 0.03% : 0.000000s : 3: predicate.const_output_eliminate 0.48% : 0.000005s : 37: predicate.depend_value_elim 1.07% : 0.000012s : 81: predicate.dict_get_item_const_eliminator 1.14% : 0.000012s : 81: predicate.dict_get_item_eliminator 1.00% : 0.000011s : 81: predicate.dict_set_item_eliminator 0.13% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.04% : 0.000000s : 3: predicate.elim_not_effective 0.06% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.03% : 0.000011s : 84: predicate.environ_add_const_eliminate 1.04% : 0.000011s : 84: predicate.environ_get_add_eliminate 1.03% : 0.000011s : 84: predicate.environ_get_depend_swap 1.56% : 0.000017s : 121: predicate.environ_get_eliminate 1.06% : 0.000011s : 84: predicate.environ_get_set_eliminate 1.79% : 0.000019s : 133: predicate.exchange_switch_depend_value 2.41% : 0.000026s : 133: predicate.float_depend_g_call 0.48% : 0.000005s : 37: predicate.float_environ_get_switch 0.55% : 0.000006s : 40: predicate.float_tuple_getitem_switch 0.03% : 0.000000s : 3: predicate.fold_const_symbol 0.37% : 0.000004s : 22: predicate.get_grad_eliminate 0.04% : 0.000000s : 3: predicate.graph_param_transform 0.49% : 0.000005s : 37: predicate.incorporate_call 0.45% : 0.000005s : 37: predicate.incorporate_call_switch 4.89% : 0.000053s : 279: predicate.inline 1.68% : 0.000018s : 93: predicate.inline_without_move 0.15% : 0.000002s : 22: predicate.j_node_and_user_rematch 0.42% : 0.000005s : 22: predicate.less_batch_normalization 1.25% : 0.000014s : 99: predicate.list_to_tuple_eliminator_ 2.16% : 0.000023s : 180: predicate.load_eliminater 0.16% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.88% : 0.000031s : 219: predicate.loop_unroll_before_grad 1.13% : 0.000012s : 87: predicate.make_slice_get_slice_eliminator 0.52% : 0.000006s : 37: predicate.merge_addn 2.23% : 0.000024s : 184: predicate.micro_step_allgather_replace 2.25% : 0.000024s : 184: predicate.mini_step_allgather_replace 0.99% : 0.000011s : 81: predicate.minmaximum_grad 0.16% : 0.000002s : 3: predicate.mutable_eliminate 0.06% : 0.000001s : 3: predicate.opt_reshape 0.07% : 0.000001s : 3: predicate.parallel_virtual_node 2.30% : 0.000025s : 133: predicate.partial_defer_inline 1.39% : 0.000015s : 96: predicate.partial_eliminate 1.03% : 0.000011s : 81: predicate.print_const_string_wrapper 0.47% : 0.000005s : 35: predicate.reduce_all_const_elim 1.34% : 0.000014s : 81: predicate.reduce_eliminate 2.21% : 0.000024s : 180: predicate.redundant_stop_gradient_eliminater 0.17% : 0.000002s : 22: predicate.remove_not_recompute_node 2.28% : 0.000025s : 281: predicate.replace_applicator 0.73% : 0.000008s : 93: predicate.replace_old_param 0.04% : 0.000000s : 3: predicate.reset_defer_inline 1.08% : 0.000012s : 81: predicate.reshape_eliminate 2.33% : 0.000025s : 184: predicate.row_tensor_add_zeros_like 0.07% : 0.000001s : 3: predicate.row_tensor_eliminate 2.53% : 0.000027s : 193: predicate.same_eliminate 0.22% : 0.000002s : 24: predicate.set_cell_output_no_recompute 0.41% : 0.000004s : 22: predicate.shard_identity_eliminate 0.11% : 0.000001s : 6: predicate.special_op_eliminate 0.56% : 0.000006s : 37: predicate.specialize_transform 2.30% : 0.000025s : 184: predicate.split_environ_get_set_with_tuple_value 1.38% : 0.000015s : 93: predicate.stack_unstack_eliminate 0.04% : 0.000000s : 3: predicate.switch_call_monad_eliminater 1.89% : 0.000020s : 133: predicate.switch_defer_inline 4.20% : 0.000045s : 326: predicate.switch_layer_defer_inline 5.63% : 0.000061s : 408: predicate.switch_simplify 1.01% : 0.000011s : 81: predicate.tile_eliminate 0.99% : 0.000011s : 81: predicate.transpose_eliminate 1.23% : 0.000013s : 87: predicate.tuple_list_convert_item_index_to_positive 1.32% : 0.000014s : 87: predicate.tuple_list_get_item_const_eliminator 1.24% : 0.000013s : 87: predicate.tuple_list_get_item_depend_reorder 2.31% : 0.000025s : 136: predicate.tuple_list_get_item_eliminator 1.29% : 0.000014s : 87: predicate.tuple_list_get_set_item_eliminator 1.98% : 0.000021s : 124: predicate.tuple_list_set_item_eliminator 1.25% : 0.000013s : 99: predicate.tuple_to_list_eliminator_ 2.11% : 0.000023s : 180: predicate.updatestate_pure_node_eliminater 2.68% : 0.000029s : 217: predicate.updatestate_useless_node_eliminater 0.05% : 0.000001s : 3: predicate.value_based_eliminate 0.36% : 0.000004s : 22: predicate.virtual_dataset_eliminate 0.34% : 0.000004s : 22: predicate.virtual_output_eliminate 0.04% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.06% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006268 103 67.80% : 0.004250s : 51: func_graph_cloner_run.FuncGraphClonerGraph 32.20% : 0.002018s : 52: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.686200 262 0.00% : 0.000004s : 1: ForceFp32Comm 0.19% : 0.005024s : 1: add_attr 0.19% : 0.005015s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000062s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000272s : 1: auto_monad 0.00% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000012s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.06% : 0.001665s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000010s : 1: environ_conv 0.00% : 0.000074s : 1: event_method 0.00% : 0.000033s : 1: execute 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000006s : 1: get_jit_bprop_graph 0.00% : 0.000023s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000009s : 1: label_micro_interleaved_index 0.02% : 0.000459s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000639s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.30% : 0.008133s : 142: opt.transform.opt_a 0.00% : 0.000025s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000085s : 28: opt.transform.opt_b 0.00% : 0.000041s : 2: opt.transform.opt_trans_graph 0.00% : 0.000079s : 4: opt.transform.symbol_engine_opt 1.72% : 0.046171s : 1: opt_a 0.00% : 0.000098s : 1: opt_after_cconv 0.02% : 0.000500s : 1: opt_after_jit_grad 0.01% : 0.000187s : 1: opt_b 1.82% : 0.048755s : 1: optimize 0.00% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000022s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000021s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000005s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000097s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000019s : 1: remove_dup_value 1.03% : 0.027641s : 2: renormalize.infer 0.16% : 0.004416s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000022s : 1: rewriter_after_opt_a 0.02% : 0.000526s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000123s : 1: symbol_engine_optimizer 89.22% : 2.396692s : 1: task_emit 0.00% : 0.000070s : 1: tuple_transform 5.16% : 0.138524s : 1: type_inference 0.01% : 0.000267s : 1: validate TotalTime = 0.236106, [24] [bootstrap]: 0.00045857 [type_inference]: 0.21295 [event_method]: 0.00144347 [auto_monad]: 0.0001897 [graph_reusing]: 1.242e-05 [inline]: 2.49001e-06 [add_attr]: 0.00329325, [1] [add_attr_with_inline]: 0.00328451, [1] [Cycle 1]: 9.982e-05, [2] [tag_attr]: 5.436e-05 [meta_addattr_fg_expand]: 1.649e-05 [parallel-infer-symbol]: 3.04999e-06 [pre_auto_parallel]: 7.21e-05 [insert-virtual-dataset]: 2.20002e-06 [parallel-infer-symbol-second]: 6.79982e-07 [dataset_repeat_opt]: 1.70001e-06 [pipeline_split]: 1.53002e-06 [optimize]: 0.00983502, [53] [py_interpret_to_execute]: 4.53001e-06 [rewriter_before_opt_a]: 0.00049941 [opt_a]: 0.00749154, [2] [Cycle 1]: 0.00685434, [45] [expand_dump_flag]: 6.03002e-06 [switch_simplify]: 0.00020033 [loop_unroll]: 8.584e-05 [a_1]: 0.00169841 [with_stream_mark]: 1.413e-05 [recompute_prepare]: 9.63997e-06 [updatestate_depend_eliminate]: 4.1e-06 [updatestate_assign_eliminate]: 3.38e-06 [updatestate_loads_eliminate]: 3.19001e-06 [parameter_eliminate]: 1.87001e-06 [a_2]: 8.378e-05 [accelerated_algorithm]: 7.08998e-06 [shard]: 1.54e-06 [meta_shard_fg_expand]: 3.74002e-06 [shard_inline]: 6.39001e-06 [merge_send_recv]: 8.13999e-06 [auto_parallel]: 5.89e-06 [parallel]: 1.798e-05 [flash_sp]: 7.41999e-06 [merge_comm]: 3.37002e-06 [allreduce_fusion]: 3.53999e-06 [matmul_add_comm_reduction]: 8.22998e-06 [allreduce_slice_to_reducescatter]: 5.90022e-07 [virtual_shard_identity]: 8.72e-06 [virtual_dataset]: 7e-06 [get_grad_eliminate_]: 7.00002e-06 [virtual_output]: 6.64999e-06 [merge_forward]: 4.12998e-06 [cell_reuse_recompute_pass]: 1.29e-06 [offload_activation]: 8.85999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.326e-05 [merge_recompute_call_nodes]: 1.32999e-06 [before_grad]: 1.029e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 4.33001e-06 [flash_sp_send_recv_attached]: 2.78e-06 [receive_attached]: 1.92999e-06 [after_resolve]: 1.034e-05 [a_after_grad]: 1.02e-05 [renormalize]: 0.00418944 [add_forward_monad_depend]: 5.62999e-06 [auto_monad_grad]: 1.95001e-06 [auto_monad_eliminator]: 1.733e-05 [cse]: 3.437e-05 [a_3]: 5.159e-05 [Cycle 2]: 0.00062733, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 7.7e-06 [loop_unroll]: 6.98e-06 [a_1]: 0.00013058 [with_stream_mark]: 1.118e-05 [recompute_prepare]: 6.39001e-06 [updatestate_depend_eliminate]: 3.13e-06 [updatestate_assign_eliminate]: 2.40002e-06 [updatestate_loads_eliminate]: 2.20002e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 7.427e-05 [accelerated_algorithm]: 6.44001e-06 [shard]: 1.08001e-06 [meta_shard_fg_expand]: 1.40999e-06 [shard_inline]: 6.21998e-06 [merge_send_recv]: 4.57998e-06 [auto_parallel]: 5.12e-06 [parallel]: 4.72998e-06 [flash_sp]: 3.26999e-06 [merge_comm]: 2.89999e-06 [allreduce_fusion]: 2.78e-06 [matmul_add_comm_reduction]: 5.10001e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 7.25998e-06 [virtual_dataset]: 5.99e-06 [get_grad_eliminate_]: 5.72001e-06 [virtual_output]: 5.96e-06 [merge_forward]: 2.66e-06 [cell_reuse_recompute_pass]: 1.45001e-06 [offload_activation]: 6.33998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.353e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 9.29998e-06 [set_forward_comm_id_for_comm_node_pass]: 3.06001e-06 [meta_fg_expand]: 1.99e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 9.60019e-07 [after_resolve]: 8.69e-06 [a_after_grad]: 9.52001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.29e-06 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 6.34001e-06 [cse]: 1.488e-05 [a_3]: 3.766e-05 [py_interpret_to_execute_after_opt_a]: 4.70999e-06 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 1.664e-05 [convert_after_rewriter]: 1.77001e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.0004835 [opt_b]: 0.0002088, [1] [Cycle 1]: 0.00020261, [7] [b_1]: 0.00012947 [b_2]: 8.32e-06 [updatestate_depend_eliminate]: 5.28002e-06 [updatestate_assign_eliminate]: 2.58003e-06 [updatestate_loads_eliminate]: 2.37999e-06 [renormalize]: 3.9002e-07 [cse]: 2.048e-05 [optimize_parallel_all_gather_comm]: 1.62e-05 [overlap_param_gather]: 2.04999e-06 [cconv]: 2.235e-05 [loop_unroll]: 0.00043141 [opt_after_cconv]: 0.00010232, [1] [Cycle 1]: 9.71e-05, [7] [c_1]: 3.246e-05 [parameter_eliminate]: 2.25002e-06 [updatestate_depend_eliminate]: 5.32999e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.22999e-06 [cse]: 1.974e-05 [renormalize]: 4.90021e-07 [remove_dup_value]: 1.621e-05 [tuple_transform]: 7.514e-05, [1] [Cycle 1]: 7.067e-05, [4] [d_1]: 4.416e-05 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.03e-06 [partial_unused_args_eliminate]: 1.62001e-06 [add_recomputation]: 4.192e-05 [cse_after_recomputation]: 2.354e-05, [1] [Cycle 1]: 1.898e-05, [1] [cse]: 1.367e-05 [environ_conv]: 8.37998e-06 [swap_dp_allreduce_reducescatter]: 4.78001e-06 [bias_add_comm_swap]: 2.27999e-06 [label_micro_interleaved_index]: 4.3e-06 [label_fine_grained_interleaved_index]: 2.67001e-06 [merge_cast_opt]: 1.47001e-06 [slice_recompute_activation]: 2.19001e-06 [micro_interleaved_order_control]: 2.46e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.52999e-06 [full_micro_interleaved_order_control]: 2.21998e-06 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.20001e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.17e-06 [overlap_opt_shard_grad_in_pipeline]: 1.67999e-06 [control_data_broadcast_order]: 1.175e-05 [grouped_pairwise_exchange_alltoall]: 1.40001e-06 [offloading_packed_experts]: 3.96001e-06 [overlap_recompute_and_grad_model_parallel]: 4.45e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 1.41998e-06 [overlap_recompute_comm]: 2.19001e-06 [overlap_grad_ring_attention]: 4.68999e-06 [overlap_grad_flash_sp]: 1.63e-05 [begin_end_overlap_inline]: 4.59986e-07 [split_matmul_comm_elemetwise]: 2.09999e-06 [split_layernorm_comm]: 2.03997e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 7.302e-05, [1] [Cycle 1]: 6.911e-05, [6] [build]: 2.74001e-06 [elim_shapecalc]: 9.25001e-06 [elim_not_effective]: 1.2e-05 [opt_reshape]: 7.6e-06 [fold_const_symbol]: 1.014e-05 [renormalize]: 1.39989e-07 [detach_backward]: 1.77999e-06 [pipeline_parallel_scheduler]: 1.27999e-06 [auto_monad_reorder]: 1.712e-05 [get_jit_bprop_graph]: 1.50999e-06 [rewriter_after_jit_bprop_graph]: 3.5e-06 [opt_after_jit_grad]: 0.00047662 [validate]: 4.19e-05 [backend_pass]: 9.5999e-07 [task_emit]: 0.00707237 [execute]: 6.73e-06 Sums bootstrap : 0.000459s : 0.20% type_inference : 0.212950s : 91.85% event_method : 0.001443s : 0.62% auto_monad : 0.000190s : 0.08% graph_reusing : 0.000012s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000054s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000016s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000072s : 0.03% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000499s : 0.22% optimize.opt_a.expand_dump_flag : 0.000007s : 0.00% optimize.opt_a.switch_simplify : 0.000208s : 0.09% optimize.opt_a.loop_unroll : 0.000093s : 0.04% optimize.opt_a.a_1 : 0.001829s : 0.79% optimize.opt_a.with_stream_mark : 0.000025s : 0.01% optimize.opt_a.recompute_prepare : 0.000016s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000158s : 0.07% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.01% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000005s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.01% optimize.opt_a.merge_send_recv : 0.000013s : 0.01% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000023s : 0.01% optimize.opt_a.flash_sp : 0.000011s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000006s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000013s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.01% optimize.opt_a.virtual_dataset : 0.000013s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.01% optimize.opt_a.virtual_output : 0.000013s : 0.01% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000015s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.01% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.01% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000019s : 0.01% optimize.opt_a.a_after_grad : 0.000020s : 0.01% optimize.opt_a.renormalize : 0.004190s : 1.81% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.01% optimize.opt_a.cse : 0.000049s : 0.02% optimize.opt_a.a_3 : 0.000089s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000017s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000484s : 0.21% optimize.opt_b.b_1 : 0.000129s : 0.06% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.01% optimize.loop_unroll : 0.000431s : 0.19% optimize.opt_after_cconv.c_1 : 0.000032s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.01% optimize.tuple_transform.d_1 : 0.000044s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000042s : 0.02% optimize.cse_after_recomputation.cse : 0.000014s : 0.01% optimize.environ_conv : 0.000008s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000016s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000017s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000477s : 0.21% validate : 0.000042s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.007072s : 3.05% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000452 58 0.37% : 0.000002s : 2: substitution.elim_not_effective 0.67% : 0.000003s : 4: substitution.float_depend_g_call 0.47% : 0.000002s : 2: substitution.fold_const_symbol 1.45% : 0.000007s : 4: substitution.graph_param_transform 87.46% : 0.000395s : 23: substitution.inline 0.66% : 0.000003s : 4: substitution.j_node_and_user_rematch 0.91% : 0.000004s : 4: substitution.partial_eliminate 0.98% : 0.000004s : 4: substitution.remove_not_recompute_node 0.62% : 0.000003s : 2: substitution.replace_old_param 3.39% : 0.000015s : 6: substitution.switch_simplify 3.02% : 0.000014s : 3: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.212848 2 96.87% : 0.206182s : 1: type_inference.infer 3.13% : 0.006665s : 1: type_inference.specialize ------[replace.] 0.000227 32 62.25% : 0.000141s : 23: replace.inline 24.58% : 0.000056s : 6: replace.switch_simplify 13.16% : 0.000030s : 3: replace.tuple_list_get_item_eliminator ------[match.] 0.000408 32 94.19% : 0.000384s : 23: match.inline 2.85% : 0.000012s : 6: match.switch_simplify 2.96% : 0.000012s : 3: match.tuple_list_get_item_eliminator ------[predicate.] 0.000364 2421 1.19% : 0.000004s : 32: predicate.accumulaten_eliminater 0.47% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.29% : 0.000001s : 8: predicate.addn_check_dump 1.11% : 0.000004s : 32: predicate.addn_zero_filter 1.07% : 0.000004s : 32: predicate.adjust_all_reduce_mul_add 2.23% : 0.000008s : 40: predicate.arithmetic_simplify 1.16% : 0.000004s : 32: predicate.cast_eliminate 0.33% : 0.000001s : 8: predicate.check_bprop_eliminate 0.28% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.28% : 0.000001s : 8: predicate.depend_value_elim 1.21% : 0.000004s : 32: predicate.dict_get_item_const_eliminator 1.36% : 0.000005s : 32: predicate.dict_get_item_eliminator 1.11% : 0.000004s : 32: predicate.dict_set_item_eliminator 0.49% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.14% : 0.000001s : 4: predicate.elim_not_effective 0.20% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.30% : 0.000005s : 36: predicate.environ_add_const_eliminate 1.22% : 0.000004s : 36: predicate.environ_get_add_eliminate 1.21% : 0.000004s : 36: predicate.environ_get_depend_swap 1.59% : 0.000006s : 44: predicate.environ_get_eliminate 1.32% : 0.000005s : 36: predicate.environ_get_set_eliminate 2.14% : 0.000008s : 58: predicate.exchange_switch_depend_value 3.06% : 0.000011s : 58: predicate.float_depend_g_call 0.30% : 0.000001s : 8: predicate.float_environ_get_switch 0.47% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 4: predicate.fold_const_symbol 0.37% : 0.000001s : 8: predicate.get_grad_eliminate 0.11% : 0.000000s : 4: predicate.graph_param_transform 0.31% : 0.000001s : 8: predicate.incorporate_call 0.27% : 0.000001s : 8: predicate.incorporate_call_switch 6.00% : 0.000022s : 114: predicate.inline 0.34% : 0.000001s : 8: predicate.inline_without_move 0.17% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.44% : 0.000002s : 8: predicate.less_batch_normalization 1.54% : 0.000006s : 43: predicate.list_to_tuple_eliminator_ 2.67% : 0.000010s : 75: predicate.load_eliminater 0.52% : 0.000002s : 4: predicate.loop_unroll_after_grad 4.10% : 0.000015s : 104: predicate.loop_unroll_before_grad 1.46% : 0.000005s : 40: predicate.make_slice_get_slice_eliminator 0.36% : 0.000001s : 8: predicate.merge_addn 0.31% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.31% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.12% : 0.000004s : 32: predicate.minmaximum_grad 0.53% : 0.000002s : 4: predicate.mutable_eliminate 0.18% : 0.000001s : 4: predicate.opt_reshape 0.23% : 0.000001s : 4: predicate.parallel_virtual_node 3.30% : 0.000012s : 58: predicate.partial_defer_inline 1.62% : 0.000006s : 39: predicate.partial_eliminate 1.24% : 0.000004s : 32: predicate.print_const_string_wrapper 0.34% : 0.000001s : 8: predicate.reduce_all_const_elim 1.60% : 0.000006s : 32: predicate.reduce_eliminate 2.71% : 0.000010s : 75: predicate.redundant_stop_gradient_eliminater 0.24% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000005s : 43: predicate.replace_applicator 0.23% : 0.000001s : 8: predicate.replace_old_param 0.16% : 0.000001s : 4: predicate.reset_defer_inline 1.31% : 0.000005s : 32: predicate.reshape_eliminate 0.36% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.21% : 0.000001s : 4: predicate.row_tensor_eliminate 0.40% : 0.000001s : 8: predicate.same_eliminate 0.25% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.56% : 0.000002s : 8: predicate.shard_identity_eliminate 0.39% : 0.000001s : 8: predicate.special_op_eliminate 0.37% : 0.000001s : 8: predicate.specialize_transform 0.42% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.39% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.47% : 0.000009s : 58: predicate.switch_defer_inline 2.70% : 0.000010s : 66: predicate.switch_layer_defer_inline 8.05% : 0.000029s : 186: predicate.switch_simplify 1.20% : 0.000004s : 32: predicate.tile_eliminate 1.10% : 0.000004s : 32: predicate.transpose_eliminate 1.65% : 0.000006s : 40: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000006s : 40: predicate.tuple_list_get_item_const_eliminator 1.55% : 0.000006s : 40: predicate.tuple_list_get_item_depend_reorder 2.68% : 0.000010s : 51: predicate.tuple_list_get_item_eliminator 1.55% : 0.000006s : 40: predicate.tuple_list_get_set_item_eliminator 2.15% : 0.000008s : 48: predicate.tuple_list_set_item_eliminator 1.57% : 0.000006s : 43: predicate.tuple_to_list_eliminator_ 2.59% : 0.000009s : 75: predicate.updatestate_pure_node_eliminater 3.01% : 0.000011s : 83: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.38% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.39% : 0.000001s : 8: predicate.virtual_output_eliminate 0.13% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.23% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.005767 52 64.93% : 0.003745s : 26: func_graph_cloner_run.FuncGraphClonerGraph 35.07% : 0.002022s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.256088 196 0.00% : 0.000003s : 1: ForceFp32Comm 1.29% : 0.003298s : 1: add_attr 1.28% : 0.003289s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000046s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000198s : 1: auto_monad 0.01% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.19% : 0.000489s : 1: bootstrap 0.01% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.01% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.57% : 0.001458s : 1: event_method 0.00% : 0.000011s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000017s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000005s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.17% : 0.000439s : 1: loop_unroll 0.00% : 0.000005s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.19% : 0.000492s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.97% : 0.002473s : 78: opt.transform.opt_a 0.01% : 0.000031s : 1: opt.transform.opt_after_cconv 0.01% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000110s : 28: opt.transform.opt_b 0.02% : 0.000049s : 2: opt.transform.opt_trans_graph 0.01% : 0.000036s : 4: opt.transform.symbol_engine_opt 2.93% : 0.007495s : 1: opt_a 0.04% : 0.000106s : 1: opt_after_cconv 0.19% : 0.000486s : 1: opt_after_jit_grad 0.08% : 0.000212s : 1: opt_b 3.84% : 0.009839s : 1: optimize 0.01% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000019s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.03% : 0.000077s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.01% : 0.000020s : 1: remove_dup_value 0.89% : 0.002268s : 1: renormalize.infer 0.75% : 0.001913s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000020s : 1: rewriter_after_opt_a 0.20% : 0.000508s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000076s : 1: symbol_engine_optimizer 2.77% : 0.007082s : 1: task_emit 0.03% : 0.000078s : 1: tuple_transform 83.16% : 0.212968s : 1: type_inference 0.03% : 0.000071s : 1: validate TotalTime = 0.279012, [24] [bootstrap]: 0.00048627 [type_inference]: 0.23157 [event_method]: 7.774e-05 [auto_monad]: 0.00025934 [graph_reusing]: 2.089e-05 [inline]: 2.63e-06 [add_attr]: 0.00345349, [1] [add_attr_with_inline]: 0.00344493, [1] [Cycle 1]: 0.00012821, [2] [tag_attr]: 7.415e-05 [meta_addattr_fg_expand]: 2.389e-05 [parallel-infer-symbol]: 3.58e-06 [pre_auto_parallel]: 9.725e-05 [insert-virtual-dataset]: 2.92002e-06 [parallel-infer-symbol-second]: 7.7e-07 [dataset_repeat_opt]: 1.89e-06 [pipeline_split]: 2.27999e-06 [optimize]: 0.0280357, [53] [py_interpret_to_execute]: 4.82e-06 [rewriter_before_opt_a]: 0.00066779 [opt_a]: 0.0254053, [3] [Cycle 1]: 0.0207174, [45] [expand_dump_flag]: 8.58001e-06 [switch_simplify]: 0.00028653 [loop_unroll]: 0.00012427 [a_1]: 0.00260982 [with_stream_mark]: 2.324e-05 [recompute_prepare]: 2.29e-05 [updatestate_depend_eliminate]: 8.48999e-06 [updatestate_assign_eliminate]: 7.01001e-06 [updatestate_loads_eliminate]: 7.56999e-06 [parameter_eliminate]: 2.54001e-06 [a_2]: 0.00022553 [accelerated_algorithm]: 1.536e-05 [shard]: 1.69e-06 [meta_shard_fg_expand]: 6.54999e-06 [shard_inline]: 1.493e-05 [merge_send_recv]: 1.467e-05 [auto_parallel]: 1.02e-05 [parallel]: 1.805e-05 [flash_sp]: 9.28002e-06 [merge_comm]: 9.11002e-06 [allreduce_fusion]: 7.75e-06 [matmul_add_comm_reduction]: 2.515e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.715e-05 [virtual_dataset]: 1.507e-05 [get_grad_eliminate_]: 1.422e-05 [virtual_output]: 1.551e-05 [merge_forward]: 8.69e-06 [cell_reuse_recompute_pass]: 1.16002e-06 [offload_activation]: 1.675e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.661e-05 [merge_recompute_call_nodes]: 1.31998e-06 [before_grad]: 2.491e-05 [set_forward_comm_id_for_comm_node_pass]: 9.37999e-06 [meta_fg_expand]: 0.00169819 [flash_sp_send_recv_attached]: 4.08001e-06 [receive_attached]: 2.47001e-06 [after_resolve]: 6.94e-05 [a_after_grad]: 8.873e-05 [renormalize]: 0.0140576 [add_forward_monad_depend]: 1.016e-05 [auto_monad_grad]: 6.73998e-06 [auto_monad_eliminator]: 5.383e-05 [cse]: 0.00046815 [a_3]: 0.00034188 [Cycle 2]: 0.0040328, [45] [expand_dump_flag]: 1.87999e-06 [switch_simplify]: 4.652e-05 [loop_unroll]: 4.3e-05 [a_1]: 0.00127297 [with_stream_mark]: 1.423e-05 [recompute_prepare]: 9.27999e-06 [updatestate_depend_eliminate]: 3.61001e-06 [updatestate_assign_eliminate]: 2.74999e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.09998e-06 [a_2]: 8.494e-05 [accelerated_algorithm]: 7.33e-06 [shard]: 1.17e-06 [meta_shard_fg_expand]: 2.16e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 4.78001e-06 [auto_parallel]: 1.919e-05 [parallel]: 5.27999e-06 [flash_sp]: 3.75998e-06 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.18e-06 [matmul_add_comm_reduction]: 6.41998e-06 [allreduce_slice_to_reducescatter]: 5.10016e-07 [virtual_shard_identity]: 7.78001e-06 [virtual_dataset]: 6.61e-06 [get_grad_eliminate_]: 7.30998e-06 [virtual_output]: 7.44002e-06 [merge_forward]: 3.21001e-06 [cell_reuse_recompute_pass]: 1.00001e-06 [offload_activation]: 7.16999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.265e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 9.77999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.33e-06 [meta_fg_expand]: 0.00072365 [flash_sp_send_recv_attached]: 1.91e-06 [receive_attached]: 1.37999e-06 [after_resolve]: 1.548e-05 [a_after_grad]: 1.161e-05 [renormalize]: 0.00132253 [add_forward_monad_depend]: 4.42e-06 [auto_monad_grad]: 1.29e-06 [auto_monad_eliminator]: 1.108e-05 [cse]: 2.448e-05 [a_3]: 4.978e-05 [Cycle 3]: 0.00064055, [45] [expand_dump_flag]: 1.17e-06 [switch_simplify]: 8.40999e-06 [loop_unroll]: 6.94999e-06 [a_1]: 0.00013103 [with_stream_mark]: 8.70999e-06 [recompute_prepare]: 6.86999e-06 [updatestate_depend_eliminate]: 3.01001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.34001e-06 [parameter_eliminate]: 8.89995e-07 [a_2]: 8.033e-05 [accelerated_algorithm]: 6.84999e-06 [shard]: 1.22999e-06 [meta_shard_fg_expand]: 1.43002e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 4.58999e-06 [auto_parallel]: 5.77999e-06 [parallel]: 4.65001e-06 [flash_sp]: 9.39996e-07 [merge_comm]: 3.14999e-06 [allreduce_fusion]: 2.88e-06 [matmul_add_comm_reduction]: 5.01002e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 8.08999e-06 [virtual_dataset]: 6.54001e-06 [get_grad_eliminate_]: 6.37001e-06 [virtual_output]: 6.80002e-06 [merge_forward]: 2.93e-06 [cell_reuse_recompute_pass]: 1.76998e-06 [offload_activation]: 6.36e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 9.41e-06 [set_forward_comm_id_for_comm_node_pass]: 3.25e-06 [meta_fg_expand]: 2.12999e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.02e-06 [after_resolve]: 6.76e-06 [a_after_grad]: 9.79e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 6.31e-06 [cse]: 1.752e-05 [a_3]: 4.029e-05 [py_interpret_to_execute_after_opt_a]: 4.62998e-06 [slice_cell_reuse_recomputed_activation]: 2.27001e-06 [rewriter_after_opt_a]: 1.862e-05 [convert_after_rewriter]: 1.52001e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00051595 [opt_b]: 0.00022354, [1] [Cycle 1]: 0.00021752, [7] [b_1]: 0.00013937 [b_2]: 7.73001e-06 [updatestate_depend_eliminate]: 5.64e-06 [updatestate_assign_eliminate]: 2.59001e-06 [updatestate_loads_eliminate]: 2.55997e-06 [renormalize]: 4.59986e-07 [cse]: 2.382e-05 [optimize_parallel_all_gather_comm]: 1.558e-05 [overlap_param_gather]: 2.22999e-06 [cconv]: 1.987e-05 [loop_unroll]: 0.00047126 [opt_after_cconv]: 0.00011129, [1] [Cycle 1]: 0.00010571, [7] [c_1]: 3.376e-05 [parameter_eliminate]: 2.37999e-06 [updatestate_depend_eliminate]: 5.49998e-06 [updatestate_assign_eliminate]: 2.57001e-06 [updatestate_loads_eliminate]: 2.49001e-06 [cse]: 2.41e-05 [renormalize]: 4.20026e-07 [remove_dup_value]: 1.858e-05 [tuple_transform]: 7.804e-05, [1] [Cycle 1]: 7.377e-05, [4] [d_1]: 4.58e-05 [none_parameter_eliminate]: 1.63002e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 7.67998e-06 [partial_unused_args_eliminate]: 1.71e-06 [add_recomputation]: 4.117e-05 [cse_after_recomputation]: 2.583e-05, [1] [Cycle 1]: 2.109e-05, [1] [cse]: 1.594e-05 [environ_conv]: 9.01998e-06 [swap_dp_allreduce_reducescatter]: 5.40001e-06 [bias_add_comm_swap]: 2.24999e-06 [label_micro_interleaved_index]: 3.9e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.15999e-06 [slice_recompute_activation]: 1.89999e-06 [micro_interleaved_order_control]: 2.41998e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 9.39996e-07 [remove_cast_before_assign_add]: 7.89994e-07 [full_micro_interleaved_order_control]: 2.50997e-06 [reorder_send_recv_between_fp_bp]: 3.09999e-06 [comm_op_add_attrs]: 9.50007e-07 [add_comm_op_reuse_tag]: 9.00007e-07 [interleave_split_concat_branches]: 1.15001e-06 [interleave_parallel_branches]: 1.04e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.54998e-06 [control_data_broadcast_order]: 1.173e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 3.55998e-06 [overlap_recompute_and_grad_model_parallel]: 4.50999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.07e-06 [overlap_recompute_allgather_and_fa_grad]: 1.26002e-06 [overlap_recompute_comm]: 1.99999e-06 [overlap_grad_ring_attention]: 3.85e-06 [overlap_grad_flash_sp]: 1.71e-05 [begin_end_overlap_inline]: 4.89992e-07 [split_matmul_comm_elemetwise]: 2.04e-06 [split_layernorm_comm]: 1.59e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 7.923e-05, [1] [Cycle 1]: 7.522e-05, [6] [build]: 3.04999e-06 [elim_shapecalc]: 1.145e-05 [elim_not_effective]: 1.337e-05 [opt_reshape]: 7.49002e-06 [fold_const_symbol]: 1.192e-05 [renormalize]: 1.8999e-07 [detach_backward]: 1.95001e-06 [pipeline_parallel_scheduler]: 1.32999e-06 [auto_monad_reorder]: 1.81e-05 [get_jit_bprop_graph]: 1.19998e-06 [rewriter_after_jit_bprop_graph]: 3.41001e-06 [opt_after_jit_grad]: 0.00060141 [validate]: 4.595e-05 [backend_pass]: 1.17e-06 [task_emit]: 0.0140647 [execute]: 7.22002e-06 Sums bootstrap : 0.000486s : 0.18% type_inference : 0.231570s : 84.43% event_method : 0.000078s : 0.03% auto_monad : 0.000259s : 0.09% graph_reusing : 0.000021s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000074s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000024s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000097s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000668s : 0.24% optimize.opt_a.expand_dump_flag : 0.000012s : 0.00% optimize.opt_a.switch_simplify : 0.000341s : 0.12% optimize.opt_a.loop_unroll : 0.000174s : 0.06% optimize.opt_a.a_1 : 0.004014s : 1.46% optimize.opt_a.with_stream_mark : 0.000046s : 0.02% optimize.opt_a.recompute_prepare : 0.000039s : 0.01% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000391s : 0.14% optimize.opt_a.accelerated_algorithm : 0.000030s : 0.01% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000028s : 0.01% optimize.opt_a.merge_send_recv : 0.000024s : 0.01% optimize.opt_a.auto_parallel : 0.000035s : 0.01% optimize.opt_a.parallel : 0.000028s : 0.01% optimize.opt_a.flash_sp : 0.000014s : 0.01% optimize.opt_a.merge_comm : 0.000016s : 0.01% optimize.opt_a.allreduce_fusion : 0.000014s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000037s : 0.01% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000033s : 0.01% optimize.opt_a.virtual_dataset : 0.000028s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000028s : 0.01% optimize.opt_a.virtual_output : 0.000030s : 0.01% optimize.opt_a.merge_forward : 0.000015s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.01% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000053s : 0.02% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000044s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.01% optimize.opt_a.meta_fg_expand : 0.002424s : 0.88% optimize.opt_a.flash_sp_send_recv_attached : 0.000007s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000092s : 0.03% optimize.opt_a.a_after_grad : 0.000110s : 0.04% optimize.opt_a.renormalize : 0.015380s : 5.61% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.01% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000071s : 0.03% optimize.opt_a.cse : 0.000510s : 0.19% optimize.opt_a.a_3 : 0.000432s : 0.16% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.01% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000516s : 0.19% optimize.opt_b.b_1 : 0.000139s : 0.05% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000020s : 0.01% optimize.loop_unroll : 0.000471s : 0.17% optimize.opt_after_cconv.c_1 : 0.000034s : 0.01% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.01% optimize.tuple_transform.d_1 : 0.000046s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000041s : 0.02% optimize.cse_after_recomputation.cse : 0.000016s : 0.01% optimize.environ_conv : 0.000009s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.00% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000004s : 0.00% optimize.overlap_grad_flash_sp : 0.000017s : 0.01% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.01% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000601s : 0.22% validate : 0.000046s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.014065s : 5.13% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.001019 186 0.20% : 0.000002s : 2: substitution.elim_not_effective 0.93% : 0.000009s : 14: substitution.float_depend_g_call 0.29% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.32% : 0.000003s : 2: substitution.fold_const_symbol 0.52% : 0.000005s : 4: substitution.graph_param_transform 0.26% : 0.000003s : 2: substitution.incorporate_call 0.20% : 0.000002s : 2: substitution.incorporate_call_switch 76.31% : 0.000777s : 38: substitution.inline 1.67% : 0.000017s : 2: substitution.inline_without_move 0.80% : 0.000008s : 12: substitution.j_node_and_user_rematch 1.01% : 0.000010s : 7: substitution.minmaximum_grad 1.06% : 0.000011s : 14: substitution.partial_eliminate 0.94% : 0.000010s : 12: substitution.remove_not_recompute_node 2.20% : 0.000022s : 9: substitution.replace_applicator 0.59% : 0.000006s : 9: substitution.replace_old_param 0.25% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.08% : 0.000021s : 9: substitution.switch_simplify 1.92% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 0.91% : 0.000009s : 7: substitution.tuple_list_get_item_const_eliminator 1.33% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 4.97% : 0.000051s : 17: substitution.tuple_list_get_item_eliminator 1.21% : 0.000012s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.231150 2 95.63% : 0.221044s : 1: type_inference.infer 4.37% : 0.010106s : 1: type_inference.specialize ------[replace.] 0.000425 55 60.93% : 0.000259s : 38: replace.inline 19.76% : 0.000084s : 9: replace.switch_simplify 19.30% : 0.000082s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000799 55 94.95% : 0.000758s : 38: match.inline 1.91% : 0.000015s : 9: match.switch_simplify 3.14% : 0.000025s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000780 5484 1.18% : 0.000009s : 73: predicate.accumulaten_eliminater 0.22% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.37% : 0.000003s : 21: predicate.addn_check_dump 1.22% : 0.000010s : 73: predicate.addn_zero_filter 1.13% : 0.000009s : 73: predicate.adjust_all_reduce_mul_add 2.17% : 0.000017s : 94: predicate.arithmetic_simplify 1.18% : 0.000009s : 73: predicate.cast_eliminate 0.93% : 0.000007s : 52: predicate.check_bprop_eliminate 0.36% : 0.000003s : 21: predicate.compare_switch_simplify 0.06% : 0.000000s : 4: predicate.const_output_eliminate 0.35% : 0.000003s : 21: predicate.depend_value_elim 1.25% : 0.000010s : 73: predicate.dict_get_item_const_eliminator 1.34% : 0.000010s : 73: predicate.dict_get_item_eliminator 1.12% : 0.000009s : 73: predicate.dict_set_item_eliminator 0.25% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.06% : 0.000000s : 4: predicate.elim_not_effective 0.10% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000010s : 77: predicate.environ_add_const_eliminate 1.22% : 0.000010s : 77: predicate.environ_get_add_eliminate 1.22% : 0.000010s : 77: predicate.environ_get_depend_swap 1.65% : 0.000013s : 98: predicate.environ_get_eliminate 1.23% : 0.000010s : 77: predicate.environ_get_set_eliminate 2.00% : 0.000016s : 119: predicate.exchange_switch_depend_value 2.82% : 0.000022s : 119: predicate.float_depend_g_call 0.37% : 0.000003s : 21: predicate.float_environ_get_switch 0.46% : 0.000004s : 25: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 4: predicate.fold_const_symbol 0.38% : 0.000003s : 21: predicate.get_grad_eliminate 0.06% : 0.000000s : 4: predicate.graph_param_transform 0.36% : 0.000003s : 21: predicate.incorporate_call 0.32% : 0.000003s : 21: predicate.incorporate_call_switch 5.59% : 0.000044s : 242: predicate.inline 1.15% : 0.000009s : 48: predicate.inline_without_move 0.19% : 0.000001s : 21: predicate.j_node_and_user_rematch 0.46% : 0.000004s : 21: predicate.less_batch_normalization 1.53% : 0.000012s : 89: predicate.list_to_tuple_eliminator_ 2.50% : 0.000020s : 162: predicate.load_eliminater 0.27% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.52% : 0.000027s : 192: predicate.loop_unroll_before_grad 1.35% : 0.000011s : 81: predicate.make_slice_get_slice_eliminator 0.39% : 0.000003s : 21: predicate.merge_addn 0.85% : 0.000007s : 52: predicate.micro_step_allgather_replace 0.88% : 0.000007s : 52: predicate.mini_step_allgather_replace 1.11% : 0.000009s : 73: predicate.minmaximum_grad 0.26% : 0.000002s : 4: predicate.mutable_eliminate 0.09% : 0.000001s : 4: predicate.opt_reshape 0.11% : 0.000001s : 4: predicate.parallel_virtual_node 2.87% : 0.000022s : 119: predicate.partial_defer_inline 1.58% : 0.000012s : 85: predicate.partial_eliminate 1.20% : 0.000009s : 73: predicate.print_const_string_wrapper 0.36% : 0.000003s : 21: predicate.reduce_all_const_elim 1.60% : 0.000013s : 73: predicate.reduce_eliminate 2.55% : 0.000020s : 162: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000002s : 21: predicate.remove_not_recompute_node 1.72% : 0.000013s : 133: predicate.replace_applicator 0.55% : 0.000004s : 48: predicate.replace_old_param 0.08% : 0.000001s : 4: predicate.reset_defer_inline 1.24% : 0.000010s : 73: predicate.reshape_eliminate 0.91% : 0.000007s : 52: predicate.row_tensor_add_zeros_like 0.11% : 0.000001s : 4: predicate.row_tensor_eliminate 1.16% : 0.000009s : 52: predicate.same_eliminate 0.29% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.42% : 0.000003s : 21: predicate.shard_identity_eliminate 0.25% : 0.000002s : 8: predicate.special_op_eliminate 0.42% : 0.000003s : 21: predicate.specialize_transform 0.97% : 0.000008s : 52: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000008s : 48: predicate.stack_unstack_eliminate 0.08% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.25% : 0.000018s : 119: predicate.switch_defer_inline 3.15% : 0.000025s : 171: predicate.switch_layer_defer_inline 7.16% : 0.000056s : 354: predicate.switch_simplify 1.24% : 0.000010s : 73: predicate.tile_eliminate 1.15% : 0.000009s : 73: predicate.transpose_eliminate 1.48% : 0.000012s : 81: predicate.tuple_list_convert_item_index_to_positive 1.51% : 0.000012s : 81: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000011s : 81: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000020s : 110: predicate.tuple_list_get_item_eliminator 1.51% : 0.000012s : 81: predicate.tuple_list_get_set_item_eliminator 1.98% : 0.000015s : 102: predicate.tuple_list_set_item_eliminator 1.52% : 0.000012s : 89: predicate.tuple_to_list_eliminator_ 2.49% : 0.000019s : 162: predicate.updatestate_pure_node_eliminater 2.86% : 0.000022s : 183: predicate.updatestate_useless_node_eliminater 0.09% : 0.000001s : 4: predicate.value_based_eliminate 0.41% : 0.000003s : 21: predicate.virtual_dataset_eliminate 0.42% : 0.000003s : 21: predicate.virtual_output_eliminate 0.07% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.11% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.008041 88 71.58% : 0.005755s : 45: func_graph_cloner_run.FuncGraphClonerGraph 28.42% : 0.002285s : 43: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.331792 237 0.00% : 0.000004s : 1: ForceFp32Comm 1.04% : 0.003458s : 1: add_attr 1.04% : 0.003448s : 1: add_attr_with_inline 0.00% : 0.000005s : 1: add_comm_op_reuse_tag 0.01% : 0.000045s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.08% : 0.000268s : 1: auto_monad 0.01% : 0.000024s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.16% : 0.000516s : 1: bootstrap 0.01% : 0.000023s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000029s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000012s : 1: environ_conv 0.03% : 0.000085s : 1: event_method 0.00% : 0.000014s : 1: execute 0.00% : 0.000007s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000025s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.14% : 0.000479s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.16% : 0.000524s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000017s : 1: opt.transform.mutable_eliminate 1.73% : 0.005728s : 117: opt.transform.opt_a 0.01% : 0.000032s : 1: opt.transform.opt_after_cconv 0.01% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000119s : 28: opt.transform.opt_b 0.02% : 0.000051s : 2: opt.transform.opt_trans_graph 0.01% : 0.000041s : 4: opt.transform.symbol_engine_opt 7.66% : 0.025409s : 1: opt_a 0.03% : 0.000115s : 1: opt_after_cconv 0.18% : 0.000611s : 1: opt_after_jit_grad 0.07% : 0.000227s : 1: opt_b 8.45% : 0.028041s : 1: optimize 0.01% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000020s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000007s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.03% : 0.000103s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000003s : 1: remove_cast_before_assign_add 0.01% : 0.000022s : 1: remove_dup_value 3.80% : 0.012618s : 2: renormalize.infer 0.83% : 0.002745s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000022s : 1: rewriter_after_opt_a 0.20% : 0.000677s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000006s : 1: split_matmul_comm_elemetwise 0.00% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.02% : 0.000082s : 1: symbol_engine_optimizer 4.25% : 0.014085s : 1: task_emit 0.02% : 0.000081s : 1: tuple_transform 69.80% : 0.231596s : 1: type_inference 0.02% : 0.000074s : 1: validate TotalTime = 0.218927, [24] [bootstrap]: 0.00132958 [type_inference]: 0.145422 [event_method]: 0.00012003 [auto_monad]: 0.00034061 [graph_reusing]: 2.681e-05 [inline]: 3.01999e-06 [add_attr]: 0.00463333, [1] [add_attr_with_inline]: 0.00462486, [1] [Cycle 1]: 0.00019974, [2] [tag_attr]: 0.00012841 [meta_addattr_fg_expand]: 3.822e-05 [parallel-infer-symbol]: 3.86999e-06 [pre_auto_parallel]: 0.00013342 [insert-virtual-dataset]: 3.16001e-06 [parallel-infer-symbol-second]: 9.00007e-07 [dataset_repeat_opt]: 2.17001e-06 [pipeline_split]: 2.23002e-06 [optimize]: 0.0162967, [53] [py_interpret_to_execute]: 4.83001e-06 [rewriter_before_opt_a]: 0.00081043 [opt_a]: 0.0125933, [2] [Cycle 1]: 0.0109839, [45] [expand_dump_flag]: 8.39002e-06 [switch_simplify]: 0.00035561 [loop_unroll]: 0.0001663 [a_1]: 0.00357072 [with_stream_mark]: 2.724e-05 [recompute_prepare]: 2.831e-05 [updatestate_depend_eliminate]: 1.507e-05 [updatestate_assign_eliminate]: 1.338e-05 [updatestate_loads_eliminate]: 1.299e-05 [parameter_eliminate]: 2.44999e-06 [a_2]: 0.00063721 [accelerated_algorithm]: 5.121e-05 [shard]: 2.06e-06 [meta_shard_fg_expand]: 8.54e-06 [shard_inline]: 1.96e-05 [merge_send_recv]: 1.742e-05 [auto_parallel]: 1.448e-05 [parallel]: 3.222e-05 [flash_sp]: 1.209e-05 [merge_comm]: 1.26e-05 [allreduce_fusion]: 1.204e-05 [matmul_add_comm_reduction]: 1.796e-05 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 2.028e-05 [virtual_dataset]: 1.879e-05 [get_grad_eliminate_]: 1.793e-05 [virtual_output]: 1.783e-05 [merge_forward]: 1.132e-05 [cell_reuse_recompute_pass]: 1.47001e-06 [offload_activation]: 1.902e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.46e-05 [merge_recompute_call_nodes]: 1.38002e-06 [before_grad]: 3.119e-05 [set_forward_comm_id_for_comm_node_pass]: 1.208e-05 [meta_fg_expand]: 1.256e-05 [flash_sp_send_recv_attached]: 4.77e-06 [receive_attached]: 2.53003e-06 [after_resolve]: 2.333e-05 [a_after_grad]: 2.826e-05 [renormalize]: 0.00493629 [add_forward_monad_depend]: 6.43998e-06 [auto_monad_grad]: 1.97999e-06 [auto_monad_eliminator]: 3.496e-05 [cse]: 0.00028629 [a_3]: 0.00014182 [Cycle 2]: 0.00159875, [45] [expand_dump_flag]: 1.55999e-06 [switch_simplify]: 2.04e-05 [loop_unroll]: 1.849e-05 [a_1]: 0.00043839 [with_stream_mark]: 1.772e-05 [recompute_prepare]: 1.777e-05 [updatestate_depend_eliminate]: 1.008e-05 [updatestate_assign_eliminate]: 9.17999e-06 [updatestate_loads_eliminate]: 8.94e-06 [parameter_eliminate]: 9.09989e-07 [a_2]: 0.00025938 [accelerated_algorithm]: 2.41e-05 [shard]: 1.03001e-06 [meta_shard_fg_expand]: 4.17998e-06 [shard_inline]: 1.815e-05 [merge_send_recv]: 1.306e-05 [auto_parallel]: 1.264e-05 [parallel]: 4.47e-06 [flash_sp]: 2.99001e-06 [merge_comm]: 1.15e-05 [allreduce_fusion]: 1.018e-05 [matmul_add_comm_reduction]: 1.461e-05 [allreduce_slice_to_reducescatter]: 4.69998e-07 [virtual_shard_identity]: 1.848e-05 [virtual_dataset]: 1.865e-05 [get_grad_eliminate_]: 1.748e-05 [virtual_output]: 1.735e-05 [merge_forward]: 9.34998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.564e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.331e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 3.132e-05 [set_forward_comm_id_for_comm_node_pass]: 1.154e-05 [meta_fg_expand]: 7.25998e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 1.00999e-06 [after_resolve]: 2.196e-05 [a_after_grad]: 2.764e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.57001e-06 [auto_monad_grad]: 1.10999e-06 [auto_monad_eliminator]: 2.125e-05 [cse]: 9.002e-05 [a_3]: 0.00012712 [py_interpret_to_execute_after_opt_a]: 4.55001e-06 [slice_cell_reuse_recomputed_activation]: 2.04999e-06 [rewriter_after_opt_a]: 4.925e-05 [convert_after_rewriter]: 1.27e-06 [order_py_execute_after_rewriter]: 1.19e-06 [mutable_eliminate]: 0.00051598 [opt_b]: 0.00056759, [1] [Cycle 1]: 0.00056119, [7] [b_1]: 0.0004184 [b_2]: 1.966e-05 [updatestate_depend_eliminate]: 1.276e-05 [updatestate_assign_eliminate]: 9.14e-06 [updatestate_loads_eliminate]: 9.14e-06 [renormalize]: 3.50003e-07 [cse]: 5.592e-05 [optimize_parallel_all_gather_comm]: 3.118e-05 [overlap_param_gather]: 2.31e-06 [cconv]: 2.534e-05 [loop_unroll]: 0.00047635 [opt_after_cconv]: 0.00021767, [1] [Cycle 1]: 0.00021149, [7] [c_1]: 8.828e-05 [parameter_eliminate]: 2.53e-06 [updatestate_depend_eliminate]: 1.364e-05 [updatestate_assign_eliminate]: 9.12001e-06 [updatestate_loads_eliminate]: 8.83001e-06 [cse]: 5.396e-05 [renormalize]: 8.60018e-07 [remove_dup_value]: 7.933e-05 [tuple_transform]: 0.00019441, [1] [Cycle 1]: 0.00018938, [4] [d_1]: 0.00012283 [none_parameter_eliminate]: 1.91998e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 4.42e-05 [partial_unused_args_eliminate]: 1.86e-06 [add_recomputation]: 0.00011691 [cse_after_recomputation]: 5.044e-05, [1] [Cycle 1]: 4.574e-05, [1] [cse]: 3.996e-05 [environ_conv]: 1.171e-05 [swap_dp_allreduce_reducescatter]: 1.397e-05 [bias_add_comm_swap]: 2.62001e-06 [label_micro_interleaved_index]: 5.14e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.49e-06 [slice_recompute_activation]: 1.97001e-06 [micro_interleaved_order_control]: 2.56998e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 7.39994e-07 [remove_cast_before_assign_add]: 1.25001e-06 [full_micro_interleaved_order_control]: 2.39001e-06 [reorder_send_recv_between_fp_bp]: 2.89999e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 1.04e-06 [interleave_split_concat_branches]: 1.37e-06 [interleave_parallel_branches]: 1.03001e-06 [overlap_opt_shard_in_pipeline]: 1.17999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.04e-06 [control_data_broadcast_order]: 3.086e-05 [grouped_pairwise_exchange_alltoall]: 1.60999e-06 [offloading_packed_experts]: 8.59e-06 [overlap_recompute_and_grad_model_parallel]: 9.88002e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.46002e-06 [overlap_recompute_comm]: 2.37001e-06 [overlap_grad_ring_attention]: 8.73001e-06 [overlap_grad_flash_sp]: 4.074e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.63e-06 [split_layernorm_comm]: 2.37999e-06 [handle_group_info]: 1.17e-06 [symbol_engine_optimizer]: 0.00014819, [1] [Cycle 1]: 0.00014408, [6] [build]: 8.62998e-06 [elim_shapecalc]: 2.312e-05 [elim_not_effective]: 3.357e-05 [opt_reshape]: 1.918e-05 [fold_const_symbol]: 3e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.85001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 3.942e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 3.70003e-06 [opt_after_jit_grad]: 0.00055334 [validate]: 0.00014107 [backend_pass]: 1.26002e-06 [task_emit]: 0.0494795 [execute]: 7.81001e-06 Sums bootstrap : 0.001330s : 0.62% type_inference : 0.145422s : 68.21% event_method : 0.000120s : 0.06% auto_monad : 0.000341s : 0.16% graph_reusing : 0.000027s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000128s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000038s : 0.02% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000133s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000810s : 0.38% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000376s : 0.18% optimize.opt_a.loop_unroll : 0.000185s : 0.09% optimize.opt_a.a_1 : 0.004009s : 1.88% optimize.opt_a.with_stream_mark : 0.000045s : 0.02% optimize.opt_a.recompute_prepare : 0.000046s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000025s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000023s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000022s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000897s : 0.42% optimize.opt_a.accelerated_algorithm : 0.000075s : 0.04% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000013s : 0.01% optimize.opt_a.shard_inline : 0.000038s : 0.02% optimize.opt_a.merge_send_recv : 0.000030s : 0.01% optimize.opt_a.auto_parallel : 0.000027s : 0.01% optimize.opt_a.parallel : 0.000037s : 0.02% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000024s : 0.01% optimize.opt_a.allreduce_fusion : 0.000022s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000033s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000039s : 0.02% optimize.opt_a.virtual_dataset : 0.000037s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000035s : 0.02% optimize.opt_a.virtual_output : 0.000035s : 0.02% optimize.opt_a.merge_forward : 0.000021s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000035s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000068s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000063s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000024s : 0.01% optimize.opt_a.meta_fg_expand : 0.000020s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000045s : 0.02% optimize.opt_a.a_after_grad : 0.000056s : 0.03% optimize.opt_a.renormalize : 0.004936s : 2.32% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000056s : 0.03% optimize.opt_a.cse : 0.000376s : 0.18% optimize.opt_a.a_3 : 0.000269s : 0.13% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000049s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000516s : 0.24% optimize.opt_b.b_1 : 0.000418s : 0.20% optimize.opt_b.b_2 : 0.000020s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000013s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000056s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000031s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000025s : 0.01% optimize.loop_unroll : 0.000476s : 0.22% optimize.opt_after_cconv.c_1 : 0.000088s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.cse : 0.000054s : 0.03% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000079s : 0.04% optimize.tuple_transform.d_1 : 0.000123s : 0.06% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000044s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000117s : 0.05% optimize.cse_after_recomputation.cse : 0.000040s : 0.02% optimize.environ_conv : 0.000012s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000014s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000031s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000009s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000010s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000041s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000023s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000034s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000019s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000030s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000039s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000553s : 0.26% validate : 0.000141s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.049480s : 23.21% execute : 0.000008s : 0.00% Time group info: ------[substitution.] 0.000962 152 0.52% : 0.000005s : 3: substitution.depend_value_elim 0.48% : 0.000005s : 12: substitution.elim_not_effective 0.43% : 0.000004s : 12: substitution.fold_const_symbol 1.24% : 0.000012s : 14: substitution.graph_param_transform 86.63% : 0.000834s : 41: substitution.inline 1.11% : 0.000011s : 24: substitution.j_node_and_user_rematch 3.17% : 0.000030s : 2: substitution.less_batch_normalization 1.61% : 0.000015s : 24: substitution.remove_not_recompute_node 0.49% : 0.000005s : 4: substitution.replace_old_param 2.56% : 0.000025s : 12: substitution.switch_simplify 1.76% : 0.000017s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.145244 2 94.15% : 0.136747s : 1: type_inference.infer 5.85% : 0.008497s : 1: type_inference.specialize ------[replace.] 0.000417 60 4.54% : 0.000019s : 3: replace.depend_value_elim 63.55% : 0.000265s : 41: replace.inline 24.50% : 0.000102s : 12: replace.switch_simplify 7.41% : 0.000031s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000850 60 0.37% : 0.000003s : 3: match.depend_value_elim 95.88% : 0.000815s : 41: match.inline 2.09% : 0.000018s : 12: match.switch_simplify 1.66% : 0.000014s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000962 6478 1.10% : 0.000011s : 77: predicate.accumulaten_eliminater 0.44% : 0.000004s : 14: predicate.ad_related_special_op_eliminate 0.67% : 0.000006s : 48: predicate.addn_check_dump 1.09% : 0.000011s : 77: predicate.addn_zero_filter 1.15% : 0.000011s : 77: predicate.adjust_all_reduce_mul_add 2.46% : 0.000024s : 125: predicate.arithmetic_simplify 1.13% : 0.000011s : 77: predicate.cast_eliminate 0.42% : 0.000004s : 28: predicate.check_bprop_eliminate 0.67% : 0.000006s : 48: predicate.compare_switch_simplify 0.11% : 0.000001s : 14: predicate.const_output_eliminate 0.72% : 0.000007s : 47: predicate.depend_value_elim 1.18% : 0.000011s : 77: predicate.dict_get_item_const_eliminator 1.36% : 0.000013s : 77: predicate.dict_get_item_eliminator 1.20% : 0.000012s : 77: predicate.dict_set_item_eliminator 0.49% : 0.000005s : 28: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 14: predicate.elim_not_effective 0.24% : 0.000002s : 14: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000012s : 91: predicate.environ_add_const_eliminate 1.24% : 0.000012s : 91: predicate.environ_get_add_eliminate 1.26% : 0.000012s : 91: predicate.environ_get_depend_swap 1.99% : 0.000019s : 139: predicate.environ_get_eliminate 1.30% : 0.000012s : 91: predicate.environ_get_set_eliminate 1.75% : 0.000017s : 122: predicate.exchange_switch_depend_value 2.28% : 0.000022s : 122: predicate.float_depend_g_call 0.66% : 0.000006s : 48: predicate.float_environ_get_switch 0.87% : 0.000008s : 62: predicate.float_tuple_getitem_switch 0.11% : 0.000001s : 14: predicate.fold_const_symbol 0.44% : 0.000004s : 28: predicate.get_grad_eliminate 0.13% : 0.000001s : 14: predicate.graph_param_transform 0.68% : 0.000007s : 48: predicate.incorporate_call 0.64% : 0.000006s : 48: predicate.incorporate_call_switch 5.82% : 0.000056s : 303: predicate.inline 0.54% : 0.000005s : 28: predicate.inline_without_move 0.20% : 0.000002s : 28: predicate.j_node_and_user_rematch 0.55% : 0.000005s : 28: predicate.less_batch_normalization 1.61% : 0.000015s : 109: predicate.list_to_tuple_eliminator_ 2.61% : 0.000025s : 186: predicate.load_eliminater 0.51% : 0.000005s : 14: predicate.loop_unroll_after_grad 3.18% : 0.000031s : 201: predicate.loop_unroll_before_grad 1.58% : 0.000015s : 105: predicate.make_slice_get_slice_eliminator 0.69% : 0.000007s : 48: predicate.merge_addn 0.41% : 0.000004s : 28: predicate.micro_step_allgather_replace 0.43% : 0.000004s : 28: predicate.mini_step_allgather_replace 1.08% : 0.000010s : 77: predicate.minmaximum_grad 0.50% : 0.000005s : 14: predicate.mutable_eliminate 0.23% : 0.000002s : 14: predicate.opt_reshape 0.23% : 0.000002s : 14: predicate.parallel_virtual_node 2.34% : 0.000022s : 122: predicate.partial_defer_inline 1.48% : 0.000014s : 95: predicate.partial_eliminate 1.11% : 0.000011s : 77: predicate.print_const_string_wrapper 0.59% : 0.000006s : 42: predicate.reduce_all_const_elim 1.47% : 0.000014s : 77: predicate.reduce_eliminate 2.56% : 0.000025s : 186: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000002s : 28: predicate.remove_not_recompute_node 1.09% : 0.000010s : 109: predicate.replace_applicator 0.24% : 0.000002s : 28: predicate.replace_old_param 0.12% : 0.000001s : 14: predicate.reset_defer_inline 1.09% : 0.000010s : 77: predicate.reshape_eliminate 0.43% : 0.000004s : 28: predicate.row_tensor_add_zeros_like 0.29% : 0.000003s : 14: predicate.row_tensor_eliminate 0.54% : 0.000005s : 28: predicate.same_eliminate 0.29% : 0.000003s : 34: predicate.set_cell_output_no_recompute 0.46% : 0.000004s : 28: predicate.shard_identity_eliminate 0.48% : 0.000005s : 28: predicate.special_op_eliminate 0.78% : 0.000008s : 48: predicate.specialize_transform 0.46% : 0.000004s : 28: predicate.split_environ_get_set_with_tuple_value 0.49% : 0.000005s : 28: predicate.stack_unstack_eliminate 0.22% : 0.000002s : 14: predicate.switch_call_monad_eliminater 1.91% : 0.000018s : 122: predicate.switch_defer_inline 2.46% : 0.000024s : 150: predicate.switch_layer_defer_inline 6.74% : 0.000065s : 409: predicate.switch_simplify 1.08% : 0.000010s : 77: predicate.tile_eliminate 1.16% : 0.000011s : 77: predicate.transpose_eliminate 1.66% : 0.000016s : 105: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000015s : 105: predicate.tuple_list_get_item_const_eliminator 1.52% : 0.000015s : 105: predicate.tuple_list_get_item_depend_reorder 2.70% : 0.000026s : 157: predicate.tuple_list_get_item_eliminator 1.80% : 0.000017s : 105: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000023s : 153: predicate.tuple_list_set_item_eliminator 1.56% : 0.000015s : 109: predicate.tuple_to_list_eliminator_ 2.52% : 0.000024s : 186: predicate.updatestate_pure_node_eliminater 3.22% : 0.000031s : 234: predicate.updatestate_useless_node_eliminater 0.22% : 0.000002s : 14: predicate.value_based_eliminate 0.46% : 0.000004s : 28: predicate.virtual_dataset_eliminate 0.43% : 0.000004s : 28: predicate.virtual_output_eliminate 0.20% : 0.000002s : 14: predicate.virtual_view_grad_eliminate 0.30% : 0.000003s : 14: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006673 85 60.95% : 0.004067s : 36: func_graph_cloner_run.FuncGraphClonerGraph 39.05% : 0.002606s : 49: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.251741 213 0.00% : 0.000004s : 1: ForceFp32Comm 1.84% : 0.004638s : 1: add_attr 1.84% : 0.004628s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000121s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.14% : 0.000353s : 1: auto_monad 0.02% : 0.000044s : 1: auto_monad_reorder 0.00% : 0.000008s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.58% : 0.001451s : 1: bootstrap 0.01% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000034s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000054s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000015s : 1: environ_conv 0.05% : 0.000129s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000032s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.19% : 0.000485s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.21% : 0.000525s : 1: mutable_eliminate 0.00% : 0.000012s : 1: offloading_packed_experts 0.01% : 0.000029s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000030s : 1: opt.transform.mutable_eliminate 2.45% : 0.006168s : 95: opt.transform.opt_a 0.03% : 0.000087s : 1: opt.transform.opt_after_cconv 0.02% : 0.000062s : 1: opt.transform.opt_after_jit_grad 0.16% : 0.000410s : 28: opt.transform.opt_b 0.07% : 0.000164s : 2: opt.transform.opt_trans_graph 0.04% : 0.000102s : 4: opt.transform.symbol_engine_opt 5.00% : 0.012597s : 1: opt_a 0.09% : 0.000221s : 1: opt_after_cconv 0.22% : 0.000563s : 1: opt_after_jit_grad 0.23% : 0.000571s : 1: opt_b 6.48% : 0.016301s : 1: optimize 0.01% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000044s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000013s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000008s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.06% : 0.000140s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000084s : 1: remove_dup_value 0.93% : 0.002346s : 1: renormalize.infer 1.03% : 0.002581s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000053s : 1: rewriter_after_opt_a 0.33% : 0.000820s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000017s : 1: swap_dp_allreduce_reducescatter 0.06% : 0.000151s : 1: symbol_engine_optimizer 19.66% : 0.049493s : 1: task_emit 0.08% : 0.000197s : 1: tuple_transform 57.77% : 0.145441s : 1: type_inference 0.09% : 0.000225s : 1: validate TotalTime = 0.310182, [24] [bootstrap]: 0.00062237 [type_inference]: 0.138018 [event_method]: 0.00012241 [auto_monad]: 0.00040363 [graph_reusing]: 3.287e-05 [inline]: 2.11998e-06 [add_attr]: 0.00380239, [1] [add_attr_with_inline]: 0.00379287, [1] [Cycle 1]: 0.0002014, [2] [tag_attr]: 0.00012166 [meta_addattr_fg_expand]: 4.552e-05 [parallel-infer-symbol]: 3.18e-06 [pre_auto_parallel]: 0.00016969 [insert-virtual-dataset]: 2.52001e-06 [parallel-infer-symbol-second]: 1.12999e-06 [dataset_repeat_opt]: 2.05002e-06 [pipeline_split]: 1.85001e-06 [optimize]: 0.0814482, [53] [py_interpret_to_execute]: 4.74e-06 [rewriter_before_opt_a]: 0.00099103 [opt_a]: 0.0769578, [3] [Cycle 1]: 0.0588843, [45] [expand_dump_flag]: 1.115e-05 [switch_simplify]: 0.0004388 [loop_unroll]: 0.00020193 [a_1]: 0.00443433 [with_stream_mark]: 3.634e-05 [recompute_prepare]: 4.011e-05 [updatestate_depend_eliminate]: 3.628e-05 [updatestate_assign_eliminate]: 1.861e-05 [updatestate_loads_eliminate]: 1.765e-05 [parameter_eliminate]: 3.23e-06 [a_2]: 0.00095412 [accelerated_algorithm]: 4.982e-05 [shard]: 1.89999e-06 [meta_shard_fg_expand]: 1.156e-05 [shard_inline]: 2.752e-05 [merge_send_recv]: 4.263e-05 [auto_parallel]: 2.158e-05 [parallel]: 1.92e-05 [flash_sp]: 1.567e-05 [merge_comm]: 1.892e-05 [allreduce_fusion]: 1.785e-05 [matmul_add_comm_reduction]: 3.718e-05 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 3.166e-05 [virtual_dataset]: 2.84e-05 [get_grad_eliminate_]: 2.887e-05 [virtual_output]: 2.74e-05 [merge_forward]: 2.163e-05 [cell_reuse_recompute_pass]: 1.53002e-06 [offload_activation]: 2.75e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.284e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 4.959e-05 [set_forward_comm_id_for_comm_node_pass]: 2.406e-05 [meta_fg_expand]: 0.00770982 [flash_sp_send_recv_attached]: 4.66002e-06 [receive_attached]: 2.14e-06 [after_resolve]: 0.00030459 [a_after_grad]: 0.00044218 [renormalize]: 0.0370371 [add_forward_monad_depend]: 6.167e-05 [auto_monad_grad]: 3.096e-05 [auto_monad_eliminator]: 0.00032981 [cse]: 0.00099101 [a_3]: 0.00486868 [Cycle 2]: 0.0158186, [45] [expand_dump_flag]: 7.6e-06 [switch_simplify]: 0.00030917 [loop_unroll]: 0.00035832 [a_1]: 0.0071829 [with_stream_mark]: 2.801e-05 [recompute_prepare]: 3.033e-05 [updatestate_depend_eliminate]: 1.664e-05 [updatestate_assign_eliminate]: 1.478e-05 [updatestate_loads_eliminate]: 1.449e-05 [parameter_eliminate]: 1.54998e-06 [a_2]: 0.00042628 [accelerated_algorithm]: 5.871e-05 [shard]: 1.07998e-06 [meta_shard_fg_expand]: 1.046e-05 [shard_inline]: 2.801e-05 [merge_send_recv]: 4.072e-05 [auto_parallel]: 1.93e-05 [parallel]: 5.74999e-06 [flash_sp]: 3.33998e-06 [merge_comm]: 1.723e-05 [allreduce_fusion]: 1.664e-05 [matmul_add_comm_reduction]: 2.245e-05 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 2.845e-05 [virtual_dataset]: 2.696e-05 [get_grad_eliminate_]: 2.634e-05 [virtual_output]: 2.679e-05 [merge_forward]: 1.441e-05 [cell_reuse_recompute_pass]: 1.57999e-06 [offload_activation]: 2.255e-05 [cell_reuse_handle_not_recompute_node_pass]: 5.316e-05 [merge_recompute_call_nodes]: 1.05999e-06 [before_grad]: 4.842e-05 [set_forward_comm_id_for_comm_node_pass]: 1.776e-05 [meta_fg_expand]: 0.00085476 [flash_sp_send_recv_attached]: 1.87999e-06 [receive_attached]: 1.45001e-06 [after_resolve]: 3.686e-05 [a_after_grad]: 4.459e-05 [renormalize]: 0.00529295 [add_forward_monad_depend]: 5.37001e-06 [auto_monad_grad]: 1.33002e-06 [auto_monad_eliminator]: 3.835e-05 [cse]: 0.00019366 [a_3]: 0.00017756 [Cycle 3]: 0.00223786, [45] [expand_dump_flag]: 1.55001e-06 [switch_simplify]: 2.605e-05 [loop_unroll]: 2.329e-05 [a_1]: 0.00077918 [with_stream_mark]: 2.183e-05 [recompute_prepare]: 2.474e-05 [updatestate_depend_eliminate]: 1.343e-05 [updatestate_assign_eliminate]: 1.198e-05 [updatestate_loads_eliminate]: 1.243e-05 [parameter_eliminate]: 1.00001e-06 [a_2]: 0.00036362 [accelerated_algorithm]: 3.289e-05 [shard]: 1.22e-06 [meta_shard_fg_expand]: 6.38e-06 [shard_inline]: 2.376e-05 [merge_send_recv]: 1.671e-05 [auto_parallel]: 1.637e-05 [parallel]: 4.34997e-06 [flash_sp]: 1.03001e-06 [merge_comm]: 1.431e-05 [allreduce_fusion]: 1.368e-05 [matmul_add_comm_reduction]: 1.79e-05 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 2.501e-05 [virtual_dataset]: 2.303e-05 [get_grad_eliminate_]: 2.292e-05 [virtual_output]: 2.315e-05 [merge_forward]: 1.289e-05 [cell_reuse_recompute_pass]: 1.52001e-06 [offload_activation]: 1.876e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.519e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 4.065e-05 [set_forward_comm_id_for_comm_node_pass]: 1.467e-05 [meta_fg_expand]: 1.103e-05 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 2.263e-05 [a_after_grad]: 3.714e-05 [renormalize]: 7.99773e-08 [add_forward_monad_depend]: 1.58002e-06 [auto_monad_grad]: 1.12999e-06 [auto_monad_eliminator]: 2.658e-05 [cse]: 0.00010361 [a_3]: 0.00016436 [py_interpret_to_execute_after_opt_a]: 4.62998e-06 [slice_cell_reuse_recomputed_activation]: 2.05002e-06 [rewriter_after_opt_a]: 6.922e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.38002e-06 [mutable_eliminate]: 0.00056648 [opt_b]: 0.00076414, [1] [Cycle 1]: 0.00075752, [7] [b_1]: 0.00057326 [b_2]: 2.543e-05 [updatestate_depend_eliminate]: 1.655e-05 [updatestate_assign_eliminate]: 1.199e-05 [updatestate_loads_eliminate]: 1.184e-05 [renormalize]: 4.59986e-07 [cse]: 8.098e-05 [optimize_parallel_all_gather_comm]: 3.686e-05 [overlap_param_gather]: 2.14e-06 [cconv]: 2.515e-05 [loop_unroll]: 0.00050003 [opt_after_cconv]: 0.00028633, [1] [Cycle 1]: 0.00027999, [7] [c_1]: 0.00012324 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 1.735e-05 [updatestate_assign_eliminate]: 1.23e-05 [updatestate_loads_eliminate]: 1.175e-05 [cse]: 7.778e-05 [renormalize]: 2.69996e-07 [remove_dup_value]: 0.00013746 [tuple_transform]: 0.00028902, [1] [Cycle 1]: 0.00028365, [4] [d_1]: 0.00023575 [none_parameter_eliminate]: 2.22999e-06 [renormalize]: 2.30008e-07 [switch_simplify]: 2.483e-05 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 0.00011156 [cse_after_recomputation]: 6.653e-05, [1] [Cycle 1]: 6.073e-05, [1] [cse]: 5.443e-05 [environ_conv]: 1.734e-05 [swap_dp_allreduce_reducescatter]: 1.874e-05 [bias_add_comm_swap]: 2.59999e-06 [label_micro_interleaved_index]: 4.70999e-06 [label_fine_grained_interleaved_index]: 2.75997e-06 [merge_cast_opt]: 1.42e-06 [slice_recompute_activation]: 2.37001e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.47999e-06 [ForceFp32Comm]: 7.40023e-07 [remove_cast_before_assign_add]: 1.22e-06 [full_micro_interleaved_order_control]: 2.09e-06 [reorder_send_recv_between_fp_bp]: 2.61e-06 [comm_op_add_attrs]: 1.02e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.45001e-06 [overlap_opt_shard_grad_in_pipeline]: 2.11e-06 [control_data_broadcast_order]: 3.812e-05 [grouped_pairwise_exchange_alltoall]: 1.91e-06 [offloading_packed_experts]: 9.69e-06 [overlap_recompute_and_grad_model_parallel]: 1.157e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.19e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37e-06 [overlap_recompute_comm]: 2.18002e-06 [overlap_grad_ring_attention]: 1.012e-05 [overlap_grad_flash_sp]: 5.406e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.29001e-06 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 9.79984e-07 [symbol_engine_optimizer]: 0.00018407, [1] [Cycle 1]: 0.0001795, [6] [build]: 1.306e-05 [elim_shapecalc]: 2.927e-05 [elim_not_effective]: 4.367e-05 [opt_reshape]: 2.382e-05 [fold_const_symbol]: 4.045e-05 [renormalize]: 2.29978e-07 [detach_backward]: 2.49999e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 4.654e-05 [get_jit_bprop_graph]: 1.39e-06 [rewriter_after_jit_bprop_graph]: 3.49001e-06 [opt_after_jit_grad]: 0.00057749 [validate]: 7.536e-05 [backend_pass]: 1.22e-06 [task_emit]: 0.084547 [execute]: 7.41999e-06 Sums bootstrap : 0.000622s : 0.20% type_inference : 0.138018s : 45.25% event_method : 0.000122s : 0.04% auto_monad : 0.000404s : 0.13% graph_reusing : 0.000033s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000122s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000046s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000170s : 0.06% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000991s : 0.32% optimize.opt_a.expand_dump_flag : 0.000020s : 0.01% optimize.opt_a.switch_simplify : 0.000774s : 0.25% optimize.opt_a.loop_unroll : 0.000584s : 0.19% optimize.opt_a.a_1 : 0.012396s : 4.06% optimize.opt_a.with_stream_mark : 0.000086s : 0.03% optimize.opt_a.recompute_prepare : 0.000095s : 0.03% optimize.opt_a.updatestate_depend_eliminate : 0.000066s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000045s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000045s : 0.01% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.001744s : 0.57% optimize.opt_a.accelerated_algorithm : 0.000141s : 0.05% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000028s : 0.01% optimize.opt_a.shard_inline : 0.000079s : 0.03% optimize.opt_a.merge_send_recv : 0.000100s : 0.03% optimize.opt_a.auto_parallel : 0.000057s : 0.02% optimize.opt_a.parallel : 0.000029s : 0.01% optimize.opt_a.flash_sp : 0.000020s : 0.01% optimize.opt_a.merge_comm : 0.000050s : 0.02% optimize.opt_a.allreduce_fusion : 0.000048s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000078s : 0.03% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000085s : 0.03% optimize.opt_a.virtual_dataset : 0.000078s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000078s : 0.03% optimize.opt_a.virtual_output : 0.000077s : 0.03% optimize.opt_a.merge_forward : 0.000049s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000005s : 0.00% optimize.opt_a.offload_activation : 0.000069s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000151s : 0.05% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000139s : 0.05% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000056s : 0.02% optimize.opt_a.meta_fg_expand : 0.008576s : 2.81% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000364s : 0.12% optimize.opt_a.a_after_grad : 0.000524s : 0.17% optimize.opt_a.renormalize : 0.042330s : 13.88% optimize.opt_a.add_forward_monad_depend : 0.000069s : 0.02% optimize.opt_a.auto_monad_grad : 0.000033s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000395s : 0.13% optimize.opt_a.cse : 0.001288s : 0.42% optimize.opt_a.a_3 : 0.005211s : 1.71% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000069s : 0.02% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000566s : 0.19% optimize.opt_b.b_1 : 0.000573s : 0.19% optimize.opt_b.b_2 : 0.000025s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000017s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000081s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000037s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000025s : 0.01% optimize.loop_unroll : 0.000500s : 0.16% optimize.opt_after_cconv.c_1 : 0.000123s : 0.04% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000017s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.00% optimize.opt_after_cconv.cse : 0.000078s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000137s : 0.05% optimize.tuple_transform.d_1 : 0.000236s : 0.08% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000025s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000112s : 0.04% optimize.cse_after_recomputation.cse : 0.000054s : 0.02% optimize.environ_conv : 0.000017s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000019s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000038s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000054s : 0.02% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000029s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000044s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000024s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000040s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000047s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000577s : 0.19% validate : 0.000075s : 0.02% backend_pass : 0.000001s : 0.00% task_emit : 0.084547s : 27.72% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.003328 689 0.43% : 0.000014s : 6: substitution.addn_check_dump 1.59% : 0.000053s : 7: substitution.addn_zero_filter 0.46% : 0.000015s : 7: substitution.adjust_all_reduce_mul_add 0.14% : 0.000005s : 3: substitution.depend_value_elim 0.21% : 0.000007s : 18: substitution.elim_not_effective 0.26% : 0.000009s : 13: substitution.float_depend_g_call 0.31% : 0.000010s : 7: substitution.float_tuple_getitem_switch 0.18% : 0.000006s : 18: substitution.fold_const_symbol 0.48% : 0.000016s : 20: substitution.graph_param_transform 0.12% : 0.000004s : 4: substitution.incorporate_call 0.10% : 0.000003s : 4: substitution.incorporate_call_switch 71.27% : 0.002372s : 94: substitution.inline 1.83% : 0.000061s : 11: substitution.inline_without_move 0.73% : 0.000024s : 58: substitution.j_node_and_user_rematch 1.38% : 0.000046s : 15: substitution.less_batch_normalization 0.46% : 0.000015s : 6: substitution.merge_addn 0.62% : 0.000021s : 14: substitution.minmaximum_grad 0.31% : 0.000010s : 13: substitution.partial_eliminate 1.03% : 0.000034s : 58: substitution.remove_not_recompute_node 5.93% : 0.000197s : 109: substitution.replace_applicator 0.75% : 0.000025s : 53: substitution.replace_old_param 0.08% : 0.000003s : 1: substitution.set_cell_output_no_recompute 0.88% : 0.000029s : 15: substitution.switch_simplify 2.89% : 0.000096s : 16: substitution.tuple_list_convert_item_index_to_positive 0.65% : 0.000022s : 16: substitution.tuple_list_get_item_const_eliminator 1.00% : 0.000033s : 16: substitution.tuple_list_get_item_depend_reorder 5.04% : 0.000168s : 71: substitution.tuple_list_get_item_eliminator 0.90% : 0.000030s : 16: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.137709 2 92.88% : 0.127902s : 1: type_inference.infer 7.12% : 0.009807s : 1: type_inference.specialize ------[replace.] 0.001453 167 1.55% : 0.000023s : 3: replace.depend_value_elim 50.05% : 0.000727s : 94: replace.inline 6.49% : 0.000094s : 6: replace.replace_applicator 8.77% : 0.000127s : 15: replace.switch_simplify 33.14% : 0.000481s : 49: replace.tuple_list_get_item_eliminator ------[match.] 0.002457 167 0.12% : 0.000003s : 3: match.depend_value_elim 94.41% : 0.002320s : 94: match.inline 0.93% : 0.000023s : 6: match.replace_applicator 0.86% : 0.000021s : 15: match.switch_simplify 3.67% : 0.000090s : 49: match.tuple_list_get_item_eliminator ------[predicate.] 0.003191 22180 0.92% : 0.000029s : 213: predicate.accumulaten_eliminater 0.19% : 0.000006s : 20: predicate.ad_related_special_op_eliminate 0.41% : 0.000013s : 95: predicate.addn_check_dump 0.92% : 0.000029s : 213: predicate.addn_zero_filter 0.94% : 0.000030s : 213: predicate.adjust_all_reduce_mul_add 1.80% : 0.000058s : 308: predicate.arithmetic_simplify 0.95% : 0.000030s : 213: predicate.cast_eliminate 3.02% : 0.000096s : 717: predicate.check_bprop_eliminate 0.41% : 0.000013s : 95: predicate.compare_switch_simplify 0.05% : 0.000002s : 20: predicate.const_output_eliminate 0.42% : 0.000013s : 94: predicate.depend_value_elim 1.02% : 0.000032s : 213: predicate.dict_get_item_const_eliminator 1.17% : 0.000037s : 213: predicate.dict_get_item_eliminator 0.95% : 0.000030s : 213: predicate.dict_set_item_eliminator 0.23% : 0.000007s : 40: predicate.dumpgradient_eliminate 0.05% : 0.000002s : 20: predicate.elim_not_effective 0.10% : 0.000003s : 20: predicate.elim_shapecalc_of_broadcastargs 0.96% : 0.000031s : 233: predicate.environ_add_const_eliminate 0.97% : 0.000031s : 233: predicate.environ_get_add_eliminate 0.94% : 0.000030s : 233: predicate.environ_get_depend_swap 1.42% : 0.000045s : 328: predicate.environ_get_eliminate 0.95% : 0.000030s : 233: predicate.environ_get_set_eliminate 1.53% : 0.000049s : 356: predicate.exchange_switch_depend_value 2.08% : 0.000066s : 356: predicate.float_depend_g_call 0.41% : 0.000013s : 95: predicate.float_environ_get_switch 0.51% : 0.000016s : 115: predicate.float_tuple_getitem_switch 0.04% : 0.000001s : 20: predicate.fold_const_symbol 0.31% : 0.000010s : 66: predicate.get_grad_eliminate 0.06% : 0.000002s : 20: predicate.graph_param_transform 0.40% : 0.000013s : 95: predicate.incorporate_call 0.39% : 0.000012s : 95: predicate.incorporate_call_switch 5.20% : 0.000166s : 770: predicate.inline 1.60% : 0.000051s : 273: predicate.inline_without_move 0.14% : 0.000004s : 66: predicate.j_node_and_user_rematch 0.38% : 0.000012s : 66: predicate.less_batch_normalization 1.27% : 0.000040s : 302: predicate.list_to_tuple_eliminator_ 2.19% : 0.000070s : 515: predicate.load_eliminater 0.21% : 0.000007s : 20: predicate.loop_unroll_after_grad 2.63% : 0.000084s : 575: predicate.loop_unroll_before_grad 1.08% : 0.000035s : 253: predicate.make_slice_get_slice_eliminator 0.43% : 0.000014s : 95: predicate.merge_addn 2.86% : 0.000091s : 690: predicate.micro_step_allgather_replace 2.89% : 0.000092s : 690: predicate.mini_step_allgather_replace 0.89% : 0.000028s : 213: predicate.minmaximum_grad 0.20% : 0.000006s : 20: predicate.mutable_eliminate 0.10% : 0.000003s : 20: predicate.opt_reshape 0.10% : 0.000003s : 20: predicate.parallel_virtual_node 2.27% : 0.000072s : 356: predicate.partial_defer_inline 1.34% : 0.000043s : 282: predicate.partial_eliminate 0.92% : 0.000029s : 213: predicate.print_const_string_wrapper 0.39% : 0.000013s : 89: predicate.reduce_all_const_elim 1.15% : 0.000037s : 213: predicate.reduce_eliminate 2.11% : 0.000067s : 515: predicate.redundant_stop_gradient_eliminater 0.15% : 0.000005s : 66: predicate.remove_not_recompute_node 2.51% : 0.000080s : 964: predicate.replace_applicator 0.64% : 0.000020s : 273: predicate.replace_old_param 0.05% : 0.000002s : 20: predicate.reset_defer_inline 0.92% : 0.000029s : 213: predicate.reshape_eliminate 3.01% : 0.000096s : 690: predicate.row_tensor_add_zeros_like 0.10% : 0.000003s : 20: predicate.row_tensor_eliminate 3.23% : 0.000103s : 717: predicate.same_eliminate 0.18% : 0.000006s : 72: predicate.set_cell_output_no_recompute 0.32% : 0.000010s : 66: predicate.shard_identity_eliminate 0.20% : 0.000006s : 40: predicate.special_op_eliminate 0.47% : 0.000015s : 95: predicate.specialize_transform 2.92% : 0.000093s : 690: predicate.split_environ_get_set_with_tuple_value 1.31% : 0.000042s : 273: predicate.stack_unstack_eliminate 0.09% : 0.000003s : 20: predicate.switch_call_monad_eliminater 1.70% : 0.000054s : 356: predicate.switch_defer_inline 4.66% : 0.000149s : 1073: predicate.switch_layer_defer_inline 5.12% : 0.000163s : 1076: predicate.switch_simplify 0.97% : 0.000031s : 213: predicate.tile_eliminate 0.91% : 0.000029s : 213: predicate.transpose_eliminate 1.20% : 0.000038s : 253: predicate.tuple_list_convert_item_index_to_positive 1.26% : 0.000040s : 253: predicate.tuple_list_get_item_const_eliminator 1.18% : 0.000038s : 253: predicate.tuple_list_get_item_depend_reorder 2.15% : 0.000069s : 397: predicate.tuple_list_get_item_eliminator 1.26% : 0.000040s : 253: predicate.tuple_list_get_set_item_eliminator 1.82% : 0.000058s : 348: predicate.tuple_list_set_item_eliminator 1.28% : 0.000041s : 302: predicate.tuple_to_list_eliminator_ 2.08% : 0.000066s : 515: predicate.updatestate_pure_node_eliminater 2.48% : 0.000079s : 610: predicate.updatestate_useless_node_eliminater 0.09% : 0.000003s : 20: predicate.value_based_eliminate 0.32% : 0.000010s : 66: predicate.virtual_dataset_eliminate 0.33% : 0.000011s : 66: predicate.virtual_output_eliminate 0.10% : 0.000003s : 20: predicate.virtual_view_grad_eliminate 0.11% : 0.000004s : 20: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.011475 208 63.44% : 0.007280s : 95: func_graph_cloner_run.FuncGraphClonerGraph 0.37% : 0.000042s : 1: func_graph_cloner_run.FuncGraphClonerNode 36.19% : 0.004153s : 112: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.461212 262 0.00% : 0.000003s : 1: ForceFp32Comm 0.83% : 0.003807s : 1: add_attr 0.82% : 0.003797s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.03% : 0.000116s : 1: add_recomputation 0.00% : 0.000005s : 1: assign_add_opt 0.09% : 0.000416s : 1: auto_monad 0.01% : 0.000051s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.14% : 0.000657s : 1: bootstrap 0.01% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000041s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.02% : 0.000070s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.03% : 0.000131s : 1: event_method 0.00% : 0.000013s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000038s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.11% : 0.000509s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.12% : 0.000575s : 1: mutable_eliminate 0.00% : 0.000013s : 1: offloading_packed_experts 0.01% : 0.000037s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000038s : 1: opt.transform.mutable_eliminate 4.84% : 0.022331s : 142: opt.transform.opt_a 0.03% : 0.000122s : 1: opt.transform.opt_after_cconv 0.02% : 0.000080s : 1: opt.transform.opt_after_jit_grad 0.12% : 0.000566s : 28: opt.transform.opt_b 0.06% : 0.000258s : 2: opt.transform.opt_trans_graph 0.03% : 0.000134s : 4: opt.transform.symbol_engine_opt 16.69% : 0.076962s : 1: opt_a 0.06% : 0.000290s : 1: opt_after_cconv 0.13% : 0.000588s : 1: opt_after_jit_grad 0.17% : 0.000768s : 1: opt_b 17.66% : 0.081453s : 1: optimize 0.01% : 0.000041s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000057s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000014s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000178s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000144s : 1: remove_dup_value 6.62% : 0.030511s : 2: renormalize.infer 2.56% : 0.011800s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000073s : 1: rewriter_after_opt_a 0.22% : 0.001003s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000022s : 1: swap_dp_allreduce_reducescatter 0.04% : 0.000187s : 1: symbol_engine_optimizer 18.33% : 0.084561s : 1: task_emit 0.06% : 0.000293s : 1: tuple_transform 29.93% : 0.138037s : 1: type_inference 0.03% : 0.000153s : 1: validate group_cases_19 have all been run, results of sub cases are below: case: (1,) {} pass. case: ('pynative',) {} pass. case: (1,) {} pass. case: ('PYNATIVE_MODE',) {} pass. case: (0, ) {} pass. case: ('KBK',) {} pass. case: ('GRAPH_MODE_O0',) {} pass. case: (0,) {} pass. ops group_cases_20 with 8 cases start to running, all cases are below: case: (, 0, ) case: (, 1, ) case: (, 1, ) case: (, 0, ) case: (, 1, ) case: (, 0, ) case: (, 0, ) case: (, 1, ) ops group_cases_20 total running memory: 32M, memory threshold: 51200M TotalTime = 2.39535, [24] [bootstrap]: 0.00091249 [type_inference]: 0.1391 [event_method]: 0.00030838 [auto_monad]: 0.00022438 [graph_reusing]: 1.003e-05 [inline]: 2.16998e-06 [add_attr]: 0.00724842, [1] [add_attr_with_inline]: 0.00723613, [1] [Cycle 1]: 0.00015021, [2] [tag_attr]: 4.756e-05 [meta_addattr_fg_expand]: 2.076e-05 [parallel-infer-symbol]: 3.61001e-06 [pre_auto_parallel]: 7.11e-05 [insert-virtual-dataset]: 2.64001e-06 [parallel-infer-symbol-second]: 7.60017e-07 [dataset_repeat_opt]: 2.14e-06 [pipeline_split]: 1.68002e-06 [optimize]: 0.00738535, [53] [py_interpret_to_execute]: 3.28998e-06 [rewriter_before_opt_a]: 0.00030459 [opt_a]: 0.00495312, [2] [Cycle 1]: 0.00439685, [45] [expand_dump_flag]: 5.00999e-06 [switch_simplify]: 0.00019742 [loop_unroll]: 5.71e-05 [a_1]: 0.00114348 [with_stream_mark]: 1.521e-05 [recompute_prepare]: 7.40998e-06 [updatestate_depend_eliminate]: 1.359e-05 [updatestate_assign_eliminate]: 1.2e-05 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 7.34e-05 [accelerated_algorithm]: 6.29001e-06 [shard]: 2.26998e-06 [meta_shard_fg_expand]: 2.68e-06 [shard_inline]: 5.56998e-06 [merge_send_recv]: 4.754e-05 [auto_parallel]: 5.99999e-06 [parallel]: 0.00010544 [flash_sp]: 3.497e-05 [merge_comm]: 3.83999e-06 [allreduce_fusion]: 1.229e-05 [matmul_add_comm_reduction]: 1.941e-05 [allreduce_slice_to_reducescatter]: 9.85002e-06 [virtual_shard_identity]: 8.38999e-06 [virtual_dataset]: 6.07001e-06 [get_grad_eliminate_]: 5.76e-06 [virtual_output]: 5.60001e-06 [merge_forward]: 3.97e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 2.862e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.287e-05 [merge_recompute_call_nodes]: 1.44e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 1.329e-05 [meta_fg_expand]: 4.03999e-06 [flash_sp_send_recv_attached]: 2.43998e-06 [receive_attached]: 2.867e-05 [after_resolve]: 9.92001e-06 [a_after_grad]: 8.46002e-06 [renormalize]: 0.00206929 [add_forward_monad_depend]: 4.97999e-06 [auto_monad_grad]: 2.04999e-06 [auto_monad_eliminator]: 2.781e-05 [cse]: 5.957e-05 [a_3]: 4.089e-05 [Cycle 2]: 0.00054665, [45] [expand_dump_flag]: 1.00001e-06 [switch_simplify]: 6.67002e-06 [loop_unroll]: 5.40999e-06 [a_1]: 9.611e-05 [with_stream_mark]: 1.076e-05 [recompute_prepare]: 5.44e-06 [updatestate_depend_eliminate]: 3.21001e-06 [updatestate_assign_eliminate]: 2.35002e-06 [updatestate_loads_eliminate]: 2.96001e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 6.013e-05 [accelerated_algorithm]: 5.67001e-06 [shard]: 1.25001e-06 [meta_shard_fg_expand]: 1.42999e-06 [shard_inline]: 5.48002e-06 [merge_send_recv]: 4.66002e-06 [auto_parallel]: 5.16002e-06 [parallel]: 4.12e-06 [flash_sp]: 3.34001e-06 [merge_comm]: 2.76e-06 [allreduce_fusion]: 2.59999e-06 [matmul_add_comm_reduction]: 5.54998e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 6.11e-06 [virtual_dataset]: 5.03002e-06 [get_grad_eliminate_]: 4.93001e-06 [virtual_output]: 4.82e-06 [merge_forward]: 2.46e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 5.80002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.101e-05 [merge_recompute_call_nodes]: 7.2e-07 [before_grad]: 7.82e-06 [set_forward_comm_id_for_comm_node_pass]: 3.3e-06 [meta_fg_expand]: 1.81e-06 [flash_sp_send_recv_attached]: 8.29983e-07 [receive_attached]: 1.09e-06 [after_resolve]: 7.82e-06 [a_after_grad]: 7.4e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22999e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 6.02999e-06 [cse]: 1.276e-05 [a_3]: 3.006e-05 [py_interpret_to_execute_after_opt_a]: 3.45998e-06 [slice_cell_reuse_recomputed_activation]: 1.88002e-06 [rewriter_after_opt_a]: 2.749e-05 [convert_after_rewriter]: 1.29998e-06 [order_py_execute_after_rewriter]: 1.14e-06 [mutable_eliminate]: 0.00051808 [opt_b]: 0.00017686, [1] [Cycle 1]: 0.00017107, [7] [b_1]: 0.00010649 [b_2]: 6.88998e-06 [updatestate_depend_eliminate]: 4.55001e-06 [updatestate_assign_eliminate]: 2.30002e-06 [updatestate_loads_eliminate]: 2.08998e-06 [renormalize]: 4.00003e-07 [cse]: 1.611e-05 [optimize_parallel_all_gather_comm]: 2.599e-05 [overlap_param_gather]: 3.157e-05 [cconv]: 2.29e-05 [loop_unroll]: 0.00043159 [opt_after_cconv]: 9.071e-05, [1] [Cycle 1]: 8.556e-05, [7] [c_1]: 2.493e-05 [parameter_eliminate]: 2.02001e-06 [updatestate_depend_eliminate]: 5.05001e-06 [updatestate_assign_eliminate]: 2.51998e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.72e-05 [renormalize]: 2.50002e-07 [remove_dup_value]: 1.391e-05 [tuple_transform]: 6.326e-05, [1] [Cycle 1]: 5.923e-05, [4] [d_1]: 3.378e-05 [none_parameter_eliminate]: 1.47999e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 5.64998e-06 [partial_unused_args_eliminate]: 2.46e-06 [add_recomputation]: 6.119e-05 [cse_after_recomputation]: 2.015e-05, [1] [Cycle 1]: 1.628e-05, [1] [cse]: 1.121e-05 [environ_conv]: 1.457e-05 [swap_dp_allreduce_reducescatter]: 2.676e-05 [bias_add_comm_swap]: 1.202e-05 [label_micro_interleaved_index]: 1.311e-05 [label_fine_grained_interleaved_index]: 2.75997e-06 [merge_cast_opt]: 1.40001e-06 [slice_recompute_activation]: 2.20002e-06 [micro_interleaved_order_control]: 2.83e-06 [assign_add_opt]: 1.14e-06 [ForceFp32Comm]: 7.39994e-07 [remove_cast_before_assign_add]: 1.071e-05 [full_micro_interleaved_order_control]: 1.107e-05 [reorder_send_recv_between_fp_bp]: 2.85002e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.063e-05 [overlap_opt_shard_in_pipeline]: 1.371e-05 [overlap_opt_shard_grad_in_pipeline]: 1.79e-06 [control_data_broadcast_order]: 1.334e-05 [grouped_pairwise_exchange_alltoall]: 1.54e-06 [offloading_packed_experts]: 3.45e-06 [overlap_recompute_and_grad_model_parallel]: 1.41e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.44e-06 [overlap_recompute_comm]: 2.29999e-06 [overlap_grad_ring_attention]: 2.141e-05 [overlap_grad_flash_sp]: 4.586e-05 [begin_end_overlap_inline]: 8.09989e-07 [split_matmul_comm_elemetwise]: 1.095e-05 [split_layernorm_comm]: 2.08998e-06 [handle_group_info]: 9.69972e-07 [symbol_engine_optimizer]: 7.595e-05, [1] [Cycle 1]: 7.111e-05, [6] [build]: 2.87002e-06 [elim_shapecalc]: 1.16e-05 [elim_not_effective]: 1.206e-05 [opt_reshape]: 6.30002e-06 [fold_const_symbol]: 9.43002e-06 [renormalize]: 3.00002e-07 [detach_backward]: 1.80001e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.109e-05 [get_jit_bprop_graph]: 1.13001e-06 [rewriter_after_jit_bprop_graph]: 2.79999e-06 [opt_after_jit_grad]: 0.00046981 [validate]: 5.493e-05 [backend_pass]: 1.35001e-06 [task_emit]: 2.23898 [execute]: 9.76e-06 Sums bootstrap : 0.000912s : 0.04% type_inference : 0.139100s : 5.83% event_method : 0.000308s : 0.01% auto_monad : 0.000224s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000048s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000021s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000071s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000003s : 0.00% optimize.rewriter_before_opt_a : 0.000305s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000204s : 0.01% optimize.opt_a.loop_unroll : 0.000063s : 0.00% optimize.opt_a.a_1 : 0.001240s : 0.05% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000013s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000134s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000110s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000015s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000014s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000011s : 0.00% optimize.opt_a.virtual_output : 0.000010s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000034s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000017s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000030s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.002069s : 0.09% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.00% optimize.opt_a.cse : 0.000072s : 0.00% optimize.opt_a.a_3 : 0.000071s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000003s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000027s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000518s : 0.02% optimize.opt_b.b_1 : 0.000106s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000016s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000032s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000432s : 0.02% optimize.opt_after_cconv.c_1 : 0.000025s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000034s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000061s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000011s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000046s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000021s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000470s : 0.02% validate : 0.000055s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.238982s : 93.81% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000338 39 0.55% : 0.000002s : 2: substitution.elim_not_effective 0.40% : 0.000001s : 2: substitution.fold_const_symbol 1.52% : 0.000005s : 3: substitution.graph_param_transform 77.94% : 0.000264s : 16: substitution.inline 0.97% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.21% : 0.000014s : 4: substitution.remove_not_recompute_node 0.96% : 0.000003s : 2: substitution.replace_old_param 6.49% : 0.000022s : 4: substitution.switch_simplify 6.96% : 0.000024s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.138987 2 97.26% : 0.135173s : 1: type_inference.infer 2.74% : 0.003814s : 1: type_inference.specialize ------[replace.] 0.000148 22 61.52% : 0.000091s : 16: replace.inline 28.50% : 0.000042s : 4: replace.switch_simplify 9.98% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000298 22 86.03% : 0.000256s : 16: match.inline 6.57% : 0.000020s : 4: match.switch_simplify 7.40% : 0.000022s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000263 1694 1.19% : 0.000003s : 22: predicate.accumulaten_eliminater 0.53% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.35% : 0.000001s : 6: predicate.addn_check_dump 1.14% : 0.000003s : 22: predicate.addn_zero_filter 1.05% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.28% : 0.000006s : 28: predicate.arithmetic_simplify 1.18% : 0.000003s : 22: predicate.cast_eliminate 0.37% : 0.000001s : 6: predicate.check_bprop_eliminate 0.34% : 0.000001s : 6: predicate.compare_switch_simplify 0.10% : 0.000000s : 3: predicate.const_output_eliminate 0.31% : 0.000001s : 6: predicate.depend_value_elim 1.18% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.30% : 0.000003s : 22: predicate.dict_get_item_eliminator 1.14% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.50% : 0.000001s : 6: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 3: predicate.elim_not_effective 0.29% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.36% : 0.000004s : 25: predicate.environ_add_const_eliminate 1.21% : 0.000003s : 25: predicate.environ_get_add_eliminate 1.30% : 0.000003s : 25: predicate.environ_get_depend_swap 1.59% : 0.000004s : 31: predicate.environ_get_eliminate 1.26% : 0.000003s : 25: predicate.environ_get_set_eliminate 2.10% : 0.000006s : 40: predicate.exchange_switch_depend_value 3.02% : 0.000008s : 40: predicate.float_depend_g_call 0.30% : 0.000001s : 6: predicate.float_environ_get_switch 0.42% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.10% : 0.000000s : 3: predicate.fold_const_symbol 0.41% : 0.000001s : 6: predicate.get_grad_eliminate 0.11% : 0.000000s : 3: predicate.graph_param_transform 0.34% : 0.000001s : 6: predicate.incorporate_call 0.30% : 0.000001s : 6: predicate.incorporate_call_switch 6.13% : 0.000016s : 80: predicate.inline 0.45% : 0.000001s : 6: predicate.inline_without_move 0.18% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.55% : 0.000001s : 6: predicate.less_batch_normalization 1.73% : 0.000005s : 30: predicate.list_to_tuple_eliminator_ 2.65% : 0.000007s : 52: predicate.load_eliminater 0.58% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.71% : 0.000010s : 70: predicate.loop_unroll_before_grad 1.53% : 0.000004s : 28: predicate.make_slice_get_slice_eliminator 0.36% : 0.000001s : 6: predicate.merge_addn 0.33% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.33% : 0.000001s : 6: predicate.mini_step_allgather_replace 1.06% : 0.000003s : 22: predicate.minmaximum_grad 0.64% : 0.000002s : 3: predicate.mutable_eliminate 0.27% : 0.000001s : 3: predicate.opt_reshape 0.32% : 0.000001s : 3: predicate.parallel_virtual_node 2.98% : 0.000008s : 40: predicate.partial_defer_inline 1.52% : 0.000004s : 27: predicate.partial_eliminate 1.27% : 0.000003s : 22: predicate.print_const_string_wrapper 0.32% : 0.000001s : 6: predicate.reduce_all_const_elim 1.62% : 0.000004s : 22: predicate.reduce_eliminate 2.71% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.26% : 0.000001s : 6: predicate.remove_not_recompute_node 1.19% : 0.000003s : 30: predicate.replace_applicator 0.32% : 0.000001s : 6: predicate.replace_old_param 0.16% : 0.000000s : 3: predicate.reset_defer_inline 1.17% : 0.000003s : 22: predicate.reshape_eliminate 0.40% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.23% : 0.000001s : 3: predicate.row_tensor_eliminate 0.48% : 0.000001s : 6: predicate.same_eliminate 0.23% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.52% : 0.000001s : 6: predicate.shard_identity_eliminate 0.40% : 0.000001s : 6: predicate.special_op_eliminate 0.45% : 0.000001s : 6: predicate.specialize_transform 0.54% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.45% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.20% : 0.000001s : 3: predicate.switch_call_monad_eliminater 2.36% : 0.000006s : 40: predicate.switch_defer_inline 2.62% : 0.000007s : 46: predicate.switch_layer_defer_inline 7.56% : 0.000020s : 127: predicate.switch_simplify 1.25% : 0.000003s : 22: predicate.tile_eliminate 1.13% : 0.000003s : 22: predicate.transpose_eliminate 1.72% : 0.000005s : 28: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000004s : 28: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000004s : 28: predicate.tuple_list_get_item_depend_reorder 2.75% : 0.000007s : 36: predicate.tuple_list_get_item_eliminator 1.54% : 0.000004s : 28: predicate.tuple_list_get_set_item_eliminator 2.09% : 0.000005s : 34: predicate.tuple_list_set_item_eliminator 1.61% : 0.000004s : 30: predicate.tuple_to_list_eliminator_ 2.52% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.93% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 3: predicate.value_based_eliminate 0.40% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.38% : 0.000001s : 6: predicate.virtual_output_eliminate 0.16% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.25% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001956 30 44.78% : 0.000876s : 12: func_graph_cloner_run.FuncGraphClonerGraph 55.22% : 0.001080s : 18: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.413946 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.30% : 0.007252s : 1: add_attr 0.30% : 0.007240s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000065s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000235s : 1: auto_monad 0.00% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000010s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.04% : 0.000969s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000098s : 1: environ_conv 0.01% : 0.000321s : 1: event_method 0.00% : 0.000022s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000440s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.02% : 0.000526s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000013s : 1: opt.transform.mutable_eliminate 0.07% : 0.001791s : 78: opt.transform.opt_a 0.00% : 0.000024s : 1: opt.transform.opt_after_cconv 0.00% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000087s : 28: opt.transform.opt_b 0.00% : 0.000038s : 2: opt.transform.opt_trans_graph 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.21% : 0.004956s : 1: opt_a 0.00% : 0.000094s : 1: opt_after_cconv 0.02% : 0.000478s : 1: opt_after_jit_grad 0.01% : 0.000180s : 1: opt_b 0.31% : 0.007389s : 1: optimize 0.00% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000049s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000025s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000035s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000017s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000075s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000006s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000017s : 1: remove_dup_value 0.05% : 0.001150s : 1: renormalize.infer 0.04% : 0.000910s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.01% : 0.000310s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000079s : 1: symbol_engine_optimizer 92.75% : 2.239021s : 1: task_emit 0.00% : 0.000066s : 1: tuple_transform 5.76% : 0.139123s : 1: type_inference 0.01% : 0.000300s : 1: validate TotalTime = 2.41765, [24] [bootstrap]: 0.00091589 [type_inference]: 0.140351 [event_method]: 0.00031883 [auto_monad]: 0.0001573 [graph_reusing]: 8.01001e-06 [inline]: 1.67999e-06 [add_attr]: 0.00707773, [1] [add_attr_with_inline]: 0.00706633, [1] [Cycle 1]: 0.00012135, [2] [tag_attr]: 4.034e-05 [meta_addattr_fg_expand]: 1.469e-05 [parallel-infer-symbol]: 1.82001e-06 [pre_auto_parallel]: 5.727e-05 [insert-virtual-dataset]: 1.25001e-06 [parallel-infer-symbol-second]: 8.80013e-07 [dataset_repeat_opt]: 1.16002e-06 [pipeline_split]: 9.20001e-07 [optimize]: 0.0074245, [53] [py_interpret_to_execute]: 3.8e-06 [rewriter_before_opt_a]: 0.00027879 [opt_a]: 0.00511428, [2] [Cycle 1]: 0.00443752, [45] [expand_dump_flag]: 4.1e-06 [switch_simplify]: 0.00016257 [loop_unroll]: 5.822e-05 [a_1]: 0.00124152 [with_stream_mark]: 1.229e-05 [recompute_prepare]: 9.15001e-06 [updatestate_depend_eliminate]: 8.61002e-06 [updatestate_assign_eliminate]: 6.66e-06 [updatestate_loads_eliminate]: 2.79001e-06 [parameter_eliminate]: 1.10999e-06 [a_2]: 8.663e-05 [accelerated_algorithm]: 7.21999e-06 [shard]: 1.01002e-06 [meta_shard_fg_expand]: 2.56998e-06 [shard_inline]: 6.58e-06 [merge_send_recv]: 2.18e-05 [auto_parallel]: 6.48e-06 [parallel]: 4.067e-05 [flash_sp]: 1.511e-05 [merge_comm]: 4.35e-06 [allreduce_fusion]: 6.93e-06 [matmul_add_comm_reduction]: 9.77999e-06 [allreduce_slice_to_reducescatter]: 3.65e-06 [virtual_shard_identity]: 9.12001e-06 [virtual_dataset]: 6.91001e-06 [get_grad_eliminate_]: 6.90002e-06 [virtual_output]: 6.98998e-06 [merge_forward]: 3.17002e-06 [cell_reuse_recompute_pass]: 1.09998e-06 [offload_activation]: 1.084e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.82e-05 [merge_recompute_call_nodes]: 7.99977e-07 [before_grad]: 1.092e-05 [set_forward_comm_id_for_comm_node_pass]: 7.49002e-06 [meta_fg_expand]: 4e-06 [flash_sp_send_recv_attached]: 1.62001e-06 [receive_attached]: 7.83001e-06 [after_resolve]: 9.81e-06 [a_after_grad]: 1.026e-05 [renormalize]: 0.00224556 [add_forward_monad_depend]: 4.84003e-06 [auto_monad_grad]: 1.40001e-06 [auto_monad_eliminator]: 2.01e-05 [cse]: 3.293e-05 [a_3]: 5.011e-05 [Cycle 2]: 0.00066698, [45] [expand_dump_flag]: 9.70002e-07 [switch_simplify]: 7.93001e-06 [loop_unroll]: 6.58e-06 [a_1]: 0.00013883 [with_stream_mark]: 1.021e-05 [recompute_prepare]: 6.81001e-06 [updatestate_depend_eliminate]: 3.95998e-06 [updatestate_assign_eliminate]: 2.91999e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.04e-06 [a_2]: 7.602e-05 [accelerated_algorithm]: 6.43e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 1.60999e-06 [shard_inline]: 2.199e-05 [merge_send_recv]: 5.89999e-06 [auto_parallel]: 5.91e-06 [parallel]: 4.25999e-06 [flash_sp]: 2.14e-06 [merge_comm]: 3.55e-06 [allreduce_fusion]: 3.43e-06 [matmul_add_comm_reduction]: 6.43998e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 7.46999e-06 [virtual_dataset]: 6.46e-06 [get_grad_eliminate_]: 6.21e-06 [virtual_output]: 6.17999e-06 [merge_forward]: 3.25e-06 [cell_reuse_recompute_pass]: 1.49998e-06 [offload_activation]: 7.11001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.26e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.007e-05 [set_forward_comm_id_for_comm_node_pass]: 4.12e-06 [meta_fg_expand]: 2.36e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 9.70002e-07 [after_resolve]: 8.69e-06 [a_after_grad]: 9.08002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 7.87998e-06 [cse]: 1.809e-05 [a_3]: 3.853e-05 [py_interpret_to_execute_after_opt_a]: 4.92e-06 [slice_cell_reuse_recomputed_activation]: 9.70002e-07 [rewriter_after_opt_a]: 2.284e-05 [convert_after_rewriter]: 1.00001e-06 [order_py_execute_after_rewriter]: 9.50007e-07 [mutable_eliminate]: 0.00050643 [opt_b]: 0.00020819, [1] [Cycle 1]: 0.00020275, [7] [b_1]: 0.0001281 [b_2]: 8.13001e-06 [updatestate_depend_eliminate]: 5.77999e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.78e-06 [renormalize]: 4.19997e-07 [cse]: 2.264e-05 [optimize_parallel_all_gather_comm]: 1.957e-05 [overlap_param_gather]: 4.97999e-06 [cconv]: 1.508e-05 [loop_unroll]: 0.00044825 [opt_after_cconv]: 0.00010437, [1] [Cycle 1]: 9.915e-05, [7] [c_1]: 3.03e-05 [parameter_eliminate]: 2.47001e-06 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 3.01999e-06 [updatestate_loads_eliminate]: 3.25998e-06 [cse]: 2.242e-05 [renormalize]: 3.69997e-07 [remove_dup_value]: 9.84001e-06 [tuple_transform]: 7.056e-05, [1] [Cycle 1]: 6.623e-05, [4] [d_1]: 4.159e-05 [none_parameter_eliminate]: 9.30013e-07 [renormalize]: 1.30007e-07 [switch_simplify]: 6.98998e-06 [partial_unused_args_eliminate]: 1.29e-06 [add_recomputation]: 5.715e-05 [cse_after_recomputation]: 2.471e-05, [1] [Cycle 1]: 2.026e-05, [1] [cse]: 1.51e-05 [environ_conv]: 1.364e-05 [swap_dp_allreduce_reducescatter]: 9.101e-05 [bias_add_comm_swap]: 4.95999e-06 [label_micro_interleaved_index]: 6.59001e-06 [label_fine_grained_interleaved_index]: 1.42e-06 [merge_cast_opt]: 6.89994e-07 [slice_recompute_activation]: 8.50006e-07 [micro_interleaved_order_control]: 1.29e-06 [assign_add_opt]: 6.19999e-07 [ForceFp32Comm]: 4.19997e-07 [remove_cast_before_assign_add]: 4.3e-06 [full_micro_interleaved_order_control]: 4.75001e-06 [reorder_send_recv_between_fp_bp]: 1.25001e-06 [comm_op_add_attrs]: 6.19999e-07 [add_comm_op_reuse_tag]: 4.19997e-07 [interleave_split_concat_branches]: 7.50006e-07 [interleave_parallel_branches]: 4.07e-06 [overlap_opt_shard_in_pipeline]: 1.431e-05 [overlap_opt_shard_grad_in_pipeline]: 8.49977e-07 [control_data_broadcast_order]: 1.265e-05 [grouped_pairwise_exchange_alltoall]: 9.89996e-07 [offloading_packed_experts]: 3.18e-06 [overlap_recompute_and_grad_model_parallel]: 7.26001e-06 [overlap_grad_matmul_and_grad_allreduce]: 7.89994e-07 [overlap_recompute_allgather_and_fa_grad]: 7.80012e-07 [overlap_recompute_comm]: 1.40001e-06 [overlap_grad_ring_attention]: 9.97001e-06 [overlap_grad_flash_sp]: 2.525e-05 [begin_end_overlap_inline]: 3.50003e-07 [split_matmul_comm_elemetwise]: 4.45e-06 [split_layernorm_comm]: 7.40023e-07 [handle_group_info]: 5.10016e-07 [symbol_engine_optimizer]: 8.319e-05, [1] [Cycle 1]: 7.83e-05, [6] [build]: 2.69001e-06 [elim_shapecalc]: 1.296e-05 [elim_not_effective]: 1.474e-05 [opt_reshape]: 7.56001e-06 [fold_const_symbol]: 1.098e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.06002e-06 [pipeline_parallel_scheduler]: 9.60019e-07 [auto_monad_reorder]: 2.672e-05 [get_jit_bprop_graph]: 9.5999e-07 [rewriter_after_jit_bprop_graph]: 3.18998e-06 [opt_after_jit_grad]: 0.00049272 [validate]: 4.722e-05 [backend_pass]: 1.24998e-06 [task_emit]: 2.25998 [execute]: 1.044e-05 Sums bootstrap : 0.000916s : 0.04% type_inference : 0.140351s : 5.83% event_method : 0.000319s : 0.01% auto_monad : 0.000157s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000040s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000057s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000279s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000171s : 0.01% optimize.opt_a.loop_unroll : 0.000065s : 0.00% optimize.opt_a.a_1 : 0.001380s : 0.06% optimize.opt_a.with_stream_mark : 0.000022s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000163s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000029s : 0.00% optimize.opt_a.merge_send_recv : 0.000028s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000045s : 0.00% optimize.opt_a.flash_sp : 0.000017s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000010s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000031s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.002246s : 0.09% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000028s : 0.00% optimize.opt_a.cse : 0.000051s : 0.00% optimize.opt_a.a_3 : 0.000089s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000023s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000506s : 0.02% optimize.opt_b.b_1 : 0.000128s : 0.01% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000023s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000020s : 0.00% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000015s : 0.00% optimize.loop_unroll : 0.000448s : 0.02% optimize.opt_after_cconv.c_1 : 0.000030s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000010s : 0.00% optimize.tuple_transform.d_1 : 0.000042s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000057s : 0.00% optimize.cse_after_recomputation.cse : 0.000015s : 0.00% optimize.environ_conv : 0.000014s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000091s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000010s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000027s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000493s : 0.02% validate : 0.000047s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.259975s : 93.81% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000357 50 9.72% : 0.000035s : 4: substitution.cast_eliminate 0.48% : 0.000002s : 3: substitution.elim_not_effective 0.38% : 0.000001s : 3: substitution.fold_const_symbol 1.28% : 0.000005s : 4: substitution.graph_param_transform 76.65% : 0.000274s : 16: substitution.inline 0.98% : 0.000003s : 6: substitution.j_node_and_user_rematch 2.54% : 0.000009s : 6: substitution.remove_not_recompute_node 0.79% : 0.000003s : 2: substitution.replace_old_param 4.04% : 0.000014s : 4: substitution.switch_simplify 3.14% : 0.000011s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.140261 2 97.25% : 0.136405s : 1: type_inference.infer 2.75% : 0.003856s : 1: type_inference.specialize ------[replace.] 0.000153 22 59.88% : 0.000091s : 16: replace.inline 29.14% : 0.000044s : 4: replace.switch_simplify 10.98% : 0.000017s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000289 22 92.43% : 0.000267s : 16: match.inline 4.06% : 0.000012s : 4: match.switch_simplify 3.51% : 0.000010s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000303 1966 1.10% : 0.000003s : 25: predicate.accumulaten_eliminater 0.63% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.37% : 0.000001s : 8: predicate.addn_check_dump 1.15% : 0.000003s : 25: predicate.addn_zero_filter 1.08% : 0.000003s : 25: predicate.adjust_all_reduce_mul_add 2.21% : 0.000007s : 33: predicate.arithmetic_simplify 1.44% : 0.000004s : 25: predicate.cast_eliminate 0.41% : 0.000001s : 8: predicate.check_bprop_eliminate 0.36% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.40% : 0.000001s : 8: predicate.depend_value_elim 1.19% : 0.000004s : 25: predicate.dict_get_item_const_eliminator 1.32% : 0.000004s : 25: predicate.dict_get_item_eliminator 1.24% : 0.000004s : 25: predicate.dict_set_item_eliminator 0.59% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.17% : 0.000001s : 4: predicate.elim_not_effective 0.23% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.40% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.26% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.24% : 0.000004s : 29: predicate.environ_get_depend_swap 1.62% : 0.000005s : 37: predicate.environ_get_eliminate 1.25% : 0.000004s : 29: predicate.environ_get_set_eliminate 2.01% : 0.000006s : 43: predicate.exchange_switch_depend_value 2.62% : 0.000008s : 43: predicate.float_depend_g_call 0.36% : 0.000001s : 8: predicate.float_environ_get_switch 0.51% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.45% : 0.000001s : 8: predicate.get_grad_eliminate 0.12% : 0.000000s : 4: predicate.graph_param_transform 0.40% : 0.000001s : 8: predicate.incorporate_call 0.32% : 0.000001s : 8: predicate.incorporate_call_switch 5.98% : 0.000018s : 92: predicate.inline 0.48% : 0.000001s : 8: predicate.inline_without_move 0.18% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.54% : 0.000002s : 8: predicate.less_batch_normalization 1.73% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.70% : 0.000008s : 60: predicate.load_eliminater 0.67% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.32% : 0.000010s : 72: predicate.loop_unroll_before_grad 1.66% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.38% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.36% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.08% : 0.000003s : 25: predicate.minmaximum_grad 0.67% : 0.000002s : 4: predicate.mutable_eliminate 0.23% : 0.000001s : 4: predicate.opt_reshape 0.23% : 0.000001s : 4: predicate.parallel_virtual_node 2.59% : 0.000008s : 43: predicate.partial_defer_inline 1.51% : 0.000005s : 31: predicate.partial_eliminate 1.19% : 0.000004s : 25: predicate.print_const_string_wrapper 0.47% : 0.000001s : 8: predicate.reduce_all_const_elim 1.51% : 0.000005s : 25: predicate.reduce_eliminate 2.67% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.26% : 0.000001s : 8: predicate.remove_not_recompute_node 1.20% : 0.000004s : 35: predicate.replace_applicator 0.28% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000000s : 4: predicate.reset_defer_inline 1.21% : 0.000004s : 25: predicate.reshape_eliminate 0.45% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.27% : 0.000001s : 4: predicate.row_tensor_eliminate 0.47% : 0.000001s : 8: predicate.same_eliminate 0.27% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.53% : 0.000002s : 8: predicate.shard_identity_eliminate 0.47% : 0.000001s : 8: predicate.special_op_eliminate 0.50% : 0.000002s : 8: predicate.specialize_transform 0.55% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.51% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.20% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.19% : 0.000007s : 43: predicate.switch_defer_inline 2.54% : 0.000008s : 51: predicate.switch_layer_defer_inline 7.03% : 0.000021s : 135: predicate.switch_simplify 1.23% : 0.000004s : 25: predicate.tile_eliminate 1.14% : 0.000003s : 25: predicate.transpose_eliminate 1.75% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.65% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.51% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.73% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.59% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.67% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.57% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.04% : 0.000009s : 68: predicate.updatestate_useless_node_eliminater 0.21% : 0.000001s : 4: predicate.value_based_eliminate 0.48% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.50% : 0.000002s : 8: predicate.virtual_output_eliminate 0.18% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.26% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002016 30 43.21% : 0.000871s : 12: func_graph_cloner_run.FuncGraphClonerGraph 56.79% : 0.001145s : 18: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.436441 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.29% : 0.007082s : 1: add_attr 0.29% : 0.007070s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000061s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.01% : 0.000170s : 1: auto_monad 0.00% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000009s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.04% : 0.000973s : 1: bootstrap 0.00% : 0.000018s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000016s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.01% : 0.000331s : 1: event_method 0.00% : 0.000039s : 1: execute 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000004s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.02% : 0.000457s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000515s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000016s : 1: opt.transform.mutable_eliminate 0.08% : 0.001988s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000109s : 28: opt.transform.opt_b 0.00% : 0.000047s : 2: opt.transform.opt_trans_graph 0.00% : 0.000042s : 4: opt.transform.symbol_engine_opt 0.21% : 0.005117s : 1: opt_a 0.00% : 0.000108s : 1: opt_after_cconv 0.02% : 0.000502s : 1: opt_after_jit_grad 0.01% : 0.000211s : 1: opt_b 0.30% : 0.007429s : 1: optimize 0.00% : 0.000023s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000013s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000014s : 1: remove_dup_value 0.05% : 0.001244s : 1: renormalize.infer 0.04% : 0.000994s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000026s : 1: rewriter_after_opt_a 0.01% : 0.000285s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000095s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000086s : 1: symbol_engine_optimizer 92.76% : 2.260125s : 1: task_emit 0.00% : 0.000073s : 1: tuple_transform 5.76% : 0.140370s : 1: type_inference 0.01% : 0.000304s : 1: validate TotalTime = 2.40255, [24] [bootstrap]: 0.00105109 [type_inference]: 0.156495 [event_method]: 0.00035965 [auto_monad]: 0.00030129 [graph_reusing]: 1.049e-05 [inline]: 2.46998e-06 [add_attr]: 0.00750998, [1] [add_attr_with_inline]: 0.00749562, [1] [Cycle 1]: 0.00017268, [2] [tag_attr]: 5.837e-05 [meta_addattr_fg_expand]: 2.6e-05 [parallel-infer-symbol]: 2.76e-06 [pre_auto_parallel]: 8.61e-05 [insert-virtual-dataset]: 2.69999e-06 [parallel-infer-symbol-second]: 7.80012e-07 [dataset_repeat_opt]: 1.71e-06 [pipeline_split]: 1.54e-06 [optimize]: 0.00793508, [53] [py_interpret_to_execute]: 4.37e-06 [rewriter_before_opt_a]: 0.00037079 [opt_a]: 0.00549612, [2] [Cycle 1]: 0.00488544, [45] [expand_dump_flag]: 5.34e-06 [switch_simplify]: 0.00022383 [loop_unroll]: 6.727e-05 [a_1]: 0.00136267 [with_stream_mark]: 8.34002e-06 [recompute_prepare]: 7.69002e-06 [updatestate_depend_eliminate]: 1.761e-05 [updatestate_assign_eliminate]: 1.469e-05 [updatestate_loads_eliminate]: 2.12999e-06 [parameter_eliminate]: 8.29983e-07 [a_2]: 7.993e-05 [accelerated_algorithm]: 6.80998e-06 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 2.31e-06 [shard_inline]: 6.30002e-06 [merge_send_recv]: 5.866e-05 [auto_parallel]: 5.54e-06 [parallel]: 0.00010523 [flash_sp]: 4.362e-05 [merge_comm]: 3.38e-06 [allreduce_fusion]: 1.556e-05 [matmul_add_comm_reduction]: 1.712e-05 [allreduce_slice_to_reducescatter]: 1.33e-05 [virtual_shard_identity]: 8.98002e-06 [virtual_dataset]: 6.71e-06 [get_grad_eliminate_]: 6.02001e-06 [virtual_output]: 6.19001e-06 [merge_forward]: 2.86e-06 [cell_reuse_recompute_pass]: 9.80013e-07 [offload_activation]: 1.897e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.422e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 9.32001e-06 [set_forward_comm_id_for_comm_node_pass]: 1.693e-05 [meta_fg_expand]: 3.71001e-06 [flash_sp_send_recv_attached]: 1.57001e-06 [receive_attached]: 2.627e-05 [after_resolve]: 1.153e-05 [a_after_grad]: 9.36e-06 [renormalize]: 0.00230966 [add_forward_monad_depend]: 4.36002e-06 [auto_monad_grad]: 1.22e-06 [auto_monad_eliminator]: 2.688e-05 [cse]: 3.76e-05 [a_3]: 4.345e-05 [Cycle 2]: 0.00060049, [45] [expand_dump_flag]: 1.14e-06 [switch_simplify]: 7.57998e-06 [loop_unroll]: 6.01e-06 [a_1]: 0.00013144 [with_stream_mark]: 9.74e-06 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 2.66e-06 [updatestate_assign_eliminate]: 2.24999e-06 [updatestate_loads_eliminate]: 2.83998e-06 [parameter_eliminate]: 9.29984e-07 [a_2]: 7.179e-05 [accelerated_algorithm]: 5.99e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.16002e-06 [shard_inline]: 6.08002e-06 [merge_send_recv]: 4.59002e-06 [auto_parallel]: 5.06002e-06 [parallel]: 4.15e-06 [flash_sp]: 2.12001e-06 [merge_comm]: 2.79999e-06 [allreduce_fusion]: 2.56998e-06 [matmul_add_comm_reduction]: 5.49e-06 [allreduce_slice_to_reducescatter]: 4.19997e-07 [virtual_shard_identity]: 6.78e-06 [virtual_dataset]: 5.54e-06 [get_grad_eliminate_]: 5.57001e-06 [virtual_output]: 5.29e-06 [merge_forward]: 2.52001e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 5.87999e-06 [cell_reuse_handle_not_recompute_node_pass]: 9.47001e-06 [merge_recompute_call_nodes]: 6.80011e-07 [before_grad]: 7.91001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.41999e-06 [meta_fg_expand]: 1.81e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.57001e-06 [a_after_grad]: 8.52e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.19e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 5.79e-06 [cse]: 1.068e-05 [a_3]: 3.363e-05 [py_interpret_to_execute_after_opt_a]: 3.85998e-06 [slice_cell_reuse_recomputed_activation]: 9.89996e-07 [rewriter_after_opt_a]: 2.616e-05 [convert_after_rewriter]: 1.05001e-06 [order_py_execute_after_rewriter]: 9.70002e-07 [mutable_eliminate]: 0.00050997 [opt_b]: 0.00018326, [1] [Cycle 1]: 0.00017763, [7] [b_1]: 0.00011398 [b_2]: 7.18e-06 [updatestate_depend_eliminate]: 4.32e-06 [updatestate_assign_eliminate]: 2.34001e-06 [updatestate_loads_eliminate]: 2.31e-06 [renormalize]: 4.19997e-07 [cse]: 1.432e-05 [optimize_parallel_all_gather_comm]: 2.204e-05 [overlap_param_gather]: 1.175e-05 [cconv]: 1.361e-05 [loop_unroll]: 0.0004461 [opt_after_cconv]: 9.162e-05, [1] [Cycle 1]: 8.635e-05, [7] [c_1]: 2.83e-05 [parameter_eliminate]: 2.43002e-06 [updatestate_depend_eliminate]: 4.50001e-06 [updatestate_assign_eliminate]: 2.29001e-06 [updatestate_loads_eliminate]: 2.13002e-06 [cse]: 1.451e-05 [renormalize]: 4.2998e-07 [remove_dup_value]: 6.50002e-06 [tuple_transform]: 6.805e-05, [1] [Cycle 1]: 6.433e-05, [4] [d_1]: 3.933e-05 [none_parameter_eliminate]: 1.00001e-06 [renormalize]: 1.50001e-07 [switch_simplify]: 6.46999e-06 [partial_unused_args_eliminate]: 1.02e-06 [add_recomputation]: 4.352e-05 [cse_after_recomputation]: 1.985e-05, [1] [Cycle 1]: 1.595e-05, [1] [cse]: 1.081e-05 [environ_conv]: 1.466e-05 [swap_dp_allreduce_reducescatter]: 2.626e-05 [bias_add_comm_swap]: 1.031e-05 [label_micro_interleaved_index]: 1.307e-05 [label_fine_grained_interleaved_index]: 1.47999e-06 [merge_cast_opt]: 5.89993e-07 [slice_recompute_activation]: 8.30012e-07 [micro_interleaved_order_control]: 1.29e-06 [assign_add_opt]: 6.80011e-07 [ForceFp32Comm]: 3.9002e-07 [remove_cast_before_assign_add]: 1.027e-05 [full_micro_interleaved_order_control]: 1.06e-05 [reorder_send_recv_between_fp_bp]: 1.20999e-06 [comm_op_add_attrs]: 7.2e-07 [add_comm_op_reuse_tag]: 5.19998e-07 [interleave_split_concat_branches]: 1.07998e-06 [interleave_parallel_branches]: 1.037e-05 [overlap_opt_shard_in_pipeline]: 1.576e-05 [overlap_opt_shard_grad_in_pipeline]: 1.07e-06 [control_data_broadcast_order]: 1.121e-05 [grouped_pairwise_exchange_alltoall]: 6.59988e-07 [offloading_packed_experts]: 2.91999e-06 [overlap_recompute_and_grad_model_parallel]: 1.262e-05 [overlap_grad_matmul_and_grad_allreduce]: 8.39995e-07 [overlap_recompute_allgather_and_fa_grad]: 9.30013e-07 [overlap_recompute_comm]: 1.14003e-06 [overlap_grad_ring_attention]: 2.078e-05 [overlap_grad_flash_sp]: 4.28e-05 [begin_end_overlap_inline]: 7.09988e-07 [split_matmul_comm_elemetwise]: 1.082e-05 [split_layernorm_comm]: 7.60017e-07 [handle_group_info]: 4.20026e-07 [symbol_engine_optimizer]: 7.574e-05, [1] [Cycle 1]: 7.079e-05, [6] [build]: 2.25002e-06 [elim_shapecalc]: 1.12e-05 [elim_not_effective]: 1.216e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 9.24e-06 [renormalize]: 2.00002e-07 [detach_backward]: 1.10999e-06 [pipeline_parallel_scheduler]: 1.00001e-06 [auto_monad_reorder]: 1.552e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 2.86e-06 [opt_after_jit_grad]: 0.00053725 [validate]: 4.549e-05 [backend_pass]: 1.09e-06 [task_emit]: 2.22761 [execute]: 1.021e-05 Sums bootstrap : 0.001051s : 0.04% type_inference : 0.156495s : 6.54% event_method : 0.000360s : 0.02% auto_monad : 0.000301s : 0.01% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000058s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000026s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000086s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000371s : 0.02% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000231s : 0.01% optimize.opt_a.loop_unroll : 0.000073s : 0.00% optimize.opt_a.a_1 : 0.001494s : 0.06% optimize.opt_a.with_stream_mark : 0.000018s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000152s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000063s : 0.00% optimize.opt_a.auto_parallel : 0.000011s : 0.00% optimize.opt_a.parallel : 0.000109s : 0.00% optimize.opt_a.flash_sp : 0.000046s : 0.00% optimize.opt_a.merge_comm : 0.000006s : 0.00% optimize.opt_a.allreduce_fusion : 0.000018s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000023s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000014s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000005s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000034s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000001s : 0.00% optimize.opt_a.before_grad : 0.000017s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000027s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.00% optimize.opt_a.a_after_grad : 0.000018s : 0.00% optimize.opt_a.renormalize : 0.002310s : 0.10% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000033s : 0.00% optimize.opt_a.cse : 0.000048s : 0.00% optimize.opt_a.a_3 : 0.000077s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000510s : 0.02% optimize.opt_b.b_1 : 0.000114s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000014s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000014s : 0.00% optimize.loop_unroll : 0.000446s : 0.02% optimize.opt_after_cconv.c_1 : 0.000028s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000015s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000007s : 0.00% optimize.tuple_transform.d_1 : 0.000039s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000044s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000026s : 0.00% optimize.bias_add_comm_swap : 0.000010s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000001s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000013s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000021s : 0.00% optimize.overlap_grad_flash_sp : 0.000043s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000000s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000016s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000537s : 0.02% validate : 0.000045s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.227613s : 93.06% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000396 55 6.65% : 0.000026s : 8: substitution.arithmetic_simplify 0.35% : 0.000001s : 2: substitution.elim_not_effective 0.26% : 0.000001s : 2: substitution.fold_const_symbol 0.91% : 0.000004s : 4: substitution.graph_param_transform 76.14% : 0.000301s : 18: substitution.inline 0.69% : 0.000003s : 4: substitution.j_node_and_user_rematch 4.30% : 0.000017s : 4: substitution.remove_not_recompute_node 1.05% : 0.000004s : 4: substitution.replace_old_param 6.71% : 0.000027s : 5: substitution.switch_simplify 2.93% : 0.000012s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.156373 2 97.53% : 0.152504s : 1: type_inference.infer 2.47% : 0.003870s : 1: type_inference.specialize ------[replace.] 0.000176 27 57.24% : 0.000101s : 18: replace.inline 29.60% : 0.000052s : 5: replace.switch_simplify 13.15% : 0.000023s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000326 27 89.82% : 0.000293s : 18: match.inline 7.20% : 0.000023s : 5: match.switch_simplify 2.98% : 0.000010s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000318 2134 1.13% : 0.000004s : 27: predicate.accumulaten_eliminater 0.49% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.32% : 0.000001s : 8: predicate.addn_check_dump 1.23% : 0.000004s : 27: predicate.addn_zero_filter 1.07% : 0.000003s : 27: predicate.adjust_all_reduce_mul_add 3.79% : 0.000012s : 35: predicate.arithmetic_simplify 1.12% : 0.000004s : 27: predicate.cast_eliminate 0.37% : 0.000001s : 8: predicate.check_bprop_eliminate 0.30% : 0.000001s : 8: predicate.compare_switch_simplify 0.11% : 0.000000s : 4: predicate.const_output_eliminate 0.34% : 0.000001s : 8: predicate.depend_value_elim 1.19% : 0.000004s : 27: predicate.dict_get_item_const_eliminator 1.26% : 0.000004s : 27: predicate.dict_get_item_eliminator 1.12% : 0.000004s : 27: predicate.dict_set_item_eliminator 0.54% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.15% : 0.000000s : 4: predicate.elim_not_effective 0.24% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.29% : 0.000004s : 31: predicate.environ_add_const_eliminate 1.25% : 0.000004s : 31: predicate.environ_get_add_eliminate 1.23% : 0.000004s : 31: predicate.environ_get_depend_swap 1.60% : 0.000005s : 39: predicate.environ_get_eliminate 1.29% : 0.000004s : 31: predicate.environ_get_set_eliminate 2.11% : 0.000007s : 49: predicate.exchange_switch_depend_value 2.70% : 0.000009s : 49: predicate.float_depend_g_call 0.33% : 0.000001s : 8: predicate.float_environ_get_switch 0.48% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.42% : 0.000001s : 8: predicate.get_grad_eliminate 0.12% : 0.000000s : 4: predicate.graph_param_transform 0.34% : 0.000001s : 8: predicate.incorporate_call 0.29% : 0.000001s : 8: predicate.incorporate_call_switch 5.76% : 0.000018s : 100: predicate.inline 0.44% : 0.000001s : 8: predicate.inline_without_move 0.19% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.45% : 0.000001s : 8: predicate.less_batch_normalization 1.74% : 0.000006s : 39: predicate.list_to_tuple_eliminator_ 2.70% : 0.000009s : 66: predicate.load_eliminater 0.52% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.69% : 0.000012s : 86: predicate.loop_unroll_before_grad 1.67% : 0.000005s : 35: predicate.make_slice_get_slice_eliminator 0.36% : 0.000001s : 8: predicate.merge_addn 0.32% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.36% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.12% : 0.000004s : 27: predicate.minmaximum_grad 0.55% : 0.000002s : 4: predicate.mutable_eliminate 0.20% : 0.000001s : 4: predicate.opt_reshape 0.22% : 0.000001s : 4: predicate.parallel_virtual_node 2.81% : 0.000009s : 49: predicate.partial_defer_inline 1.56% : 0.000005s : 35: predicate.partial_eliminate 1.22% : 0.000004s : 27: predicate.print_const_string_wrapper 0.35% : 0.000001s : 8: predicate.reduce_all_const_elim 1.45% : 0.000005s : 27: predicate.reduce_eliminate 2.70% : 0.000009s : 66: predicate.redundant_stop_gradient_eliminater 0.23% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000004s : 39: predicate.replace_applicator 0.26% : 0.000001s : 8: predicate.replace_old_param 0.15% : 0.000000s : 4: predicate.reset_defer_inline 1.16% : 0.000004s : 27: predicate.reshape_eliminate 0.39% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.23% : 0.000001s : 4: predicate.row_tensor_eliminate 0.41% : 0.000001s : 8: predicate.same_eliminate 0.26% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.46% : 0.000001s : 8: predicate.shard_identity_eliminate 0.43% : 0.000001s : 8: predicate.special_op_eliminate 0.43% : 0.000001s : 8: predicate.specialize_transform 0.47% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.44% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.19% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.33% : 0.000007s : 49: predicate.switch_defer_inline 2.58% : 0.000008s : 57: predicate.switch_layer_defer_inline 7.59% : 0.000024s : 157: predicate.switch_simplify 1.12% : 0.000004s : 27: predicate.tile_eliminate 1.15% : 0.000004s : 27: predicate.transpose_eliminate 1.64% : 0.000005s : 35: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000005s : 35: predicate.tuple_list_get_item_const_eliminator 1.63% : 0.000005s : 35: predicate.tuple_list_get_item_depend_reorder 2.45% : 0.000008s : 47: predicate.tuple_list_get_item_eliminator 1.45% : 0.000005s : 35: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000007s : 43: predicate.tuple_list_set_item_eliminator 1.63% : 0.000005s : 39: predicate.tuple_to_list_eliminator_ 2.70% : 0.000009s : 66: predicate.updatestate_pure_node_eliminater 2.97% : 0.000009s : 74: predicate.updatestate_useless_node_eliminater 0.21% : 0.000001s : 4: predicate.value_based_eliminate 0.51% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.40% : 0.000001s : 8: predicate.virtual_output_eliminate 0.16% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.22% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002270 34 50.20% : 0.001139s : 14: func_graph_cloner_run.FuncGraphClonerGraph 49.80% : 0.001130s : 20: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.422539 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.31% : 0.007514s : 1: add_attr 0.31% : 0.007499s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000048s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.01% : 0.000311s : 1: auto_monad 0.00% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000009s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000013s : 1: bias_add_comm_swap 0.05% : 0.001111s : 1: bootstrap 0.00% : 0.000017s : 1: cconv 0.00% : 0.000005s : 1: comm_op_add_attrs 0.00% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000023s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000019s : 1: environ_conv 0.02% : 0.000372s : 1: event_method 0.00% : 0.000023s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000454s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000519s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.09% : 0.002126s : 78: opt.transform.opt_a 0.00% : 0.000027s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000095s : 28: opt.transform.opt_b 0.00% : 0.000044s : 2: opt.transform.opt_trans_graph 0.00% : 0.000036s : 4: opt.transform.symbol_engine_opt 0.23% : 0.005499s : 1: opt_a 0.00% : 0.000095s : 1: opt_after_cconv 0.02% : 0.000546s : 1: opt_after_jit_grad 0.01% : 0.000187s : 1: opt_b 0.33% : 0.007940s : 1: optimize 0.00% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000046s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000024s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000020s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000091s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000010s : 1: remove_dup_value 0.05% : 0.001274s : 1: renormalize.infer 0.04% : 0.001029s : 1: renormalize.specialize 0.00% : 0.000086s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000030s : 1: rewriter_after_opt_a 0.02% : 0.000377s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000029s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000079s : 1: symbol_engine_optimizer 91.96% : 2.227658s : 1: task_emit 0.00% : 0.000071s : 1: tuple_transform 6.46% : 0.156517s : 1: type_inference 0.01% : 0.000301s : 1: validate TotalTime = 2.47295, [24] [bootstrap]: 0.00075532 [type_inference]: 0.178021 [event_method]: 6.257e-05 [auto_monad]: 0.00029558 [graph_reusing]: 1.689e-05 [inline]: 2.47001e-06 [add_attr]: 0.00729472, [1] [add_attr_with_inline]: 0.00728329, [1] [Cycle 1]: 0.00017736, [2] [tag_attr]: 7.258e-05 [meta_addattr_fg_expand]: 2.898e-05 [parallel-infer-symbol]: 3.11999e-06 [pre_auto_parallel]: 0.00010133 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 9.09989e-07 [dataset_repeat_opt]: 2.21998e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.0296569, [53] [py_interpret_to_execute]: 4.33999e-06 [rewriter_before_opt_a]: 0.00051081 [opt_a]: 0.0258106, [3] [Cycle 1]: 0.0166893, [45] [expand_dump_flag]: 6.80002e-06 [switch_simplify]: 0.00028849 [loop_unroll]: 0.00010829 [a_1]: 0.00242184 [with_stream_mark]: 2.576e-05 [recompute_prepare]: 2.417e-05 [updatestate_depend_eliminate]: 2.012e-05 [updatestate_assign_eliminate]: 1.812e-05 [updatestate_loads_eliminate]: 8.79e-06 [parameter_eliminate]: 2.74999e-06 [a_2]: 0.00028293 [accelerated_algorithm]: 5.065e-05 [shard]: 1.86e-06 [meta_shard_fg_expand]: 5.91998e-06 [shard_inline]: 1.858e-05 [merge_send_recv]: 5.505e-05 [auto_parallel]: 1.278e-05 [parallel]: 8.653e-05 [flash_sp]: 3.589e-05 [merge_comm]: 1.223e-05 [allreduce_fusion]: 1.955e-05 [matmul_add_comm_reduction]: 3.589e-05 [allreduce_slice_to_reducescatter]: 9.66e-06 [virtual_shard_identity]: 2.158e-05 [virtual_dataset]: 1.814e-05 [get_grad_eliminate_]: 1.803e-05 [virtual_output]: 1.739e-05 [merge_forward]: 1.058e-05 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 3.028e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.397e-05 [merge_recompute_call_nodes]: 1.93997e-06 [before_grad]: 4.164e-05 [set_forward_comm_id_for_comm_node_pass]: 1.925e-05 [meta_fg_expand]: 0.0028219 [flash_sp_send_recv_attached]: 6.16998e-06 [receive_attached]: 2.218e-05 [after_resolve]: 0.00010383 [a_after_grad]: 0.00014968 [renormalize]: 0.00823468 [add_forward_monad_depend]: 1.466e-05 [auto_monad_grad]: 9.40001e-06 [auto_monad_eliminator]: 0.00012209 [cse]: 0.00036996 [a_3]: 0.00076941 [Cycle 2]: 0.00728468, [45] [expand_dump_flag]: 2.51e-06 [switch_simplify]: 0.00010387 [loop_unroll]: 0.00010071 [a_1]: 0.00337338 [with_stream_mark]: 2.151e-05 [recompute_prepare]: 2.432e-05 [updatestate_depend_eliminate]: 1.352e-05 [updatestate_assign_eliminate]: 1.125e-05 [updatestate_loads_eliminate]: 1.143e-05 [parameter_eliminate]: 1.22e-06 [a_2]: 0.00032327 [accelerated_algorithm]: 2.497e-05 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 5.82999e-06 [shard_inline]: 2.164e-05 [merge_send_recv]: 1.576e-05 [auto_parallel]: 1.588e-05 [parallel]: 3.93001e-06 [flash_sp]: 3.23e-06 [merge_comm]: 1.608e-05 [allreduce_fusion]: 1.527e-05 [matmul_add_comm_reduction]: 1.904e-05 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 2.36e-05 [virtual_dataset]: 2.147e-05 [get_grad_eliminate_]: 2.108e-05 [virtual_output]: 2.092e-05 [merge_forward]: 1.165e-05 [cell_reuse_recompute_pass]: 1.34003e-06 [offload_activation]: 2.151e-05 [cell_reuse_handle_not_recompute_node_pass]: 4.152e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 3.862e-05 [set_forward_comm_id_for_comm_node_pass]: 1.372e-05 [meta_fg_expand]: 0.00044426 [flash_sp_send_recv_attached]: 1.74e-06 [receive_attached]: 1.53002e-06 [after_resolve]: 3.744e-05 [a_after_grad]: 3.709e-05 [renormalize]: 0.00180315 [add_forward_monad_depend]: 4.58999e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 3.275e-05 [cse]: 0.0001623 [a_3]: 0.00014712 [Cycle 3]: 0.00182263, [45] [expand_dump_flag]: 1.30001e-06 [switch_simplify]: 2.179e-05 [loop_unroll]: 2.005e-05 [a_1]: 0.00058538 [with_stream_mark]: 1.877e-05 [recompute_prepare]: 1.972e-05 [updatestate_depend_eliminate]: 1.183e-05 [updatestate_assign_eliminate]: 1.157e-05 [updatestate_loads_eliminate]: 1.096e-05 [parameter_eliminate]: 1.04e-06 [a_2]: 0.00029282 [accelerated_algorithm]: 2.283e-05 [shard]: 9.89996e-07 [meta_shard_fg_expand]: 4.27998e-06 [shard_inline]: 1.991e-05 [merge_send_recv]: 1.458e-05 [auto_parallel]: 1.44e-05 [parallel]: 4.20999e-06 [flash_sp]: 9.70002e-07 [merge_comm]: 1.248e-05 [allreduce_fusion]: 1.229e-05 [matmul_add_comm_reduction]: 1.625e-05 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 2.073e-05 [virtual_dataset]: 1.937e-05 [get_grad_eliminate_]: 1.928e-05 [virtual_output]: 1.895e-05 [merge_forward]: 1.05e-05 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 2.026e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.929e-05 [merge_recompute_call_nodes]: 7.60017e-07 [before_grad]: 3.521e-05 [set_forward_comm_id_for_comm_node_pass]: 1.273e-05 [meta_fg_expand]: 8.95001e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.07e-06 [after_resolve]: 2.278e-05 [a_after_grad]: 3.193e-05 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.47001e-06 [auto_monad_grad]: 1.26002e-06 [auto_monad_eliminator]: 2.444e-05 [cse]: 7.715e-05 [a_3]: 0.00013589 [py_interpret_to_execute_after_opt_a]: 4.63999e-06 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 7.098e-05 [convert_after_rewriter]: 1.32999e-06 [order_py_execute_after_rewriter]: 1.36998e-06 [mutable_eliminate]: 0.00050752 [opt_b]: 0.00064218, [1] [Cycle 1]: 0.00063592, [7] [b_1]: 0.0004707 [b_2]: 2.207e-05 [updatestate_depend_eliminate]: 1.469e-05 [updatestate_assign_eliminate]: 1.079e-05 [updatestate_loads_eliminate]: 1.07e-05 [renormalize]: 4.69998e-07 [cse]: 7.016e-05 [optimize_parallel_all_gather_comm]: 4.962e-05 [overlap_param_gather]: 1.47e-05 [cconv]: 2.352e-05 [loop_unroll]: 0.00047001 [opt_after_cconv]: 0.00025499, [1] [Cycle 1]: 0.00024868, [7] [c_1]: 0.00010655 [parameter_eliminate]: 2.46e-06 [updatestate_depend_eliminate]: 1.451e-05 [updatestate_assign_eliminate]: 1.1e-05 [updatestate_loads_eliminate]: 1.075e-05 [cse]: 6.827e-05 [renormalize]: 4.39992e-07 [remove_dup_value]: 7.872e-05 [tuple_transform]: 0.00019754, [1] [Cycle 1]: 0.00019242, [4] [d_1]: 0.00015091 [none_parameter_eliminate]: 1.89999e-06 [renormalize]: 3.29979e-07 [switch_simplify]: 2.007e-05 [partial_unused_args_eliminate]: 2.21e-06 [add_recomputation]: 0.0001319 [cse_after_recomputation]: 6.578e-05, [1] [Cycle 1]: 6.024e-05, [1] [cse]: 5.382e-05 [environ_conv]: 1.857e-05 [swap_dp_allreduce_reducescatter]: 4.445e-05 [bias_add_comm_swap]: 1.457e-05 [label_micro_interleaved_index]: 1.645e-05 [label_fine_grained_interleaved_index]: 2.76e-06 [merge_cast_opt]: 1.19998e-06 [slice_recompute_activation]: 1.92999e-06 [micro_interleaved_order_control]: 2.37999e-06 [assign_add_opt]: 1.01002e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.381e-05 [full_micro_interleaved_order_control]: 1.418e-05 [reorder_send_recv_between_fp_bp]: 2.76e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.15999e-06 [interleave_parallel_branches]: 1.291e-05 [overlap_opt_shard_in_pipeline]: 1.718e-05 [overlap_opt_shard_grad_in_pipeline]: 1.86e-06 [control_data_broadcast_order]: 3.732e-05 [grouped_pairwise_exchange_alltoall]: 1.31002e-06 [offloading_packed_experts]: 9.84999e-06 [overlap_recompute_and_grad_model_parallel]: 2.273e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.34e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.35002e-06 [overlap_grad_ring_attention]: 3.323e-05 [overlap_grad_flash_sp]: 8.651e-05 [begin_end_overlap_inline]: 3.80009e-07 [split_matmul_comm_elemetwise]: 1.467e-05 [split_layernorm_comm]: 1.72001e-06 [handle_group_info]: 1.27e-06 [symbol_engine_optimizer]: 0.00019034, [1] [Cycle 1]: 0.00018526, [6] [build]: 3.276e-05 [elim_shapecalc]: 2.651e-05 [elim_not_effective]: 3.898e-05 [opt_reshape]: 2.078e-05 [fold_const_symbol]: 3.645e-05 [renormalize]: 2.89991e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.44998e-06 [auto_monad_reorder]: 4.798e-05 [get_jit_bprop_graph]: 1.05001e-06 [rewriter_after_jit_bprop_graph]: 3.56001e-06 [opt_after_jit_grad]: 0.00055227 [validate]: 9.504e-05 [backend_pass]: 1.10001e-06 [task_emit]: 2.25559 [execute]: 1.157e-05 Sums bootstrap : 0.000755s : 0.03% type_inference : 0.178021s : 7.22% event_method : 0.000063s : 0.00% auto_monad : 0.000296s : 0.01% graph_reusing : 0.000017s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000073s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000029s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000101s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000511s : 0.02% optimize.opt_a.expand_dump_flag : 0.000011s : 0.00% optimize.opt_a.switch_simplify : 0.000414s : 0.02% optimize.opt_a.loop_unroll : 0.000229s : 0.01% optimize.opt_a.a_1 : 0.006381s : 0.26% optimize.opt_a.with_stream_mark : 0.000066s : 0.00% optimize.opt_a.recompute_prepare : 0.000068s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000045s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000041s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000031s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000899s : 0.04% optimize.opt_a.accelerated_algorithm : 0.000098s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000016s : 0.00% optimize.opt_a.shard_inline : 0.000060s : 0.00% optimize.opt_a.merge_send_recv : 0.000085s : 0.00% optimize.opt_a.auto_parallel : 0.000043s : 0.00% optimize.opt_a.parallel : 0.000095s : 0.00% optimize.opt_a.flash_sp : 0.000040s : 0.00% optimize.opt_a.merge_comm : 0.000041s : 0.00% optimize.opt_a.allreduce_fusion : 0.000047s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000071s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000066s : 0.00% optimize.opt_a.virtual_dataset : 0.000059s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000058s : 0.00% optimize.opt_a.virtual_output : 0.000057s : 0.00% optimize.opt_a.merge_forward : 0.000033s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000072s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000115s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000115s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000046s : 0.00% optimize.opt_a.meta_fg_expand : 0.003275s : 0.13% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.00% optimize.opt_a.receive_attached : 0.000025s : 0.00% optimize.opt_a.after_resolve : 0.000164s : 0.01% optimize.opt_a.a_after_grad : 0.000219s : 0.01% optimize.opt_a.renormalize : 0.010038s : 0.41% optimize.opt_a.add_forward_monad_depend : 0.000021s : 0.00% optimize.opt_a.auto_monad_grad : 0.000012s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000179s : 0.01% optimize.opt_a.cse : 0.000609s : 0.02% optimize.opt_a.a_3 : 0.001052s : 0.04% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000071s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000508s : 0.02% optimize.opt_b.b_1 : 0.000471s : 0.02% optimize.opt_b.b_2 : 0.000022s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000070s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000050s : 0.00% optimize.overlap_param_gather : 0.000015s : 0.00% optimize.cconv : 0.000024s : 0.00% optimize.loop_unroll : 0.000470s : 0.02% optimize.opt_after_cconv.c_1 : 0.000107s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000011s : 0.00% optimize.opt_after_cconv.cse : 0.000068s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000079s : 0.00% optimize.tuple_transform.d_1 : 0.000151s : 0.01% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000020s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000132s : 0.01% optimize.cse_after_recomputation.cse : 0.000054s : 0.00% optimize.environ_conv : 0.000019s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000044s : 0.00% optimize.bias_add_comm_swap : 0.000015s : 0.00% optimize.label_micro_interleaved_index : 0.000016s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000014s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000017s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000037s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000010s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000023s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000033s : 0.00% optimize.overlap_grad_flash_sp : 0.000087s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000015s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000033s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000027s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000039s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000021s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000036s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000048s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000552s : 0.02% validate : 0.000095s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.255588s : 91.54% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.001577 428 5.61% : 0.000088s : 22: substitution.arithmetic_simplify 4.26% : 0.000067s : 17: substitution.cast_eliminate 0.37% : 0.000006s : 16: substitution.elim_not_effective 0.59% : 0.000009s : 14: substitution.float_depend_g_call 0.23% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.37% : 0.000006s : 16: substitution.fold_const_symbol 0.98% : 0.000015s : 18: substitution.graph_param_transform 0.18% : 0.000003s : 2: substitution.incorporate_call 0.15% : 0.000002s : 2: substitution.incorporate_call_switch 58.97% : 0.000930s : 41: substitution.inline 1.62% : 0.000026s : 4: substitution.inline_without_move 1.80% : 0.000028s : 45: substitution.j_node_and_user_rematch 2.02% : 0.000032s : 3: substitution.less_batch_normalization 1.24% : 0.000020s : 16: substitution.minmaximum_grad 1.81% : 0.000029s : 14: substitution.partial_eliminate 1.75% : 0.000028s : 45: substitution.remove_not_recompute_node 2.40% : 0.000038s : 19: substitution.replace_applicator 0.77% : 0.000012s : 18: substitution.replace_old_param 0.16% : 0.000002s : 1: substitution.set_cell_output_no_recompute 1.74% : 0.000027s : 8: substitution.switch_simplify 2.37% : 0.000037s : 16: substitution.tuple_list_convert_item_index_to_positive 1.21% : 0.000019s : 16: substitution.tuple_list_get_item_const_eliminator 1.57% : 0.000025s : 16: substitution.tuple_list_get_item_depend_reorder 5.41% : 0.000085s : 39: substitution.tuple_list_get_item_eliminator 1.59% : 0.000025s : 16: substitution.tuple_list_get_set_item_eliminator 0.81% : 0.000013s : 1: substitution.value_based_eliminate ------[type_inference.] 0.177507 2 97.01% : 0.172209s : 1: type_inference.infer 2.99% : 0.005299s : 1: type_inference.specialize ------[replace.] 0.000493 71 2.23% : 0.000011s : 2: replace.arithmetic_simplify 55.41% : 0.000273s : 41: replace.inline 14.84% : 0.000073s : 8: replace.switch_simplify 27.53% : 0.000136s : 20: replace.tuple_list_get_item_eliminator ------[match.] 0.000999 71 2.49% : 0.000025s : 2: match.arithmetic_simplify 90.98% : 0.000909s : 41: match.inline 2.30% : 0.000023s : 8: match.switch_simplify 4.23% : 0.000042s : 20: match.tuple_list_get_item_eliminator ------[predicate.] 0.001554 10935 1.03% : 0.000016s : 125: predicate.accumulaten_eliminater 0.38% : 0.000006s : 18: predicate.ad_related_special_op_eliminate 0.46% : 0.000007s : 55: predicate.addn_check_dump 1.06% : 0.000016s : 125: predicate.addn_zero_filter 1.00% : 0.000016s : 125: predicate.adjust_all_reduce_mul_add 2.53% : 0.000039s : 182: predicate.arithmetic_simplify 1.20% : 0.000019s : 127: predicate.cast_eliminate 1.29% : 0.000020s : 149: predicate.check_bprop_eliminate 0.47% : 0.000007s : 55: predicate.compare_switch_simplify 0.08% : 0.000001s : 18: predicate.const_output_eliminate 0.47% : 0.000007s : 55: predicate.depend_value_elim 1.14% : 0.000018s : 127: predicate.dict_get_item_const_eliminator 1.30% : 0.000020s : 127: predicate.dict_get_item_eliminator 1.05% : 0.000016s : 127: predicate.dict_set_item_eliminator 0.38% : 0.000006s : 36: predicate.dumpgradient_eliminate 0.10% : 0.000002s : 18: predicate.elim_not_effective 0.18% : 0.000003s : 18: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000018s : 145: predicate.environ_add_const_eliminate 1.23% : 0.000019s : 145: predicate.environ_get_add_eliminate 1.20% : 0.000019s : 145: predicate.environ_get_depend_swap 1.69% : 0.000026s : 200: predicate.environ_get_eliminate 1.22% : 0.000019s : 145: predicate.environ_get_set_eliminate 1.65% : 0.000026s : 188: predicate.exchange_switch_depend_value 2.20% : 0.000034s : 188: predicate.float_depend_g_call 0.48% : 0.000007s : 55: predicate.float_environ_get_switch 0.62% : 0.000010s : 73: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 18: predicate.fold_const_symbol 0.51% : 0.000008s : 55: predicate.get_grad_eliminate 0.10% : 0.000002s : 18: predicate.graph_param_transform 0.48% : 0.000007s : 55: predicate.incorporate_call 0.45% : 0.000007s : 55: predicate.incorporate_call_switch 5.31% : 0.000083s : 459: predicate.inline 1.29% : 0.000020s : 112: predicate.inline_without_move 0.25% : 0.000004s : 55: predicate.j_node_and_user_rematch 0.58% : 0.000009s : 55: predicate.less_batch_normalization 1.61% : 0.000025s : 183: predicate.list_to_tuple_eliminator_ 2.66% : 0.000041s : 308: predicate.load_eliminater 0.36% : 0.000006s : 18: predicate.loop_unroll_after_grad 2.34% : 0.000036s : 261: predicate.loop_unroll_before_grad 1.45% : 0.000023s : 163: predicate.make_slice_get_slice_eliminator 0.48% : 0.000007s : 55: predicate.merge_addn 1.27% : 0.000020s : 149: predicate.micro_step_allgather_replace 1.27% : 0.000020s : 149: predicate.mini_step_allgather_replace 1.10% : 0.000017s : 127: predicate.minmaximum_grad 0.37% : 0.000006s : 18: predicate.mutable_eliminate 0.18% : 0.000003s : 18: predicate.opt_reshape 0.18% : 0.000003s : 18: predicate.parallel_virtual_node 2.21% : 0.000034s : 188: predicate.partial_defer_inline 1.59% : 0.000025s : 165: predicate.partial_eliminate 1.05% : 0.000016s : 125: predicate.print_const_string_wrapper 0.48% : 0.000007s : 55: predicate.reduce_all_const_elim 1.39% : 0.000022s : 127: predicate.reduce_eliminate 2.51% : 0.000039s : 308: predicate.redundant_stop_gradient_eliminater 0.27% : 0.000004s : 55: predicate.remove_not_recompute_node 1.67% : 0.000026s : 296: predicate.replace_applicator 0.57% : 0.000009s : 112: predicate.replace_old_param 0.10% : 0.000001s : 18: predicate.reset_defer_inline 1.09% : 0.000017s : 127: predicate.reshape_eliminate 1.29% : 0.000020s : 149: predicate.row_tensor_add_zeros_like 0.18% : 0.000003s : 18: predicate.row_tensor_eliminate 1.46% : 0.000023s : 149: predicate.same_eliminate 0.28% : 0.000004s : 55: predicate.set_cell_output_no_recompute 0.53% : 0.000008s : 55: predicate.shard_identity_eliminate 0.35% : 0.000005s : 36: predicate.special_op_eliminate 0.55% : 0.000009s : 55: predicate.specialize_transform 1.34% : 0.000021s : 149: predicate.split_environ_get_set_with_tuple_value 1.14% : 0.000018s : 112: predicate.stack_unstack_eliminate 0.18% : 0.000003s : 18: predicate.switch_call_monad_eliminater 1.81% : 0.000028s : 188: predicate.switch_defer_inline 3.07% : 0.000048s : 337: predicate.switch_layer_defer_inline 5.15% : 0.000080s : 538: predicate.switch_simplify 1.10% : 0.000017s : 127: predicate.tile_eliminate 1.14% : 0.000018s : 127: predicate.transpose_eliminate 1.54% : 0.000024s : 163: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000024s : 163: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000023s : 163: predicate.tuple_list_get_item_depend_reorder 2.56% : 0.000040s : 238: predicate.tuple_list_get_item_eliminator 1.52% : 0.000024s : 163: predicate.tuple_list_get_set_item_eliminator 2.11% : 0.000033s : 218: predicate.tuple_list_set_item_eliminator 1.61% : 0.000025s : 183: predicate.tuple_to_list_eliminator_ 2.54% : 0.000040s : 308: predicate.updatestate_pure_node_eliminater 3.07% : 0.000048s : 363: predicate.updatestate_useless_node_eliminater 0.21% : 0.000003s : 18: predicate.value_based_eliminate 0.52% : 0.000008s : 55: predicate.virtual_dataset_eliminate 0.51% : 0.000008s : 55: predicate.virtual_output_eliminate 0.18% : 0.000003s : 18: predicate.virtual_view_grad_eliminate 0.19% : 0.000003s : 18: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004594 76 57.05% : 0.002621s : 31: func_graph_cloner_run.FuncGraphClonerGraph 42.95% : 0.001973s : 45: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.530709 237 0.00% : 0.000004s : 1: ForceFp32Comm 0.29% : 0.007299s : 1: add_attr 0.29% : 0.007287s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.01% : 0.000137s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000304s : 1: auto_monad 0.00% : 0.000052s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000018s : 1: bias_add_comm_swap 0.03% : 0.000800s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000041s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000069s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000022s : 1: environ_conv 0.00% : 0.000070s : 1: event_method 0.00% : 0.000025s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000021s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.02% : 0.000479s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000516s : 1: mutable_eliminate 0.00% : 0.000013s : 1: offloading_packed_experts 0.00% : 0.000032s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000034s : 1: opt.transform.mutable_eliminate 0.39% : 0.009913s : 117: opt.transform.opt_a 0.00% : 0.000105s : 1: opt.transform.opt_after_cconv 0.00% : 0.000071s : 1: opt.transform.opt_after_jit_grad 0.02% : 0.000463s : 28: opt.transform.opt_b 0.01% : 0.000169s : 2: opt.transform.opt_trans_graph 0.00% : 0.000119s : 4: opt.transform.symbol_engine_opt 1.02% : 0.025814s : 1: opt_a 0.01% : 0.000259s : 1: opt_after_cconv 0.02% : 0.000562s : 1: opt_after_jit_grad 0.03% : 0.000646s : 1: opt_b 1.17% : 0.029661s : 1: optimize 0.00% : 0.000054s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000090s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000036s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000021s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000018s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000026s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000106s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000017s : 1: remove_cast_before_assign_add 0.00% : 0.000084s : 1: remove_dup_value 0.25% : 0.006386s : 2: renormalize.infer 0.14% : 0.003637s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000075s : 1: rewriter_after_opt_a 0.02% : 0.000519s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000018s : 1: split_matmul_comm_elemetwise 0.00% : 0.000048s : 1: swap_dp_allreduce_reducescatter 0.01% : 0.000193s : 1: symbol_engine_optimizer 89.13% : 2.255656s : 1: task_emit 0.01% : 0.000201s : 1: tuple_transform 7.04% : 0.178046s : 1: type_inference 0.01% : 0.000226s : 1: validate group_cases_20 have all been run, results of sub cases are below: case: (1, ) {} pass. case: (1, ) {} pass. case: (1, ) {} pass. case: (1, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. ops group_cases_21 with 8 cases start to running, all cases are below: case: (, 1, ) case: (, 0, ) case: (, 1, ) case: (, 0, ) case: (, 0, ) case: (, 1, ) case: (, 1, ) case: (, 0, ) ops group_cases_21 total running memory: 32M, memory threshold: 51200M TotalTime = 2.14817, [24] [bootstrap]: 0.00086548 [type_inference]: 0.175191 [event_method]: 5.389e-05 [auto_monad]: 0.00028997 [graph_reusing]: 1.61e-05 [inline]: 2.51998e-06 [add_attr]: 0.00753252, [1] [add_attr_with_inline]: 0.00751942, [1] [Cycle 1]: 0.00018277, [2] [tag_attr]: 7.222e-05 [meta_addattr_fg_expand]: 2.924e-05 [parallel-infer-symbol]: 3.21999e-06 [pre_auto_parallel]: 9.99e-05 [insert-virtual-dataset]: 2.58e-06 [parallel-infer-symbol-second]: 8.49977e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.85001e-06 [optimize]: 0.0267392, [53] [py_interpret_to_execute]: 5.14998e-06 [rewriter_before_opt_a]: 0.00044186 [opt_a]: 0.0239384, [3] [Cycle 1]: 0.0186398, [45] [expand_dump_flag]: 6.94001e-06 [switch_simplify]: 0.00027302 [loop_unroll]: 9.274e-05 [a_1]: 0.00201183 [with_stream_mark]: 2.402e-05 [recompute_prepare]: 2.121e-05 [updatestate_depend_eliminate]: 1.962e-05 [updatestate_assign_eliminate]: 1.71e-05 [updatestate_loads_eliminate]: 7.41001e-06 [parameter_eliminate]: 3.21999e-06 [a_2]: 0.00021851 [accelerated_algorithm]: 1.469e-05 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 5.30001e-06 [shard_inline]: 1.387e-05 [merge_send_recv]: 5.918e-05 [auto_parallel]: 1.106e-05 [parallel]: 8.486e-05 [flash_sp]: 3.839e-05 [merge_comm]: 9.62001e-06 [allreduce_fusion]: 1.881e-05 [matmul_add_comm_reduction]: 3.518e-05 [allreduce_slice_to_reducescatter]: 1.071e-05 [virtual_shard_identity]: 1.721e-05 [virtual_dataset]: 1.392e-05 [get_grad_eliminate_]: 1.38e-05 [virtual_output]: 1.383e-05 [merge_forward]: 9.20999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 2.755e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.651e-05 [merge_recompute_call_nodes]: 1.46998e-06 [before_grad]: 3.409e-05 [set_forward_comm_id_for_comm_node_pass]: 1.953e-05 [meta_fg_expand]: 0.002368 [flash_sp_send_recv_attached]: 4.08001e-06 [receive_attached]: 2.418e-05 [after_resolve]: 8.001e-05 [a_after_grad]: 0.0001129 [renormalize]: 0.0116026 [add_forward_monad_depend]: 1.196e-05 [auto_monad_grad]: 7.13e-06 [auto_monad_eliminator]: 7.845e-05 [cse]: 0.00038811 [a_3]: 0.00048647 [Cycle 2]: 0.00440569, [45] [expand_dump_flag]: 2.26998e-06 [switch_simplify]: 6.427e-05 [loop_unroll]: 6.214e-05 [a_1]: 0.00199048 [with_stream_mark]: 1.582e-05 [recompute_prepare]: 1.102e-05 [updatestate_depend_eliminate]: 5.39e-06 [updatestate_assign_eliminate]: 4.11001e-06 [updatestate_loads_eliminate]: 3.95e-06 [parameter_eliminate]: 1.14998e-06 [a_2]: 0.00011989 [accelerated_algorithm]: 9.10999e-06 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 2.61999e-06 [shard_inline]: 8.45999e-06 [merge_send_recv]: 7.11999e-06 [auto_parallel]: 7.71001e-06 [parallel]: 4.32e-06 [flash_sp]: 3.20002e-06 [merge_comm]: 5.31002e-06 [allreduce_fusion]: 4.85001e-06 [matmul_add_comm_reduction]: 7.46999e-06 [allreduce_slice_to_reducescatter]: 3.39991e-07 [virtual_shard_identity]: 9.91e-06 [virtual_dataset]: 8.69e-06 [get_grad_eliminate_]: 9.10999e-06 [virtual_output]: 8.28999e-06 [merge_forward]: 5.82999e-06 [cell_reuse_recompute_pass]: 9.60019e-07 [offload_activation]: 1.009e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.737e-05 [merge_recompute_call_nodes]: 7.79983e-07 [before_grad]: 1.447e-05 [set_forward_comm_id_for_comm_node_pass]: 5.36998e-06 [meta_fg_expand]: 0.00043323 [flash_sp_send_recv_attached]: 1.54e-06 [receive_attached]: 1.39e-06 [after_resolve]: 2.254e-05 [a_after_grad]: 1.457e-05 [renormalize]: 0.00107837 [add_forward_monad_depend]: 4.58001e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.583e-05 [cse]: 8.164e-05 [a_3]: 6.448e-05 [Cycle 3]: 0.00087885, [45] [expand_dump_flag]: 1.18001e-06 [switch_simplify]: 1.038e-05 [loop_unroll]: 8.62e-06 [a_1]: 0.00021884 [with_stream_mark]: 1.081e-05 [recompute_prepare]: 8.89e-06 [updatestate_depend_eliminate]: 4.75001e-06 [updatestate_assign_eliminate]: 4.03999e-06 [updatestate_loads_eliminate]: 4.13001e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00011633 [accelerated_algorithm]: 8.82e-06 [shard]: 1.09998e-06 [meta_shard_fg_expand]: 1.90001e-06 [shard_inline]: 8.47998e-06 [merge_send_recv]: 6.64999e-06 [auto_parallel]: 7.13998e-06 [parallel]: 3.98001e-06 [flash_sp]: 8.79983e-07 [merge_comm]: 5.05999e-06 [allreduce_fusion]: 4.73001e-06 [matmul_add_comm_reduction]: 8.28999e-06 [allreduce_slice_to_reducescatter]: 3.20026e-07 [virtual_shard_identity]: 9.27999e-06 [virtual_dataset]: 8.08999e-06 [get_grad_eliminate_]: 8.08001e-06 [virtual_output]: 7.97998e-06 [merge_forward]: 4.37998e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 9.05001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.625e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 1.365e-05 [set_forward_comm_id_for_comm_node_pass]: 5.09998e-06 [meta_fg_expand]: 3.35e-06 [flash_sp_send_recv_attached]: 8.49977e-07 [receive_attached]: 9.30013e-07 [after_resolve]: 1.13e-05 [a_after_grad]: 1.291e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.12e-06 [auto_monad_grad]: 9.70002e-07 [auto_monad_eliminator]: 1.066e-05 [cse]: 5.378e-05 [a_3]: 5.651e-05 [py_interpret_to_execute_after_opt_a]: 4.85001e-06 [slice_cell_reuse_recomputed_activation]: 1.96e-06 [rewriter_after_opt_a]: 4.047e-05 [convert_after_rewriter]: 1.29e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00050854 [opt_b]: 0.00029525, [1] [Cycle 1]: 0.00028883, [7] [b_1]: 0.00019068 [b_2]: 1.006e-05 [updatestate_depend_eliminate]: 7.55998e-06 [updatestate_assign_eliminate]: 4.13001e-06 [updatestate_loads_eliminate]: 4.25e-06 [renormalize]: 4.30009e-07 [cse]: 3.768e-05 [optimize_parallel_all_gather_comm]: 3.33e-05 [overlap_param_gather]: 1.234e-05 [cconv]: 2.278e-05 [loop_unroll]: 0.00047285 [opt_after_cconv]: 0.00013658, [1] [Cycle 1]: 0.00013113, [7] [c_1]: 4.502e-05 [parameter_eliminate]: 2.45002e-06 [updatestate_depend_eliminate]: 7.69002e-06 [updatestate_assign_eliminate]: 4.42e-06 [updatestate_loads_eliminate]: 4.12003e-06 [cse]: 3.372e-05 [renormalize]: 4.50003e-07 [remove_dup_value]: 2.606e-05 [tuple_transform]: 9.244e-05, [1] [Cycle 1]: 8.786e-05, [4] [d_1]: 5.87e-05 [none_parameter_eliminate]: 1.77999e-06 [renormalize]: 1.30007e-07 [switch_simplify]: 9.52001e-06 [partial_unused_args_eliminate]: 1.94999e-06 [add_recomputation]: 7.09e-05 [cse_after_recomputation]: 3.574e-05, [1] [Cycle 1]: 3.121e-05, [1] [cse]: 2.555e-05 [environ_conv]: 1.114e-05 [swap_dp_allreduce_reducescatter]: 2.932e-05 [bias_add_comm_swap]: 1.232e-05 [label_micro_interleaved_index]: 1.406e-05 [label_fine_grained_interleaved_index]: 2.62001e-06 [merge_cast_opt]: 1.49998e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.90002e-06 [assign_add_opt]: 1.29e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.078e-05 [full_micro_interleaved_order_control]: 1.148e-05 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.01997e-06 [add_comm_op_reuse_tag]: 9.70002e-07 [interleave_split_concat_branches]: 1.11002e-06 [interleave_parallel_branches]: 1.024e-05 [overlap_opt_shard_in_pipeline]: 1.668e-05 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.781e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 5.07e-06 [overlap_recompute_and_grad_model_parallel]: 1.518e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.30002e-06 [overlap_grad_ring_attention]: 2.395e-05 [overlap_grad_flash_sp]: 5.462e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 1.175e-05 [split_layernorm_comm]: 1.70001e-06 [handle_group_info]: 1.02e-06 [symbol_engine_optimizer]: 9.377e-05, [1] [Cycle 1]: 8.931e-05, [6] [build]: 3.78001e-06 [elim_shapecalc]: 1.4e-05 [elim_not_effective]: 1.827e-05 [opt_reshape]: 9.54e-06 [fold_const_symbol]: 1.538e-05 [renormalize]: 2.50002e-07 [detach_backward]: 2.01e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.865e-05 [get_jit_bprop_graph]: 1.00001e-06 [rewriter_after_jit_bprop_graph]: 3.09999e-06 [opt_after_jit_grad]: 0.00049247 [validate]: 6.239e-05 [backend_pass]: 9.5999e-07 [task_emit]: 1.93614 [execute]: 1.046e-05 Sums bootstrap : 0.000865s : 0.04% type_inference : 0.175191s : 8.19% event_method : 0.000054s : 0.00% auto_monad : 0.000290s : 0.01% graph_reusing : 0.000016s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000072s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000029s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000100s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000442s : 0.02% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000348s : 0.02% optimize.opt_a.loop_unroll : 0.000163s : 0.01% optimize.opt_a.a_1 : 0.004221s : 0.20% optimize.opt_a.with_stream_mark : 0.000051s : 0.00% optimize.opt_a.recompute_prepare : 0.000041s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000030s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000025s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000455s : 0.02% optimize.opt_a.accelerated_algorithm : 0.000033s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000031s : 0.00% optimize.opt_a.merge_send_recv : 0.000073s : 0.00% optimize.opt_a.auto_parallel : 0.000026s : 0.00% optimize.opt_a.parallel : 0.000093s : 0.00% optimize.opt_a.flash_sp : 0.000042s : 0.00% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000028s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000051s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000011s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000036s : 0.00% optimize.opt_a.virtual_dataset : 0.000031s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000031s : 0.00% optimize.opt_a.virtual_output : 0.000030s : 0.00% optimize.opt_a.merge_forward : 0.000019s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000047s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000060s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000062s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000030s : 0.00% optimize.opt_a.meta_fg_expand : 0.002805s : 0.13% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000026s : 0.00% optimize.opt_a.after_resolve : 0.000114s : 0.01% optimize.opt_a.a_after_grad : 0.000140s : 0.01% optimize.opt_a.renormalize : 0.012681s : 0.59% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.00% optimize.opt_a.auto_monad_grad : 0.000009s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000105s : 0.00% optimize.opt_a.cse : 0.000524s : 0.02% optimize.opt_a.a_3 : 0.000607s : 0.03% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000040s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000509s : 0.02% optimize.opt_b.b_1 : 0.000191s : 0.01% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000038s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000033s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000473s : 0.02% optimize.opt_after_cconv.c_1 : 0.000045s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000034s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000026s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000071s : 0.00% optimize.cse_after_recomputation.cse : 0.000026s : 0.00% optimize.environ_conv : 0.000011s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000029s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000017s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000024s : 0.00% optimize.overlap_grad_flash_sp : 0.000055s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000018s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000029s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000492s : 0.02% validate : 0.000062s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 1.936136s : 90.52% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001133 243 4.01% : 0.000045s : 5: substitution.arithmetic_simplify 4.60% : 0.000052s : 9: substitution.cast_eliminate 0.24% : 0.000003s : 5: substitution.elim_not_effective 0.74% : 0.000008s : 12: substitution.float_depend_g_call 0.32% : 0.000004s : 2: substitution.float_tuple_getitem_switch 0.22% : 0.000002s : 5: substitution.fold_const_symbol 0.61% : 0.000007s : 6: substitution.graph_param_transform 0.29% : 0.000003s : 2: substitution.incorporate_call 0.17% : 0.000002s : 2: substitution.incorporate_call_switch 62.51% : 0.000708s : 35: substitution.inline 1.88% : 0.000021s : 3: substitution.inline_without_move 1.65% : 0.000019s : 19: substitution.j_node_and_user_rematch 1.13% : 0.000013s : 10: substitution.minmaximum_grad 2.56% : 0.000029s : 12: substitution.partial_eliminate 1.25% : 0.000014s : 19: substitution.remove_not_recompute_node 2.70% : 0.000031s : 14: substitution.replace_applicator 0.83% : 0.000009s : 12: substitution.replace_old_param 0.25% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.41% : 0.000027s : 7: substitution.switch_simplify 2.29% : 0.000026s : 10: substitution.tuple_list_convert_item_index_to_positive 1.08% : 0.000012s : 10: substitution.tuple_list_get_item_const_eliminator 1.53% : 0.000017s : 10: substitution.tuple_list_get_item_depend_reorder 5.16% : 0.000058s : 23: substitution.tuple_list_get_item_eliminator 1.58% : 0.000018s : 10: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.175009 2 96.85% : 0.169501s : 1: type_inference.infer 3.15% : 0.005508s : 1: type_inference.specialize ------[replace.] 0.000392 54 1.37% : 0.000005s : 1: replace.arithmetic_simplify 58.58% : 0.000230s : 35: replace.inline 17.80% : 0.000070s : 7: replace.switch_simplify 22.25% : 0.000087s : 11: replace.tuple_list_get_item_eliminator ------[match.] 0.000765 54 2.96% : 0.000023s : 1: match.arithmetic_simplify 90.34% : 0.000691s : 35: match.inline 3.07% : 0.000023s : 7: match.switch_simplify 3.63% : 0.000028s : 11: match.tuple_list_get_item_eliminator ------[predicate.] 0.000901 6167 1.09% : 0.000010s : 77: predicate.accumulaten_eliminater 0.27% : 0.000002s : 6: predicate.ad_related_special_op_eliminate 0.36% : 0.000003s : 24: predicate.addn_check_dump 1.13% : 0.000010s : 77: predicate.addn_zero_filter 1.11% : 0.000010s : 77: predicate.adjust_all_reduce_mul_add 2.23% : 0.000020s : 102: predicate.arithmetic_simplify 1.28% : 0.000012s : 78: predicate.cast_eliminate 1.21% : 0.000011s : 79: predicate.check_bprop_eliminate 0.37% : 0.000003s : 24: predicate.compare_switch_simplify 0.06% : 0.000001s : 6: predicate.const_output_eliminate 0.38% : 0.000003s : 24: predicate.depend_value_elim 1.20% : 0.000011s : 78: predicate.dict_get_item_const_eliminator 1.34% : 0.000012s : 78: predicate.dict_get_item_eliminator 1.17% : 0.000011s : 78: predicate.dict_set_item_eliminator 0.27% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 6: predicate.elim_not_effective 0.12% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.23% : 0.000011s : 84: predicate.environ_add_const_eliminate 1.17% : 0.000011s : 84: predicate.environ_get_add_eliminate 1.18% : 0.000011s : 84: predicate.environ_get_depend_swap 1.62% : 0.000015s : 108: predicate.environ_get_eliminate 1.18% : 0.000011s : 84: predicate.environ_get_set_eliminate 1.85% : 0.000017s : 124: predicate.exchange_switch_depend_value 2.48% : 0.000022s : 124: predicate.float_depend_g_call 0.37% : 0.000003s : 24: predicate.float_environ_get_switch 0.46% : 0.000004s : 30: predicate.float_tuple_getitem_switch 0.05% : 0.000000s : 6: predicate.fold_const_symbol 0.41% : 0.000004s : 24: predicate.get_grad_eliminate 0.06% : 0.000001s : 6: predicate.graph_param_transform 0.38% : 0.000003s : 24: predicate.incorporate_call 0.34% : 0.000003s : 24: predicate.incorporate_call_switch 5.40% : 0.000049s : 261: predicate.inline 1.36% : 0.000012s : 65: predicate.inline_without_move 0.19% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.46% : 0.000004s : 24: predicate.less_batch_normalization 1.63% : 0.000015s : 101: predicate.list_to_tuple_eliminator_ 2.64% : 0.000024s : 178: predicate.load_eliminater 0.33% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.85% : 0.000026s : 183: predicate.loop_unroll_before_grad 1.33% : 0.000012s : 90: predicate.make_slice_get_slice_eliminator 0.37% : 0.000003s : 24: predicate.merge_addn 1.16% : 0.000010s : 79: predicate.micro_step_allgather_replace 1.17% : 0.000011s : 79: predicate.mini_step_allgather_replace 1.11% : 0.000010s : 78: predicate.minmaximum_grad 0.31% : 0.000003s : 6: predicate.mutable_eliminate 0.11% : 0.000001s : 6: predicate.opt_reshape 0.12% : 0.000001s : 6: predicate.parallel_virtual_node 2.44% : 0.000022s : 124: predicate.partial_defer_inline 1.59% : 0.000014s : 95: predicate.partial_eliminate 1.16% : 0.000010s : 77: predicate.print_const_string_wrapper 0.39% : 0.000003s : 24: predicate.reduce_all_const_elim 1.48% : 0.000013s : 78: predicate.reduce_eliminate 2.57% : 0.000023s : 178: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000002s : 24: predicate.remove_not_recompute_node 1.73% : 0.000016s : 168: predicate.replace_applicator 0.60% : 0.000005s : 65: predicate.replace_old_param 0.07% : 0.000001s : 6: predicate.reset_defer_inline 1.15% : 0.000010s : 78: predicate.reshape_eliminate 1.19% : 0.000011s : 79: predicate.row_tensor_add_zeros_like 0.12% : 0.000001s : 6: predicate.row_tensor_eliminate 1.46% : 0.000013s : 79: predicate.same_eliminate 0.24% : 0.000002s : 24: predicate.set_cell_output_no_recompute 0.45% : 0.000004s : 24: predicate.shard_identity_eliminate 0.25% : 0.000002s : 12: predicate.special_op_eliminate 0.44% : 0.000004s : 24: predicate.specialize_transform 1.33% : 0.000012s : 79: predicate.split_environ_get_set_with_tuple_value 1.22% : 0.000011s : 65: predicate.stack_unstack_eliminate 0.10% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.10% : 0.000019s : 124: predicate.switch_defer_inline 4.24% : 0.000038s : 203: predicate.switch_layer_defer_inline 5.92% : 0.000053s : 351: predicate.switch_simplify 1.16% : 0.000010s : 78: predicate.tile_eliminate 1.13% : 0.000010s : 78: predicate.transpose_eliminate 1.46% : 0.000013s : 90: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000013s : 90: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000013s : 90: predicate.tuple_list_get_item_depend_reorder 2.41% : 0.000022s : 125: predicate.tuple_list_get_item_eliminator 1.45% : 0.000013s : 90: predicate.tuple_list_get_set_item_eliminator 1.95% : 0.000018s : 114: predicate.tuple_list_set_item_eliminator 1.56% : 0.000014s : 101: predicate.tuple_to_list_eliminator_ 2.48% : 0.000022s : 178: predicate.updatestate_pure_node_eliminater 2.90% : 0.000026s : 202: predicate.updatestate_useless_node_eliminater 0.12% : 0.000001s : 6: predicate.value_based_eliminate 0.42% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.44% : 0.000004s : 24: predicate.virtual_output_eliminate 0.11% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.13% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004107 66 57.97% : 0.002381s : 27: func_graph_cloner_run.FuncGraphClonerGraph 42.03% : 0.001726s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.201485 237 0.00% : 0.000004s : 1: ForceFp32Comm 0.34% : 0.007537s : 1: add_attr 0.34% : 0.007524s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000075s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000299s : 1: auto_monad 0.00% : 0.000033s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.04% : 0.000911s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.00% : 0.000061s : 1: event_method 0.00% : 0.000029s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000021s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000481s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.02% : 0.000517s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.28% : 0.006266s : 117: opt.transform.opt_a 0.00% : 0.000044s : 1: opt.transform.opt_after_cconv 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000169s : 28: opt.transform.opt_b 0.00% : 0.000066s : 2: opt.transform.opt_trans_graph 0.00% : 0.000054s : 4: opt.transform.symbol_engine_opt 1.09% : 0.023942s : 1: opt_a 0.01% : 0.000140s : 1: opt_after_cconv 0.02% : 0.000501s : 1: opt_after_jit_grad 0.01% : 0.000299s : 1: opt_b 1.21% : 0.026744s : 1: optimize 0.00% : 0.000037s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000058s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000027s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000020s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000016s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000105s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000030s : 1: remove_dup_value 0.46% : 0.010047s : 2: renormalize.infer 0.12% : 0.002619s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000044s : 1: rewriter_after_opt_a 0.02% : 0.000449s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000015s : 1: split_matmul_comm_elemetwise 0.00% : 0.000033s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000097s : 1: symbol_engine_optimizer 87.95% : 1.936244s : 1: task_emit 0.00% : 0.000095s : 1: tuple_transform 7.96% : 0.175216s : 1: type_inference 0.01% : 0.000176s : 1: validate TotalTime = 2.52292, [24] [bootstrap]: 0.00091675 [type_inference]: 0.180843 [event_method]: 5.357e-05 [auto_monad]: 0.00023426 [graph_reusing]: 1.491e-05 [inline]: 2.00002e-06 [add_attr]: 0.00722648, [1] [add_attr_with_inline]: 0.00721491, [1] [Cycle 1]: 0.00015621, [2] [tag_attr]: 6.34e-05 [meta_addattr_fg_expand]: 2.471e-05 [parallel-infer-symbol]: 2.73e-06 [pre_auto_parallel]: 9.014e-05 [insert-virtual-dataset]: 1.81003e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 1.84e-06 [pipeline_split]: 1.72001e-06 [optimize]: 0.0261891, [53] [py_interpret_to_execute]: 4.60001e-06 [rewriter_before_opt_a]: 0.00043264 [opt_a]: 0.0235253, [3] [Cycle 1]: 0.0182108, [45] [expand_dump_flag]: 5.87001e-06 [switch_simplify]: 0.00025169 [loop_unroll]: 9.389e-05 [a_1]: 0.0019732 [with_stream_mark]: 2.21e-05 [recompute_prepare]: 2.042e-05 [updatestate_depend_eliminate]: 1.535e-05 [updatestate_assign_eliminate]: 1.179e-05 [updatestate_loads_eliminate]: 6.87002e-06 [parameter_eliminate]: 2.12999e-06 [a_2]: 0.0002172 [accelerated_algorithm]: 1.521e-05 [shard]: 1.13001e-06 [meta_shard_fg_expand]: 5.10999e-06 [shard_inline]: 1.431e-05 [merge_send_recv]: 5.976e-05 [auto_parallel]: 1.187e-05 [parallel]: 6.827e-05 [flash_sp]: 2.431e-05 [merge_comm]: 1.015e-05 [allreduce_fusion]: 1.56e-05 [matmul_add_comm_reduction]: 2.704e-05 [allreduce_slice_to_reducescatter]: 4.98001e-06 [virtual_shard_identity]: 1.817e-05 [virtual_dataset]: 1.435e-05 [get_grad_eliminate_]: 1.412e-05 [virtual_output]: 1.388e-05 [merge_forward]: 8.33999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [offload_activation]: 2.283e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.858e-05 [merge_recompute_call_nodes]: 8.29983e-07 [before_grad]: 3.036e-05 [set_forward_comm_id_for_comm_node_pass]: 1.505e-05 [meta_fg_expand]: 0.00216721 [flash_sp_send_recv_attached]: 3.61001e-06 [receive_attached]: 1.43e-05 [after_resolve]: 8.178e-05 [a_after_grad]: 0.00011508 [renormalize]: 0.0115338 [add_forward_monad_depend]: 1.218e-05 [auto_monad_grad]: 7.45e-06 [auto_monad_eliminator]: 7.686e-05 [cse]: 0.00036572 [a_3]: 0.00048309 [Cycle 2]: 0.00437812, [45] [expand_dump_flag]: 2.37999e-06 [switch_simplify]: 6.592e-05 [loop_unroll]: 6.174e-05 [a_1]: 0.00198679 [with_stream_mark]: 1.575e-05 [recompute_prepare]: 1.12e-05 [updatestate_depend_eliminate]: 5.59998e-06 [updatestate_assign_eliminate]: 4.62e-06 [updatestate_loads_eliminate]: 4.25999e-06 [parameter_eliminate]: 1.17e-06 [a_2]: 0.00012362 [accelerated_algorithm]: 9.42001e-06 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 2.62001e-06 [shard_inline]: 9.40001e-06 [merge_send_recv]: 6.86001e-06 [auto_parallel]: 7.66001e-06 [parallel]: 3.97002e-06 [flash_sp]: 2.83998e-06 [merge_comm]: 5.28002e-06 [allreduce_fusion]: 4.82998e-06 [matmul_add_comm_reduction]: 7.48e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 9.64e-06 [virtual_dataset]: 8.97e-06 [get_grad_eliminate_]: 9.59e-06 [virtual_output]: 8.64003e-06 [merge_forward]: 5.42999e-06 [cell_reuse_recompute_pass]: 1.09998e-06 [offload_activation]: 1.028e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.683e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.444e-05 [set_forward_comm_id_for_comm_node_pass]: 5.52001e-06 [meta_fg_expand]: 0.00043204 [flash_sp_send_recv_attached]: 1.49e-06 [receive_attached]: 1.40999e-06 [after_resolve]: 2.197e-05 [a_after_grad]: 1.422e-05 [renormalize]: 0.00102689 [add_forward_monad_depend]: 4.50001e-06 [auto_monad_grad]: 1.17999e-06 [auto_monad_eliminator]: 1.658e-05 [cse]: 0.00010045 [a_3]: 6.633e-05 [Cycle 3]: 0.00092213, [45] [expand_dump_flag]: 1.39e-06 [switch_simplify]: 1.055e-05 [loop_unroll]: 8.65999e-06 [a_1]: 0.00023993 [with_stream_mark]: 1.172e-05 [recompute_prepare]: 9.23002e-06 [updatestate_depend_eliminate]: 5.27999e-06 [updatestate_assign_eliminate]: 4.36002e-06 [updatestate_loads_eliminate]: 4.31002e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 0.00012184 [accelerated_algorithm]: 8.88002e-06 [shard]: 1.25999e-06 [meta_shard_fg_expand]: 2.02999e-06 [shard_inline]: 8.89e-06 [merge_send_recv]: 6.95998e-06 [auto_parallel]: 7.58001e-06 [parallel]: 3.78001e-06 [flash_sp]: 9.79984e-07 [merge_comm]: 5.02999e-06 [allreduce_fusion]: 4.81997e-06 [matmul_add_comm_reduction]: 8.15e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 9.64e-06 [virtual_dataset]: 8.71997e-06 [get_grad_eliminate_]: 8.54002e-06 [virtual_output]: 8.33001e-06 [merge_forward]: 4.50999e-06 [cell_reuse_recompute_pass]: 1.37999e-06 [offload_activation]: 9.05001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.676e-05 [merge_recompute_call_nodes]: 6.69999e-07 [before_grad]: 1.419e-05 [set_forward_comm_id_for_comm_node_pass]: 5.07e-06 [meta_fg_expand]: 3.61999e-06 [flash_sp_send_recv_attached]: 9.00007e-07 [receive_attached]: 9.20001e-07 [after_resolve]: 1.093e-05 [a_after_grad]: 1.271e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.45999e-06 [auto_monad_grad]: 1.00001e-06 [auto_monad_eliminator]: 4.245e-05 [cse]: 2.814e-05 [a_3]: 5.598e-05 [py_interpret_to_execute_after_opt_a]: 4.60001e-06 [slice_cell_reuse_recomputed_activation]: 2.04e-06 [rewriter_after_opt_a]: 3.617e-05 [convert_after_rewriter]: 1.40999e-06 [order_py_execute_after_rewriter]: 1.30001e-06 [mutable_eliminate]: 0.00049985 [opt_b]: 0.00029376, [1] [Cycle 1]: 0.00028738, [7] [b_1]: 0.00018894 [b_2]: 1.078e-05 [updatestate_depend_eliminate]: 7.29001e-06 [updatestate_assign_eliminate]: 4.34002e-06 [updatestate_loads_eliminate]: 4.22e-06 [renormalize]: 4.39992e-07 [cse]: 3.672e-05 [optimize_parallel_all_gather_comm]: 2.708e-05 [overlap_param_gather]: 7.25998e-06 [cconv]: 1.795e-05 [loop_unroll]: 0.00045044 [opt_after_cconv]: 0.0001343, [1] [Cycle 1]: 0.00012859, [7] [c_1]: 4.267e-05 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 7.38e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.35e-06 [cse]: 3.428e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 2.179e-05 [tuple_transform]: 9.281e-05, [1] [Cycle 1]: 8.858e-05, [4] [d_1]: 5.905e-05 [none_parameter_eliminate]: 1.33002e-06 [renormalize]: 1.40019e-07 [switch_simplify]: 9.79999e-06 [partial_unused_args_eliminate]: 1.42999e-06 [add_recomputation]: 6.757e-05 [cse_after_recomputation]: 3.646e-05, [1] [Cycle 1]: 3.182e-05, [1] [cse]: 2.594e-05 [environ_conv]: 9.55001e-06 [swap_dp_allreduce_reducescatter]: 2.087e-05 [bias_add_comm_swap]: 7.7e-06 [label_micro_interleaved_index]: 1.055e-05 [label_fine_grained_interleaved_index]: 2.14999e-06 [merge_cast_opt]: 8.70001e-07 [slice_recompute_activation]: 2.02001e-06 [micro_interleaved_order_control]: 2.56998e-06 [assign_add_opt]: 8.89995e-07 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 5.77001e-06 [full_micro_interleaved_order_control]: 8.54998e-06 [reorder_send_recv_between_fp_bp]: 1.92001e-06 [comm_op_add_attrs]: 8.99978e-07 [add_comm_op_reuse_tag]: 7.80012e-07 [interleave_split_concat_branches]: 1.20999e-06 [interleave_parallel_branches]: 6.33e-06 [overlap_opt_shard_in_pipeline]: 1.637e-05 [overlap_opt_shard_grad_in_pipeline]: 1.37999e-06 [control_data_broadcast_order]: 1.661e-05 [grouped_pairwise_exchange_alltoall]: 1.39998e-06 [offloading_packed_experts]: 4.96002e-06 [overlap_recompute_and_grad_model_parallel]: 1.213e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.17e-06 [overlap_recompute_allgather_and_fa_grad]: 9.39996e-07 [overlap_recompute_comm]: 1.77999e-06 [overlap_grad_ring_attention]: 1.595e-05 [overlap_grad_flash_sp]: 3.843e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 6.12001e-06 [split_layernorm_comm]: 1.28002e-06 [handle_group_info]: 9.5999e-07 [symbol_engine_optimizer]: 9.502e-05, [1] [Cycle 1]: 9.03e-05, [6] [build]: 2.97002e-06 [elim_shapecalc]: 1.389e-05 [elim_not_effective]: 1.874e-05 [opt_reshape]: 1.039e-05 [fold_const_symbol]: 1.546e-05 [renormalize]: 1.59984e-07 [detach_backward]: 1.42999e-06 [pipeline_parallel_scheduler]: 1.35999e-06 [auto_monad_reorder]: 2.66e-05 [get_jit_bprop_graph]: 1.04e-06 [rewriter_after_jit_bprop_graph]: 3.29001e-06 [opt_after_jit_grad]: 0.00049291 [validate]: 5.616e-05 [backend_pass]: 8.29983e-07 [task_emit]: 2.30638 [execute]: 9.82999e-06 Sums bootstrap : 0.000917s : 0.04% type_inference : 0.180843s : 7.19% event_method : 0.000054s : 0.00% auto_monad : 0.000234s : 0.01% graph_reusing : 0.000015s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000063s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000025s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000090s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000433s : 0.02% optimize.opt_a.expand_dump_flag : 0.000010s : 0.00% optimize.opt_a.switch_simplify : 0.000328s : 0.01% optimize.opt_a.loop_unroll : 0.000164s : 0.01% optimize.opt_a.a_1 : 0.004200s : 0.17% optimize.opt_a.with_stream_mark : 0.000050s : 0.00% optimize.opt_a.recompute_prepare : 0.000041s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000026s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000021s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000015s : 0.00% optimize.opt_a.parameter_eliminate : 0.000004s : 0.00% optimize.opt_a.a_2 : 0.000463s : 0.02% optimize.opt_a.accelerated_algorithm : 0.000034s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.00% optimize.opt_a.shard_inline : 0.000033s : 0.00% optimize.opt_a.merge_send_recv : 0.000074s : 0.00% optimize.opt_a.auto_parallel : 0.000027s : 0.00% optimize.opt_a.parallel : 0.000076s : 0.00% optimize.opt_a.flash_sp : 0.000028s : 0.00% optimize.opt_a.merge_comm : 0.000020s : 0.00% optimize.opt_a.allreduce_fusion : 0.000025s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000043s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000006s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000037s : 0.00% optimize.opt_a.virtual_dataset : 0.000032s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000032s : 0.00% optimize.opt_a.virtual_output : 0.000031s : 0.00% optimize.opt_a.merge_forward : 0.000018s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000042s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000062s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000059s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000026s : 0.00% optimize.opt_a.meta_fg_expand : 0.002603s : 0.10% optimize.opt_a.flash_sp_send_recv_attached : 0.000006s : 0.00% optimize.opt_a.receive_attached : 0.000017s : 0.00% optimize.opt_a.after_resolve : 0.000115s : 0.00% optimize.opt_a.a_after_grad : 0.000142s : 0.01% optimize.opt_a.renormalize : 0.012561s : 0.50% optimize.opt_a.add_forward_monad_depend : 0.000018s : 0.00% optimize.opt_a.auto_monad_grad : 0.000010s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000136s : 0.01% optimize.opt_a.cse : 0.000494s : 0.02% optimize.opt_a.a_3 : 0.000605s : 0.02% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000036s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000500s : 0.02% optimize.opt_b.b_1 : 0.000189s : 0.01% optimize.opt_b.b_2 : 0.000011s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000037s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000027s : 0.00% optimize.overlap_param_gather : 0.000007s : 0.00% optimize.cconv : 0.000018s : 0.00% optimize.loop_unroll : 0.000450s : 0.02% optimize.opt_after_cconv.c_1 : 0.000043s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000004s : 0.00% optimize.opt_after_cconv.cse : 0.000034s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.00% optimize.tuple_transform.d_1 : 0.000059s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000010s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000068s : 0.00% optimize.cse_after_recomputation.cse : 0.000026s : 0.00% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000021s : 0.00% optimize.bias_add_comm_swap : 0.000008s : 0.00% optimize.label_micro_interleaved_index : 0.000011s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000006s : 0.00% optimize.full_micro_interleaved_order_control : 0.000009s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000006s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000017s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000016s : 0.00% optimize.overlap_grad_flash_sp : 0.000038s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000006s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000019s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000010s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000015s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000027s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000493s : 0.02% validate : 0.000056s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.306377s : 91.73% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.001088 242 3.88% : 0.000042s : 5: substitution.arithmetic_simplify 4.06% : 0.000044s : 8: substitution.cast_eliminate 0.26% : 0.000003s : 5: substitution.elim_not_effective 0.70% : 0.000008s : 12: substitution.float_depend_g_call 0.24% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.20% : 0.000002s : 5: substitution.fold_const_symbol 0.56% : 0.000006s : 6: substitution.graph_param_transform 0.21% : 0.000002s : 2: substitution.incorporate_call 0.16% : 0.000002s : 2: substitution.incorporate_call_switch 64.56% : 0.000702s : 35: substitution.inline 2.09% : 0.000023s : 3: substitution.inline_without_move 1.37% : 0.000015s : 19: substitution.j_node_and_user_rematch 1.14% : 0.000012s : 10: substitution.minmaximum_grad 2.06% : 0.000022s : 12: substitution.partial_eliminate 1.25% : 0.000014s : 19: substitution.remove_not_recompute_node 2.80% : 0.000030s : 14: substitution.replace_applicator 0.84% : 0.000009s : 12: substitution.replace_old_param 0.23% : 0.000002s : 1: substitution.set_cell_output_no_recompute 2.08% : 0.000023s : 7: substitution.switch_simplify 2.22% : 0.000024s : 10: substitution.tuple_list_convert_item_index_to_positive 1.13% : 0.000012s : 10: substitution.tuple_list_get_item_const_eliminator 1.53% : 0.000017s : 10: substitution.tuple_list_get_item_depend_reorder 4.88% : 0.000053s : 23: substitution.tuple_list_get_item_eliminator 1.53% : 0.000017s : 10: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.180686 2 97.01% : 0.175277s : 1: type_inference.infer 2.99% : 0.005409s : 1: type_inference.specialize ------[replace.] 0.000417 54 1.35% : 0.000006s : 1: replace.arithmetic_simplify 60.64% : 0.000253s : 35: replace.inline 16.84% : 0.000070s : 7: replace.switch_simplify 21.17% : 0.000088s : 11: replace.tuple_list_get_item_eliminator ------[match.] 0.000747 54 2.61% : 0.000019s : 1: match.arithmetic_simplify 91.65% : 0.000685s : 35: match.inline 2.51% : 0.000019s : 7: match.switch_simplify 3.24% : 0.000024s : 11: match.tuple_list_get_item_eliminator ------[predicate.] 0.000885 6109 1.12% : 0.000010s : 76: predicate.accumulaten_eliminater 0.28% : 0.000003s : 6: predicate.ad_related_special_op_eliminate 0.36% : 0.000003s : 24: predicate.addn_check_dump 1.14% : 0.000010s : 76: predicate.addn_zero_filter 1.09% : 0.000010s : 76: predicate.adjust_all_reduce_mul_add 2.23% : 0.000020s : 101: predicate.arithmetic_simplify 1.26% : 0.000011s : 77: predicate.cast_eliminate 1.20% : 0.000011s : 78: predicate.check_bprop_eliminate 0.38% : 0.000003s : 24: predicate.compare_switch_simplify 0.05% : 0.000000s : 6: predicate.const_output_eliminate 0.37% : 0.000003s : 24: predicate.depend_value_elim 1.21% : 0.000011s : 77: predicate.dict_get_item_const_eliminator 1.39% : 0.000012s : 77: predicate.dict_get_item_eliminator 1.14% : 0.000010s : 77: predicate.dict_set_item_eliminator 0.28% : 0.000002s : 12: predicate.dumpgradient_eliminate 0.06% : 0.000001s : 6: predicate.elim_not_effective 0.14% : 0.000001s : 6: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000011s : 83: predicate.environ_add_const_eliminate 1.18% : 0.000010s : 83: predicate.environ_get_add_eliminate 1.22% : 0.000011s : 83: predicate.environ_get_depend_swap 1.58% : 0.000014s : 107: predicate.environ_get_eliminate 1.19% : 0.000011s : 83: predicate.environ_get_set_eliminate 1.88% : 0.000017s : 123: predicate.exchange_switch_depend_value 2.57% : 0.000023s : 123: predicate.float_depend_g_call 0.38% : 0.000003s : 24: predicate.float_environ_get_switch 0.48% : 0.000004s : 30: predicate.float_tuple_getitem_switch 0.06% : 0.000000s : 6: predicate.fold_const_symbol 0.44% : 0.000004s : 24: predicate.get_grad_eliminate 0.06% : 0.000001s : 6: predicate.graph_param_transform 0.39% : 0.000003s : 24: predicate.incorporate_call 0.35% : 0.000003s : 24: predicate.incorporate_call_switch 5.42% : 0.000048s : 259: predicate.inline 1.38% : 0.000012s : 65: predicate.inline_without_move 0.20% : 0.000002s : 24: predicate.j_node_and_user_rematch 0.45% : 0.000004s : 24: predicate.less_batch_normalization 1.60% : 0.000014s : 100: predicate.list_to_tuple_eliminator_ 2.63% : 0.000023s : 176: predicate.load_eliminater 0.28% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.91% : 0.000026s : 181: predicate.loop_unroll_before_grad 1.36% : 0.000012s : 89: predicate.make_slice_get_slice_eliminator 0.40% : 0.000004s : 24: predicate.merge_addn 1.16% : 0.000010s : 78: predicate.micro_step_allgather_replace 1.18% : 0.000010s : 78: predicate.mini_step_allgather_replace 1.11% : 0.000010s : 77: predicate.minmaximum_grad 0.32% : 0.000003s : 6: predicate.mutable_eliminate 0.13% : 0.000001s : 6: predicate.opt_reshape 0.12% : 0.000001s : 6: predicate.parallel_virtual_node 2.59% : 0.000023s : 123: predicate.partial_defer_inline 1.58% : 0.000014s : 94: predicate.partial_eliminate 1.11% : 0.000010s : 76: predicate.print_const_string_wrapper 0.39% : 0.000003s : 24: predicate.reduce_all_const_elim 1.47% : 0.000013s : 77: predicate.reduce_eliminate 2.54% : 0.000022s : 176: predicate.redundant_stop_gradient_eliminater 0.22% : 0.000002s : 24: predicate.remove_not_recompute_node 1.73% : 0.000015s : 166: predicate.replace_applicator 0.61% : 0.000005s : 65: predicate.replace_old_param 0.07% : 0.000001s : 6: predicate.reset_defer_inline 1.17% : 0.000010s : 77: predicate.reshape_eliminate 1.23% : 0.000011s : 78: predicate.row_tensor_add_zeros_like 0.16% : 0.000001s : 6: predicate.row_tensor_eliminate 1.41% : 0.000012s : 78: predicate.same_eliminate 0.24% : 0.000002s : 24: predicate.set_cell_output_no_recompute 0.48% : 0.000004s : 24: predicate.shard_identity_eliminate 0.22% : 0.000002s : 12: predicate.special_op_eliminate 0.45% : 0.000004s : 24: predicate.specialize_transform 1.29% : 0.000011s : 78: predicate.split_environ_get_set_with_tuple_value 1.17% : 0.000010s : 65: predicate.stack_unstack_eliminate 0.11% : 0.000001s : 6: predicate.switch_call_monad_eliminater 2.10% : 0.000019s : 123: predicate.switch_defer_inline 3.29% : 0.000029s : 201: predicate.switch_layer_defer_inline 5.99% : 0.000053s : 348: predicate.switch_simplify 1.12% : 0.000010s : 77: predicate.tile_eliminate 1.14% : 0.000010s : 77: predicate.transpose_eliminate 1.50% : 0.000013s : 89: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000014s : 89: predicate.tuple_list_get_item_const_eliminator 1.41% : 0.000012s : 89: predicate.tuple_list_get_item_depend_reorder 2.50% : 0.000022s : 124: predicate.tuple_list_get_item_eliminator 1.52% : 0.000013s : 89: predicate.tuple_list_get_set_item_eliminator 1.99% : 0.000018s : 113: predicate.tuple_list_set_item_eliminator 1.62% : 0.000014s : 100: predicate.tuple_to_list_eliminator_ 2.49% : 0.000022s : 176: predicate.updatestate_pure_node_eliminater 2.97% : 0.000026s : 200: predicate.updatestate_useless_node_eliminater 0.11% : 0.000001s : 6: predicate.value_based_eliminate 0.45% : 0.000004s : 24: predicate.virtual_dataset_eliminate 0.41% : 0.000004s : 24: predicate.virtual_output_eliminate 0.09% : 0.000001s : 6: predicate.virtual_view_grad_eliminate 0.14% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003945 66 56.66% : 0.002235s : 27: func_graph_cloner_run.FuncGraphClonerGraph 43.34% : 0.001710s : 39: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.575410 237 0.00% : 0.000004s : 1: ForceFp32Comm 0.28% : 0.007231s : 1: add_attr 0.28% : 0.007219s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000072s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000243s : 1: auto_monad 0.00% : 0.000031s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000011s : 1: bias_add_comm_swap 0.04% : 0.000971s : 1: bootstrap 0.00% : 0.000022s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000020s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.00% : 0.000059s : 1: event_method 0.00% : 0.000023s : 1: execute 0.00% : 0.000012s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000019s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000009s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000013s : 1: label_micro_interleaved_index 0.02% : 0.000459s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000509s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.00% : 0.000018s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000021s : 1: opt.transform.mutable_eliminate 0.24% : 0.006241s : 117: opt.transform.opt_a 0.00% : 0.000041s : 1: opt.transform.opt_after_cconv 0.00% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000171s : 28: opt.transform.opt_b 0.00% : 0.000067s : 2: opt.transform.opt_trans_graph 0.00% : 0.000054s : 4: opt.transform.symbol_engine_opt 0.91% : 0.023529s : 1: opt_a 0.01% : 0.000138s : 1: opt_after_cconv 0.02% : 0.000502s : 1: opt_after_jit_grad 0.01% : 0.000297s : 1: opt_b 1.02% : 0.026193s : 1: optimize 0.00% : 0.000031s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000042s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000019s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000020s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000011s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000095s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000009s : 1: remove_cast_before_assign_add 0.00% : 0.000026s : 1: remove_dup_value 0.39% : 0.010016s : 2: renormalize.infer 0.10% : 0.002530s : 2: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000040s : 1: rewriter_after_opt_a 0.02% : 0.000440s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000009s : 1: split_matmul_comm_elemetwise 0.00% : 0.000024s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000098s : 1: symbol_engine_optimizer 89.56% : 2.306416s : 1: task_emit 0.00% : 0.000096s : 1: tuple_transform 7.02% : 0.180860s : 1: type_inference 0.01% : 0.000165s : 1: validate TotalTime = 2.62332, [24] [bootstrap]: 0.00107437 [type_inference]: 0.14687 [event_method]: 0.00029761 [auto_monad]: 0.00023614 [graph_reusing]: 1.091e-05 [inline]: 3.56001e-06 [add_attr]: 0.00790205, [1] [add_attr_with_inline]: 0.00788662, [1] [Cycle 1]: 0.00015629, [2] [tag_attr]: 4.896e-05 [meta_addattr_fg_expand]: 2.016e-05 [parallel-infer-symbol]: 3.33998e-06 [pre_auto_parallel]: 7.074e-05 [insert-virtual-dataset]: 2.42001e-06 [parallel-infer-symbol-second]: 8.90024e-07 [dataset_repeat_opt]: 1.94999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00766267, [53] [py_interpret_to_execute]: 5.17e-06 [rewriter_before_opt_a]: 0.00035012 [opt_a]: 0.00515103, [2] [Cycle 1]: 0.00458754, [45] [expand_dump_flag]: 4.87998e-06 [switch_simplify]: 0.00018644 [loop_unroll]: 5.676e-05 [a_1]: 0.00114486 [with_stream_mark]: 1.619e-05 [recompute_prepare]: 8.30999e-06 [updatestate_depend_eliminate]: 1.213e-05 [updatestate_assign_eliminate]: 1.102e-05 [updatestate_loads_eliminate]: 3.34001e-06 [parameter_eliminate]: 2.02999e-06 [a_2]: 7.503e-05 [accelerated_algorithm]: 6.39999e-06 [shard]: 1.74e-06 [meta_shard_fg_expand]: 2.51998e-06 [shard_inline]: 5.74e-06 [merge_send_recv]: 4.331e-05 [auto_parallel]: 6.24001e-06 [parallel]: 8.13e-05 [flash_sp]: 3.054e-05 [merge_comm]: 4.37998e-06 [allreduce_fusion]: 1.068e-05 [matmul_add_comm_reduction]: 1.624e-05 [allreduce_slice_to_reducescatter]: 7.87e-06 [virtual_shard_identity]: 8.74e-06 [virtual_dataset]: 5.95002e-06 [get_grad_eliminate_]: 5.48002e-06 [virtual_output]: 5.89e-06 [merge_forward]: 3.76999e-06 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 1.77e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.054e-05 [merge_recompute_call_nodes]: 1.94999e-06 [before_grad]: 9.86e-06 [set_forward_comm_id_for_comm_node_pass]: 1.075e-05 [meta_fg_expand]: 3.66999e-06 [flash_sp_send_recv_attached]: 2.43e-06 [receive_attached]: 1.668e-05 [after_resolve]: 9.52999e-06 [a_after_grad]: 8.47e-06 [renormalize]: 0.00232553 [add_forward_monad_depend]: 5.91e-06 [auto_monad_grad]: 2.68998e-06 [auto_monad_eliminator]: 2.794e-05 [cse]: 5.587e-05 [a_3]: 4.299e-05 [Cycle 2]: 0.00055248, [45] [expand_dump_flag]: 1.20999e-06 [switch_simplify]: 6.42001e-06 [loop_unroll]: 5.24e-06 [a_1]: 9.415e-05 [with_stream_mark]: 1.176e-05 [recompute_prepare]: 5.52999e-06 [updatestate_depend_eliminate]: 2.91999e-06 [updatestate_assign_eliminate]: 2.48e-06 [updatestate_loads_eliminate]: 3.01001e-06 [parameter_eliminate]: 1.25001e-06 [a_2]: 5.854e-05 [accelerated_algorithm]: 5.00999e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 1.44e-06 [shard_inline]: 5.23002e-06 [merge_send_recv]: 5.84e-06 [auto_parallel]: 5.26998e-06 [parallel]: 5.10001e-06 [flash_sp]: 3.43e-06 [merge_comm]: 2.84001e-06 [allreduce_fusion]: 2.79001e-06 [matmul_add_comm_reduction]: 6.19001e-06 [allreduce_slice_to_reducescatter]: 3.69997e-07 [virtual_shard_identity]: 6.06e-06 [virtual_dataset]: 5.02e-06 [get_grad_eliminate_]: 4.97999e-06 [virtual_output]: 4.76002e-06 [merge_forward]: 2.62001e-06 [cell_reuse_recompute_pass]: 2.16e-06 [offload_activation]: 7.2e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.186e-05 [merge_recompute_call_nodes]: 1.15999e-06 [before_grad]: 7.98001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.22002e-06 [meta_fg_expand]: 2.11e-06 [flash_sp_send_recv_attached]: 9.10019e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 7.97e-06 [a_after_grad]: 7.48999e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 1.01002e-06 [auto_monad_eliminator]: 7.01001e-06 [cse]: 1.279e-05 [a_3]: 3.004e-05 [py_interpret_to_execute_after_opt_a]: 5.51e-06 [slice_cell_reuse_recomputed_activation]: 2.07999e-06 [rewriter_after_opt_a]: 2.623e-05 [convert_after_rewriter]: 1.19e-06 [order_py_execute_after_rewriter]: 1.25999e-06 [mutable_eliminate]: 0.00057377 [opt_b]: 0.00017251, [1] [Cycle 1]: 0.00016638, [7] [b_1]: 9.973e-05 [b_2]: 6.36998e-06 [updatestate_depend_eliminate]: 5.05999e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.64001e-06 [renormalize]: 5.69999e-07 [cse]: 1.69e-05 [optimize_parallel_all_gather_comm]: 2.619e-05 [overlap_param_gather]: 1.024e-05 [cconv]: 3.463e-05 [loop_unroll]: 0.00044187 [opt_after_cconv]: 9.147e-05, [1] [Cycle 1]: 8.556e-05, [7] [c_1]: 2.348e-05 [parameter_eliminate]: 3.03e-06 [updatestate_depend_eliminate]: 5.25999e-06 [updatestate_assign_eliminate]: 2.39999e-06 [updatestate_loads_eliminate]: 2.31e-06 [cse]: 1.714e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.36e-05 [tuple_transform]: 6.536e-05, [1] [Cycle 1]: 6.108e-05, [4] [d_1]: 3.563e-05 [none_parameter_eliminate]: 1.81e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 5.94e-06 [partial_unused_args_eliminate]: 1.67999e-06 [add_recomputation]: 5.761e-05 [cse_after_recomputation]: 2.088e-05, [1] [Cycle 1]: 1.651e-05, [1] [cse]: 1.135e-05 [environ_conv]: 1.963e-05 [swap_dp_allreduce_reducescatter]: 2.394e-05 [bias_add_comm_swap]: 1.44e-05 [label_micro_interleaved_index]: 1.304e-05 [label_fine_grained_interleaved_index]: 3.36999e-06 [merge_cast_opt]: 1.24e-06 [slice_recompute_activation]: 2.27001e-06 [micro_interleaved_order_control]: 2.34999e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 1.04e-06 [remove_cast_before_assign_add]: 8.40999e-06 [full_micro_interleaved_order_control]: 1.034e-05 [reorder_send_recv_between_fp_bp]: 2.64001e-06 [comm_op_add_attrs]: 1.00001e-06 [add_comm_op_reuse_tag]: 8.79983e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 8.64998e-06 [overlap_opt_shard_in_pipeline]: 3.122e-05 [overlap_opt_shard_grad_in_pipeline]: 1.72999e-06 [control_data_broadcast_order]: 1.375e-05 [grouped_pairwise_exchange_alltoall]: 1.44998e-06 [offloading_packed_experts]: 3.51001e-06 [overlap_recompute_and_grad_model_parallel]: 1.232e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.37999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35001e-06 [overlap_recompute_comm]: 2.44001e-06 [overlap_grad_ring_attention]: 1.861e-05 [overlap_grad_flash_sp]: 4.245e-05 [begin_end_overlap_inline]: 5.50004e-07 [split_matmul_comm_elemetwise]: 1.021e-05 [split_layernorm_comm]: 1.69998e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 7.459e-05, [1] [Cycle 1]: 7e-05, [6] [build]: 3.38999e-06 [elim_shapecalc]: 1.049e-05 [elim_not_effective]: 1.265e-05 [opt_reshape]: 5.90002e-06 [fold_const_symbol]: 9.22001e-06 [renormalize]: 2.80008e-07 [detach_backward]: 1.79998e-06 [pipeline_parallel_scheduler]: 1.40999e-06 [auto_monad_reorder]: 3.035e-05 [get_jit_bprop_graph]: 1.68002e-06 [rewriter_after_jit_bprop_graph]: 3.46999e-06 [opt_after_jit_grad]: 0.00052236 [validate]: 7.006e-05 [backend_pass]: 1.26002e-06 [task_emit]: 2.458 [execute]: 9.77999e-06 Sums bootstrap : 0.001074s : 0.04% type_inference : 0.146870s : 5.62% event_method : 0.000298s : 0.01% auto_monad : 0.000236s : 0.01% graph_reusing : 0.000011s : 0.00% inline : 0.000004s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000049s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000020s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000071s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000350s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000193s : 0.01% optimize.opt_a.loop_unroll : 0.000062s : 0.00% optimize.opt_a.a_1 : 0.001239s : 0.05% optimize.opt_a.with_stream_mark : 0.000028s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000015s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000134s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000011s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000011s : 0.00% optimize.opt_a.merge_send_recv : 0.000049s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000086s : 0.00% optimize.opt_a.flash_sp : 0.000034s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000008s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.00% optimize.opt_a.virtual_dataset : 0.000011s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000010s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000006s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000025s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000014s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000018s : 0.00% optimize.opt_a.after_resolve : 0.000017s : 0.00% optimize.opt_a.a_after_grad : 0.000016s : 0.00% optimize.opt_a.renormalize : 0.002326s : 0.09% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.00% optimize.opt_a.cse : 0.000069s : 0.00% optimize.opt_a.a_3 : 0.000073s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000574s : 0.02% optimize.opt_b.b_1 : 0.000100s : 0.00% optimize.opt_b.b_2 : 0.000006s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.00% optimize.overlap_param_gather : 0.000010s : 0.00% optimize.cconv : 0.000035s : 0.00% optimize.loop_unroll : 0.000442s : 0.02% optimize.opt_after_cconv.c_1 : 0.000023s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000017s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000014s : 0.00% optimize.tuple_transform.d_1 : 0.000036s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000058s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000024s : 0.00% optimize.bias_add_comm_swap : 0.000014s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000008s : 0.00% optimize.full_micro_interleaved_order_control : 0.000010s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000009s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000031s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000019s : 0.00% optimize.overlap_grad_flash_sp : 0.000042s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000010s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000030s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000522s : 0.02% validate : 0.000070s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.458002s : 94.03% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000344 39 0.59% : 0.000002s : 2: substitution.elim_not_effective 0.50% : 0.000002s : 2: substitution.fold_const_symbol 1.57% : 0.000005s : 3: substitution.graph_param_transform 78.69% : 0.000271s : 16: substitution.inline 1.01% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.66% : 0.000013s : 4: substitution.remove_not_recompute_node 0.85% : 0.000003s : 2: substitution.replace_old_param 6.24% : 0.000021s : 4: substitution.switch_simplify 6.89% : 0.000024s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.146749 2 97.49% : 0.143072s : 1: type_inference.infer 2.51% : 0.003677s : 1: type_inference.specialize ------[replace.] 0.000156 22 60.57% : 0.000095s : 16: replace.inline 29.84% : 0.000047s : 4: replace.switch_simplify 9.59% : 0.000015s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000305 22 86.40% : 0.000263s : 16: match.inline 6.27% : 0.000019s : 4: match.switch_simplify 7.33% : 0.000022s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000266 1692 1.16% : 0.000003s : 22: predicate.accumulaten_eliminater 0.55% : 0.000001s : 3: predicate.ad_related_special_op_eliminate 0.30% : 0.000001s : 6: predicate.addn_check_dump 1.14% : 0.000003s : 22: predicate.addn_zero_filter 1.08% : 0.000003s : 22: predicate.adjust_all_reduce_mul_add 2.50% : 0.000007s : 28: predicate.arithmetic_simplify 1.24% : 0.000003s : 22: predicate.cast_eliminate 0.36% : 0.000001s : 6: predicate.check_bprop_eliminate 0.33% : 0.000001s : 6: predicate.compare_switch_simplify 0.10% : 0.000000s : 3: predicate.const_output_eliminate 0.30% : 0.000001s : 6: predicate.depend_value_elim 1.18% : 0.000003s : 22: predicate.dict_get_item_const_eliminator 1.32% : 0.000004s : 22: predicate.dict_get_item_eliminator 1.24% : 0.000003s : 22: predicate.dict_set_item_eliminator 0.60% : 0.000002s : 6: predicate.dumpgradient_eliminate 0.14% : 0.000000s : 3: predicate.elim_not_effective 0.23% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 25: predicate.environ_add_const_eliminate 1.24% : 0.000003s : 25: predicate.environ_get_add_eliminate 1.22% : 0.000003s : 25: predicate.environ_get_depend_swap 1.55% : 0.000004s : 31: predicate.environ_get_eliminate 1.21% : 0.000003s : 25: predicate.environ_get_set_eliminate 2.09% : 0.000006s : 40: predicate.exchange_switch_depend_value 2.92% : 0.000008s : 40: predicate.float_depend_g_call 0.32% : 0.000001s : 6: predicate.float_environ_get_switch 0.44% : 0.000001s : 9: predicate.float_tuple_getitem_switch 0.11% : 0.000000s : 3: predicate.fold_const_symbol 0.45% : 0.000001s : 6: predicate.get_grad_eliminate 0.13% : 0.000000s : 3: predicate.graph_param_transform 0.33% : 0.000001s : 6: predicate.incorporate_call 0.29% : 0.000001s : 6: predicate.incorporate_call_switch 5.97% : 0.000016s : 80: predicate.inline 0.52% : 0.000001s : 6: predicate.inline_without_move 0.17% : 0.000000s : 6: predicate.j_node_and_user_rematch 0.57% : 0.000002s : 6: predicate.less_batch_normalization 1.65% : 0.000004s : 30: predicate.list_to_tuple_eliminator_ 2.65% : 0.000007s : 52: predicate.load_eliminater 0.67% : 0.000002s : 3: predicate.loop_unroll_after_grad 3.58% : 0.000010s : 69: predicate.loop_unroll_before_grad 1.44% : 0.000004s : 28: predicate.make_slice_get_slice_eliminator 0.37% : 0.000001s : 6: predicate.merge_addn 0.30% : 0.000001s : 6: predicate.micro_step_allgather_replace 0.33% : 0.000001s : 6: predicate.mini_step_allgather_replace 1.08% : 0.000003s : 22: predicate.minmaximum_grad 0.74% : 0.000002s : 3: predicate.mutable_eliminate 0.21% : 0.000001s : 3: predicate.opt_reshape 0.20% : 0.000001s : 3: predicate.parallel_virtual_node 2.77% : 0.000007s : 40: predicate.partial_defer_inline 1.52% : 0.000004s : 27: predicate.partial_eliminate 1.18% : 0.000003s : 22: predicate.print_const_string_wrapper 0.34% : 0.000001s : 6: predicate.reduce_all_const_elim 1.58% : 0.000004s : 22: predicate.reduce_eliminate 2.63% : 0.000007s : 52: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 6: predicate.remove_not_recompute_node 1.25% : 0.000003s : 30: predicate.replace_applicator 0.33% : 0.000001s : 6: predicate.replace_old_param 0.15% : 0.000000s : 3: predicate.reset_defer_inline 1.21% : 0.000003s : 22: predicate.reshape_eliminate 0.36% : 0.000001s : 6: predicate.row_tensor_add_zeros_like 0.24% : 0.000001s : 3: predicate.row_tensor_eliminate 0.49% : 0.000001s : 6: predicate.same_eliminate 0.24% : 0.000001s : 6: predicate.set_cell_output_no_recompute 0.56% : 0.000001s : 6: predicate.shard_identity_eliminate 0.41% : 0.000001s : 6: predicate.special_op_eliminate 0.45% : 0.000001s : 6: predicate.specialize_transform 0.50% : 0.000001s : 6: predicate.split_environ_get_set_with_tuple_value 0.54% : 0.000001s : 6: predicate.stack_unstack_eliminate 0.18% : 0.000000s : 3: predicate.switch_call_monad_eliminater 2.37% : 0.000006s : 40: predicate.switch_defer_inline 2.85% : 0.000008s : 46: predicate.switch_layer_defer_inline 7.47% : 0.000020s : 126: predicate.switch_simplify 1.21% : 0.000003s : 22: predicate.tile_eliminate 1.11% : 0.000003s : 22: predicate.transpose_eliminate 1.62% : 0.000004s : 28: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 28: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 28: predicate.tuple_list_get_item_depend_reorder 2.87% : 0.000008s : 36: predicate.tuple_list_get_item_eliminator 1.65% : 0.000004s : 28: predicate.tuple_list_get_set_item_eliminator 2.06% : 0.000005s : 34: predicate.tuple_list_set_item_eliminator 1.69% : 0.000005s : 30: predicate.tuple_to_list_eliminator_ 2.52% : 0.000007s : 52: predicate.updatestate_pure_node_eliminater 2.91% : 0.000008s : 58: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 3: predicate.value_based_eliminate 0.40% : 0.000001s : 6: predicate.virtual_dataset_eliminate 0.47% : 0.000001s : 6: predicate.virtual_output_eliminate 0.17% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.26% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002028 30 44.72% : 0.000907s : 12: func_graph_cloner_run.FuncGraphClonerGraph 55.28% : 0.001121s : 18: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.643080 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.30% : 0.007906s : 1: add_attr 0.30% : 0.007891s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000062s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000248s : 1: auto_monad 0.00% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000011s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.04% : 0.001134s : 1: bootstrap 0.00% : 0.000039s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000024s : 1: environ_conv 0.01% : 0.000311s : 1: event_method 0.00% : 0.000022s : 1: execute 0.00% : 0.000013s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000007s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000063s : 1: label_micro_interleaved_index 0.02% : 0.000452s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000583s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.07% : 0.001779s : 78: opt.transform.opt_a 0.00% : 0.000022s : 1: opt.transform.opt_after_cconv 0.00% : 0.000022s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000080s : 28: opt.transform.opt_b 0.00% : 0.000039s : 2: opt.transform.opt_trans_graph 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.20% : 0.005155s : 1: opt_a 0.00% : 0.000095s : 1: opt_after_cconv 0.02% : 0.000532s : 1: opt_after_jit_grad 0.01% : 0.000176s : 1: opt_b 0.29% : 0.007667s : 1: optimize 0.00% : 0.000030s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000046s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000022s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000035s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000013s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000015s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000075s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000017s : 1: remove_dup_value 0.05% : 0.001279s : 1: renormalize.infer 0.04% : 0.001038s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000030s : 1: rewriter_after_opt_a 0.01% : 0.000358s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000013s : 1: split_matmul_comm_elemetwise 0.00% : 0.000028s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000077s : 1: symbol_engine_optimizer 93.00% : 2.458042s : 1: task_emit 0.00% : 0.000068s : 1: tuple_transform 5.56% : 0.146894s : 1: type_inference 0.01% : 0.000309s : 1: validate TotalTime = 2.68346, [24] [bootstrap]: 0.00107563 [type_inference]: 0.148705 [event_method]: 5.245e-05 [auto_monad]: 0.00015873 [graph_reusing]: 8.30999e-06 [inline]: 2.16e-06 [add_attr]: 0.00724731, [1] [add_attr_with_inline]: 0.0072349, [1] [Cycle 1]: 0.00011011, [2] [tag_attr]: 3.896e-05 [meta_addattr_fg_expand]: 1.526e-05 [parallel-infer-symbol]: 1.90001e-06 [pre_auto_parallel]: 5.713e-05 [insert-virtual-dataset]: 1.22999e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 1.14e-06 [pipeline_split]: 1.037e-05 [optimize]: 0.00746308, [53] [py_interpret_to_execute]: 4.3e-06 [rewriter_before_opt_a]: 0.00027229 [opt_a]: 0.00511924, [2] [Cycle 1]: 0.00444915, [45] [expand_dump_flag]: 3.59002e-06 [switch_simplify]: 0.00015694 [loop_unroll]: 5.746e-05 [a_1]: 0.00121268 [with_stream_mark]: 1.226e-05 [recompute_prepare]: 9.05999e-06 [updatestate_depend_eliminate]: 9.69e-06 [updatestate_assign_eliminate]: 6.76e-06 [updatestate_loads_eliminate]: 2.78998e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 8.828e-05 [accelerated_algorithm]: 7.25e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 2.54001e-06 [shard_inline]: 6.86001e-06 [merge_send_recv]: 3.261e-05 [auto_parallel]: 6.59001e-06 [parallel]: 5.543e-05 [flash_sp]: 1.582e-05 [merge_comm]: 4.99e-06 [allreduce_fusion]: 7.41999e-06 [matmul_add_comm_reduction]: 1.001e-05 [allreduce_slice_to_reducescatter]: 3.99002e-06 [virtual_shard_identity]: 9.46998e-06 [virtual_dataset]: 6.89999e-06 [get_grad_eliminate_]: 6.71999e-06 [virtual_output]: 6.91001e-06 [merge_forward]: 3.45e-06 [cell_reuse_recompute_pass]: 1.39998e-06 [offload_activation]: 1.127e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.748e-05 [merge_recompute_call_nodes]: 8.79983e-07 [before_grad]: 1.134e-05 [set_forward_comm_id_for_comm_node_pass]: 7.2e-06 [meta_fg_expand]: 3.85e-06 [flash_sp_send_recv_attached]: 1.52001e-06 [receive_attached]: 8.38999e-06 [after_resolve]: 9.81998e-06 [a_after_grad]: 1.013e-05 [renormalize]: 0.00223406 [add_forward_monad_depend]: 4.81002e-06 [auto_monad_grad]: 1.51002e-06 [auto_monad_eliminator]: 2.335e-05 [cse]: 4.758e-05 [a_3]: 5.055e-05 [Cycle 2]: 0.00066049, [45] [expand_dump_flag]: 1.11002e-06 [switch_simplify]: 7.61001e-06 [loop_unroll]: 6.43e-06 [a_1]: 0.00013987 [with_stream_mark]: 1.031e-05 [recompute_prepare]: 7.21001e-06 [updatestate_depend_eliminate]: 3.75e-06 [updatestate_assign_eliminate]: 2.88e-06 [updatestate_loads_eliminate]: 2.77002e-06 [parameter_eliminate]: 1.00999e-06 [a_2]: 7.654e-05 [accelerated_algorithm]: 6.44001e-06 [shard]: 1.19e-06 [meta_shard_fg_expand]: 1.37e-06 [shard_inline]: 6.68998e-06 [merge_send_recv]: 5.29998e-06 [auto_parallel]: 5.92999e-06 [parallel]: 4.31002e-06 [flash_sp]: 1.202e-05 [merge_comm]: 3.8e-06 [allreduce_fusion]: 3.31001e-06 [matmul_add_comm_reduction]: 6.21e-06 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 8.35001e-06 [virtual_dataset]: 6.39999e-06 [get_grad_eliminate_]: 6.14001e-06 [virtual_output]: 5.93002e-06 [merge_forward]: 3.43e-06 [cell_reuse_recompute_pass]: 1.55999e-06 [offload_activation]: 7.05e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.218e-05 [merge_recompute_call_nodes]: 9.79984e-07 [before_grad]: 1.006e-05 [set_forward_comm_id_for_comm_node_pass]: 4.38001e-06 [meta_fg_expand]: 2.41e-06 [flash_sp_send_recv_attached]: 8.39995e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 8.47e-06 [a_after_grad]: 9.22001e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.17e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 8.27e-06 [cse]: 1.714e-05 [a_3]: 3.773e-05 [py_interpret_to_execute_after_opt_a]: 5.54e-06 [slice_cell_reuse_recomputed_activation]: 9.39996e-07 [rewriter_after_opt_a]: 2.245e-05 [convert_after_rewriter]: 1.05999e-06 [order_py_execute_after_rewriter]: 9.80013e-07 [mutable_eliminate]: 0.0005024 [opt_b]: 0.00020852, [1] [Cycle 1]: 0.0002025, [7] [b_1]: 0.0001283 [b_2]: 7.21001e-06 [updatestate_depend_eliminate]: 5.82001e-06 [updatestate_assign_eliminate]: 2.86e-06 [updatestate_loads_eliminate]: 3.25e-06 [renormalize]: 4.40021e-07 [cse]: 2.186e-05 [optimize_parallel_all_gather_comm]: 2.143e-05 [overlap_param_gather]: 5.49e-06 [cconv]: 1.563e-05 [loop_unroll]: 0.00044624 [opt_after_cconv]: 0.00010569, [1] [Cycle 1]: 9.955e-05, [7] [c_1]: 3.016e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 3.09999e-06 [updatestate_loads_eliminate]: 2.84999e-06 [cse]: 2.189e-05 [renormalize]: 4.00003e-07 [remove_dup_value]: 2.236e-05 [tuple_transform]: 7.058e-05, [1] [Cycle 1]: 6.664e-05, [4] [d_1]: 4.05e-05 [none_parameter_eliminate]: 1.19e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.83998e-06 [partial_unused_args_eliminate]: 9.79984e-07 [add_recomputation]: 4.699e-05 [cse_after_recomputation]: 2.491e-05, [1] [Cycle 1]: 2.095e-05, [1] [cse]: 1.598e-05 [environ_conv]: 1.483e-05 [swap_dp_allreduce_reducescatter]: 1.506e-05 [bias_add_comm_swap]: 4.90999e-06 [label_micro_interleaved_index]: 7.83001e-06 [label_fine_grained_interleaved_index]: 1.81003e-06 [merge_cast_opt]: 6.89994e-07 [slice_recompute_activation]: 8.59989e-07 [micro_interleaved_order_control]: 1.19e-06 [assign_add_opt]: 6.19999e-07 [ForceFp32Comm]: 4.39992e-07 [remove_cast_before_assign_add]: 3.97e-06 [full_micro_interleaved_order_control]: 5.39e-06 [reorder_send_recv_between_fp_bp]: 1.37e-06 [comm_op_add_attrs]: 5.19998e-07 [add_comm_op_reuse_tag]: 4.2998e-07 [interleave_split_concat_branches]: 7.60017e-07 [interleave_parallel_branches]: 4.31002e-06 [overlap_opt_shard_in_pipeline]: 1.686e-05 [overlap_opt_shard_grad_in_pipeline]: 9.99979e-07 [control_data_broadcast_order]: 4.107e-05 [grouped_pairwise_exchange_alltoall]: 5.59987e-07 [offloading_packed_experts]: 3.55998e-06 [overlap_recompute_and_grad_model_parallel]: 8.22003e-06 [overlap_grad_matmul_and_grad_allreduce]: 8.00006e-07 [overlap_recompute_allgather_and_fa_grad]: 8.59989e-07 [overlap_recompute_comm]: 1.59e-06 [overlap_grad_ring_attention]: 1.082e-05 [overlap_grad_flash_sp]: 2.614e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 4.39002e-06 [split_layernorm_comm]: 1.22999e-06 [handle_group_info]: 6.19999e-07 [symbol_engine_optimizer]: 8.243e-05, [1] [Cycle 1]: 7.745e-05, [6] [build]: 2.77002e-06 [elim_shapecalc]: 1.355e-05 [elim_not_effective]: 1.366e-05 [opt_reshape]: 7.33999e-06 [fold_const_symbol]: 1.129e-05 [renormalize]: 2.60014e-07 [detach_backward]: 1.15999e-06 [pipeline_parallel_scheduler]: 1.08001e-06 [auto_monad_reorder]: 1.804e-05 [get_jit_bprop_graph]: 1.23002e-06 [rewriter_after_jit_bprop_graph]: 4.07e-06 [opt_after_jit_grad]: 0.00048854 [validate]: 5.866e-05 [backend_pass]: 1.22999e-06 [task_emit]: 2.51734 [execute]: 1.087e-05 Sums bootstrap : 0.001076s : 0.04% type_inference : 0.148705s : 5.56% event_method : 0.000052s : 0.00% auto_monad : 0.000159s : 0.01% graph_reusing : 0.000008s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000039s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000015s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000057s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000010s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000272s : 0.01% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000165s : 0.01% optimize.opt_a.loop_unroll : 0.000064s : 0.00% optimize.opt_a.a_1 : 0.001353s : 0.05% optimize.opt_a.with_stream_mark : 0.000023s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000010s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000165s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000014s : 0.00% optimize.opt_a.merge_send_recv : 0.000038s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000060s : 0.00% optimize.opt_a.flash_sp : 0.000028s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000021s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000012s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000002s : 0.00% optimize.opt_a.receive_attached : 0.000009s : 0.00% optimize.opt_a.after_resolve : 0.000018s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.002234s : 0.08% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000032s : 0.00% optimize.opt_a.cse : 0.000065s : 0.00% optimize.opt_a.a_3 : 0.000088s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000022s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000502s : 0.02% optimize.opt_b.b_1 : 0.000128s : 0.00% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000021s : 0.00% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000016s : 0.00% optimize.loop_unroll : 0.000446s : 0.02% optimize.opt_after_cconv.c_1 : 0.000030s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000022s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000022s : 0.00% optimize.tuple_transform.d_1 : 0.000040s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000047s : 0.00% optimize.cse_after_recomputation.cse : 0.000016s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000015s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000008s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000000s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000000s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000017s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000041s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000011s : 0.00% optimize.overlap_grad_flash_sp : 0.000026s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000014s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000018s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000489s : 0.02% validate : 0.000059s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.517335s : 94.12% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000344 50 10.66% : 0.000037s : 4: substitution.cast_eliminate 0.49% : 0.000002s : 3: substitution.elim_not_effective 0.57% : 0.000002s : 3: substitution.fold_const_symbol 1.09% : 0.000004s : 4: substitution.graph_param_transform 75.09% : 0.000259s : 16: substitution.inline 1.00% : 0.000003s : 6: substitution.j_node_and_user_rematch 2.52% : 0.000009s : 6: substitution.remove_not_recompute_node 0.81% : 0.000003s : 2: substitution.replace_old_param 4.33% : 0.000015s : 4: substitution.switch_simplify 3.43% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.148593 2 97.43% : 0.144771s : 1: type_inference.infer 2.57% : 0.003823s : 1: type_inference.specialize ------[replace.] 0.000148 22 61.11% : 0.000090s : 16: replace.inline 28.12% : 0.000042s : 4: replace.switch_simplify 10.76% : 0.000016s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000274 22 91.65% : 0.000251s : 16: match.inline 4.48% : 0.000012s : 4: match.switch_simplify 3.88% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000304 1964 1.10% : 0.000003s : 25: predicate.accumulaten_eliminater 0.68% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.35% : 0.000001s : 8: predicate.addn_check_dump 1.10% : 0.000003s : 25: predicate.addn_zero_filter 1.04% : 0.000003s : 25: predicate.adjust_all_reduce_mul_add 2.56% : 0.000008s : 33: predicate.arithmetic_simplify 1.43% : 0.000004s : 25: predicate.cast_eliminate 0.38% : 0.000001s : 8: predicate.check_bprop_eliminate 0.36% : 0.000001s : 8: predicate.compare_switch_simplify 0.12% : 0.000000s : 4: predicate.const_output_eliminate 0.37% : 0.000001s : 8: predicate.depend_value_elim 1.19% : 0.000004s : 25: predicate.dict_get_item_const_eliminator 1.42% : 0.000004s : 25: predicate.dict_get_item_eliminator 1.17% : 0.000004s : 25: predicate.dict_set_item_eliminator 0.58% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.16% : 0.000000s : 4: predicate.elim_not_effective 0.25% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.33% : 0.000004s : 29: predicate.environ_add_const_eliminate 1.24% : 0.000004s : 29: predicate.environ_get_add_eliminate 1.23% : 0.000004s : 29: predicate.environ_get_depend_swap 1.77% : 0.000005s : 37: predicate.environ_get_eliminate 1.27% : 0.000004s : 29: predicate.environ_get_set_eliminate 1.97% : 0.000006s : 43: predicate.exchange_switch_depend_value 2.72% : 0.000008s : 43: predicate.float_depend_g_call 0.35% : 0.000001s : 8: predicate.float_environ_get_switch 0.53% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.13% : 0.000000s : 4: predicate.fold_const_symbol 0.47% : 0.000001s : 8: predicate.get_grad_eliminate 0.13% : 0.000000s : 4: predicate.graph_param_transform 0.41% : 0.000001s : 8: predicate.incorporate_call 0.34% : 0.000001s : 8: predicate.incorporate_call_switch 5.91% : 0.000018s : 92: predicate.inline 0.51% : 0.000002s : 8: predicate.inline_without_move 0.19% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.53% : 0.000002s : 8: predicate.less_batch_normalization 1.75% : 0.000005s : 35: predicate.list_to_tuple_eliminator_ 2.64% : 0.000008s : 60: predicate.load_eliminater 0.59% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.29% : 0.000010s : 71: predicate.loop_unroll_before_grad 1.59% : 0.000005s : 33: predicate.make_slice_get_slice_eliminator 0.43% : 0.000001s : 8: predicate.merge_addn 0.37% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.38% : 0.000001s : 8: predicate.mini_step_allgather_replace 1.06% : 0.000003s : 25: predicate.minmaximum_grad 0.82% : 0.000003s : 4: predicate.mutable_eliminate 0.27% : 0.000001s : 4: predicate.opt_reshape 0.25% : 0.000001s : 4: predicate.parallel_virtual_node 2.55% : 0.000008s : 43: predicate.partial_defer_inline 1.53% : 0.000005s : 31: predicate.partial_eliminate 1.23% : 0.000004s : 25: predicate.print_const_string_wrapper 0.39% : 0.000001s : 8: predicate.reduce_all_const_elim 1.63% : 0.000005s : 25: predicate.reduce_eliminate 2.61% : 0.000008s : 60: predicate.redundant_stop_gradient_eliminater 0.28% : 0.000001s : 8: predicate.remove_not_recompute_node 1.18% : 0.000004s : 35: predicate.replace_applicator 0.26% : 0.000001s : 8: predicate.replace_old_param 0.18% : 0.000001s : 4: predicate.reset_defer_inline 1.20% : 0.000004s : 25: predicate.reshape_eliminate 0.41% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.22% : 0.000001s : 4: predicate.row_tensor_eliminate 0.51% : 0.000002s : 8: predicate.same_eliminate 0.28% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.55% : 0.000002s : 8: predicate.shard_identity_eliminate 0.45% : 0.000001s : 8: predicate.special_op_eliminate 0.51% : 0.000002s : 8: predicate.specialize_transform 0.52% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.56% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.21% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.19% : 0.000007s : 43: predicate.switch_defer_inline 2.46% : 0.000007s : 51: predicate.switch_layer_defer_inline 6.78% : 0.000021s : 134: predicate.switch_simplify 1.15% : 0.000003s : 25: predicate.tile_eliminate 1.19% : 0.000004s : 25: predicate.transpose_eliminate 1.65% : 0.000005s : 33: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000005s : 33: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000005s : 33: predicate.tuple_list_get_item_depend_reorder 2.66% : 0.000008s : 43: predicate.tuple_list_get_item_eliminator 1.56% : 0.000005s : 33: predicate.tuple_list_get_set_item_eliminator 2.35% : 0.000007s : 41: predicate.tuple_list_set_item_eliminator 1.65% : 0.000005s : 35: predicate.tuple_to_list_eliminator_ 2.54% : 0.000008s : 60: predicate.updatestate_pure_node_eliminater 3.03% : 0.000009s : 68: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 4: predicate.value_based_eliminate 0.47% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.55% : 0.000002s : 8: predicate.virtual_output_eliminate 0.18% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.28% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001889 30 47.66% : 0.000900s : 12: func_graph_cloner_run.FuncGraphClonerGraph 52.34% : 0.000988s : 18: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.702374 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.27% : 0.007252s : 1: add_attr 0.27% : 0.007238s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000051s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.01% : 0.000167s : 1: auto_monad 0.00% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000011s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.04% : 0.001134s : 1: bootstrap 0.00% : 0.000019s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000045s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000090s : 1: environ_conv 0.00% : 0.000060s : 1: event_method 0.00% : 0.000029s : 1: execute 0.00% : 0.000009s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000004s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000011s : 1: label_micro_interleaved_index 0.02% : 0.000455s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000512s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000017s : 1: opt.transform.mutable_eliminate 0.07% : 0.001939s : 78: opt.transform.opt_a 0.00% : 0.000029s : 1: opt.transform.opt_after_cconv 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000108s : 28: opt.transform.opt_b 0.00% : 0.000045s : 2: opt.transform.opt_trans_graph 0.00% : 0.000042s : 4: opt.transform.symbol_engine_opt 0.19% : 0.005123s : 1: opt_a 0.00% : 0.000109s : 1: opt_after_cconv 0.02% : 0.000498s : 1: opt_after_jit_grad 0.01% : 0.000212s : 1: opt_b 0.28% : 0.007467s : 1: optimize 0.00% : 0.000025s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000030s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000014s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000021s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000009s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000005s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000013s : 1: pipeline_split 0.00% : 0.000061s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000007s : 1: remove_cast_before_assign_add 0.00% : 0.000026s : 1: remove_dup_value 0.05% : 0.001315s : 1: renormalize.infer 0.03% : 0.000911s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000026s : 1: rewriter_after_opt_a 0.01% : 0.000280s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000019s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000085s : 1: symbol_engine_optimizer 93.16% : 2.517483s : 1: task_emit 0.00% : 0.000073s : 1: tuple_transform 5.50% : 0.148723s : 1: type_inference 0.01% : 0.000296s : 1: validate group_cases_21 have all been run, results of sub cases are below: case: (1, ) {} pass. case: (1, ) {} pass. case: (1, ) {} pass. case: (1, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. case: (0, ) {} pass. ops group_cases_22 with 8 cases start to running, all cases are below: case: (, 1, ) case: (, 'pynative') case: (, 'KBK') case: (, 'pynative') case: (, 'KBK') case: (, 'pynative') case: (, 'KBK') case: (, 'pynative') ops group_cases_22 total running memory: 138M, memory threshold: 51200M TotalTime = 2.90063, [33] [bootstrap]: 0.00079682 [type_inference]: 0.105905 [event_method]: 0.00019052 [auto_monad]: 0.00032427 [graph_reusing]: 9.22999e-06 [pre_auto_parallel]: 1.415e-05 [py_interpret_to_execute]: 5.057e-05 [rewriter_before_opt_a]: 0.00017202 [expand_dump_flag]: 3.53e-06 [jit_opt_a]: 0.0131559, [2] [Cycle 1]: 0.00448879, [27] [switch_simplify]: 0.00018396 [loop_unroll]: 5.443e-05 [a_1]: 0.00160282 [with_stream_mark]: 2.527e-05 [recompute_prepare]: 1.924e-05 [updatestate_depend_eliminate]: 4.317e-05 [updatestate_assign_eliminate]: 8.04997e-06 [updatestate_loads_eliminate]: 7.98001e-06 [parameter_eliminate]: 2.02001e-06 [specialize_transform]: 1.533e-05 [updatestate_useless_node_eliminater]: 1.699e-05 [accelerated_algorithm]: 4.799e-05 [meta_shard_fg_expand]: 4.45e-06 [get_grad_eliminate_]: 1.392e-05 [merge_forward]: 8.1e-06 [cell_reuse_recompute_pass]: 1.24e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.215e-05 [j_node_and_user_rematch]: 2.447e-05 [meta_fg_expand]: 6.02001e-06 [replace_old_param]: 3.73e-05 [inline_without_move]: 1.484e-05 [renormalize]: 0.00180315 [add_forward_monad_depend]: 1.367e-05 [auto_monad_grad]: 2.61e-06 [auto_monad_eliminator]: 3.97e-05 [cse]: 0.00019138 [replace_applicator]: 2.288e-05 [Cycle 2]: 0.00078665, [27] [switch_simplify]: 1.499e-05 [loop_unroll]: 1.31e-05 [a_1]: 0.00035463 [with_stream_mark]: 1.425e-05 [recompute_prepare]: 1.342e-05 [updatestate_depend_eliminate]: 8.88002e-06 [updatestate_assign_eliminate]: 6.96001e-06 [updatestate_loads_eliminate]: 6.49999e-06 [parameter_eliminate]: 1.12e-06 [specialize_transform]: 1.361e-05 [updatestate_useless_node_eliminater]: 1.625e-05 [accelerated_algorithm]: 1.607e-05 [meta_shard_fg_expand]: 2.79001e-06 [get_grad_eliminate_]: 1.253e-05 [merge_forward]: 6.85998e-06 [cell_reuse_recompute_pass]: 1.39e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.65e-05 [j_node_and_user_rematch]: 2.227e-05 [meta_fg_expand]: 5.16998e-06 [replace_old_param]: 1.848e-05 [inline_without_move]: 1.336e-05 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.27e-06 [auto_monad_grad]: 9.39996e-07 [auto_monad_eliminator]: 1.657e-05 [cse]: 4.06e-05 [replace_applicator]: 1.424e-05 [py_interpret_to_execute_after_opt_a]: 1.845e-05 [rewriter_after_opt_a]: 0.00062546 [convert_after_rewriter]: 5.127e-05 [order_py_execute_after_rewriter]: 1.128e-05 [mutable_eliminate]: 0.00064148 [jit_opt_b]: 0.00010966, [1] [Cycle 1]: 0.00010258, [2] [frontend_op_eliminate]: 4.007e-05 [inline_after_opt_a]: 4.979e-05 [cconv]: 2.999e-05 [loop_unroll]: 0.00045853 [jit_opt_after_cconv]: 0.00028663, [1] [Cycle 1]: 0.00028014, [11] [c_1]: 7.185e-05 [parameter_eliminate]: 2.48002e-06 [updatestate_depend_eliminate]: 1.11e-05 [updatestate_assign_eliminate]: 7.45e-06 [updatestate_loads_eliminate]: 6.51e-06 [cse]: 5.43e-05 [call_graph_tuple_transform]: 3.636e-05 [tuple_list_get_item_eliminator]: 2.128e-05 [none_parameter_eliminate]: 1.52001e-06 [renormalize]: 5.89993e-07 [switch_simplify]: 1.373e-05 [remove_dup_value]: 5.751e-05 [partial_unused_args_eliminate]: 2.41998e-06 [environ_conv]: 3.221e-05 [add_recomputation]: 0.00011803 [cse_after_recomputation]: 4.728e-05, [1] [Cycle 1]: 4.118e-05, [1] [cse]: 3.421e-05 [auto_monad_reorder]: 4.003e-05 [get_jit_bprop_graph]: 2.16e-06 [rewriter_after_jit_bprop_graph]: 4.08001e-06 [opt_after_jit_grad]: 0.00053255 [symbol_engine_optimizer]: 0.00014229, [1] [Cycle 1]: 0.00013637, [6] [build]: 2.633e-05 [elim_shapecalc]: 1.795e-05 [elim_not_effective]: 2.682e-05 [opt_reshape]: 1.362e-05 [fold_const_symbol]: 2.209e-05 [renormalize]: 4.99975e-07 [validate]: 7.765e-05 [backend_pass]: 1.05001e-06 [task_emit]: 2.77526 [execute]: 1.025e-05 Sums bootstrap : 0.000797s : 0.03% type_inference : 0.105905s : 3.66% event_method : 0.000191s : 0.01% auto_monad : 0.000324s : 0.01% graph_reusing : 0.000009s : 0.00% pre_auto_parallel : 0.000014s : 0.00% py_interpret_to_execute : 0.000051s : 0.00% rewriter_before_opt_a : 0.000172s : 0.01% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000199s : 0.01% jit_opt_a.loop_unroll : 0.000068s : 0.00% jit_opt_a.a_1 : 0.001957s : 0.07% jit_opt_a.with_stream_mark : 0.000040s : 0.00% jit_opt_a.recompute_prepare : 0.000033s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000052s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000015s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000014s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000029s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000033s : 0.00% jit_opt_a.accelerated_algorithm : 0.000064s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000007s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000026s : 0.00% jit_opt_a.merge_forward : 0.000015s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000059s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000047s : 0.00% jit_opt_a.meta_fg_expand : 0.000011s : 0.00% jit_opt_a.replace_old_param : 0.000056s : 0.00% jit_opt_a.inline_without_move : 0.000028s : 0.00% jit_opt_a.renormalize : 0.001803s : 0.06% jit_opt_a.add_forward_monad_depend : 0.000015s : 0.00% jit_opt_a.auto_monad_grad : 0.000004s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000056s : 0.00% jit_opt_a.cse : 0.000232s : 0.01% jit_opt_a.replace_applicator : 0.000037s : 0.00% py_interpret_to_execute_after_opt_a : 0.000018s : 0.00% rewriter_after_opt_a : 0.000625s : 0.02% convert_after_rewriter : 0.000051s : 0.00% order_py_execute_after_rewriter : 0.000011s : 0.00% mutable_eliminate : 0.000641s : 0.02% jit_opt_b.frontend_op_eliminate : 0.000040s : 0.00% jit_opt_b.inline_after_opt_a : 0.000050s : 0.00% cconv : 0.000030s : 0.00% loop_unroll : 0.000459s : 0.02% jit_opt_after_cconv.c_1 : 0.000072s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.00% jit_opt_after_cconv.cse : 0.000054s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000036s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000021s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000014s : 0.00% remove_dup_value : 0.000058s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000032s : 0.00% add_recomputation : 0.000118s : 0.00% cse_after_recomputation.cse : 0.000034s : 0.00% auto_monad_reorder : 0.000040s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000533s : 0.02% symbol_engine_optimizer.build : 0.000026s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000027s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000014s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000022s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000078s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.775257s : 96.00% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000584 147 2.37% : 0.000014s : 7: substitution.depend_value_elim 0.64% : 0.000004s : 8: substitution.elim_not_effective 0.56% : 0.000003s : 8: substitution.fold_const_symbol 1.79% : 0.000010s : 11: substitution.graph_param_transform 49.25% : 0.000288s : 10: substitution.inline 1.38% : 0.000008s : 16: substitution.j_node_and_user_rematch 5.49% : 0.000032s : 2: substitution.less_batch_normalization 4.06% : 0.000024s : 10: substitution.minmaximum_grad 2.14% : 0.000012s : 16: substitution.remove_not_recompute_node 4.02% : 0.000023s : 6: substitution.replace_old_param 3.22% : 0.000019s : 1: substitution.switch_simplify 4.56% : 0.000027s : 10: substitution.tuple_list_convert_item_index_to_positive 7.09% : 0.000041s : 12: substitution.tuple_list_get_item_depend_reorder 8.00% : 0.000047s : 18: substitution.tuple_list_get_item_eliminator 1.92% : 0.000011s : 5: substitution.updatestate_pure_node_eliminater 3.51% : 0.000020s : 7: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.105765 2 97.91% : 0.103556s : 1: type_inference.infer 2.09% : 0.002210s : 1: type_inference.specialize ------[replace.] 0.000147 15 60.03% : 0.000088s : 10: replace.inline 22.23% : 0.000033s : 1: replace.switch_simplify 10.94% : 0.000016s : 2: replace.tuple_list_get_item_depend_reorder 6.79% : 0.000010s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000329 15 85.60% : 0.000281s : 10: match.inline 5.45% : 0.000018s : 1: match.switch_simplify 7.71% : 0.000025s : 2: match.tuple_list_get_item_depend_reorder 1.23% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000386 2602 1.41% : 0.000005s : 42: predicate.accumulaten_eliminater 0.91% : 0.000004s : 11: predicate.ad_related_special_op_eliminate 1.38% : 0.000005s : 42: predicate.addn_check_dump 1.42% : 0.000005s : 42: predicate.addn_zero_filter 2.13% : 0.000008s : 42: predicate.arithmetic_simplify 1.48% : 0.000006s : 42: predicate.cast_eliminate 0.40% : 0.000002s : 11: predicate.check_bprop_eliminate 1.37% : 0.000005s : 42: predicate.compare_switch_simplify 1.56% : 0.000006s : 42: predicate.depend_value_elim 1.41% : 0.000005s : 42: predicate.dict_get_item_const_eliminator 1.48% : 0.000006s : 42: predicate.dict_get_item_eliminator 1.41% : 0.000005s : 42: predicate.dict_set_item_eliminator 0.53% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 11: predicate.elim_not_effective 0.49% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.55% : 0.000006s : 42: predicate.environ_add_const_eliminate 1.37% : 0.000005s : 42: predicate.environ_get_add_eliminate 1.45% : 0.000006s : 42: predicate.environ_get_depend_swap 1.43% : 0.000006s : 42: predicate.environ_get_eliminate 1.40% : 0.000005s : 42: predicate.environ_get_set_eliminate 0.24% : 0.000001s : 11: predicate.fold_const_symbol 0.88% : 0.000003s : 22: predicate.get_grad_eliminate 0.27% : 0.000001s : 11: predicate.graph_param_transform 4.56% : 0.000018s : 78: predicate.inline 0.93% : 0.000004s : 22: predicate.inline_without_move 0.41% : 0.000002s : 22: predicate.j_node_and_user_rematch 1.09% : 0.000004s : 22: predicate.less_batch_normalization 1.70% : 0.000007s : 46: predicate.list_to_tuple_eliminator_ 2.00% : 0.000008s : 57: predicate.load_eliminater 0.88% : 0.000003s : 11: predicate.loop_unroll_after_grad 2.55% : 0.000010s : 67: predicate.loop_unroll_before_grad 2.05% : 0.000008s : 55: predicate.make_slice_get_slice_eliminator 1.34% : 0.000005s : 42: predicate.merge_addn 1.48% : 0.000006s : 42: predicate.minmaximum_grad 1.03% : 0.000004s : 11: predicate.mutable_eliminate 0.47% : 0.000002s : 11: predicate.opt_reshape 2.39% : 0.000009s : 57: predicate.partial_eliminate 1.40% : 0.000005s : 42: predicate.print_const_string_wrapper 1.92% : 0.000007s : 42: predicate.reduce_eliminate 1.58% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.45% : 0.000002s : 22: predicate.remove_not_recompute_node 1.77% : 0.000007s : 68: predicate.replace_applicator 0.46% : 0.000002s : 22: predicate.replace_old_param 0.25% : 0.000001s : 11: predicate.reset_defer_inline 1.48% : 0.000006s : 42: predicate.reshape_eliminate 1.46% : 0.000006s : 42: predicate.row_tensor_add_zeros_like 0.53% : 0.000002s : 11: predicate.row_tensor_eliminate 1.52% : 0.000006s : 42: predicate.same_eliminate 0.54% : 0.000002s : 25: predicate.set_cell_output_no_recompute 0.91% : 0.000004s : 22: predicate.special_op_eliminate 0.96% : 0.000004s : 22: predicate.specialize_transform 1.74% : 0.000007s : 42: predicate.split_environ_get_set_with_tuple_value 1.45% : 0.000006s : 42: predicate.stack_unstack_eliminate 0.47% : 0.000002s : 11: predicate.switch_call_monad_eliminater 2.36% : 0.000009s : 56: predicate.switch_defer_inline 2.19% : 0.000008s : 56: predicate.switch_layer_defer_inline 5.76% : 0.000022s : 136: predicate.switch_simplify 1.42% : 0.000005s : 42: predicate.tile_eliminate 1.42% : 0.000005s : 42: predicate.transpose_eliminate 1.97% : 0.000008s : 42: predicate.tuple_list_convert_item_index_to_positive 1.81% : 0.000007s : 44: predicate.tuple_list_get_item_depend_reorder 3.65% : 0.000014s : 68: predicate.tuple_list_get_item_eliminator 2.01% : 0.000008s : 44: predicate.tuple_list_set_item_eliminator 1.57% : 0.000006s : 46: predicate.tuple_to_list_eliminator_ 1.97% : 0.000008s : 57: predicate.updatestate_pure_node_eliminater 3.13% : 0.000012s : 79: predicate.updatestate_useless_node_eliminater 1.86% : 0.000007s : 42: predicate.value_based_eliminate 0.37% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.50% : 0.000002s : 11: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001915 19 66.06% : 0.001265s : 7: func_graph_cloner_run.FuncGraphClonerGraph 33.94% : 0.000650s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.904288 76 0.00% : 0.000121s : 1: add_recomputation 0.01% : 0.000333s : 1: auto_monad 0.00% : 0.000043s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.03% : 0.000828s : 1: bootstrap 0.00% : 0.000033s : 1: cconv 0.00% : 0.000056s : 1: convert_after_rewriter 0.00% : 0.000050s : 1: cse_after_recomputation 0.00% : 0.000035s : 1: environ_conv 0.01% : 0.000199s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000012s : 1: graph_reusing 0.45% : 0.013159s : 1: jit_opt_a 0.01% : 0.000290s : 1: jit_opt_after_cconv 0.00% : 0.000113s : 1: jit_opt_b 0.02% : 0.000466s : 1: loop_unroll 0.02% : 0.000649s : 1: mutable_eliminate 0.09% : 0.002594s : 26: opt.transform.jit_opt_a 0.00% : 0.000140s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000083s : 4: opt.transform.jit_opt_b 0.00% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000026s : 1: opt.transform.mutable_eliminate 0.00% : 0.000050s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000077s : 4: opt.transform.symbol_engine_opt 0.02% : 0.000540s : 1: opt_after_jit_grad 0.00% : 0.000014s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000017s : 1: pre_auto_parallel 0.00% : 0.000054s : 1: py_interpret_to_execute 0.00% : 0.000021s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000061s : 1: remove_dup_value 0.04% : 0.001020s : 1: renormalize.infer 0.03% : 0.000774s : 1: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000630s : 1: rewriter_after_opt_a 0.01% : 0.000175s : 1: rewriter_before_opt_a 0.00% : 0.000145s : 1: symbol_engine_optimizer 95.56% : 2.775374s : 1: task_emit 3.65% : 0.105921s : 1: type_inference 0.00% : 0.000123s : 1: validate TotalTime = 2.96187, [33] [bootstrap]: 0.00098891 [type_inference]: 0.0615409 [event_method]: 1.258e-05 [auto_monad]: 0.00011686 [graph_reusing]: 5.39e-06 [pre_auto_parallel]: 1.248e-05 [py_interpret_to_execute]: 8.612e-05 [rewriter_before_opt_a]: 8.592e-05 [expand_dump_flag]: 2.99001e-06 [jit_opt_a]: 0.00909415, [2] [Cycle 1]: 0.00145163, [27] [switch_simplify]: 7.488e-05 [loop_unroll]: 1.533e-05 [a_1]: 0.00026681 [with_stream_mark]: 1.819e-05 [recompute_prepare]: 7.2e-06 [updatestate_depend_eliminate]: 1.446e-05 [updatestate_assign_eliminate]: 1.299e-05 [updatestate_loads_eliminate]: 2.94999e-06 [parameter_eliminate]: 1.94e-06 [specialize_transform]: 6.54999e-06 [updatestate_useless_node_eliminater]: 5.69999e-06 [accelerated_algorithm]: 5.67999e-06 [meta_shard_fg_expand]: 1.089e-05 [get_grad_eliminate_]: 5.69e-06 [merge_forward]: 3.92002e-06 [cell_reuse_recompute_pass]: 1.10999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.639e-05 [j_node_and_user_rematch]: 9.44e-06 [meta_fg_expand]: 2.89001e-06 [replace_old_param]: 9.37001e-06 [inline_without_move]: 5.52001e-06 [renormalize]: 0.00062753 [add_forward_monad_depend]: 1.35e-05 [auto_monad_grad]: 2.29999e-06 [auto_monad_eliminator]: 2.395e-05 [cse]: 5.565e-05 [replace_applicator]: 1.241e-05 [Cycle 2]: 0.00033069, [27] [switch_simplify]: 6.03002e-06 [loop_unroll]: 5.35999e-06 [a_1]: 9.094e-05 [with_stream_mark]: 9.62001e-06 [recompute_prepare]: 5.56e-06 [updatestate_depend_eliminate]: 3.29001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.32999e-06 [parameter_eliminate]: 9.09989e-07 [specialize_transform]: 5.44e-06 [updatestate_useless_node_eliminater]: 5.34e-06 [accelerated_algorithm]: 5.82999e-06 [meta_shard_fg_expand]: 1.35001e-06 [get_grad_eliminate_]: 4.94998e-06 [merge_forward]: 3.10998e-06 [cell_reuse_recompute_pass]: 1.34003e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.248e-05 [j_node_and_user_rematch]: 8.12998e-06 [meta_fg_expand]: 1.84e-06 [replace_old_param]: 7.34002e-06 [inline_without_move]: 4.87998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.00999e-06 [auto_monad_grad]: 7.09988e-07 [auto_monad_eliminator]: 6.08998e-06 [cse]: 1.387e-05 [replace_applicator]: 6.09001e-06 [py_interpret_to_execute_after_opt_a]: 1.156e-05 [rewriter_after_opt_a]: 8.674e-05 [convert_after_rewriter]: 1.168e-05 [order_py_execute_after_rewriter]: 5.85002e-06 [mutable_eliminate]: 0.00054182 [jit_opt_b]: 5.059e-05, [1] [Cycle 1]: 4.411e-05, [2] [frontend_op_eliminate]: 1.643e-05 [inline_after_opt_a]: 1.614e-05 [cconv]: 2.617e-05 [loop_unroll]: 0.00039912 [jit_opt_after_cconv]: 0.00014252, [1] [Cycle 1]: 0.00013673, [11] [c_1]: 2.144e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 5.77001e-06 [updatestate_assign_eliminate]: 2.84999e-06 [updatestate_loads_eliminate]: 2.51e-06 [cse]: 2.08e-05 [call_graph_tuple_transform]: 1.827e-05 [tuple_list_get_item_eliminator]: 5.84e-06 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 3.89991e-07 [switch_simplify]: 5.69999e-06 [remove_dup_value]: 1.68e-05 [partial_unused_args_eliminate]: 2.17999e-06 [environ_conv]: 1.729e-05 [add_recomputation]: 6.592e-05 [cse_after_recomputation]: 2.531e-05, [1] [Cycle 1]: 1.969e-05, [1] [cse]: 1.387e-05 [auto_monad_reorder]: 2.551e-05 [get_jit_bprop_graph]: 1.98002e-06 [rewriter_after_jit_bprop_graph]: 3.43e-06 [opt_after_jit_grad]: 0.00043642 [symbol_engine_optimizer]: 7.355e-05, [1] [Cycle 1]: 6.749e-05, [6] [build]: 3.61999e-06 [elim_shapecalc]: 8.18001e-06 [elim_not_effective]: 1.333e-05 [opt_reshape]: 5.76e-06 [fold_const_symbol]: 8.62998e-06 [renormalize]: 7.89994e-07 [validate]: 5.245e-05 [backend_pass]: 9.80013e-07 [task_emit]: 2.88641 [execute]: 1.033e-05 Sums bootstrap : 0.000989s : 0.03% type_inference : 0.061541s : 2.08% event_method : 0.000013s : 0.00% auto_monad : 0.000117s : 0.00% graph_reusing : 0.000005s : 0.00% pre_auto_parallel : 0.000012s : 0.00% py_interpret_to_execute : 0.000086s : 0.00% rewriter_before_opt_a : 0.000086s : 0.00% expand_dump_flag : 0.000003s : 0.00% jit_opt_a.switch_simplify : 0.000081s : 0.00% jit_opt_a.loop_unroll : 0.000021s : 0.00% jit_opt_a.a_1 : 0.000358s : 0.01% jit_opt_a.with_stream_mark : 0.000028s : 0.00% jit_opt_a.recompute_prepare : 0.000013s : 0.00% jit_opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000005s : 0.00% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000012s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000011s : 0.00% jit_opt_a.accelerated_algorithm : 0.000012s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000012s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000011s : 0.00% jit_opt_a.merge_forward : 0.000007s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000039s : 0.00% jit_opt_a.j_node_and_user_rematch : 0.000018s : 0.00% jit_opt_a.meta_fg_expand : 0.000005s : 0.00% jit_opt_a.replace_old_param : 0.000017s : 0.00% jit_opt_a.inline_without_move : 0.000010s : 0.00% jit_opt_a.renormalize : 0.000628s : 0.02% jit_opt_a.add_forward_monad_depend : 0.000015s : 0.00% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000030s : 0.00% jit_opt_a.cse : 0.000070s : 0.00% jit_opt_a.replace_applicator : 0.000019s : 0.00% py_interpret_to_execute_after_opt_a : 0.000012s : 0.00% rewriter_after_opt_a : 0.000087s : 0.00% convert_after_rewriter : 0.000012s : 0.00% order_py_execute_after_rewriter : 0.000006s : 0.00% mutable_eliminate : 0.000542s : 0.02% jit_opt_b.frontend_op_eliminate : 0.000016s : 0.00% jit_opt_b.inline_after_opt_a : 0.000016s : 0.00% cconv : 0.000026s : 0.00% loop_unroll : 0.000399s : 0.01% jit_opt_after_cconv.c_1 : 0.000021s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000021s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000018s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000006s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000006s : 0.00% remove_dup_value : 0.000017s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000017s : 0.00% add_recomputation : 0.000066s : 0.00% cse_after_recomputation.cse : 0.000014s : 0.00% auto_monad_reorder : 0.000026s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000436s : 0.01% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000008s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000052s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.886410s : 97.76% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000118 19 1.74% : 0.000002s : 2: substitution.elim_not_effective 1.07% : 0.000001s : 2: substitution.fold_const_symbol 4.60% : 0.000005s : 3: substitution.graph_param_transform 74.97% : 0.000088s : 2: substitution.inline 2.82% : 0.000003s : 4: substitution.j_node_and_user_rematch 11.45% : 0.000013s : 4: substitution.remove_not_recompute_node 3.35% : 0.000004s : 2: substitution.replace_old_param ------[type_inference.] 0.061454 2 98.73% : 0.060676s : 1: type_inference.infer 1.27% : 0.000777s : 1: type_inference.specialize ------[replace.] 0.000023 2 100.00% : 0.000023s : 2: replace.inline ------[match.] 0.000087 2 100.00% : 0.000087s : 2: match.inline ------[predicate.] 0.000085 485 1.20% : 0.000001s : 7: predicate.accumulaten_eliminater 1.80% : 0.000002s : 3: predicate.ad_related_special_op_eliminate 1.02% : 0.000001s : 7: predicate.addn_check_dump 1.25% : 0.000001s : 7: predicate.addn_zero_filter 1.91% : 0.000002s : 7: predicate.arithmetic_simplify 1.12% : 0.000001s : 7: predicate.cast_eliminate 0.56% : 0.000000s : 3: predicate.check_bprop_eliminate 1.08% : 0.000001s : 7: predicate.compare_switch_simplify 1.11% : 0.000001s : 7: predicate.depend_value_elim 1.07% : 0.000001s : 7: predicate.dict_get_item_const_eliminator 1.22% : 0.000001s : 7: predicate.dict_get_item_eliminator 1.19% : 0.000001s : 7: predicate.dict_set_item_eliminator 1.00% : 0.000001s : 3: predicate.dumpgradient_eliminate 0.48% : 0.000000s : 3: predicate.elim_not_effective 0.78% : 0.000001s : 3: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000001s : 7: predicate.environ_add_const_eliminate 1.06% : 0.000001s : 7: predicate.environ_get_add_eliminate 0.99% : 0.000001s : 7: predicate.environ_get_depend_swap 1.20% : 0.000001s : 7: predicate.environ_get_eliminate 0.98% : 0.000001s : 7: predicate.environ_get_set_eliminate 0.38% : 0.000000s : 3: predicate.fold_const_symbol 1.24% : 0.000001s : 6: predicate.get_grad_eliminate 0.34% : 0.000000s : 3: predicate.graph_param_transform 6.01% : 0.000005s : 15: predicate.inline 1.35% : 0.000001s : 6: predicate.inline_without_move 0.59% : 0.000000s : 6: predicate.j_node_and_user_rematch 1.53% : 0.000001s : 6: predicate.less_batch_normalization 1.22% : 0.000001s : 7: predicate.list_to_tuple_eliminator_ 1.69% : 0.000001s : 10: predicate.load_eliminater 2.04% : 0.000002s : 3: predicate.loop_unroll_after_grad 2.97% : 0.000003s : 14: predicate.loop_unroll_before_grad 2.22% : 0.000002s : 10: predicate.make_slice_get_slice_eliminator 1.05% : 0.000001s : 7: predicate.merge_addn 1.06% : 0.000001s : 7: predicate.minmaximum_grad 2.07% : 0.000002s : 3: predicate.mutable_eliminate 0.66% : 0.000001s : 3: predicate.opt_reshape 2.47% : 0.000002s : 10: predicate.partial_eliminate 1.16% : 0.000001s : 7: predicate.print_const_string_wrapper 1.81% : 0.000002s : 7: predicate.reduce_eliminate 1.24% : 0.000001s : 7: predicate.redundant_stop_gradient_eliminater 1.08% : 0.000001s : 6: predicate.remove_not_recompute_node 1.84% : 0.000002s : 13: predicate.replace_applicator 0.73% : 0.000001s : 6: predicate.replace_old_param 0.35% : 0.000000s : 3: predicate.reset_defer_inline 1.19% : 0.000001s : 7: predicate.reshape_eliminate 1.12% : 0.000001s : 7: predicate.row_tensor_add_zeros_like 0.91% : 0.000001s : 3: predicate.row_tensor_eliminate 1.24% : 0.000001s : 7: predicate.same_eliminate 0.80% : 0.000001s : 6: predicate.set_cell_output_no_recompute 1.51% : 0.000001s : 6: predicate.special_op_eliminate 1.52% : 0.000001s : 6: predicate.specialize_transform 1.37% : 0.000001s : 7: predicate.split_environ_get_set_with_tuple_value 1.12% : 0.000001s : 7: predicate.stack_unstack_eliminate 0.61% : 0.000001s : 3: predicate.switch_call_monad_eliminater 1.88% : 0.000002s : 9: predicate.switch_defer_inline 1.66% : 0.000001s : 9: predicate.switch_layer_defer_inline 5.97% : 0.000005s : 26: predicate.switch_simplify 1.27% : 0.000001s : 7: predicate.tile_eliminate 1.25% : 0.000001s : 7: predicate.transpose_eliminate 1.52% : 0.000001s : 7: predicate.tuple_list_convert_item_index_to_positive 1.28% : 0.000001s : 7: predicate.tuple_list_get_item_depend_reorder 4.34% : 0.000004s : 13: predicate.tuple_list_get_item_eliminator 1.53% : 0.000001s : 7: predicate.tuple_list_set_item_eliminator 1.08% : 0.000001s : 7: predicate.tuple_to_list_eliminator_ 1.52% : 0.000001s : 10: predicate.updatestate_pure_node_eliminater 3.17% : 0.000003s : 16: predicate.updatestate_useless_node_eliminater 1.48% : 0.000001s : 7: predicate.value_based_eliminate 0.52% : 0.000000s : 3: predicate.virtual_view_grad_eliminate 0.87% : 0.000001s : 3: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000368 6 36.16% : 0.000133s : 2: func_graph_cloner_run.FuncGraphClonerGraph 63.84% : 0.000235s : 4: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.961936 76 0.00% : 0.000069s : 1: add_recomputation 0.00% : 0.000121s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.03% : 0.001024s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000014s : 1: convert_after_rewriter 0.00% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000020s : 1: environ_conv 0.00% : 0.000017s : 1: event_method 0.00% : 0.000017s : 1: execute 0.00% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000008s : 1: graph_reusing 0.31% : 0.009097s : 1: jit_opt_a 0.00% : 0.000145s : 1: jit_opt_after_cconv 0.00% : 0.000053s : 1: jit_opt_b 0.01% : 0.000406s : 1: loop_unroll 0.02% : 0.000550s : 1: mutable_eliminate 0.02% : 0.000583s : 26: opt.transform.jit_opt_a 0.00% : 0.000048s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000026s : 4: opt.transform.jit_opt_b 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.000021s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000033s : 4: opt.transform.symbol_engine_opt 0.01% : 0.000443s : 1: opt_after_jit_grad 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000015s : 1: pre_auto_parallel 0.00% : 0.000089s : 1: py_interpret_to_execute 0.00% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000019s : 1: remove_dup_value 0.01% : 0.000360s : 1: renormalize.infer 0.01% : 0.000261s : 1: renormalize.specialize 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000091s : 1: rewriter_after_opt_a 0.00% : 0.000089s : 1: rewriter_before_opt_a 0.00% : 0.000076s : 1: symbol_engine_optimizer 97.45% : 2.886449s : 1: task_emit 2.08% : 0.061557s : 1: type_inference 0.00% : 0.000079s : 1: validate [WARNING] ME(103441:281473890602800,ForkProcess-179):2026-01-29-17:52:40.668.652 [mindspore/graph/api.py:128] The function "multinomial_forward_func" at the file "/home/jenkins/mindspore/testcases/testcases/tests/st/mint/test_multinomial.py", line 28 has been compiled again. Try to reuse the function object decorated by @jit to reduce the compile time. For more details, get instructions about `jit` at https://www.mindspore.cn/search?inputValue=jit. TotalTime = 0.0704328, [33] [bootstrap]: 0.00043914 [type_inference]: 0.048325 [event_method]: 0.00015548 [auto_monad]: 0.00018922 [graph_reusing]: 8.1e-06 [pre_auto_parallel]: 2.96001e-06 [py_interpret_to_execute]: 3.514e-05 [rewriter_before_opt_a]: 0.00012362 [expand_dump_flag]: 3.56999e-06 [jit_opt_a]: 0.00810784, [2] [Cycle 1]: 0.00346527, [27] [switch_simplify]: 0.00011386 [loop_unroll]: 5.157e-05 [a_1]: 0.00145693 [with_stream_mark]: 2.036e-05 [recompute_prepare]: 1.78e-05 [updatestate_depend_eliminate]: 2.922e-05 [updatestate_assign_eliminate]: 8.31002e-06 [updatestate_loads_eliminate]: 7.88001e-06 [parameter_eliminate]: 2.16e-06 [specialize_transform]: 1.503e-05 [updatestate_useless_node_eliminater]: 1.64e-05 [accelerated_algorithm]: 2.843e-05 [meta_shard_fg_expand]: 3.41999e-06 [get_grad_eliminate_]: 1.327e-05 [merge_forward]: 7.75e-06 [cell_reuse_recompute_pass]: 9.60019e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.778e-05 [j_node_and_user_rematch]: 2.211e-05 [meta_fg_expand]: 5.69e-06 [replace_old_param]: 1.881e-05 [inline_without_move]: 1.373e-05 [renormalize]: 0.00119927 [add_forward_monad_depend]: 2.973e-05 [auto_monad_grad]: 1.74998e-06 [auto_monad_eliminator]: 2.7e-05 [cse]: 0.00012877 [replace_applicator]: 2.126e-05 [Cycle 2]: 0.00075851, [27] [switch_simplify]: 1.418e-05 [loop_unroll]: 1.272e-05 [a_1]: 0.00034972 [with_stream_mark]: 1.347e-05 [recompute_prepare]: 1.37e-05 [updatestate_depend_eliminate]: 8.03001e-06 [updatestate_assign_eliminate]: 6.76999e-06 [updatestate_loads_eliminate]: 6.41998e-06 [parameter_eliminate]: 1.07998e-06 [specialize_transform]: 1.324e-05 [updatestate_useless_node_eliminater]: 1.646e-05 [accelerated_algorithm]: 1.562e-05 [meta_shard_fg_expand]: 2.37999e-06 [get_grad_eliminate_]: 1.268e-05 [merge_forward]: 6.60002e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.391e-05 [j_node_and_user_rematch]: 2.056e-05 [meta_fg_expand]: 4.88001e-06 [replace_old_param]: 1.701e-05 [inline_without_move]: 1.295e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 1.611e-05 [cse]: 3.69e-05 [replace_applicator]: 1.339e-05 [py_interpret_to_execute_after_opt_a]: 1.328e-05 [rewriter_after_opt_a]: 0.00051758 [convert_after_rewriter]: 1.587e-05 [order_py_execute_after_rewriter]: 9.64e-06 [mutable_eliminate]: 0.00048528 [jit_opt_b]: 0.00010952, [1] [Cycle 1]: 0.00010318, [2] [frontend_op_eliminate]: 4.193e-05 [inline_after_opt_a]: 4.97e-05 [cconv]: 2.475e-05 [loop_unroll]: 0.00043932 [jit_opt_after_cconv]: 0.00027522, [1] [Cycle 1]: 0.00026871, [11] [c_1]: 6.9e-05 [parameter_eliminate]: 2.26e-06 [updatestate_depend_eliminate]: 1.029e-05 [updatestate_assign_eliminate]: 6.86001e-06 [updatestate_loads_eliminate]: 6.74001e-06 [cse]: 4.79e-05 [call_graph_tuple_transform]: 3.504e-05 [tuple_list_get_item_eliminator]: 2.182e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 2.69996e-07 [switch_simplify]: 1.425e-05 [remove_dup_value]: 4.616e-05 [partial_unused_args_eliminate]: 2.34001e-06 [environ_conv]: 1.113e-05 [add_recomputation]: 8.953e-05 [cse_after_recomputation]: 4.378e-05, [1] [Cycle 1]: 3.786e-05, [1] [cse]: 3.181e-05 [auto_monad_reorder]: 3.269e-05 [get_jit_bprop_graph]: 1.28002e-06 [rewriter_after_jit_bprop_graph]: 4.08001e-06 [opt_after_jit_grad]: 0.00051773 [symbol_engine_optimizer]: 0.00012129, [1] [Cycle 1]: 0.00011536, [6] [build]: 9.37999e-06 [elim_shapecalc]: 1.673e-05 [elim_not_effective]: 2.547e-05 [opt_reshape]: 1.352e-05 [fold_const_symbol]: 2.153e-05 [renormalize]: 3.59985e-07 [validate]: 0.00014314 [backend_pass]: 1.18001e-06 [task_emit]: 0.0098999 [execute]: 5.22e-06 Sums bootstrap : 0.000439s : 0.67% type_inference : 0.048325s : 73.33% event_method : 0.000155s : 0.24% auto_monad : 0.000189s : 0.29% graph_reusing : 0.000008s : 0.01% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000035s : 0.05% rewriter_before_opt_a : 0.000124s : 0.19% expand_dump_flag : 0.000004s : 0.01% jit_opt_a.switch_simplify : 0.000128s : 0.19% jit_opt_a.loop_unroll : 0.000064s : 0.10% jit_opt_a.a_1 : 0.001807s : 2.74% jit_opt_a.with_stream_mark : 0.000034s : 0.05% jit_opt_a.recompute_prepare : 0.000031s : 0.05% jit_opt_a.updatestate_depend_eliminate : 0.000037s : 0.06% jit_opt_a.updatestate_assign_eliminate : 0.000015s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000014s : 0.02% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000028s : 0.04% jit_opt_a.updatestate_useless_node_eliminater : 0.000033s : 0.05% jit_opt_a.accelerated_algorithm : 0.000044s : 0.07% jit_opt_a.meta_shard_fg_expand : 0.000006s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000026s : 0.04% jit_opt_a.merge_forward : 0.000014s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000052s : 0.08% jit_opt_a.j_node_and_user_rematch : 0.000043s : 0.06% jit_opt_a.meta_fg_expand : 0.000011s : 0.02% jit_opt_a.replace_old_param : 0.000036s : 0.05% jit_opt_a.inline_without_move : 0.000027s : 0.04% jit_opt_a.renormalize : 0.001199s : 1.82% jit_opt_a.add_forward_monad_depend : 0.000031s : 0.05% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000043s : 0.07% jit_opt_a.cse : 0.000166s : 0.25% jit_opt_a.replace_applicator : 0.000035s : 0.05% py_interpret_to_execute_after_opt_a : 0.000013s : 0.02% rewriter_after_opt_a : 0.000518s : 0.79% convert_after_rewriter : 0.000016s : 0.02% order_py_execute_after_rewriter : 0.000010s : 0.01% mutable_eliminate : 0.000485s : 0.74% jit_opt_b.frontend_op_eliminate : 0.000042s : 0.06% jit_opt_b.inline_after_opt_a : 0.000050s : 0.08% cconv : 0.000025s : 0.04% loop_unroll : 0.000439s : 0.67% jit_opt_after_cconv.c_1 : 0.000069s : 0.10% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.02% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.cse : 0.000048s : 0.07% jit_opt_after_cconv.call_graph_tuple_transform : 0.000035s : 0.05% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000022s : 0.03% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000014s : 0.02% remove_dup_value : 0.000046s : 0.07% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000011s : 0.02% add_recomputation : 0.000090s : 0.14% cse_after_recomputation.cse : 0.000032s : 0.05% auto_monad_reorder : 0.000033s : 0.05% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000518s : 0.79% symbol_engine_optimizer.build : 0.000009s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.03% symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.04% symbol_engine_optimizer.opt_reshape : 0.000014s : 0.02% symbol_engine_optimizer.fold_const_symbol : 0.000022s : 0.03% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000143s : 0.22% backend_pass : 0.000001s : 0.00% task_emit : 0.009900s : 15.02% execute : 0.000005s : 0.01% Time group info: ------[substitution.] 0.000424 147 3.06% : 0.000013s : 7: substitution.depend_value_elim 0.84% : 0.000004s : 8: substitution.elim_not_effective 0.72% : 0.000003s : 8: substitution.fold_const_symbol 2.25% : 0.000010s : 11: substitution.graph_param_transform 52.10% : 0.000221s : 10: substitution.inline 1.59% : 0.000007s : 16: substitution.j_node_and_user_rematch 3.38% : 0.000014s : 2: substitution.less_batch_normalization 2.87% : 0.000012s : 10: substitution.minmaximum_grad 2.47% : 0.000010s : 16: substitution.remove_not_recompute_node 1.43% : 0.000006s : 6: substitution.replace_old_param 1.79% : 0.000008s : 1: substitution.switch_simplify 5.93% : 0.000025s : 10: substitution.tuple_list_convert_item_index_to_positive 6.92% : 0.000029s : 12: substitution.tuple_list_get_item_depend_reorder 10.27% : 0.000044s : 18: substitution.tuple_list_get_item_eliminator 2.03% : 0.000009s : 5: substitution.updatestate_pure_node_eliminater 2.34% : 0.000010s : 7: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.048260 2 96.44% : 0.046544s : 1: type_inference.infer 3.56% : 0.001716s : 1: type_inference.specialize ------[replace.] 0.000116 15 63.93% : 0.000074s : 10: replace.inline 16.65% : 0.000019s : 1: replace.switch_simplify 11.52% : 0.000013s : 2: replace.tuple_list_get_item_depend_reorder 7.90% : 0.000009s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000239 15 89.98% : 0.000215s : 10: match.inline 2.82% : 0.000007s : 1: match.switch_simplify 5.76% : 0.000014s : 2: match.tuple_list_get_item_depend_reorder 1.44% : 0.000003s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000374 2602 1.44% : 0.000005s : 42: predicate.accumulaten_eliminater 0.85% : 0.000003s : 11: predicate.ad_related_special_op_eliminate 1.40% : 0.000005s : 42: predicate.addn_check_dump 1.47% : 0.000006s : 42: predicate.addn_zero_filter 2.01% : 0.000008s : 42: predicate.arithmetic_simplify 1.50% : 0.000006s : 42: predicate.cast_eliminate 0.45% : 0.000002s : 11: predicate.check_bprop_eliminate 1.42% : 0.000005s : 42: predicate.compare_switch_simplify 1.54% : 0.000006s : 42: predicate.depend_value_elim 1.43% : 0.000005s : 42: predicate.dict_get_item_const_eliminator 1.46% : 0.000005s : 42: predicate.dict_get_item_eliminator 1.48% : 0.000006s : 42: predicate.dict_set_item_eliminator 0.59% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 11: predicate.elim_not_effective 0.47% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.42% : 0.000005s : 42: predicate.environ_add_const_eliminate 1.41% : 0.000005s : 42: predicate.environ_get_add_eliminate 1.42% : 0.000005s : 42: predicate.environ_get_depend_swap 1.44% : 0.000005s : 42: predicate.environ_get_eliminate 1.42% : 0.000005s : 42: predicate.environ_get_set_eliminate 0.25% : 0.000001s : 11: predicate.fold_const_symbol 0.86% : 0.000003s : 22: predicate.get_grad_eliminate 0.25% : 0.000001s : 11: predicate.graph_param_transform 4.45% : 0.000017s : 78: predicate.inline 0.93% : 0.000003s : 22: predicate.inline_without_move 0.42% : 0.000002s : 22: predicate.j_node_and_user_rematch 1.06% : 0.000004s : 22: predicate.less_batch_normalization 1.63% : 0.000006s : 46: predicate.list_to_tuple_eliminator_ 2.04% : 0.000008s : 57: predicate.load_eliminater 0.87% : 0.000003s : 11: predicate.loop_unroll_after_grad 2.54% : 0.000010s : 67: predicate.loop_unroll_before_grad 2.15% : 0.000008s : 55: predicate.make_slice_get_slice_eliminator 1.42% : 0.000005s : 42: predicate.merge_addn 1.53% : 0.000006s : 42: predicate.minmaximum_grad 0.89% : 0.000003s : 11: predicate.mutable_eliminate 0.45% : 0.000002s : 11: predicate.opt_reshape 2.44% : 0.000009s : 57: predicate.partial_eliminate 1.46% : 0.000005s : 42: predicate.print_const_string_wrapper 1.90% : 0.000007s : 42: predicate.reduce_eliminate 1.60% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.46% : 0.000002s : 22: predicate.remove_not_recompute_node 1.77% : 0.000007s : 68: predicate.replace_applicator 0.46% : 0.000002s : 22: predicate.replace_old_param 0.26% : 0.000001s : 11: predicate.reset_defer_inline 1.52% : 0.000006s : 42: predicate.reshape_eliminate 1.44% : 0.000005s : 42: predicate.row_tensor_add_zeros_like 0.59% : 0.000002s : 11: predicate.row_tensor_eliminate 1.51% : 0.000006s : 42: predicate.same_eliminate 0.55% : 0.000002s : 25: predicate.set_cell_output_no_recompute 0.86% : 0.000003s : 22: predicate.special_op_eliminate 0.95% : 0.000004s : 22: predicate.specialize_transform 1.61% : 0.000006s : 42: predicate.split_environ_get_set_with_tuple_value 1.60% : 0.000006s : 42: predicate.stack_unstack_eliminate 0.45% : 0.000002s : 11: predicate.switch_call_monad_eliminater 2.38% : 0.000009s : 56: predicate.switch_defer_inline 2.18% : 0.000008s : 56: predicate.switch_layer_defer_inline 5.66% : 0.000021s : 136: predicate.switch_simplify 1.46% : 0.000005s : 42: predicate.tile_eliminate 1.46% : 0.000005s : 42: predicate.transpose_eliminate 1.85% : 0.000007s : 42: predicate.tuple_list_convert_item_index_to_positive 1.94% : 0.000007s : 44: predicate.tuple_list_get_item_depend_reorder 3.52% : 0.000013s : 68: predicate.tuple_list_get_item_eliminator 1.99% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.58% : 0.000006s : 46: predicate.tuple_to_list_eliminator_ 2.00% : 0.000008s : 57: predicate.updatestate_pure_node_eliminater 3.18% : 0.000012s : 79: predicate.updatestate_useless_node_eliminater 1.80% : 0.000007s : 42: predicate.value_based_eliminate 0.40% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.50% : 0.000002s : 11: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001653 19 64.40% : 0.001065s : 7: func_graph_cloner_run.FuncGraphClonerGraph 35.60% : 0.000589s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.074299 76 0.12% : 0.000092s : 1: add_recomputation 0.27% : 0.000197s : 1: auto_monad 0.05% : 0.000036s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: backend_pass 0.65% : 0.000480s : 1: bootstrap 0.04% : 0.000027s : 1: cconv 0.03% : 0.000019s : 1: convert_after_rewriter 0.06% : 0.000046s : 1: cse_after_recomputation 0.02% : 0.000013s : 1: environ_conv 0.22% : 0.000162s : 1: event_method 0.01% : 0.000009s : 1: execute 0.01% : 0.000005s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 10.92% : 0.008110s : 1: jit_opt_a 0.37% : 0.000278s : 1: jit_opt_after_cconv 0.15% : 0.000112s : 1: jit_opt_b 0.60% : 0.000446s : 1: loop_unroll 0.66% : 0.000493s : 1: mutable_eliminate 3.13% : 0.002323s : 26: opt.transform.jit_opt_a 0.18% : 0.000137s : 4: opt.transform.jit_opt_after_cconv 0.11% : 0.000085s : 4: opt.transform.jit_opt_b 0.03% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000023s : 1: opt.transform.mutable_eliminate 0.06% : 0.000046s : 1: opt.transform.opt_after_jit_grad 0.10% : 0.000074s : 4: opt.transform.symbol_engine_opt 0.71% : 0.000526s : 1: opt_after_jit_grad 0.02% : 0.000012s : 1: order_py_execute_after_rewriter 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pre_auto_parallel 0.05% : 0.000038s : 1: py_interpret_to_execute 0.02% : 0.000016s : 1: py_interpret_to_execute_after_opt_a 0.07% : 0.000049s : 1: remove_dup_value 0.81% : 0.000599s : 1: renormalize.infer 0.80% : 0.000593s : 1: renormalize.specialize 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.70% : 0.000521s : 1: rewriter_after_opt_a 0.17% : 0.000127s : 1: rewriter_before_opt_a 0.17% : 0.000124s : 1: symbol_engine_optimizer 13.34% : 0.009908s : 1: task_emit 65.05% : 0.048335s : 1: type_inference 0.24% : 0.000181s : 1: validate [WARNING] ME(103441:281473890602800,ForkProcess-179):2026-01-29-17:52:40.745.440 [mindspore/graph/api.py:128] The function "multinomial_forward_func" at the file "/home/jenkins/mindspore/testcases/testcases/tests/st/mint/test_multinomial.py", line 28 has been compiled again. Try to reuse the function object decorated by @jit to reduce the compile time. For more details, get instructions about `jit` at https://www.mindspore.cn/search?inputValue=jit. TotalTime = 0.104202, [33] [bootstrap]: 0.00044202 [type_inference]: 0.0624274 [event_method]: 0.00030092 [auto_monad]: 0.00015269 [graph_reusing]: 8.59002e-06 [pre_auto_parallel]: 3.47002e-06 [py_interpret_to_execute]: 4.399e-05 [rewriter_before_opt_a]: 0.00014016 [expand_dump_flag]: 4.52e-06 [jit_opt_a]: 0.0224679, [3] [Cycle 1]: 0.0162393, [27] [switch_simplify]: 0.00016245 [loop_unroll]: 5.355e-05 [a_1]: 0.00126161 [with_stream_mark]: 2.535e-05 [recompute_prepare]: 2.191e-05 [updatestate_depend_eliminate]: 9.42999e-06 [updatestate_assign_eliminate]: 7.74002e-06 [updatestate_loads_eliminate]: 7.06001e-06 [parameter_eliminate]: 2.59999e-06 [specialize_transform]: 1.468e-05 [updatestate_useless_node_eliminater]: 1.385e-05 [accelerated_algorithm]: 1.411e-05 [meta_shard_fg_expand]: 4.08001e-06 [get_grad_eliminate_]: 1.338e-05 [merge_forward]: 8.73001e-06 [cell_reuse_recompute_pass]: 1.04e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.026e-05 [j_node_and_user_rematch]: 2.493e-05 [meta_fg_expand]: 0.00190126 [replace_old_param]: 6.802e-05 [inline_without_move]: 6.074e-05 [renormalize]: 0.0119199 [add_forward_monad_depend]: 1.54e-05 [auto_monad_grad]: 6.72002e-06 [auto_monad_eliminator]: 6.367e-05 [cse]: 0.00024823 [replace_applicator]: 7.214e-05 [Cycle 2]: 0.00250883, [27] [switch_simplify]: 4.126e-05 [loop_unroll]: 3.905e-05 [a_1]: 0.00117577 [with_stream_mark]: 1.194e-05 [recompute_prepare]: 8.54998e-06 [updatestate_depend_eliminate]: 3.57002e-06 [updatestate_assign_eliminate]: 2.98e-06 [updatestate_loads_eliminate]: 2.64001e-06 [parameter_eliminate]: 1.14e-06 [specialize_transform]: 6.85998e-06 [updatestate_useless_node_eliminater]: 6.31e-06 [accelerated_algorithm]: 6.96999e-06 [meta_shard_fg_expand]: 1.59998e-06 [get_grad_eliminate_]: 5.76998e-06 [merge_forward]: 3.2e-06 [cell_reuse_recompute_pass]: 9.20001e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.258e-05 [j_node_and_user_rematch]: 9.85002e-06 [meta_fg_expand]: 7.032e-05 [replace_old_param]: 1.018e-05 [inline_without_move]: 6.31e-06 [renormalize]: 0.00088127 [add_forward_monad_depend]: 4.10998e-06 [auto_monad_grad]: 1.35001e-06 [auto_monad_eliminator]: 1.162e-05 [cse]: 2.08e-05 [replace_applicator]: 1.334e-05 [Cycle 3]: 0.00037938, [27] [switch_simplify]: 7.3e-06 [loop_unroll]: 6.36e-06 [a_1]: 0.00013268 [with_stream_mark]: 8.45999e-06 [recompute_prepare]: 6.06e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 2.68e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 9.50007e-07 [specialize_transform]: 6.41998e-06 [updatestate_useless_node_eliminater]: 5.92001e-06 [accelerated_algorithm]: 6.21e-06 [meta_shard_fg_expand]: 1.39e-06 [get_grad_eliminate_]: 5.74e-06 [merge_forward]: 2.99999e-06 [cell_reuse_recompute_pass]: 1.10001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.282e-05 [j_node_and_user_rematch]: 9.69e-06 [meta_fg_expand]: 2.20002e-06 [replace_old_param]: 8.59e-06 [inline_without_move]: 5.87001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.00001e-06 [auto_monad_grad]: 8.00006e-07 [auto_monad_eliminator]: 6.54001e-06 [cse]: 1.392e-05 [replace_applicator]: 6.36e-06 [py_interpret_to_execute_after_opt_a]: 9.09e-06 [rewriter_after_opt_a]: 3.493e-05 [convert_after_rewriter]: 8.37998e-06 [order_py_execute_after_rewriter]: 5.54e-06 [mutable_eliminate]: 0.00046711 [jit_opt_b]: 5.444e-05, [1] [Cycle 1]: 4.841e-05, [2] [frontend_op_eliminate]: 1.905e-05 [inline_after_opt_a]: 1.779e-05 [cconv]: 2.201e-05 [loop_unroll]: 0.00041354 [jit_opt_after_cconv]: 0.00015114, [1] [Cycle 1]: 0.0001453, [11] [c_1]: 2.624e-05 [parameter_eliminate]: 2.47001e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 2.89999e-06 [updatestate_loads_eliminate]: 2.56e-06 [cse]: 2.083e-05 [call_graph_tuple_transform]: 2.091e-05 [tuple_list_get_item_eliminator]: 6.56e-06 [none_parameter_eliminate]: 1.50001e-06 [renormalize]: 4.50003e-07 [switch_simplify]: 6.44999e-06 [remove_dup_value]: 1.509e-05 [partial_unused_args_eliminate]: 2.11e-06 [environ_conv]: 5.91e-06 [add_recomputation]: 4.485e-05 [cse_after_recomputation]: 2.436e-05, [1] [Cycle 1]: 1.887e-05, [1] [cse]: 1.326e-05 [auto_monad_reorder]: 1.928e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.28001e-06 [opt_after_jit_grad]: 0.00045778 [symbol_engine_optimizer]: 7.656e-05, [1] [Cycle 1]: 7.105e-05, [6] [build]: 3.34001e-06 [elim_shapecalc]: 8.71002e-06 [elim_not_effective]: 1.526e-05 [opt_reshape]: 6.29001e-06 [fold_const_symbol]: 1.061e-05 [renormalize]: 4.50003e-07 [validate]: 3.429e-05 [backend_pass]: 1.07998e-06 [task_emit]: 0.016152 [execute]: 6.59001e-06 Sums bootstrap : 0.000442s : 0.44% type_inference : 0.062427s : 62.38% event_method : 0.000301s : 0.30% auto_monad : 0.000153s : 0.15% graph_reusing : 0.000009s : 0.01% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000044s : 0.04% rewriter_before_opt_a : 0.000140s : 0.14% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000211s : 0.21% jit_opt_a.loop_unroll : 0.000099s : 0.10% jit_opt_a.a_1 : 0.002570s : 2.57% jit_opt_a.with_stream_mark : 0.000046s : 0.05% jit_opt_a.recompute_prepare : 0.000037s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000017s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000028s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000026s : 0.03% jit_opt_a.accelerated_algorithm : 0.000027s : 0.03% jit_opt_a.meta_shard_fg_expand : 0.000007s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000025s : 0.02% jit_opt_a.merge_forward : 0.000015s : 0.01% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000056s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000044s : 0.04% jit_opt_a.meta_fg_expand : 0.001974s : 1.97% jit_opt_a.replace_old_param : 0.000087s : 0.09% jit_opt_a.inline_without_move : 0.000073s : 0.07% jit_opt_a.renormalize : 0.012801s : 12.79% jit_opt_a.add_forward_monad_depend : 0.000021s : 0.02% jit_opt_a.auto_monad_grad : 0.000009s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000082s : 0.08% jit_opt_a.cse : 0.000283s : 0.28% jit_opt_a.replace_applicator : 0.000092s : 0.09% py_interpret_to_execute_after_opt_a : 0.000009s : 0.01% rewriter_after_opt_a : 0.000035s : 0.03% convert_after_rewriter : 0.000008s : 0.01% order_py_execute_after_rewriter : 0.000006s : 0.01% mutable_eliminate : 0.000467s : 0.47% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.02% jit_opt_b.inline_after_opt_a : 0.000018s : 0.02% cconv : 0.000022s : 0.02% loop_unroll : 0.000414s : 0.41% jit_opt_after_cconv.c_1 : 0.000026s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000021s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000006s : 0.01% remove_dup_value : 0.000015s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000006s : 0.01% add_recomputation : 0.000045s : 0.04% cse_after_recomputation.cse : 0.000013s : 0.01% auto_monad_reorder : 0.000019s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000458s : 0.46% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% symbol_engine_optimizer.opt_reshape : 0.000006s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000034s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.016152s : 16.14% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000663 130 3.46% : 0.000023s : 1: substitution.arithmetic_simplify 2.12% : 0.000014s : 2: substitution.cast_eliminate 0.37% : 0.000002s : 3: substitution.elim_not_effective 0.26% : 0.000002s : 3: substitution.fold_const_symbol 0.83% : 0.000005s : 4: substitution.graph_param_transform 64.00% : 0.000425s : 17: substitution.inline 2.52% : 0.000017s : 2: substitution.inline_without_move 1.21% : 0.000008s : 14: substitution.j_node_and_user_rematch 1.45% : 0.000010s : 7: substitution.minmaximum_grad 2.91% : 0.000019s : 11: substitution.partial_eliminate 1.61% : 0.000011s : 14: substitution.remove_not_recompute_node 3.48% : 0.000023s : 9: substitution.replace_applicator 1.49% : 0.000010s : 13: substitution.replace_old_param 0.41% : 0.000003s : 1: substitution.set_cell_output_no_recompute 3.34% : 0.000022s : 3: substitution.switch_simplify 3.14% : 0.000021s : 7: substitution.tuple_list_convert_item_index_to_positive 2.26% : 0.000015s : 7: substitution.tuple_list_get_item_depend_reorder 5.14% : 0.000034s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.062345 2 95.98% : 0.059836s : 1: type_inference.infer 4.02% : 0.002509s : 1: type_inference.specialize ------[replace.] 0.000226 26 2.42% : 0.000005s : 1: replace.arithmetic_simplify 59.55% : 0.000135s : 17: replace.inline 17.76% : 0.000040s : 3: replace.switch_simplify 20.26% : 0.000046s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000468 26 4.73% : 0.000022s : 1: match.arithmetic_simplify 88.62% : 0.000414s : 17: match.inline 4.43% : 0.000021s : 3: match.switch_simplify 2.21% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000418 2906 1.55% : 0.000006s : 49: predicate.accumulaten_eliminater 0.45% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.45% : 0.000006s : 49: predicate.addn_check_dump 1.51% : 0.000006s : 49: predicate.addn_zero_filter 2.19% : 0.000009s : 50: predicate.arithmetic_simplify 1.71% : 0.000007s : 50: predicate.cast_eliminate 0.17% : 0.000001s : 4: predicate.check_bprop_eliminate 1.42% : 0.000006s : 49: predicate.compare_switch_simplify 1.52% : 0.000006s : 49: predicate.depend_value_elim 1.48% : 0.000006s : 50: predicate.dict_get_item_const_eliminator 1.61% : 0.000007s : 50: predicate.dict_get_item_eliminator 1.51% : 0.000006s : 50: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.13% : 0.000001s : 4: predicate.elim_not_effective 0.21% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.55% : 0.000006s : 50: predicate.environ_add_const_eliminate 1.49% : 0.000006s : 50: predicate.environ_get_add_eliminate 1.50% : 0.000006s : 50: predicate.environ_get_depend_swap 1.49% : 0.000006s : 50: predicate.environ_get_eliminate 1.49% : 0.000006s : 50: predicate.environ_get_set_eliminate 0.10% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000003s : 19: predicate.get_grad_eliminate 0.08% : 0.000000s : 4: predicate.graph_param_transform 4.39% : 0.000018s : 80: predicate.inline 1.94% : 0.000008s : 49: predicate.inline_without_move 0.34% : 0.000001s : 19: predicate.j_node_and_user_rematch 0.83% : 0.000003s : 19: predicate.less_batch_normalization 1.75% : 0.000007s : 55: predicate.list_to_tuple_eliminator_ 1.89% : 0.000008s : 59: predicate.load_eliminater 0.42% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.30% : 0.000014s : 96: predicate.loop_unroll_before_grad 1.82% : 0.000008s : 54: predicate.make_slice_get_slice_eliminator 1.42% : 0.000006s : 49: predicate.merge_addn 1.55% : 0.000006s : 50: predicate.minmaximum_grad 0.50% : 0.000002s : 4: predicate.mutable_eliminate 0.17% : 0.000001s : 4: predicate.opt_reshape 2.38% : 0.000010s : 59: predicate.partial_eliminate 1.54% : 0.000006s : 49: predicate.print_const_string_wrapper 1.94% : 0.000008s : 50: predicate.reduce_eliminate 1.72% : 0.000007s : 55: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000002s : 19: predicate.remove_not_recompute_node 2.45% : 0.000010s : 103: predicate.replace_applicator 1.05% : 0.000004s : 49: predicate.replace_old_param 0.11% : 0.000000s : 4: predicate.reset_defer_inline 1.58% : 0.000007s : 50: predicate.reshape_eliminate 1.48% : 0.000006s : 49: predicate.row_tensor_add_zeros_like 0.23% : 0.000001s : 4: predicate.row_tensor_eliminate 1.57% : 0.000007s : 49: predicate.same_eliminate 0.45% : 0.000002s : 19: predicate.set_cell_output_no_recompute 0.36% : 0.000002s : 8: predicate.special_op_eliminate 0.78% : 0.000003s : 19: predicate.specialize_transform 1.73% : 0.000007s : 49: predicate.split_environ_get_set_with_tuple_value 1.53% : 0.000006s : 49: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.88% : 0.000012s : 72: predicate.switch_defer_inline 2.62% : 0.000011s : 72: predicate.switch_layer_defer_inline 6.75% : 0.000028s : 178: predicate.switch_simplify 1.52% : 0.000006s : 50: predicate.tile_eliminate 1.49% : 0.000006s : 50: predicate.transpose_eliminate 1.89% : 0.000008s : 50: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000007s : 50: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000013s : 63: predicate.tuple_list_get_item_eliminator 1.95% : 0.000008s : 50: predicate.tuple_list_set_item_eliminator 1.76% : 0.000007s : 55: predicate.tuple_to_list_eliminator_ 1.80% : 0.000008s : 59: predicate.updatestate_pure_node_eliminater 2.77% : 0.000012s : 78: predicate.updatestate_useless_node_eliminater 1.95% : 0.000008s : 49: predicate.value_based_eliminate 0.13% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.19% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002802 38 60.84% : 0.001705s : 17: func_graph_cloner_run.FuncGraphClonerGraph 39.16% : 0.001097s : 21: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.120454 91 0.04% : 0.000047s : 1: add_recomputation 0.13% : 0.000159s : 1: auto_monad 0.02% : 0.000022s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.38% : 0.000457s : 1: bootstrap 0.02% : 0.000024s : 1: cconv 0.01% : 0.000011s : 1: convert_after_rewriter 0.02% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: environ_conv 0.26% : 0.000307s : 1: event_method 0.01% : 0.000010s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 18.66% : 0.022471s : 1: jit_opt_a 0.13% : 0.000154s : 1: jit_opt_after_cconv 0.05% : 0.000057s : 1: jit_opt_b 0.35% : 0.000421s : 1: loop_unroll 0.39% : 0.000475s : 1: mutable_eliminate 2.76% : 0.003326s : 39: opt.transform.jit_opt_a 0.05% : 0.000057s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000031s : 4: opt.transform.jit_opt_b 0.01% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000015s : 1: opt.transform.mutable_eliminate 0.02% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000038s : 4: opt.transform.symbol_engine_opt 0.39% : 0.000465s : 1: opt_after_jit_grad 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pre_auto_parallel 0.04% : 0.000047s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000017s : 1: remove_dup_value 8.80% : 0.010603s : 2: renormalize.infer 1.81% : 0.002183s : 2: renormalize.specialize 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000038s : 1: rewriter_after_opt_a 0.12% : 0.000144s : 1: rewriter_before_opt_a 0.07% : 0.000079s : 1: symbol_engine_optimizer 13.42% : 0.016162s : 1: task_emit 51.84% : 0.062439s : 1: type_inference 0.05% : 0.000061s : 1: validate TotalTime = 2.80343, [24] [bootstrap]: 0.00077721 [type_inference]: 0.0107473 [event_method]: 8.85999e-06 [auto_monad]: 0.00023111 [graph_reusing]: 5.30999e-06 [inline]: 1.96e-06 [add_attr]: 0.00788477, [1] [add_attr_with_inline]: 0.00787253, [1] [Cycle 1]: 0.00010695, [2] [tag_attr]: 2.52e-05 [meta_addattr_fg_expand]: 1.31e-05 [parallel-infer-symbol]: 2.65002e-06 [pre_auto_parallel]: 4.215e-05 [insert-virtual-dataset]: 2.37999e-06 [parallel-infer-symbol-second]: 7.39994e-07 [dataset_repeat_opt]: 1.92001e-06 [pipeline_split]: 1.60999e-06 [optimize]: 0.00605684, [53] [py_interpret_to_execute]: 3.88999e-06 [rewriter_before_opt_a]: 6.357e-05 [opt_a]: 0.00350669, [2] [Cycle 1]: 0.0024211, [45] [expand_dump_flag]: 2.98e-06 [switch_simplify]: 5.737e-05 [loop_unroll]: 1.566e-05 [a_1]: 0.00047972 [with_stream_mark]: 1.551e-05 [recompute_prepare]: 1.232e-05 [updatestate_depend_eliminate]: 1.638e-05 [updatestate_assign_eliminate]: 8.22e-06 [updatestate_loads_eliminate]: 3.038e-05 [parameter_eliminate]: 1.83002e-06 [a_2]: 0.00017509 [accelerated_algorithm]: 3.762e-05 [shard]: 2.24999e-06 [meta_shard_fg_expand]: 2.38002e-06 [shard_inline]: 1.172e-05 [merge_send_recv]: 4.93e-05 [auto_parallel]: 9.46e-06 [parallel]: 9.355e-05 [flash_sp]: 3.489e-05 [merge_comm]: 7.3e-06 [allreduce_fusion]: 1.458e-05 [matmul_add_comm_reduction]: 2.115e-05 [allreduce_slice_to_reducescatter]: 9.28002e-06 [virtual_shard_identity]: 1.46e-05 [virtual_dataset]: 1.375e-05 [get_grad_eliminate_]: 1.144e-05 [virtual_output]: 1.139e-05 [merge_forward]: 6.46e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [offload_activation]: 2.229e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.228e-05 [merge_recompute_call_nodes]: 1.42e-06 [before_grad]: 2.023e-05 [set_forward_comm_id_for_comm_node_pass]: 1.66e-05 [meta_fg_expand]: 4.75999e-06 [flash_sp_send_recv_attached]: 4.68001e-06 [receive_attached]: 1.93e-05 [after_resolve]: 1.76e-05 [a_after_grad]: 1.853e-05 [renormalize]: 0.00062632 [add_forward_monad_depend]: 4.67998e-06 [auto_monad_grad]: 1.89e-06 [auto_monad_eliminator]: 4.851e-05 [cse]: 9.166e-05 [a_3]: 8.336e-05 [Cycle 2]: 0.00107698, [45] [expand_dump_flag]: 8.30012e-07 [switch_simplify]: 1.251e-05 [loop_unroll]: 1.13e-05 [a_1]: 0.00031485 [with_stream_mark]: 1.001e-05 [recompute_prepare]: 1.117e-05 [updatestate_depend_eliminate]: 6.12999e-06 [updatestate_assign_eliminate]: 6.19001e-06 [updatestate_loads_eliminate]: 7.53999e-06 [parameter_eliminate]: 9.50007e-07 [a_2]: 0.00016112 [accelerated_algorithm]: 1.391e-05 [shard]: 1.00001e-06 [meta_shard_fg_expand]: 2.06e-06 [shard_inline]: 1.144e-05 [merge_send_recv]: 7.91001e-06 [auto_parallel]: 8.64998e-06 [parallel]: 3.88999e-06 [flash_sp]: 3.43e-06 [merge_comm]: 6.01e-06 [allreduce_fusion]: 5.95002e-06 [matmul_add_comm_reduction]: 8.99998e-06 [allreduce_slice_to_reducescatter]: 3.59985e-07 [virtual_shard_identity]: 1.209e-05 [virtual_dataset]: 1.122e-05 [get_grad_eliminate_]: 1.125e-05 [virtual_output]: 1.085e-05 [merge_forward]: 5.32001e-06 [cell_reuse_recompute_pass]: 1.34998e-06 [offload_activation]: 9.99001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.013e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 1.828e-05 [set_forward_comm_id_for_comm_node_pass]: 6.44001e-06 [meta_fg_expand]: 3.71999e-06 [flash_sp_send_recv_attached]: 7.99977e-07 [receive_attached]: 9.80013e-07 [after_resolve]: 1.567e-05 [a_after_grad]: 1.779e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 8.39995e-07 [auto_monad_eliminator]: 2.11e-05 [cse]: 2.666e-05 [a_3]: 7.375e-05 [py_interpret_to_execute_after_opt_a]: 4.32e-06 [slice_cell_reuse_recomputed_activation]: 1.87001e-06 [rewriter_after_opt_a]: 3.798e-05 [convert_after_rewriter]: 1.35999e-06 [order_py_execute_after_rewriter]: 1.16002e-06 [mutable_eliminate]: 0.00049581 [opt_b]: 0.00037038, [1] [Cycle 1]: 0.00036489, [7] [b_1]: 0.00026335 [b_2]: 1.35e-05 [updatestate_depend_eliminate]: 8.28001e-06 [updatestate_assign_eliminate]: 6.13002e-06 [updatestate_loads_eliminate]: 8.05e-06 [renormalize]: 4.10015e-07 [cse]: 3.185e-05 [optimize_parallel_all_gather_comm]: 3.221e-05 [overlap_param_gather]: 1.214e-05 [cconv]: 2.451e-05 [loop_unroll]: 0.00043317 [opt_after_cconv]: 0.000172, [1] [Cycle 1]: 0.00016651, [7] [c_1]: 7.83e-05 [parameter_eliminate]: 2.11998e-06 [updatestate_depend_eliminate]: 8.47e-06 [updatestate_assign_eliminate]: 6.06e-06 [updatestate_loads_eliminate]: 8.04002e-06 [cse]: 3.092e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 3.021e-05 [tuple_transform]: 0.00011594, [1] [Cycle 1]: 0.00011158, [4] [d_1]: 8.069e-05 [none_parameter_eliminate]: 2.05002e-06 [renormalize]: 1.80007e-07 [switch_simplify]: 1.185e-05 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 8.951e-05 [cse_after_recomputation]: 3.468e-05, [1] [Cycle 1]: 2.999e-05, [1] [cse]: 2.46e-05 [environ_conv]: 1.639e-05 [swap_dp_allreduce_reducescatter]: 2.775e-05 [bias_add_comm_swap]: 1.13e-05 [label_micro_interleaved_index]: 1.347e-05 [label_fine_grained_interleaved_index]: 2.67001e-06 [merge_cast_opt]: 1.45999e-06 [slice_recompute_activation]: 1.96003e-06 [micro_interleaved_order_control]: 2.39999e-06 [assign_add_opt]: 1.16997e-06 [ForceFp32Comm]: 1.06002e-06 [remove_cast_before_assign_add]: 1.019e-05 [full_micro_interleaved_order_control]: 1.072e-05 [reorder_send_recv_between_fp_bp]: 2.86e-06 [comm_op_add_attrs]: 1.08001e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.43002e-06 [interleave_parallel_branches]: 1.009e-05 [overlap_opt_shard_in_pipeline]: 1.438e-05 [overlap_opt_shard_grad_in_pipeline]: 1.90001e-06 [control_data_broadcast_order]: 2.096e-05 [grouped_pairwise_exchange_alltoall]: 1.50001e-06 [offloading_packed_experts]: 6.23e-06 [overlap_recompute_and_grad_model_parallel]: 1.506e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.16997e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.58998e-06 [overlap_grad_ring_attention]: 2.319e-05 [overlap_grad_flash_sp]: 5.462e-05 [begin_end_overlap_inline]: 5.39992e-07 [split_matmul_comm_elemetwise]: 1.079e-05 [split_layernorm_comm]: 1.71998e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 0.00013625, [1] [Cycle 1]: 0.0001321, [6] [build]: 3.317e-05 [elim_shapecalc]: 1.614e-05 [elim_not_effective]: 2.377e-05 [opt_reshape]: 1.196e-05 [fold_const_symbol]: 1.871e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.57999e-06 [pipeline_parallel_scheduler]: 1.44e-06 [auto_monad_reorder]: 5.075e-05 [get_jit_bprop_graph]: 8.80013e-07 [rewriter_after_jit_bprop_graph]: 3.06001e-06 [opt_after_jit_grad]: 0.00047298 [validate]: 5.975e-05 [backend_pass]: 9.09989e-07 [task_emit]: 2.77676 [execute]: 8.84e-06 Sums bootstrap : 0.000777s : 0.03% type_inference : 0.010747s : 0.38% event_method : 0.000009s : 0.00% auto_monad : 0.000231s : 0.01% graph_reusing : 0.000005s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000042s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000064s : 0.00% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000070s : 0.00% optimize.opt_a.loop_unroll : 0.000027s : 0.00% optimize.opt_a.a_1 : 0.000795s : 0.03% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000023s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000038s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000336s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000052s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000023s : 0.00% optimize.opt_a.merge_send_recv : 0.000057s : 0.00% optimize.opt_a.auto_parallel : 0.000018s : 0.00% optimize.opt_a.parallel : 0.000097s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000013s : 0.00% optimize.opt_a.allreduce_fusion : 0.000021s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000030s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000027s : 0.00% optimize.opt_a.virtual_dataset : 0.000025s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000023s : 0.00% optimize.opt_a.virtual_output : 0.000022s : 0.00% optimize.opt_a.merge_forward : 0.000012s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000032s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000052s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000039s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000023s : 0.00% optimize.opt_a.meta_fg_expand : 0.000008s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000020s : 0.00% optimize.opt_a.after_resolve : 0.000033s : 0.00% optimize.opt_a.a_after_grad : 0.000036s : 0.00% optimize.opt_a.renormalize : 0.000626s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000070s : 0.00% optimize.opt_a.cse : 0.000118s : 0.00% optimize.opt_a.a_3 : 0.000157s : 0.01% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000038s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000496s : 0.02% optimize.opt_b.b_1 : 0.000263s : 0.01% optimize.opt_b.b_2 : 0.000014s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000032s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000025s : 0.00% optimize.loop_unroll : 0.000433s : 0.02% optimize.opt_after_cconv.c_1 : 0.000078s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000008s : 0.00% optimize.opt_after_cconv.cse : 0.000031s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000030s : 0.00% optimize.tuple_transform.d_1 : 0.000081s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000090s : 0.00% optimize.cse_after_recomputation.cse : 0.000025s : 0.00% optimize.environ_conv : 0.000016s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000015s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000055s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000033s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000024s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000012s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000019s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000051s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000473s : 0.02% validate : 0.000060s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.776762s : 99.37% execute : 0.000009s : 0.00% Time group info: ------[substitution.] 0.000230 89 2.91% : 0.000007s : 2: substitution.depend_value_elim 1.39% : 0.000003s : 7: substitution.elim_not_effective 1.18% : 0.000003s : 7: substitution.fold_const_symbol 3.68% : 0.000008s : 10: substitution.graph_param_transform 33.08% : 0.000076s : 1: substitution.inline 2.85% : 0.000007s : 14: substitution.j_node_and_user_rematch 9.83% : 0.000023s : 2: substitution.less_batch_normalization 2.79% : 0.000006s : 12: substitution.load_eliminater 8.39% : 0.000019s : 14: substitution.remove_not_recompute_node 2.31% : 0.000005s : 6: substitution.replace_old_param 3.02% : 0.000007s : 6: substitution.updatestate_pure_node_eliminater 28.59% : 0.000066s : 8: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.010689 2 96.50% : 0.010315s : 1: type_inference.infer 3.50% : 0.000374s : 1: type_inference.specialize ------[replace.] 0.000013 1 100.00% : 0.000013s : 1: replace.inline ------[match.] 0.000075 1 100.00% : 0.000075s : 1: match.inline ------[predicate.] 0.000317 2325 0.82% : 0.000003s : 21: predicate.accumulaten_eliminater 0.84% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.79% : 0.000003s : 20: predicate.addn_check_dump 0.88% : 0.000003s : 21: predicate.addn_zero_filter 0.80% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.08% : 0.000007s : 41: predicate.arithmetic_simplify 0.85% : 0.000003s : 21: predicate.cast_eliminate 0.83% : 0.000003s : 20: predicate.check_bprop_eliminate 0.79% : 0.000002s : 20: predicate.compare_switch_simplify 0.25% : 0.000001s : 10: predicate.const_output_eliminate 0.83% : 0.000003s : 20: predicate.depend_value_elim 0.92% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 0.95% : 0.000003s : 21: predicate.dict_get_item_eliminator 0.83% : 0.000003s : 21: predicate.dict_set_item_eliminator 1.09% : 0.000003s : 20: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 10: predicate.elim_not_effective 0.51% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000004s : 31: predicate.environ_add_const_eliminate 1.18% : 0.000004s : 31: predicate.environ_get_add_eliminate 1.19% : 0.000004s : 31: predicate.environ_get_depend_swap 2.01% : 0.000006s : 51: predicate.environ_get_eliminate 1.21% : 0.000004s : 31: predicate.environ_get_set_eliminate 0.85% : 0.000003s : 22: predicate.exchange_switch_depend_value 1.36% : 0.000004s : 22: predicate.float_depend_g_call 0.80% : 0.000003s : 20: predicate.float_environ_get_switch 1.16% : 0.000004s : 30: predicate.float_tuple_getitem_switch 0.26% : 0.000001s : 10: predicate.fold_const_symbol 0.90% : 0.000003s : 20: predicate.get_grad_eliminate 0.34% : 0.000001s : 10: predicate.graph_param_transform 0.84% : 0.000003s : 20: predicate.incorporate_call 0.75% : 0.000002s : 20: predicate.incorporate_call_switch 5.61% : 0.000018s : 103: predicate.inline 1.00% : 0.000003s : 20: predicate.inline_without_move 0.45% : 0.000001s : 20: predicate.j_node_and_user_rematch 1.17% : 0.000004s : 22: predicate.less_batch_normalization 1.76% : 0.000006s : 41: predicate.list_to_tuple_eliminator_ 2.53% : 0.000008s : 62: predicate.load_eliminater 0.85% : 0.000003s : 10: predicate.loop_unroll_after_grad 1.10% : 0.000003s : 25: predicate.loop_unroll_before_grad 1.73% : 0.000005s : 41: predicate.make_slice_get_slice_eliminator 0.83% : 0.000003s : 20: predicate.merge_addn 0.78% : 0.000002s : 20: predicate.micro_step_allgather_replace 0.81% : 0.000003s : 20: predicate.mini_step_allgather_replace 0.76% : 0.000002s : 21: predicate.minmaximum_grad 0.96% : 0.000003s : 10: predicate.mutable_eliminate 0.49% : 0.000002s : 10: predicate.opt_reshape 0.46% : 0.000001s : 10: predicate.parallel_virtual_node 1.08% : 0.000003s : 22: predicate.partial_defer_inline 1.41% : 0.000004s : 31: predicate.partial_eliminate 0.82% : 0.000003s : 21: predicate.print_const_string_wrapper 0.80% : 0.000003s : 20: predicate.reduce_all_const_elim 1.02% : 0.000003s : 21: predicate.reduce_eliminate 2.41% : 0.000008s : 62: predicate.redundant_stop_gradient_eliminater 0.57% : 0.000002s : 20: predicate.remove_not_recompute_node 1.20% : 0.000004s : 41: predicate.replace_applicator 0.63% : 0.000002s : 20: predicate.replace_old_param 0.29% : 0.000001s : 10: predicate.reset_defer_inline 0.86% : 0.000003s : 21: predicate.reshape_eliminate 0.81% : 0.000003s : 20: predicate.row_tensor_add_zeros_like 0.47% : 0.000001s : 10: predicate.row_tensor_eliminate 0.96% : 0.000003s : 20: predicate.same_eliminate 0.56% : 0.000002s : 20: predicate.set_cell_output_no_recompute 0.96% : 0.000003s : 20: predicate.shard_identity_eliminate 0.95% : 0.000003s : 20: predicate.special_op_eliminate 0.93% : 0.000003s : 20: predicate.specialize_transform 0.92% : 0.000003s : 20: predicate.split_environ_get_set_with_tuple_value 1.04% : 0.000003s : 20: predicate.stack_unstack_eliminate 0.47% : 0.000002s : 10: predicate.switch_call_monad_eliminater 0.93% : 0.000003s : 22: predicate.switch_defer_inline 1.72% : 0.000005s : 42: predicate.switch_layer_defer_inline 3.55% : 0.000011s : 77: predicate.switch_simplify 0.83% : 0.000003s : 21: predicate.tile_eliminate 0.84% : 0.000003s : 21: predicate.transpose_eliminate 1.72% : 0.000005s : 41: predicate.tuple_list_convert_item_index_to_positive 1.80% : 0.000006s : 41: predicate.tuple_list_get_item_const_eliminator 1.62% : 0.000005s : 41: predicate.tuple_list_get_item_depend_reorder 2.97% : 0.000009s : 61: predicate.tuple_list_get_item_eliminator 1.68% : 0.000005s : 41: predicate.tuple_list_get_set_item_eliminator 2.69% : 0.000009s : 61: predicate.tuple_list_set_item_eliminator 1.73% : 0.000005s : 41: predicate.tuple_to_list_eliminator_ 2.48% : 0.000008s : 62: predicate.updatestate_pure_node_eliminater 3.43% : 0.000011s : 82: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 10: predicate.value_based_eliminate 0.91% : 0.000003s : 20: predicate.virtual_dataset_eliminate 0.89% : 0.000003s : 20: predicate.virtual_output_eliminate 0.41% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.46% : 0.000001s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000188 4 9.23% : 0.000017s : 1: func_graph_cloner_run.FuncGraphClonerGraph 90.77% : 0.000171s : 3: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.820104 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.28% : 0.007889s : 1: add_attr 0.28% : 0.007876s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000094s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000238s : 1: auto_monad 0.00% : 0.000055s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.03% : 0.000820s : 1: bootstrap 0.00% : 0.000028s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000038s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.00% : 0.000014s : 1: event_method 0.00% : 0.000015s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000441s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000503s : 1: mutable_eliminate 0.00% : 0.000009s : 1: offloading_packed_experts 0.00% : 0.000019s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.06% : 0.001657s : 78: opt.transform.opt_a 0.00% : 0.000077s : 1: opt.transform.opt_after_cconv 0.00% : 0.000042s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000251s : 28: opt.transform.opt_b 0.00% : 0.000090s : 2: opt.transform.opt_trans_graph 0.00% : 0.000067s : 4: opt.transform.symbol_engine_opt 0.12% : 0.003510s : 1: opt_a 0.01% : 0.000176s : 1: opt_after_cconv 0.02% : 0.000482s : 1: opt_after_jit_grad 0.01% : 0.000374s : 1: opt_b 0.21% : 0.006061s : 1: optimize 0.00% : 0.000036s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000058s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000018s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000046s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000034s : 1: remove_dup_value 0.01% : 0.000356s : 1: renormalize.infer 0.01% : 0.000263s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000042s : 1: rewriter_after_opt_a 0.00% : 0.000068s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000139s : 1: symbol_engine_optimizer 98.47% : 2.776837s : 1: task_emit 0.00% : 0.000119s : 1: tuple_transform 0.38% : 0.010760s : 1: type_inference 0.00% : 0.000094s : 1: validate TotalTime = 0.0798141, [33] [bootstrap]: 0.00021617 [type_inference]: 0.0563981 [event_method]: 0.00022826 [auto_monad]: 0.00020449 [graph_reusing]: 9.04998e-06 [pre_auto_parallel]: 3.51001e-06 [py_interpret_to_execute]: 4.116e-05 [rewriter_before_opt_a]: 0.00013116 [expand_dump_flag]: 3.88001e-06 [jit_opt_a]: 0.00877727, [2] [Cycle 1]: 0.00415996, [27] [switch_simplify]: 0.00011805 [loop_unroll]: 5.133e-05 [a_1]: 0.00148362 [with_stream_mark]: 2.036e-05 [recompute_prepare]: 1.858e-05 [updatestate_depend_eliminate]: 2.943e-05 [updatestate_assign_eliminate]: 8.43001e-06 [updatestate_loads_eliminate]: 7.87e-06 [parameter_eliminate]: 2.09e-06 [specialize_transform]: 1.503e-05 [updatestate_useless_node_eliminater]: 1.546e-05 [accelerated_algorithm]: 2.994e-05 [meta_shard_fg_expand]: 3.73001e-06 [get_grad_eliminate_]: 1.348e-05 [merge_forward]: 7.82e-06 [cell_reuse_recompute_pass]: 1.08001e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.804e-05 [j_node_and_user_rematch]: 2.172e-05 [meta_fg_expand]: 5.59998e-06 [replace_old_param]: 1.892e-05 [inline_without_move]: 1.354e-05 [renormalize]: 0.0018562 [add_forward_monad_depend]: 6.33002e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 2.679e-05 [cse]: 0.00015248 [replace_applicator]: 2.234e-05 [Cycle 2]: 0.00075598, [27] [switch_simplify]: 1.51e-05 [loop_unroll]: 1.297e-05 [a_1]: 0.00034522 [with_stream_mark]: 1.377e-05 [recompute_prepare]: 1.304e-05 [updatestate_depend_eliminate]: 8.11002e-06 [updatestate_assign_eliminate]: 6.59999e-06 [updatestate_loads_eliminate]: 6.54999e-06 [parameter_eliminate]: 1.00999e-06 [specialize_transform]: 1.347e-05 [updatestate_useless_node_eliminater]: 1.642e-05 [accelerated_algorithm]: 1.577e-05 [meta_shard_fg_expand]: 2.68e-06 [get_grad_eliminate_]: 1.252e-05 [merge_forward]: 6.53e-06 [cell_reuse_recompute_pass]: 1.47999e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.364e-05 [j_node_and_user_rematch]: 2.073e-05 [meta_fg_expand]: 5.04e-06 [replace_old_param]: 1.72e-05 [inline_without_move]: 1.299e-05 [renormalize]: 6.00121e-08 [add_forward_monad_depend]: 1.39e-06 [auto_monad_grad]: 1.10001e-06 [auto_monad_eliminator]: 1.565e-05 [cse]: 3.778e-05 [replace_applicator]: 1.358e-05 [py_interpret_to_execute_after_opt_a]: 1.44e-05 [rewriter_after_opt_a]: 0.00054145 [convert_after_rewriter]: 1.612e-05 [order_py_execute_after_rewriter]: 9.27001e-06 [mutable_eliminate]: 0.00049678 [jit_opt_b]: 0.00010895, [1] [Cycle 1]: 0.00010272, [2] [frontend_op_eliminate]: 4.017e-05 [inline_after_opt_a]: 5.009e-05 [cconv]: 2.557e-05 [loop_unroll]: 0.00048548 [jit_opt_after_cconv]: 0.00027446, [1] [Cycle 1]: 0.00026797, [11] [c_1]: 6.903e-05 [parameter_eliminate]: 2.39001e-06 [updatestate_depend_eliminate]: 1.069e-05 [updatestate_assign_eliminate]: 7.05e-06 [updatestate_loads_eliminate]: 6.74001e-06 [cse]: 4.835e-05 [call_graph_tuple_transform]: 3.479e-05 [tuple_list_get_item_eliminator]: 2.103e-05 [none_parameter_eliminate]: 1.71998e-06 [renormalize]: 4.59986e-07 [switch_simplify]: 1.363e-05 [remove_dup_value]: 4.527e-05 [partial_unused_args_eliminate]: 2.33002e-06 [environ_conv]: 1.241e-05 [add_recomputation]: 9.254e-05 [cse_after_recomputation]: 4.408e-05, [1] [Cycle 1]: 3.823e-05, [1] [cse]: 3.182e-05 [auto_monad_reorder]: 3.252e-05 [get_jit_bprop_graph]: 1.60001e-06 [rewriter_after_jit_bprop_graph]: 4.68999e-06 [opt_after_jit_grad]: 0.00051172 [symbol_engine_optimizer]: 0.00012481, [1] [Cycle 1]: 0.00011881, [6] [build]: 1.217e-05 [elim_shapecalc]: 1.73e-05 [elim_not_effective]: 2.608e-05 [opt_reshape]: 1.345e-05 [fold_const_symbol]: 2.14e-05 [renormalize]: 4.39992e-07 [validate]: 5.946e-05 [backend_pass]: 1.07998e-06 [task_emit]: 0.0106634 [execute]: 7.68999e-06 Sums bootstrap : 0.000216s : 0.29% type_inference : 0.056398s : 74.89% event_method : 0.000228s : 0.30% auto_monad : 0.000204s : 0.27% graph_reusing : 0.000009s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000041s : 0.05% rewriter_before_opt_a : 0.000131s : 0.17% expand_dump_flag : 0.000004s : 0.01% jit_opt_a.switch_simplify : 0.000133s : 0.18% jit_opt_a.loop_unroll : 0.000064s : 0.09% jit_opt_a.a_1 : 0.001829s : 2.43% jit_opt_a.with_stream_mark : 0.000034s : 0.05% jit_opt_a.recompute_prepare : 0.000032s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000038s : 0.05% jit_opt_a.updatestate_assign_eliminate : 0.000015s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000014s : 0.02% jit_opt_a.parameter_eliminate : 0.000003s : 0.00% jit_opt_a.specialize_transform : 0.000029s : 0.04% jit_opt_a.updatestate_useless_node_eliminater : 0.000032s : 0.04% jit_opt_a.accelerated_algorithm : 0.000046s : 0.06% jit_opt_a.meta_shard_fg_expand : 0.000006s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000026s : 0.03% jit_opt_a.merge_forward : 0.000014s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000052s : 0.07% jit_opt_a.j_node_and_user_rematch : 0.000042s : 0.06% jit_opt_a.meta_fg_expand : 0.000011s : 0.01% jit_opt_a.replace_old_param : 0.000036s : 0.05% jit_opt_a.inline_without_move : 0.000027s : 0.04% jit_opt_a.renormalize : 0.001856s : 2.46% jit_opt_a.add_forward_monad_depend : 0.000008s : 0.01% jit_opt_a.auto_monad_grad : 0.000003s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000042s : 0.06% jit_opt_a.cse : 0.000190s : 0.25% jit_opt_a.replace_applicator : 0.000036s : 0.05% py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% rewriter_after_opt_a : 0.000541s : 0.72% convert_after_rewriter : 0.000016s : 0.02% order_py_execute_after_rewriter : 0.000009s : 0.01% mutable_eliminate : 0.000497s : 0.66% jit_opt_b.frontend_op_eliminate : 0.000040s : 0.05% jit_opt_b.inline_after_opt_a : 0.000050s : 0.07% cconv : 0.000026s : 0.03% loop_unroll : 0.000485s : 0.64% jit_opt_after_cconv.c_1 : 0.000069s : 0.09% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000011s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000007s : 0.01% jit_opt_after_cconv.cse : 0.000048s : 0.06% jit_opt_after_cconv.call_graph_tuple_transform : 0.000035s : 0.05% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000021s : 0.03% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000014s : 0.02% remove_dup_value : 0.000045s : 0.06% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000012s : 0.02% add_recomputation : 0.000093s : 0.12% cse_after_recomputation.cse : 0.000032s : 0.04% auto_monad_reorder : 0.000033s : 0.04% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000512s : 0.68% symbol_engine_optimizer.build : 0.000012s : 0.02% symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.02% symbol_engine_optimizer.elim_not_effective : 0.000026s : 0.03% symbol_engine_optimizer.opt_reshape : 0.000013s : 0.02% symbol_engine_optimizer.fold_const_symbol : 0.000021s : 0.03% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000059s : 0.08% backend_pass : 0.000001s : 0.00% task_emit : 0.010663s : 14.16% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000452 147 2.89% : 0.000013s : 7: substitution.depend_value_elim 0.81% : 0.000004s : 8: substitution.elim_not_effective 0.70% : 0.000003s : 8: substitution.fold_const_symbol 2.14% : 0.000010s : 11: substitution.graph_param_transform 53.91% : 0.000244s : 10: substitution.inline 1.55% : 0.000007s : 16: substitution.j_node_and_user_rematch 3.49% : 0.000016s : 2: substitution.less_batch_normalization 2.77% : 0.000013s : 10: substitution.minmaximum_grad 2.32% : 0.000010s : 16: substitution.remove_not_recompute_node 1.25% : 0.000006s : 6: substitution.replace_old_param 1.74% : 0.000008s : 1: substitution.switch_simplify 5.46% : 0.000025s : 10: substitution.tuple_list_convert_item_index_to_positive 6.65% : 0.000030s : 12: substitution.tuple_list_get_item_depend_reorder 10.19% : 0.000046s : 18: substitution.tuple_list_get_item_eliminator 2.12% : 0.000010s : 5: substitution.updatestate_pure_node_eliminater 2.00% : 0.000009s : 7: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.056305 2 94.96% : 0.053469s : 1: type_inference.infer 5.04% : 0.002836s : 1: type_inference.specialize ------[replace.] 0.000126 15 64.46% : 0.000081s : 10: replace.inline 17.04% : 0.000021s : 1: replace.switch_simplify 11.53% : 0.000014s : 2: replace.tuple_list_get_item_depend_reorder 6.97% : 0.000009s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000263 15 90.30% : 0.000238s : 10: match.inline 2.67% : 0.000007s : 1: match.switch_simplify 5.51% : 0.000015s : 2: match.tuple_list_get_item_depend_reorder 1.52% : 0.000004s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000375 2602 1.50% : 0.000006s : 42: predicate.accumulaten_eliminater 0.91% : 0.000003s : 11: predicate.ad_related_special_op_eliminate 1.40% : 0.000005s : 42: predicate.addn_check_dump 1.54% : 0.000006s : 42: predicate.addn_zero_filter 2.01% : 0.000008s : 42: predicate.arithmetic_simplify 1.59% : 0.000006s : 42: predicate.cast_eliminate 0.45% : 0.000002s : 11: predicate.check_bprop_eliminate 1.39% : 0.000005s : 42: predicate.compare_switch_simplify 1.54% : 0.000006s : 42: predicate.depend_value_elim 1.46% : 0.000005s : 42: predicate.dict_get_item_const_eliminator 1.49% : 0.000006s : 42: predicate.dict_get_item_eliminator 1.46% : 0.000005s : 42: predicate.dict_set_item_eliminator 0.52% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.27% : 0.000001s : 11: predicate.elim_not_effective 0.49% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.46% : 0.000005s : 42: predicate.environ_add_const_eliminate 1.42% : 0.000005s : 42: predicate.environ_get_add_eliminate 1.38% : 0.000005s : 42: predicate.environ_get_depend_swap 1.52% : 0.000006s : 42: predicate.environ_get_eliminate 1.39% : 0.000005s : 42: predicate.environ_get_set_eliminate 0.24% : 0.000001s : 11: predicate.fold_const_symbol 0.98% : 0.000004s : 22: predicate.get_grad_eliminate 0.27% : 0.000001s : 11: predicate.graph_param_transform 4.37% : 0.000016s : 78: predicate.inline 0.92% : 0.000003s : 22: predicate.inline_without_move 0.42% : 0.000002s : 22: predicate.j_node_and_user_rematch 1.10% : 0.000004s : 22: predicate.less_batch_normalization 1.65% : 0.000006s : 46: predicate.list_to_tuple_eliminator_ 2.08% : 0.000008s : 57: predicate.load_eliminater 0.92% : 0.000003s : 11: predicate.loop_unroll_after_grad 2.58% : 0.000010s : 67: predicate.loop_unroll_before_grad 1.97% : 0.000007s : 55: predicate.make_slice_get_slice_eliminator 1.39% : 0.000005s : 42: predicate.merge_addn 1.50% : 0.000006s : 42: predicate.minmaximum_grad 1.05% : 0.000004s : 11: predicate.mutable_eliminate 0.45% : 0.000002s : 11: predicate.opt_reshape 2.41% : 0.000009s : 57: predicate.partial_eliminate 1.42% : 0.000005s : 42: predicate.print_const_string_wrapper 1.88% : 0.000007s : 42: predicate.reduce_eliminate 1.64% : 0.000006s : 46: predicate.redundant_stop_gradient_eliminater 0.44% : 0.000002s : 22: predicate.remove_not_recompute_node 1.74% : 0.000007s : 68: predicate.replace_applicator 0.48% : 0.000002s : 22: predicate.replace_old_param 0.25% : 0.000001s : 11: predicate.reset_defer_inline 1.46% : 0.000005s : 42: predicate.reshape_eliminate 1.50% : 0.000006s : 42: predicate.row_tensor_add_zeros_like 0.59% : 0.000002s : 11: predicate.row_tensor_eliminate 1.53% : 0.000006s : 42: predicate.same_eliminate 0.57% : 0.000002s : 25: predicate.set_cell_output_no_recompute 0.94% : 0.000004s : 22: predicate.special_op_eliminate 0.94% : 0.000004s : 22: predicate.specialize_transform 1.73% : 0.000006s : 42: predicate.split_environ_get_set_with_tuple_value 1.56% : 0.000006s : 42: predicate.stack_unstack_eliminate 0.44% : 0.000002s : 11: predicate.switch_call_monad_eliminater 2.37% : 0.000009s : 56: predicate.switch_defer_inline 2.19% : 0.000008s : 56: predicate.switch_layer_defer_inline 5.58% : 0.000021s : 136: predicate.switch_simplify 1.44% : 0.000005s : 42: predicate.tile_eliminate 1.47% : 0.000006s : 42: predicate.transpose_eliminate 1.86% : 0.000007s : 42: predicate.tuple_list_convert_item_index_to_positive 1.71% : 0.000006s : 44: predicate.tuple_list_get_item_depend_reorder 3.39% : 0.000013s : 68: predicate.tuple_list_get_item_eliminator 1.89% : 0.000007s : 44: predicate.tuple_list_set_item_eliminator 1.61% : 0.000006s : 46: predicate.tuple_to_list_eliminator_ 2.00% : 0.000008s : 57: predicate.updatestate_pure_node_eliminater 3.10% : 0.000012s : 79: predicate.updatestate_useless_node_eliminater 1.79% : 0.000007s : 42: predicate.value_based_eliminate 0.38% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.62% : 0.000002s : 11: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002043 19 62.12% : 0.001269s : 7: func_graph_cloner_run.FuncGraphClonerGraph 37.88% : 0.000774s : 12: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.084359 76 0.11% : 0.000095s : 1: add_recomputation 0.25% : 0.000213s : 1: auto_monad 0.04% : 0.000035s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.27% : 0.000227s : 1: bootstrap 0.03% : 0.000028s : 1: cconv 0.02% : 0.000019s : 1: convert_after_rewriter 0.05% : 0.000046s : 1: cse_after_recomputation 0.02% : 0.000015s : 1: environ_conv 0.28% : 0.000236s : 1: event_method 0.01% : 0.000011s : 1: execute 0.01% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000012s : 1: graph_reusing 10.41% : 0.008780s : 1: jit_opt_a 0.33% : 0.000277s : 1: jit_opt_after_cconv 0.13% : 0.000112s : 1: jit_opt_b 0.59% : 0.000494s : 1: loop_unroll 0.60% : 0.000505s : 1: mutable_eliminate 2.79% : 0.002353s : 26: opt.transform.jit_opt_a 0.16% : 0.000135s : 4: opt.transform.jit_opt_after_cconv 0.10% : 0.000084s : 4: opt.transform.jit_opt_b 0.03% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000025s : 1: opt.transform.mutable_eliminate 0.06% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.09% : 0.000075s : 4: opt.transform.symbol_engine_opt 0.62% : 0.000520s : 1: opt_after_jit_grad 0.01% : 0.000011s : 1: order_py_execute_after_rewriter 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pre_auto_parallel 0.05% : 0.000044s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.06% : 0.000048s : 1: remove_dup_value 1.16% : 0.000978s : 1: renormalize.infer 1.03% : 0.000871s : 1: renormalize.specialize 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.65% : 0.000545s : 1: rewriter_after_opt_a 0.16% : 0.000134s : 1: rewriter_before_opt_a 0.15% : 0.000127s : 1: symbol_engine_optimizer 12.65% : 0.010672s : 1: task_emit 66.88% : 0.056415s : 1: type_inference 0.12% : 0.000098s : 1: validate TotalTime = 0.125771, [33] [bootstrap]: 0.00091976 [type_inference]: 0.0314462 [event_method]: 6.629e-05 [auto_monad]: 0.00038201 [graph_reusing]: 9.64e-06 [pre_auto_parallel]: 3.18e-06 [py_interpret_to_execute]: 6.892e-05 [rewriter_before_opt_a]: 0.0002249 [expand_dump_flag]: 5.643e-05 [jit_opt_a]: 0.0768619, [4] [Cycle 1]: 0.0573572, [27] [switch_simplify]: 0.00014809 [loop_unroll]: 9.591e-05 [a_1]: 0.00274747 [with_stream_mark]: 3.792e-05 [recompute_prepare]: 5.754e-05 [updatestate_depend_eliminate]: 8.591e-05 [updatestate_assign_eliminate]: 2.445e-05 [updatestate_loads_eliminate]: 5.838e-05 [parameter_eliminate]: 2.80002e-06 [specialize_transform]: 3.85e-05 [updatestate_useless_node_eliminater]: 4.717e-05 [accelerated_algorithm]: 8.547e-05 [meta_shard_fg_expand]: 7.59002e-06 [get_grad_eliminate_]: 3.372e-05 [merge_forward]: 2.261e-05 [cell_reuse_recompute_pass]: 1.19e-06 [cell_reuse_handle_not_recompute_node_pass]: 6.731e-05 [j_node_and_user_rematch]: 6.111e-05 [meta_fg_expand]: 0.0362723 [replace_old_param]: 0.00018306 [inline_without_move]: 0.00018131 [renormalize]: 0.0158594 [add_forward_monad_depend]: 2.589e-05 [auto_monad_grad]: 1.554e-05 [auto_monad_eliminator]: 0.00020029 [cse]: 0.0004435 [replace_applicator]: 0.000308 [Cycle 2]: 0.0113895, [27] [switch_simplify]: 0.00014965 [loop_unroll]: 0.0001449 [a_1]: 0.00565776 [with_stream_mark]: 3.807e-05 [recompute_prepare]: 4.69e-05 [updatestate_depend_eliminate]: 4.583e-05 [updatestate_assign_eliminate]: 2.57e-05 [updatestate_loads_eliminate]: 2.768e-05 [parameter_eliminate]: 4.47e-06 [specialize_transform]: 4.327e-05 [updatestate_useless_node_eliminater]: 0.00011526 [accelerated_algorithm]: 3.363e-05 [meta_shard_fg_expand]: 5.69e-06 [get_grad_eliminate_]: 2.461e-05 [merge_forward]: 1.324e-05 [cell_reuse_recompute_pass]: 1.32e-06 [cell_reuse_handle_not_recompute_node_pass]: 4.723e-05 [j_node_and_user_rematch]: 4.261e-05 [meta_fg_expand]: 5.849e-05 [replace_old_param]: 3.069e-05 [inline_without_move]: 2.449e-05 [renormalize]: 0.00432968 [add_forward_monad_depend]: 4.45e-06 [auto_monad_grad]: 1.63002e-06 [auto_monad_eliminator]: 4.354e-05 [cse]: 0.00020161 [replace_applicator]: 3.17e-05 [Cycle 3]: 0.00213523, [27] [switch_simplify]: 2.233e-05 [loop_unroll]: 2.12e-05 [a_1]: 0.00059962 [with_stream_mark]: 1.918e-05 [recompute_prepare]: 2.095e-05 [updatestate_depend_eliminate]: 4.191e-05 [updatestate_assign_eliminate]: 1.116e-05 [updatestate_loads_eliminate]: 1.205e-05 [parameter_eliminate]: 1.59998e-06 [specialize_transform]: 1.989e-05 [updatestate_useless_node_eliminater]: 5.639e-05 [accelerated_algorithm]: 2.511e-05 [meta_shard_fg_expand]: 3.45e-06 [get_grad_eliminate_]: 1.698e-05 [merge_forward]: 9.05001e-06 [cell_reuse_recompute_pass]: 1.45999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.041e-05 [j_node_and_user_rematch]: 2.712e-05 [meta_fg_expand]: 6.64001e-06 [replace_old_param]: 1.937e-05 [inline_without_move]: 1.644e-05 [renormalize]: 0.000848 [add_forward_monad_depend]: 4.48001e-06 [auto_monad_grad]: 1.04e-06 [auto_monad_eliminator]: 3.182e-05 [cse]: 8.998e-05 [replace_applicator]: 2.518e-05 [Cycle 4]: 0.00098692, [27] [switch_simplify]: 1.734e-05 [loop_unroll]: 1.666e-05 [a_1]: 0.00048263 [with_stream_mark]: 1.444e-05 [recompute_prepare]: 1.675e-05 [updatestate_depend_eliminate]: 9.31e-06 [updatestate_assign_eliminate]: 9.04e-06 [updatestate_loads_eliminate]: 1.048e-05 [parameter_eliminate]: 1.02e-06 [specialize_transform]: 1.667e-05 [updatestate_useless_node_eliminater]: 2.282e-05 [accelerated_algorithm]: 2.489e-05 [meta_shard_fg_expand]: 3.43e-06 [get_grad_eliminate_]: 1.608e-05 [merge_forward]: 8.13001e-06 [cell_reuse_recompute_pass]: 1.60999e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.026e-05 [j_node_and_user_rematch]: 2.73e-05 [meta_fg_expand]: 6.65002e-06 [replace_old_param]: 1.935e-05 [inline_without_move]: 1.63e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.43002e-06 [auto_monad_grad]: 9.50007e-07 [auto_monad_eliminator]: 2.631e-05 [cse]: 5.147e-05 [replace_applicator]: 1.732e-05 [py_interpret_to_execute_after_opt_a]: 1.705e-05 [rewriter_after_opt_a]: 0.00028222 [convert_after_rewriter]: 1.598e-05 [order_py_execute_after_rewriter]: 1.054e-05 [mutable_eliminate]: 0.00057994 [jit_opt_b]: 0.00012698, [1] [Cycle 1]: 0.00011992, [2] [frontend_op_eliminate]: 5.719e-05 [inline_after_opt_a]: 4.953e-05 [cconv]: 2.508e-05 [loop_unroll]: 0.00044761 [jit_opt_after_cconv]: 0.00036341, [1] [Cycle 1]: 0.0003564, [11] [c_1]: 0.00010233 [parameter_eliminate]: 2.60997e-06 [updatestate_depend_eliminate]: 1.428e-05 [updatestate_assign_eliminate]: 1.054e-05 [updatestate_loads_eliminate]: 1.205e-05 [cse]: 7.506e-05 [call_graph_tuple_transform]: 4.53e-05 [tuple_list_get_item_eliminator]: 1.756e-05 [none_parameter_eliminate]: 1.91998e-06 [renormalize]: 4.60015e-07 [switch_simplify]: 1.808e-05 [remove_dup_value]: 6.86e-05 [partial_unused_args_eliminate]: 2.46e-06 [environ_conv]: 1.535e-05 [add_recomputation]: 0.000104 [cse_after_recomputation]: 6.121e-05, [1] [Cycle 1]: 5.477e-05, [1] [cse]: 4.641e-05 [auto_monad_reorder]: 5.615e-05 [get_jit_bprop_graph]: 1.67999e-06 [rewriter_after_jit_bprop_graph]: 4.52998e-06 [opt_after_jit_grad]: 0.00053123 [symbol_engine_optimizer]: 0.00016243, [1] [Cycle 1]: 0.00015583, [6] [build]: 1.43e-05 [elim_shapecalc]: 2.233e-05 [elim_not_effective]: 3.498e-05 [opt_reshape]: 2.357e-05 [fold_const_symbol]: 3.037e-05 [renormalize]: 4.19997e-07 [validate]: 0.00014951 [backend_pass]: 1.32999e-06 [task_emit]: 0.0124581 [execute]: 7.20998e-06 Sums bootstrap : 0.000920s : 0.77% type_inference : 0.031446s : 26.27% event_method : 0.000066s : 0.06% auto_monad : 0.000382s : 0.32% graph_reusing : 0.000010s : 0.01% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000069s : 0.06% rewriter_before_opt_a : 0.000225s : 0.19% expand_dump_flag : 0.000056s : 0.05% jit_opt_a.switch_simplify : 0.000337s : 0.28% jit_opt_a.loop_unroll : 0.000279s : 0.23% jit_opt_a.a_1 : 0.009487s : 7.93% jit_opt_a.with_stream_mark : 0.000110s : 0.09% jit_opt_a.recompute_prepare : 0.000142s : 0.12% jit_opt_a.updatestate_depend_eliminate : 0.000183s : 0.15% jit_opt_a.updatestate_assign_eliminate : 0.000070s : 0.06% jit_opt_a.updatestate_loads_eliminate : 0.000109s : 0.09% jit_opt_a.parameter_eliminate : 0.000010s : 0.01% jit_opt_a.specialize_transform : 0.000118s : 0.10% jit_opt_a.updatestate_useless_node_eliminater : 0.000242s : 0.20% jit_opt_a.accelerated_algorithm : 0.000169s : 0.14% jit_opt_a.meta_shard_fg_expand : 0.000020s : 0.02% jit_opt_a.get_grad_eliminate_ : 0.000091s : 0.08% jit_opt_a.merge_forward : 0.000053s : 0.04% jit_opt_a.cell_reuse_recompute_pass : 0.000006s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000175s : 0.15% jit_opt_a.j_node_and_user_rematch : 0.000158s : 0.13% jit_opt_a.meta_fg_expand : 0.036344s : 30.36% jit_opt_a.replace_old_param : 0.000252s : 0.21% jit_opt_a.inline_without_move : 0.000239s : 0.20% jit_opt_a.renormalize : 0.021037s : 17.58% jit_opt_a.add_forward_monad_depend : 0.000036s : 0.03% jit_opt_a.auto_monad_grad : 0.000019s : 0.02% jit_opt_a.auto_monad_eliminator : 0.000302s : 0.25% jit_opt_a.cse : 0.000787s : 0.66% jit_opt_a.replace_applicator : 0.000382s : 0.32% py_interpret_to_execute_after_opt_a : 0.000017s : 0.01% rewriter_after_opt_a : 0.000282s : 0.24% convert_after_rewriter : 0.000016s : 0.01% order_py_execute_after_rewriter : 0.000011s : 0.01% mutable_eliminate : 0.000580s : 0.48% jit_opt_b.frontend_op_eliminate : 0.000057s : 0.05% jit_opt_b.inline_after_opt_a : 0.000050s : 0.04% cconv : 0.000025s : 0.02% loop_unroll : 0.000448s : 0.37% jit_opt_after_cconv.c_1 : 0.000102s : 0.09% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000014s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000011s : 0.01% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000012s : 0.01% jit_opt_after_cconv.cse : 0.000075s : 0.06% jit_opt_after_cconv.call_graph_tuple_transform : 0.000045s : 0.04% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000018s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000018s : 0.02% remove_dup_value : 0.000069s : 0.06% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000015s : 0.01% add_recomputation : 0.000104s : 0.09% cse_after_recomputation.cse : 0.000046s : 0.04% auto_monad_reorder : 0.000056s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000531s : 0.44% symbol_engine_optimizer.build : 0.000014s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000022s : 0.02% symbol_engine_optimizer.elim_not_effective : 0.000035s : 0.03% symbol_engine_optimizer.opt_reshape : 0.000024s : 0.02% symbol_engine_optimizer.fold_const_symbol : 0.000030s : 0.03% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000150s : 0.12% backend_pass : 0.000001s : 0.00% task_emit : 0.012458s : 10.41% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.003533 557 0.67% : 0.000024s : 11: substitution.depend_value_elim 0.14% : 0.000005s : 11: substitution.elim_not_effective 0.31% : 0.000011s : 6: substitution.environ_get_add_eliminate 0.17% : 0.000006s : 4: substitution.environ_get_depend_swap 0.23% : 0.000008s : 6: substitution.environ_get_eliminate 0.50% : 0.000018s : 6: substitution.environ_get_set_eliminate 0.13% : 0.000005s : 11: substitution.fold_const_symbol 39.15% : 0.001383s : 8: substitution.getattr_setattr_resolve 0.35% : 0.000012s : 15: substitution.graph_param_transform 34.89% : 0.001233s : 33: substitution.inline 1.26% : 0.000045s : 6: substitution.inline_without_move 0.79% : 0.000028s : 67: substitution.j_node_and_user_rematch 1.49% : 0.000053s : 19: substitution.less_batch_normalization 0.33% : 0.000012s : 17: substitution.load_eliminater 0.67% : 0.000024s : 21: substitution.minmaximum_grad 0.04% : 0.000001s : 2: substitution.opt_reshape 2.20% : 0.000078s : 12: substitution.partial_eliminate 1.22% : 0.000043s : 67: substitution.remove_not_recompute_node 2.03% : 0.000072s : 28: substitution.replace_applicator 0.57% : 0.000020s : 28: substitution.replace_old_param 0.93% : 0.000033s : 10: substitution.reshape_eliminate 0.16% : 0.000006s : 2: substitution.set_cell_output_no_recompute 1.59% : 0.000056s : 10: substitution.split_environ_get_set_with_tuple_value 1.43% : 0.000050s : 21: substitution.tuple_list_convert_item_index_to_positive 0.95% : 0.000034s : 21: substitution.tuple_list_get_item_depend_reorder 3.54% : 0.000125s : 57: substitution.tuple_list_get_item_eliminator 0.64% : 0.000023s : 24: substitution.updatestate_pure_node_eliminater 3.64% : 0.000129s : 34: substitution.updatestate_useless_node_eliminater ------[type_inference.] 0.031357 2 93.97% : 0.029466s : 1: type_inference.infer 6.03% : 0.001890s : 1: type_inference.specialize ------[replace.] 0.000987 85 2.29% : 0.000023s : 2: replace.environ_get_set_eliminate 9.51% : 0.000094s : 6: replace.getattr_setattr_resolve 41.05% : 0.000405s : 33: replace.inline 2.38% : 0.000023s : 2: replace.partial_eliminate 6.08% : 0.000060s : 3: replace.replace_applicator 31.17% : 0.000308s : 36: replace.tuple_list_get_item_eliminator 7.52% : 0.000074s : 3: replace.updatestate_useless_node_eliminater ------[match.] 0.002636 85 0.39% : 0.000010s : 2: match.environ_get_set_eliminate 49.18% : 0.001296s : 6: match.getattr_setattr_resolve 46.05% : 0.001214s : 33: match.inline 0.34% : 0.000009s : 2: match.partial_eliminate 0.65% : 0.000017s : 3: match.replace_applicator 2.60% : 0.000068s : 36: match.tuple_list_get_item_eliminator 0.79% : 0.000021s : 3: match.updatestate_useless_node_eliminater ------[predicate.] 0.001756 12211 1.54% : 0.000027s : 209: predicate.accumulaten_eliminater 0.24% : 0.000004s : 15: predicate.ad_related_special_op_eliminate 1.49% : 0.000026s : 209: predicate.addn_check_dump 1.54% : 0.000027s : 209: predicate.addn_zero_filter 2.07% : 0.000036s : 209: predicate.arithmetic_simplify 1.56% : 0.000027s : 209: predicate.cast_eliminate 0.13% : 0.000002s : 15: predicate.check_bprop_eliminate 1.53% : 0.000027s : 209: predicate.compare_switch_simplify 1.57% : 0.000028s : 209: predicate.depend_value_elim 1.50% : 0.000026s : 211: predicate.dict_get_item_const_eliminator 1.59% : 0.000028s : 211: predicate.dict_get_item_eliminator 1.51% : 0.000027s : 211: predicate.dict_set_item_eliminator 0.14% : 0.000002s : 15: predicate.dumpgradient_eliminate 0.07% : 0.000001s : 15: predicate.elim_not_effective 0.15% : 0.000003s : 15: predicate.elim_shapecalc_of_broadcastargs 1.52% : 0.000027s : 209: predicate.environ_add_const_eliminate 1.51% : 0.000026s : 211: predicate.environ_get_add_eliminate 1.49% : 0.000026s : 209: predicate.environ_get_depend_swap 1.56% : 0.000027s : 211: predicate.environ_get_eliminate 1.52% : 0.000027s : 211: predicate.environ_get_set_eliminate 0.07% : 0.000001s : 15: predicate.fold_const_symbol 0.73% : 0.000013s : 88: predicate.get_grad_eliminate 0.59% : 0.000010s : 40: predicate.getattr_setattr_resolve 0.07% : 0.000001s : 15: predicate.graph_param_transform 3.62% : 0.000064s : 312: predicate.inline 1.76% : 0.000031s : 192: predicate.inline_without_move 0.35% : 0.000006s : 88: predicate.j_node_and_user_rematch 0.89% : 0.000016s : 93: predicate.less_batch_normalization 1.86% : 0.000033s : 247: predicate.list_to_tuple_eliminator_ 2.05% : 0.000036s : 264: predicate.load_eliminater 0.26% : 0.000005s : 15: predicate.loop_unroll_after_grad 2.50% : 0.000044s : 313: predicate.loop_unroll_before_grad 1.71% : 0.000030s : 226: predicate.make_slice_get_slice_eliminator 1.48% : 0.000026s : 209: predicate.merge_addn 1.53% : 0.000027s : 209: predicate.minmaximum_grad 0.27% : 0.000005s : 15: predicate.mutable_eliminate 0.14% : 0.000002s : 15: predicate.opt_reshape 3.83% : 0.000067s : 264: predicate.partial_eliminate 1.52% : 0.000027s : 209: predicate.print_const_string_wrapper 1.92% : 0.000034s : 209: predicate.reduce_eliminate 1.87% : 0.000033s : 249: predicate.redundant_stop_gradient_eliminater 0.37% : 0.000007s : 88: predicate.remove_not_recompute_node 2.51% : 0.000044s : 470: predicate.replace_applicator 0.85% : 0.000015s : 192: predicate.replace_old_param 0.08% : 0.000001s : 15: predicate.reset_defer_inline 1.60% : 0.000028s : 209: predicate.reshape_eliminate 1.54% : 0.000027s : 209: predicate.row_tensor_add_zeros_like 0.16% : 0.000003s : 15: predicate.row_tensor_eliminate 1.53% : 0.000027s : 209: predicate.same_eliminate 0.55% : 0.000010s : 116: predicate.set_cell_output_no_recompute 0.25% : 0.000004s : 30: predicate.special_op_eliminate 0.94% : 0.000017s : 107: predicate.specialize_transform 1.84% : 0.000032s : 209: predicate.split_environ_get_set_with_tuple_value 1.56% : 0.000027s : 209: predicate.stack_unstack_eliminate 0.13% : 0.000002s : 15: predicate.switch_call_monad_eliminater 2.88% : 0.000051s : 282: predicate.switch_defer_inline 2.35% : 0.000041s : 282: predicate.switch_layer_defer_inline 5.15% : 0.000090s : 610: predicate.switch_simplify 1.57% : 0.000028s : 209: predicate.tile_eliminate 3.47% : 0.000061s : 209: predicate.transpose_eliminate 1.96% : 0.000034s : 211: predicate.tuple_list_convert_item_index_to_positive 1.78% : 0.000031s : 211: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000049s : 277: predicate.tuple_list_get_item_eliminator 1.84% : 0.000032s : 211: predicate.tuple_list_set_item_eliminator 1.86% : 0.000033s : 247: predicate.tuple_to_list_eliminator_ 2.02% : 0.000035s : 264: predicate.updatestate_pure_node_eliminater 3.00% : 0.000053s : 357: predicate.updatestate_useless_node_eliminater 1.92% : 0.000034s : 209: predicate.value_based_eliminate 0.10% : 0.000002s : 15: predicate.virtual_view_grad_eliminate 0.17% : 0.000003s : 15: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.004987 87 57.99% : 0.002892s : 38: func_graph_cloner_run.FuncGraphClonerGraph 6.78% : 0.000338s : 8: func_graph_cloner_run.FuncGraphClonerNode 35.23% : 0.001757s : 41: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.160804 110 0.07% : 0.000107s : 1: add_recomputation 0.24% : 0.000389s : 1: auto_monad 0.04% : 0.000059s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.59% : 0.000941s : 1: bootstrap 0.02% : 0.000028s : 1: cconv 0.01% : 0.000018s : 1: convert_after_rewriter 0.04% : 0.000064s : 1: cse_after_recomputation 0.01% : 0.000018s : 1: environ_conv 0.04% : 0.000071s : 1: event_method 0.01% : 0.000011s : 1: execute 0.04% : 0.000060s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 47.80% : 0.076865s : 1: jit_opt_a 0.23% : 0.000366s : 1: jit_opt_after_cconv 0.08% : 0.000130s : 1: jit_opt_b 0.28% : 0.000455s : 1: loop_unroll 0.37% : 0.000588s : 1: mutable_eliminate 7.45% : 0.011984s : 52: opt.transform.jit_opt_a 0.11% : 0.000179s : 4: opt.transform.jit_opt_after_cconv 0.06% : 0.000100s : 4: opt.transform.jit_opt_b 0.02% : 0.000028s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000030s : 1: opt.transform.mutable_eliminate 0.04% : 0.000062s : 1: opt.transform.opt_after_jit_grad 0.98% : 0.001573s : 4: opt.transform.opt_resolve 0.07% : 0.000108s : 4: opt.transform.symbol_engine_opt 0.34% : 0.000540s : 1: opt_after_jit_grad 0.01% : 0.000013s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pre_auto_parallel 0.05% : 0.000073s : 1: py_interpret_to_execute 0.01% : 0.000020s : 1: py_interpret_to_execute_after_opt_a 0.04% : 0.000072s : 1: remove_dup_value 9.58% : 0.015412s : 3: renormalize.infer 3.48% : 0.005603s : 3: renormalize.specialize 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.18% : 0.000286s : 1: rewriter_after_opt_a 0.14% : 0.000229s : 1: rewriter_before_opt_a 0.10% : 0.000165s : 1: symbol_engine_optimizer 7.75% : 0.012467s : 1: task_emit 19.56% : 0.031458s : 1: type_inference 0.12% : 0.000197s : 1: validate group_cases_22 have all been run, results of sub cases are below: case: (1, ) {} pass. case: ('pynative',) {} pass. case: ('pynative',) {} pass. case: ('pynative',) {} pass. case: ('pynative',) {} pass. case: ('KBK',) {} pass. case: ('KBK',) {} pass. case: ('KBK',) {} pass. ops group_cases_23 with 8 cases start to running, all cases are below: case: (, 'KBK') case: (, 0) case: (, 1) case: (, 'KBK') case: (, 'GRAPH') case: (, 'pynative') case: (, 'KBK') case: (, 'pynative') ops group_cases_23 total running memory: 604M, memory threshold: 51200M TotalTime = 2.57933, [24] [bootstrap]: 0.00090502 [type_inference]: 0.210817 [event_method]: 0.00044904 [auto_monad]: 0.00015519 [graph_reusing]: 6.61999e-06 [inline]: 2.70997e-06 [add_attr]: 0.00784537, [1] [add_attr_with_inline]: 0.00783231, [1] [Cycle 1]: 0.0001428, [2] [tag_attr]: 4.242e-05 [meta_addattr_fg_expand]: 1.694e-05 [parallel-infer-symbol]: 3.47002e-06 [pre_auto_parallel]: 5.853e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 2.01998e-06 [pipeline_split]: 1.65001e-06 [optimize]: 0.00848438, [53] [py_interpret_to_execute]: 4.50001e-06 [rewriter_before_opt_a]: 0.00027046 [opt_a]: 0.00577401, [2] [Cycle 1]: 0.00497189, [45] [expand_dump_flag]: 4.15e-06 [switch_simplify]: 8.224e-05 [loop_unroll]: 3.428e-05 [a_1]: 0.00068177 [with_stream_mark]: 1.611e-05 [recompute_prepare]: 1.046e-05 [updatestate_depend_eliminate]: 1.459e-05 [updatestate_assign_eliminate]: 1.33e-05 [updatestate_loads_eliminate]: 3.43e-06 [parameter_eliminate]: 2.07001e-06 [a_2]: 0.00010997 [accelerated_algorithm]: 8.75001e-06 [shard]: 1.92001e-06 [meta_shard_fg_expand]: 2.49999e-06 [shard_inline]: 8.2e-06 [merge_send_recv]: 4.673e-05 [auto_parallel]: 6.46e-06 [parallel]: 0.00010129 [flash_sp]: 3.627e-05 [merge_comm]: 5.09e-06 [allreduce_fusion]: 1.245e-05 [matmul_add_comm_reduction]: 1.894e-05 [allreduce_slice_to_reducescatter]: 9.62999e-06 [virtual_shard_identity]: 1.124e-05 [virtual_dataset]: 8.92e-06 [get_grad_eliminate_]: 8.22e-06 [virtual_output]: 8.40001e-06 [merge_forward]: 4.42e-06 [cell_reuse_recompute_pass]: 1.63002e-06 [offload_activation]: 1.873e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.316e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.241e-05 [set_forward_comm_id_for_comm_node_pass]: 1.418e-05 [meta_fg_expand]: 4.54998e-06 [flash_sp_send_recv_attached]: 2.59001e-06 [receive_attached]: 1.963e-05 [after_resolve]: 1.199e-05 [a_after_grad]: 1.254e-05 [renormalize]: 0.0031442 [add_forward_monad_depend]: 6.07999e-06 [auto_monad_grad]: 2.22001e-06 [auto_monad_eliminator]: 2.816e-05 [cse]: 8.67e-05 [a_3]: 6.361e-05 [Cycle 2]: 0.00079144, [45] [expand_dump_flag]: 2.09e-06 [switch_simplify]: 9.11002e-06 [loop_unroll]: 8.02e-06 [a_1]: 0.00020203 [with_stream_mark]: 1.372e-05 [recompute_prepare]: 8.3e-06 [updatestate_depend_eliminate]: 3.76001e-06 [updatestate_assign_eliminate]: 2.99999e-06 [updatestate_loads_eliminate]: 3.08998e-06 [parameter_eliminate]: 9.60019e-07 [a_2]: 9.928e-05 [accelerated_algorithm]: 7.93999e-06 [shard]: 1.27e-06 [meta_shard_fg_expand]: 1.79998e-06 [shard_inline]: 7.64002e-06 [merge_send_recv]: 5.49998e-06 [auto_parallel]: 6.78e-06 [parallel]: 5.35999e-06 [flash_sp]: 1.355e-05 [merge_comm]: 3.58e-06 [allreduce_fusion]: 3.35998e-06 [matmul_add_comm_reduction]: 6.19001e-06 [allreduce_slice_to_reducescatter]: 4.40021e-07 [virtual_shard_identity]: 8.85999e-06 [virtual_dataset]: 8e-06 [get_grad_eliminate_]: 7.45e-06 [virtual_output]: 8.05999e-06 [merge_forward]: 3.38e-06 [cell_reuse_recompute_pass]: 1.69998e-06 [offload_activation]: 7.00002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.519e-05 [merge_recompute_call_nodes]: 1.04e-06 [before_grad]: 1.059e-05 [set_forward_comm_id_for_comm_node_pass]: 3.71001e-06 [meta_fg_expand]: 2.48e-06 [flash_sp_send_recv_attached]: 1.01997e-06 [receive_attached]: 1.18001e-06 [after_resolve]: 1.095e-05 [a_after_grad]: 1.167e-05 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.47999e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 7.20003e-06 [cse]: 2.164e-05 [a_3]: 4.837e-05 [py_interpret_to_execute_after_opt_a]: 4.57e-06 [slice_cell_reuse_recomputed_activation]: 1.92999e-06 [rewriter_after_opt_a]: 3.253e-05 [convert_after_rewriter]: 1.22e-06 [order_py_execute_after_rewriter]: 1.13001e-06 [mutable_eliminate]: 0.00063982 [opt_b]: 0.00029899, [1] [Cycle 1]: 0.00029231, [7] [b_1]: 0.00016363 [b_2]: 9.19e-06 [updatestate_depend_eliminate]: 5.92999e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 2.74001e-06 [renormalize]: 4.10015e-07 [cse]: 2.787e-05 [optimize_parallel_all_gather_comm]: 2.853e-05 [overlap_param_gather]: 1.225e-05 [cconv]: 2.546e-05 [loop_unroll]: 0.00047004 [opt_after_cconv]: 0.00012151, [1] [Cycle 1]: 0.0001159, [7] [c_1]: 4.227e-05 [parameter_eliminate]: 2.49001e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 3.18e-06 [updatestate_loads_eliminate]: 2.81999e-06 [cse]: 2.52e-05 [renormalize]: 4.09986e-07 [remove_dup_value]: 4.042e-05 [tuple_transform]: 8.299e-05, [1] [Cycle 1]: 7.831e-05, [4] [d_1]: 5.014e-05 [none_parameter_eliminate]: 1.95001e-06 [renormalize]: 1.79978e-07 [switch_simplify]: 7.87e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 6.656e-05 [cse_after_recomputation]: 2.681e-05, [1] [Cycle 1]: 2.226e-05, [1] [cse]: 1.665e-05 [environ_conv]: 2.193e-05 [swap_dp_allreduce_reducescatter]: 2.739e-05 [bias_add_comm_swap]: 1.188e-05 [label_micro_interleaved_index]: 1.323e-05 [label_fine_grained_interleaved_index]: 3.15002e-06 [merge_cast_opt]: 1.35001e-06 [slice_recompute_activation]: 2.16e-06 [micro_interleaved_order_control]: 2.24999e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 8.2e-07 [remove_cast_before_assign_add]: 1.029e-05 [full_micro_interleaved_order_control]: 1.178e-05 [reorder_send_recv_between_fp_bp]: 2.71e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.17e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.003e-05 [overlap_opt_shard_in_pipeline]: 2.023e-05 [overlap_opt_shard_grad_in_pipeline]: 1.83002e-06 [control_data_broadcast_order]: 1.421e-05 [grouped_pairwise_exchange_alltoall]: 1.49998e-06 [offloading_packed_experts]: 3.95e-06 [overlap_recompute_and_grad_model_parallel]: 1.358e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.20001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.71e-06 [overlap_grad_ring_attention]: 2.268e-05 [overlap_grad_flash_sp]: 4.619e-05 [begin_end_overlap_inline]: 4.69998e-07 [split_matmul_comm_elemetwise]: 1.168e-05 [split_layernorm_comm]: 1.95001e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 8.274e-05, [1] [Cycle 1]: 7.798e-05, [6] [build]: 3.38e-06 [elim_shapecalc]: 1.238e-05 [elim_not_effective]: 1.357e-05 [opt_reshape]: 8.27998e-06 [fold_const_symbol]: 1.185e-05 [renormalize]: 1.80007e-07 [detach_backward]: 2.06998e-06 [pipeline_parallel_scheduler]: 1.49998e-06 [auto_monad_reorder]: 2.4e-05 [get_jit_bprop_graph]: 1.34998e-06 [rewriter_after_jit_bprop_graph]: 3.73001e-06 [opt_after_jit_grad]: 0.00050229 [validate]: 6.972e-05 [backend_pass]: 8.30012e-07 [task_emit]: 2.34958 [execute]: 2.856e-05 Sums bootstrap : 0.000905s : 0.04% type_inference : 0.210817s : 8.20% event_method : 0.000449s : 0.02% auto_monad : 0.000155s : 0.01% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000042s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000059s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000270s : 0.01% optimize.opt_a.expand_dump_flag : 0.000006s : 0.00% optimize.opt_a.switch_simplify : 0.000091s : 0.00% optimize.opt_a.loop_unroll : 0.000042s : 0.00% optimize.opt_a.a_1 : 0.000884s : 0.03% optimize.opt_a.with_stream_mark : 0.000030s : 0.00% optimize.opt_a.recompute_prepare : 0.000019s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000018s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000209s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000017s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.00% optimize.opt_a.shard_inline : 0.000016s : 0.00% optimize.opt_a.merge_send_recv : 0.000052s : 0.00% optimize.opt_a.auto_parallel : 0.000013s : 0.00% optimize.opt_a.parallel : 0.000107s : 0.00% optimize.opt_a.flash_sp : 0.000050s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000016s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000025s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000010s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000020s : 0.00% optimize.opt_a.virtual_dataset : 0.000017s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000016s : 0.00% optimize.opt_a.virtual_output : 0.000016s : 0.00% optimize.opt_a.merge_forward : 0.000008s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000026s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000007s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000021s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.00% optimize.opt_a.a_after_grad : 0.000024s : 0.00% optimize.opt_a.renormalize : 0.003144s : 0.12% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000035s : 0.00% optimize.opt_a.cse : 0.000108s : 0.00% optimize.opt_a.a_3 : 0.000112s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000033s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000640s : 0.02% optimize.opt_b.b_1 : 0.000164s : 0.01% optimize.opt_b.b_2 : 0.000009s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000028s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000012s : 0.00% optimize.cconv : 0.000025s : 0.00% optimize.loop_unroll : 0.000470s : 0.02% optimize.opt_after_cconv.c_1 : 0.000042s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000025s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000040s : 0.00% optimize.tuple_transform.d_1 : 0.000050s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000067s : 0.00% optimize.cse_after_recomputation.cse : 0.000017s : 0.00% optimize.environ_conv : 0.000022s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000027s : 0.00% optimize.bias_add_comm_swap : 0.000012s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000010s : 0.00% optimize.full_micro_interleaved_order_control : 0.000012s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000020s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000014s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000023s : 0.00% optimize.overlap_grad_flash_sp : 0.000046s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000012s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000502s : 0.02% validate : 0.000070s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.349577s : 91.41% execute : 0.000029s : 0.00% Time group info: ------[substitution.] 0.000213 26 0.83% : 0.000002s : 2: substitution.elim_not_effective 0.87% : 0.000002s : 2: substitution.fold_const_symbol 2.85% : 0.000006s : 5: substitution.graph_param_transform 79.86% : 0.000170s : 5: substitution.inline 1.73% : 0.000004s : 4: substitution.j_node_and_user_rematch 6.63% : 0.000014s : 4: substitution.remove_not_recompute_node 1.57% : 0.000003s : 2: substitution.replace_old_param 5.66% : 0.000012s : 2: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.210697 2 98.47% : 0.207484s : 1: type_inference.infer 1.53% : 0.003213s : 1: type_inference.specialize ------[replace.] 0.000073 7 74.79% : 0.000054s : 5: replace.inline 25.21% : 0.000018s : 2: replace.tuple_list_get_item_eliminator ------[match.] 0.000178 7 93.95% : 0.000167s : 5: match.inline 6.05% : 0.000011s : 2: match.tuple_list_get_item_eliminator ------[predicate.] 0.000227 1691 1.00% : 0.000002s : 17: predicate.accumulaten_eliminater 0.74% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.56% : 0.000001s : 12: predicate.addn_check_dump 0.93% : 0.000002s : 17: predicate.addn_zero_filter 0.84% : 0.000002s : 17: predicate.adjust_all_reduce_mul_add 2.18% : 0.000005s : 29: predicate.arithmetic_simplify 0.98% : 0.000002s : 17: predicate.cast_eliminate 0.63% : 0.000001s : 12: predicate.check_bprop_eliminate 0.60% : 0.000001s : 12: predicate.compare_switch_simplify 0.26% : 0.000001s : 6: predicate.const_output_eliminate 0.65% : 0.000001s : 12: predicate.depend_value_elim 0.93% : 0.000002s : 17: predicate.dict_get_item_const_eliminator 1.09% : 0.000002s : 17: predicate.dict_get_item_eliminator 0.84% : 0.000002s : 17: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.23% : 0.000001s : 5: predicate.elim_not_effective 0.36% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000003s : 23: predicate.environ_add_const_eliminate 1.13% : 0.000003s : 23: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 23: predicate.environ_get_depend_swap 1.85% : 0.000004s : 35: predicate.environ_get_eliminate 1.09% : 0.000002s : 23: predicate.environ_get_set_eliminate 1.32% : 0.000003s : 24: predicate.exchange_switch_depend_value 2.21% : 0.000005s : 24: predicate.float_depend_g_call 0.60% : 0.000001s : 12: predicate.float_environ_get_switch 0.90% : 0.000002s : 18: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000002s : 12: predicate.get_grad_eliminate 0.22% : 0.000000s : 5: predicate.graph_param_transform 0.63% : 0.000001s : 12: predicate.incorporate_call 0.54% : 0.000001s : 12: predicate.incorporate_call_switch 5.68% : 0.000013s : 77: predicate.inline 0.74% : 0.000002s : 12: predicate.inline_without_move 0.39% : 0.000001s : 12: predicate.j_node_and_user_rematch 0.84% : 0.000002s : 12: predicate.less_batch_normalization 1.83% : 0.000004s : 30: predicate.list_to_tuple_eliminator_ 2.55% : 0.000006s : 48: predicate.load_eliminater 1.00% : 0.000002s : 6: predicate.loop_unroll_after_grad 2.35% : 0.000005s : 40: predicate.loop_unroll_before_grad 1.75% : 0.000004s : 29: predicate.make_slice_get_slice_eliminator 0.67% : 0.000002s : 12: predicate.merge_addn 0.66% : 0.000001s : 12: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 12: predicate.mini_step_allgather_replace 0.82% : 0.000002s : 17: predicate.minmaximum_grad 1.03% : 0.000002s : 6: predicate.mutable_eliminate 0.34% : 0.000001s : 5: predicate.opt_reshape 0.34% : 0.000001s : 6: predicate.parallel_virtual_node 1.77% : 0.000004s : 24: predicate.partial_defer_inline 1.45% : 0.000003s : 25: predicate.partial_eliminate 0.94% : 0.000002s : 17: predicate.print_const_string_wrapper 0.66% : 0.000001s : 12: predicate.reduce_all_const_elim 1.18% : 0.000003s : 17: predicate.reduce_eliminate 2.50% : 0.000006s : 48: predicate.redundant_stop_gradient_eliminater 0.59% : 0.000001s : 12: predicate.remove_not_recompute_node 1.70% : 0.000004s : 31: predicate.replace_applicator 0.52% : 0.000001s : 12: predicate.replace_old_param 0.32% : 0.000001s : 6: predicate.reset_defer_inline 0.95% : 0.000002s : 17: predicate.reshape_eliminate 0.75% : 0.000002s : 12: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 6: predicate.row_tensor_eliminate 0.83% : 0.000002s : 12: predicate.same_eliminate 0.56% : 0.000001s : 12: predicate.set_cell_output_no_recompute 0.82% : 0.000002s : 12: predicate.shard_identity_eliminate 0.70% : 0.000002s : 11: predicate.special_op_eliminate 0.69% : 0.000002s : 12: predicate.specialize_transform 0.80% : 0.000002s : 12: predicate.split_environ_get_set_with_tuple_value 0.83% : 0.000002s : 12: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.49% : 0.000003s : 24: predicate.switch_defer_inline 2.05% : 0.000005s : 36: predicate.switch_layer_defer_inline 5.22% : 0.000012s : 81: predicate.switch_simplify 0.91% : 0.000002s : 17: predicate.tile_eliminate 0.92% : 0.000002s : 17: predicate.transpose_eliminate 1.51% : 0.000003s : 28: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000004s : 28: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 28: predicate.tuple_list_get_item_depend_reorder 2.99% : 0.000007s : 42: predicate.tuple_list_get_item_eliminator 1.47% : 0.000003s : 28: predicate.tuple_list_get_set_item_eliminator 2.26% : 0.000005s : 40: predicate.tuple_list_set_item_eliminator 1.79% : 0.000004s : 30: predicate.tuple_to_list_eliminator_ 2.31% : 0.000005s : 48: predicate.updatestate_pure_node_eliminater 3.09% : 0.000007s : 60: predicate.updatestate_useless_node_eliminater 0.34% : 0.000001s : 6: predicate.value_based_eliminate 0.77% : 0.000002s : 12: predicate.virtual_dataset_eliminate 0.71% : 0.000002s : 12: predicate.virtual_output_eliminate 0.26% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003415 29 73.30% : 0.002503s : 22: func_graph_cloner_run.FuncGraphClonerGraph 26.70% : 0.000912s : 7: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.600483 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.30% : 0.007850s : 1: add_attr 0.30% : 0.007837s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000071s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000166s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000015s : 1: bias_add_comm_swap 0.04% : 0.000952s : 1: bootstrap 0.00% : 0.000029s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000030s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000026s : 1: environ_conv 0.02% : 0.000463s : 1: event_method 0.00% : 0.000043s : 1: execute 0.00% : 0.000015s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000011s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000016s : 1: label_micro_interleaved_index 0.02% : 0.000478s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.02% : 0.000648s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000017s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000019s : 1: opt.transform.mutable_eliminate 0.06% : 0.001477s : 78: opt.transform.opt_a 0.00% : 0.000041s : 1: opt.transform.opt_after_cconv 0.00% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000145s : 28: opt.transform.opt_b 0.00% : 0.000056s : 2: opt.transform.opt_trans_graph 0.00% : 0.000042s : 4: opt.transform.symbol_engine_opt 0.22% : 0.005777s : 1: opt_a 0.00% : 0.000125s : 1: opt_after_cconv 0.02% : 0.000511s : 1: opt_after_jit_grad 0.01% : 0.000303s : 1: opt_b 0.33% : 0.008489s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000050s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000026s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000024s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000015s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000006s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000062s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000013s : 1: remove_cast_before_assign_add 0.00% : 0.000045s : 1: remove_dup_value 0.08% : 0.002169s : 1: renormalize.infer 0.04% : 0.000965s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000036s : 1: rewriter_after_opt_a 0.01% : 0.000276s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000086s : 1: symbol_engine_optimizer 90.36% : 2.349676s : 1: task_emit 0.00% : 0.000086s : 1: tuple_transform 8.11% : 0.210844s : 1: type_inference 0.00% : 0.000100s : 1: validate TotalTime = 2.5993, [24] [bootstrap]: 0.00090317 [type_inference]: 0.025751 [event_method]: 1.485e-05 [auto_monad]: 0.00013695 [graph_reusing]: 5.37001e-06 [inline]: 2.20002e-06 [add_attr]: 0.00806523, [1] [add_attr_with_inline]: 0.00804876, [1] [Cycle 1]: 0.00014724, [2] [tag_attr]: 3.077e-05 [meta_addattr_fg_expand]: 1.229e-05 [parallel-infer-symbol]: 3.51001e-06 [pre_auto_parallel]: 5.303e-05 [insert-virtual-dataset]: 3.23e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 1.67001e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00518229, [53] [py_interpret_to_execute]: 5.54e-06 [rewriter_before_opt_a]: 7.926e-05 [opt_a]: 0.0027626, [2] [Cycle 1]: 0.00209841, [45] [expand_dump_flag]: 3.5e-06 [switch_simplify]: 7.606e-05 [loop_unroll]: 1.926e-05 [a_1]: 0.00041463 [with_stream_mark]: 2.366e-05 [recompute_prepare]: 9.40001e-06 [updatestate_depend_eliminate]: 1.489e-05 [updatestate_assign_eliminate]: 1.153e-05 [updatestate_loads_eliminate]: 3.18e-06 [parameter_eliminate]: 1.96998e-06 [a_2]: 8.312e-05 [accelerated_algorithm]: 6.87002e-06 [shard]: 3.2e-06 [meta_shard_fg_expand]: 1.66998e-06 [shard_inline]: 6.36998e-06 [merge_send_recv]: 4.427e-05 [auto_parallel]: 8.48001e-06 [parallel]: 8.039e-05 [flash_sp]: 3.473e-05 [merge_comm]: 4.60001e-06 [allreduce_fusion]: 1.057e-05 [matmul_add_comm_reduction]: 1.597e-05 [allreduce_slice_to_reducescatter]: 8.32e-06 [virtual_shard_identity]: 8.89998e-06 [virtual_dataset]: 6.51e-06 [get_grad_eliminate_]: 6.36998e-06 [virtual_output]: 6.07001e-06 [merge_forward]: 4.03001e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 1.756e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.686e-05 [merge_recompute_call_nodes]: 2.07999e-06 [before_grad]: 1.005e-05 [set_forward_comm_id_for_comm_node_pass]: 1.189e-05 [meta_fg_expand]: 2.62001e-06 [flash_sp_send_recv_attached]: 2.94001e-06 [receive_attached]: 1.786e-05 [after_resolve]: 1.234e-05 [a_after_grad]: 9.25999e-06 [renormalize]: 0.0006706 [add_forward_monad_depend]: 5.49998e-06 [auto_monad_grad]: 2.24001e-06 [auto_monad_eliminator]: 2.546e-05 [cse]: 6.048e-05 [a_3]: 4.578e-05 [Cycle 2]: 0.00065042, [45] [expand_dump_flag]: 1.52999e-06 [switch_simplify]: 7.87998e-06 [loop_unroll]: 5.67001e-06 [a_1]: 0.00012439 [with_stream_mark]: 1.535e-05 [recompute_prepare]: 6.56e-06 [updatestate_depend_eliminate]: 3.95998e-06 [updatestate_assign_eliminate]: 2.36e-06 [updatestate_loads_eliminate]: 2.54001e-06 [parameter_eliminate]: 1.14e-06 [a_2]: 6.683e-05 [accelerated_algorithm]: 5.84999e-06 [shard]: 1.60999e-06 [meta_shard_fg_expand]: 1.71998e-06 [shard_inline]: 5.93998e-06 [merge_send_recv]: 6.33998e-06 [auto_parallel]: 6.54001e-06 [parallel]: 5.25001e-06 [flash_sp]: 3.51001e-06 [merge_comm]: 3.35e-06 [allreduce_fusion]: 2.81e-06 [matmul_add_comm_reduction]: 6.03002e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 6.11998e-06 [virtual_dataset]: 5.47999e-06 [get_grad_eliminate_]: 5.52001e-06 [virtual_output]: 5.35999e-06 [merge_forward]: 3.03998e-06 [cell_reuse_recompute_pass]: 1.94e-06 [offload_activation]: 6.89999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.363e-05 [merge_recompute_call_nodes]: 1.14e-06 [before_grad]: 8.50001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.09999e-06 [meta_fg_expand]: 2.21e-06 [flash_sp_send_recv_attached]: 8.79983e-07 [receive_attached]: 1.40999e-06 [after_resolve]: 1.11e-05 [a_after_grad]: 8.50001e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.49e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 8.15e-06 [cse]: 2.726e-05 [a_3]: 3.469e-05 [py_interpret_to_execute_after_opt_a]: 6.39999e-06 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 2.767e-05 [convert_after_rewriter]: 1.50001e-06 [order_py_execute_after_rewriter]: 1.30999e-06 [mutable_eliminate]: 0.00067665 [opt_b]: 0.00020189, [1] [Cycle 1]: 0.00019377, [7] [b_1]: 0.00011513 [b_2]: 7.53999e-06 [updatestate_depend_eliminate]: 6.76999e-06 [updatestate_assign_eliminate]: 2.43e-06 [updatestate_loads_eliminate]: 2.27001e-06 [renormalize]: 5.3001e-07 [cse]: 2.386e-05 [optimize_parallel_all_gather_comm]: 2.812e-05 [overlap_param_gather]: 1.104e-05 [cconv]: 2.859e-05 [loop_unroll]: 0.00049839 [opt_after_cconv]: 0.00010462, [1] [Cycle 1]: 9.746e-05, [7] [c_1]: 2.916e-05 [parameter_eliminate]: 3.38e-06 [updatestate_depend_eliminate]: 5.74e-06 [updatestate_assign_eliminate]: 2.42001e-06 [updatestate_loads_eliminate]: 2.48e-06 [cse]: 2.096e-05 [renormalize]: 4.40021e-07 [remove_dup_value]: 1.472e-05 [tuple_transform]: 7.376e-05, [1] [Cycle 1]: 6.914e-05, [4] [d_1]: 4.29e-05 [none_parameter_eliminate]: 1.62001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 6.69999e-06 [partial_unused_args_eliminate]: 2.80997e-06 [add_recomputation]: 6.37e-05 [cse_after_recomputation]: 2.185e-05, [1] [Cycle 1]: 1.711e-05, [1] [cse]: 1.096e-05 [environ_conv]: 2.038e-05 [swap_dp_allreduce_reducescatter]: 2.344e-05 [bias_add_comm_swap]: 1.095e-05 [label_micro_interleaved_index]: 1.341e-05 [label_fine_grained_interleaved_index]: 2.66e-06 [merge_cast_opt]: 1.36998e-06 [slice_recompute_activation]: 2.26998e-06 [micro_interleaved_order_control]: 2.54999e-06 [assign_add_opt]: 1.39998e-06 [ForceFp32Comm]: 8.10018e-07 [remove_cast_before_assign_add]: 9.17001e-06 [full_micro_interleaved_order_control]: 1.063e-05 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.35999e-06 [add_comm_op_reuse_tag]: 1.03001e-06 [interleave_split_concat_branches]: 1.10999e-06 [interleave_parallel_branches]: 8.17998e-06 [overlap_opt_shard_in_pipeline]: 2.333e-05 [overlap_opt_shard_grad_in_pipeline]: 1.96e-06 [control_data_broadcast_order]: 1.273e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.7e-06 [overlap_recompute_and_grad_model_parallel]: 1.229e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.20999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.01998e-06 [overlap_grad_ring_attention]: 1.99e-05 [overlap_grad_flash_sp]: 4.298e-05 [begin_end_overlap_inline]: 7.50006e-07 [split_matmul_comm_elemetwise]: 1.086e-05 [split_layernorm_comm]: 2.14999e-06 [handle_group_info]: 1.15001e-06 [symbol_engine_optimizer]: 8.143e-05, [1] [Cycle 1]: 7.608e-05, [6] [build]: 4.17e-06 [elim_shapecalc]: 1.216e-05 [elim_not_effective]: 1.337e-05 [opt_reshape]: 6.56e-06 [fold_const_symbol]: 8.99e-06 [renormalize]: 1.80007e-07 [detach_backward]: 2.08002e-06 [pipeline_parallel_scheduler]: 1.47001e-06 [auto_monad_reorder]: 2.234e-05 [get_jit_bprop_graph]: 1.84e-06 [rewriter_after_jit_bprop_graph]: 3.38e-06 [opt_after_jit_grad]: 0.00055735 [validate]: 6.087e-05 [backend_pass]: 1.00999e-06 [task_emit]: 2.55822 [execute]: 1.048e-05 Sums bootstrap : 0.000903s : 0.03% type_inference : 0.025751s : 0.99% event_method : 0.000015s : 0.00% auto_monad : 0.000137s : 0.01% graph_reusing : 0.000005s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000031s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000053s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000079s : 0.00% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000084s : 0.00% optimize.opt_a.loop_unroll : 0.000025s : 0.00% optimize.opt_a.a_1 : 0.000539s : 0.02% optimize.opt_a.with_stream_mark : 0.000039s : 0.00% optimize.opt_a.recompute_prepare : 0.000016s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000014s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000150s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000005s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000051s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000086s : 0.00% optimize.opt_a.flash_sp : 0.000038s : 0.00% optimize.opt_a.merge_comm : 0.000008s : 0.00% optimize.opt_a.allreduce_fusion : 0.000013s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000022s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000009s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.00% optimize.opt_a.virtual_dataset : 0.000012s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000011s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000024s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000040s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000015s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000019s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.00% optimize.opt_a.a_after_grad : 0.000018s : 0.00% optimize.opt_a.renormalize : 0.000671s : 0.03% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.00% optimize.opt_a.cse : 0.000088s : 0.00% optimize.opt_a.a_3 : 0.000080s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000028s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000677s : 0.03% optimize.opt_b.b_1 : 0.000115s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000024s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000029s : 0.00% optimize.loop_unroll : 0.000498s : 0.02% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.00% optimize.tuple_transform.d_1 : 0.000043s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000003s : 0.00% optimize.add_recomputation : 0.000064s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000020s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000011s : 0.00% optimize.label_micro_interleaved_index : 0.000013s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000009s : 0.00% optimize.full_micro_interleaved_order_control : 0.000011s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000008s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000023s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000013s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000012s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000020s : 0.00% optimize.overlap_grad_flash_sp : 0.000043s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000011s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000004s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000012s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000022s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000557s : 0.02% validate : 0.000061s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.558221s : 98.77% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000175 23 1.17% : 0.000002s : 2: substitution.elim_not_effective 0.75% : 0.000001s : 2: substitution.fold_const_symbol 3.48% : 0.000006s : 4: substitution.graph_param_transform 79.62% : 0.000139s : 3: substitution.inline 2.25% : 0.000004s : 4: substitution.j_node_and_user_rematch 9.65% : 0.000017s : 4: substitution.remove_not_recompute_node 3.09% : 0.000005s : 4: substitution.replace_old_param ------[type_inference.] 0.025666 2 97.72% : 0.025080s : 1: type_inference.infer 2.28% : 0.000586s : 1: type_inference.specialize ------[replace.] 0.000028 3 100.00% : 0.000028s : 3: replace.inline ------[match.] 0.000137 3 100.00% : 0.000137s : 3: match.inline ------[predicate.] 0.000165 1047 0.96% : 0.000002s : 10: predicate.accumulaten_eliminater 1.00% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 10: predicate.addn_zero_filter 0.74% : 0.000001s : 10: predicate.adjust_all_reduce_mul_add 2.19% : 0.000004s : 18: predicate.arithmetic_simplify 1.26% : 0.000002s : 10: predicate.cast_eliminate 0.68% : 0.000001s : 8: predicate.check_bprop_eliminate 0.62% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.75% : 0.000001s : 8: predicate.depend_value_elim 0.82% : 0.000001s : 10: predicate.dict_get_item_const_eliminator 1.02% : 0.000002s : 10: predicate.dict_get_item_eliminator 0.82% : 0.000001s : 10: predicate.dict_set_item_eliminator 1.06% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.41% : 0.000001s : 4: predicate.elim_not_effective 0.50% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.04% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.03% : 0.000002s : 14: predicate.environ_get_depend_swap 1.84% : 0.000003s : 22: predicate.environ_get_eliminate 1.02% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.03% : 0.000002s : 13: predicate.exchange_switch_depend_value 1.89% : 0.000003s : 13: predicate.float_depend_g_call 0.90% : 0.000001s : 8: predicate.float_environ_get_switch 0.97% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000001s : 8: predicate.get_grad_eliminate 0.36% : 0.000001s : 4: predicate.graph_param_transform 0.64% : 0.000001s : 8: predicate.incorporate_call 0.53% : 0.000001s : 8: predicate.incorporate_call_switch 5.61% : 0.000009s : 47: predicate.inline 0.83% : 0.000001s : 8: predicate.inline_without_move 0.38% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.07% : 0.000002s : 8: predicate.less_batch_normalization 1.80% : 0.000003s : 18: predicate.list_to_tuple_eliminator_ 2.16% : 0.000004s : 28: predicate.load_eliminater 1.41% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.09% : 0.000003s : 23: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.71% : 0.000001s : 8: predicate.merge_addn 0.68% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.79% : 0.000001s : 10: predicate.minmaximum_grad 1.65% : 0.000003s : 4: predicate.mutable_eliminate 0.44% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 1.57% : 0.000003s : 13: predicate.partial_defer_inline 1.10% : 0.000002s : 14: predicate.partial_eliminate 0.90% : 0.000001s : 10: predicate.print_const_string_wrapper 0.65% : 0.000001s : 8: predicate.reduce_all_const_elim 1.20% : 0.000002s : 10: predicate.reduce_eliminate 2.44% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.80% : 0.000001s : 8: predicate.remove_not_recompute_node 1.17% : 0.000002s : 18: predicate.replace_applicator 0.62% : 0.000001s : 8: predicate.replace_old_param 0.34% : 0.000001s : 4: predicate.reset_defer_inline 0.99% : 0.000002s : 10: predicate.reshape_eliminate 0.70% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.96% : 0.000002s : 8: predicate.same_eliminate 0.50% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.96% : 0.000002s : 8: predicate.shard_identity_eliminate 0.74% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000001s : 8: predicate.specialize_transform 0.92% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.44% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.16% : 0.000002s : 13: predicate.switch_defer_inline 1.88% : 0.000003s : 21: predicate.switch_layer_defer_inline 4.66% : 0.000008s : 48: predicate.switch_simplify 0.79% : 0.000001s : 10: predicate.tile_eliminate 0.97% : 0.000002s : 10: predicate.transpose_eliminate 1.58% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000002s : 18: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.27% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.42% : 0.000002s : 18: predicate.tuple_list_get_set_item_eliminator 2.42% : 0.000004s : 26: predicate.tuple_list_set_item_eliminator 1.52% : 0.000003s : 18: predicate.tuple_to_list_eliminator_ 2.11% : 0.000003s : 28: predicate.updatestate_pure_node_eliminater 2.92% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.79% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.71% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000280 6 6.76% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.24% : 0.000261s : 5: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.614332 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.31% : 0.008072s : 1: add_attr 0.31% : 0.008053s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000068s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000143s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000007s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000014s : 1: bias_add_comm_swap 0.04% : 0.000955s : 1: bootstrap 0.00% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000017s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000025s : 1: environ_conv 0.00% : 0.000021s : 1: event_method 0.00% : 0.000016s : 1: execute 0.00% : 0.000014s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000007s : 1: insert-virtual-dataset 0.00% : 0.000011s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.02% : 0.000508s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000689s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000020s : 1: opt.transform.mutable_eliminate 0.04% : 0.000977s : 78: opt.transform.opt_a 0.00% : 0.000028s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000095s : 28: opt.transform.opt_b 0.00% : 0.000047s : 2: opt.transform.opt_trans_graph 0.00% : 0.000038s : 4: opt.transform.symbol_engine_opt 0.11% : 0.002766s : 1: opt_a 0.00% : 0.000108s : 1: opt_after_cconv 0.02% : 0.000569s : 1: opt_after_jit_grad 0.01% : 0.000206s : 1: opt_b 0.20% : 0.005187s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000048s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000023s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000028s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000016s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000006s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000059s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000010s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000012s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.01% : 0.000344s : 1: renormalize.infer 0.01% : 0.000317s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000031s : 1: rewriter_after_opt_a 0.00% : 0.000084s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000014s : 1: split_matmul_comm_elemetwise 0.00% : 0.000027s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000084s : 1: symbol_engine_optimizer 97.86% : 2.558255s : 1: task_emit 0.00% : 0.000077s : 1: tuple_transform 0.99% : 0.025775s : 1: type_inference 0.00% : 0.000095s : 1: validate TotalTime = 2.71594, [24] [bootstrap]: 0.00090321 [type_inference]: 0.0561637 [event_method]: 0.00014438 [auto_monad]: 0.00012235 [graph_reusing]: 5.13002e-06 [inline]: 1.92001e-06 [add_attr]: 0.00795972, [1] [add_attr_with_inline]: 0.00794682, [1] [Cycle 1]: 0.00011366, [2] [tag_attr]: 4.081e-05 [meta_addattr_fg_expand]: 1.196e-05 [parallel-infer-symbol]: 2.24999e-06 [pre_auto_parallel]: 5.196e-05 [insert-virtual-dataset]: 1.44e-06 [parallel-infer-symbol-second]: 9.49978e-07 [dataset_repeat_opt]: 1.37e-06 [pipeline_split]: 8.39995e-07 [optimize]: 0.00628073, [53] [py_interpret_to_execute]: 5.15999e-06 [rewriter_before_opt_a]: 0.00031713 [opt_a]: 0.00392505, [2] [Cycle 1]: 0.00326176, [45] [expand_dump_flag]: 2.79001e-06 [switch_simplify]: 0.00010135 [loop_unroll]: 4.893e-05 [a_1]: 0.00102334 [with_stream_mark]: 1.552e-05 [recompute_prepare]: 1.006e-05 [updatestate_depend_eliminate]: 9.49999e-06 [updatestate_assign_eliminate]: 6.38e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 1.27e-06 [a_2]: 9.234e-05 [accelerated_algorithm]: 7.99997e-06 [shard]: 1.23002e-06 [meta_shard_fg_expand]: 2.07001e-06 [shard_inline]: 6.98e-06 [merge_send_recv]: 2.71e-05 [auto_parallel]: 7.8e-06 [parallel]: 4.848e-05 [flash_sp]: 2.119e-05 [merge_comm]: 5.20999e-06 [allreduce_fusion]: 7.88999e-06 [matmul_add_comm_reduction]: 1.003e-05 [allreduce_slice_to_reducescatter]: 3.60998e-06 [virtual_shard_identity]: 9.91e-06 [virtual_dataset]: 7.61999e-06 [get_grad_eliminate_]: 7.28e-06 [virtual_output]: 7.19001e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 9.80013e-07 [offload_activation]: 1.082e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.9e-05 [merge_recompute_call_nodes]: 9.39996e-07 [before_grad]: 1.08e-05 [set_forward_comm_id_for_comm_node_pass]: 7.88999e-06 [meta_fg_expand]: 3.25998e-06 [flash_sp_send_recv_attached]: 1.76e-06 [receive_attached]: 1.011e-05 [after_resolve]: 1.261e-05 [a_after_grad]: 1.125e-05 [renormalize]: 0.00129041 [add_forward_monad_depend]: 4.3e-06 [auto_monad_grad]: 1.63002e-06 [auto_monad_eliminator]: 1.761e-05 [cse]: 5.172e-05 [a_3]: 5.137e-05 [Cycle 2]: 0.00065273, [45] [expand_dump_flag]: 1.69e-06 [switch_simplify]: 7.5e-06 [loop_unroll]: 6.75998e-06 [a_1]: 0.00014889 [with_stream_mark]: 1.055e-05 [recompute_prepare]: 6.74999e-06 [updatestate_depend_eliminate]: 3.17002e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 3.01999e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 7.757e-05 [accelerated_algorithm]: 6.90002e-06 [shard]: 1.22e-06 [meta_shard_fg_expand]: 1.35001e-06 [shard_inline]: 6.51e-06 [merge_send_recv]: 5.28002e-06 [auto_parallel]: 6.24001e-06 [parallel]: 4.38001e-06 [flash_sp]: 2.18998e-06 [merge_comm]: 3.41999e-06 [allreduce_fusion]: 3.16001e-06 [matmul_add_comm_reduction]: 5.52001e-06 [allreduce_slice_to_reducescatter]: 3.99974e-07 [virtual_shard_identity]: 7.33e-06 [virtual_dataset]: 6.43998e-06 [get_grad_eliminate_]: 6.38e-06 [virtual_output]: 6.21e-06 [merge_forward]: 2.93e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 6.04001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.265e-05 [merge_recompute_call_nodes]: 6.90023e-07 [before_grad]: 8.95001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.23e-06 [meta_fg_expand]: 2.31998e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 1.07e-06 [after_resolve]: 1.111e-05 [a_after_grad]: 1.009e-05 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.11997e-06 [auto_monad_grad]: 8.89995e-07 [auto_monad_eliminator]: 6.45002e-06 [cse]: 1.713e-05 [a_3]: 3.852e-05 [py_interpret_to_execute_after_opt_a]: 4.97e-06 [slice_cell_reuse_recomputed_activation]: 1.10999e-06 [rewriter_after_opt_a]: 2.039e-05 [convert_after_rewriter]: 1.07e-06 [order_py_execute_after_rewriter]: 1.12e-06 [mutable_eliminate]: 0.00058209 [opt_b]: 0.00024023, [1] [Cycle 1]: 0.00023305, [7] [b_1]: 0.000155 [b_2]: 8.47e-06 [updatestate_depend_eliminate]: 5.97999e-06 [updatestate_assign_eliminate]: 2.84001e-06 [updatestate_loads_eliminate]: 2.74999e-06 [renormalize]: 4.2998e-07 [cse]: 2.197e-05 [optimize_parallel_all_gather_comm]: 1.743e-05 [overlap_param_gather]: 4.72998e-06 [cconv]: 1.544e-05 [loop_unroll]: 0.00043824 [opt_after_cconv]: 0.0001064, [1] [Cycle 1]: 0.00010009, [7] [c_1]: 3.304e-05 [parameter_eliminate]: 2.64001e-06 [updatestate_depend_eliminate]: 5.51e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 2.69001e-06 [cse]: 2.058e-05 [renormalize]: 4.80009e-07 [remove_dup_value]: 2.263e-05 [tuple_transform]: 7.613e-05, [1] [Cycle 1]: 7.154e-05, [4] [d_1]: 4.56e-05 [none_parameter_eliminate]: 1.12e-06 [renormalize]: 1.8999e-07 [switch_simplify]: 7.31001e-06 [partial_unused_args_eliminate]: 1.09e-06 [add_recomputation]: 3.755e-05 [cse_after_recomputation]: 2.353e-05, [1] [Cycle 1]: 1.942e-05, [1] [cse]: 1.401e-05 [environ_conv]: 1.342e-05 [swap_dp_allreduce_reducescatter]: 1.284e-05 [bias_add_comm_swap]: 5.49e-06 [label_micro_interleaved_index]: 6.89001e-06 [label_fine_grained_interleaved_index]: 1.50001e-06 [merge_cast_opt]: 6.39993e-07 [slice_recompute_activation]: 8.39995e-07 [micro_interleaved_order_control]: 1.12e-06 [assign_add_opt]: 7.00005e-07 [ForceFp32Comm]: 6.49976e-07 [remove_cast_before_assign_add]: 3.66001e-06 [full_micro_interleaved_order_control]: 4.55001e-06 [reorder_send_recv_between_fp_bp]: 1.27e-06 [comm_op_add_attrs]: 4.50003e-07 [add_comm_op_reuse_tag]: 6.09987e-07 [interleave_split_concat_branches]: 9.20001e-07 [interleave_parallel_branches]: 3.78001e-06 [overlap_opt_shard_in_pipeline]: 1.363e-05 [overlap_opt_shard_grad_in_pipeline]: 1.01002e-06 [control_data_broadcast_order]: 1.013e-05 [grouped_pairwise_exchange_alltoall]: 5.50004e-07 [offloading_packed_experts]: 3.17002e-06 [overlap_recompute_and_grad_model_parallel]: 7.03e-06 [overlap_grad_matmul_and_grad_allreduce]: 9.5999e-07 [overlap_recompute_allgather_and_fa_grad]: 7.80012e-07 [overlap_recompute_comm]: 1.07e-06 [overlap_grad_ring_attention]: 9.31e-06 [overlap_grad_flash_sp]: 2.256e-05 [begin_end_overlap_inline]: 3.10014e-07 [split_matmul_comm_elemetwise]: 4.22e-06 [split_layernorm_comm]: 8.89995e-07 [handle_group_info]: 5.99975e-07 [symbol_engine_optimizer]: 7.527e-05, [1] [Cycle 1]: 7.065e-05, [6] [build]: 1.79998e-06 [elim_shapecalc]: 1.022e-05 [elim_not_effective]: 1.291e-05 [opt_reshape]: 7.75e-06 [fold_const_symbol]: 1.009e-05 [renormalize]: 2.50002e-07 [detach_backward]: 1.05001e-06 [pipeline_parallel_scheduler]: 9.50007e-07 [auto_monad_reorder]: 2.442e-05 [get_jit_bprop_graph]: 1.06002e-06 [rewriter_after_jit_bprop_graph]: 3.43999e-06 [opt_after_jit_grad]: 0.00047419 [validate]: 4.496e-05 [backend_pass]: 8.09989e-07 [task_emit]: 2.6431 [execute]: 1.039e-05 Sums bootstrap : 0.000903s : 0.03% type_inference : 0.056164s : 2.08% event_method : 0.000144s : 0.01% auto_monad : 0.000122s : 0.00% graph_reusing : 0.000005s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000041s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000012s : 0.00% parallel-infer-symbol : 0.000002s : 0.00% pre_auto_parallel : 0.000052s : 0.00% insert-virtual-dataset : 0.000001s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000001s : 0.00% pipeline_split : 0.000001s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000317s : 0.01% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000109s : 0.00% optimize.opt_a.loop_unroll : 0.000056s : 0.00% optimize.opt_a.a_1 : 0.001172s : 0.04% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000017s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000013s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000009s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000170s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000015s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000013s : 0.00% optimize.opt_a.merge_send_recv : 0.000032s : 0.00% optimize.opt_a.auto_parallel : 0.000014s : 0.00% optimize.opt_a.parallel : 0.000053s : 0.00% optimize.opt_a.flash_sp : 0.000023s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000011s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000017s : 0.00% optimize.opt_a.virtual_dataset : 0.000014s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.00% optimize.opt_a.virtual_output : 0.000013s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000002s : 0.00% optimize.opt_a.offload_activation : 0.000017s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000032s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000020s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000011s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000011s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.00% optimize.opt_a.a_after_grad : 0.000021s : 0.00% optimize.opt_a.renormalize : 0.001290s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000005s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000024s : 0.00% optimize.opt_a.cse : 0.000069s : 0.00% optimize.opt_a.a_3 : 0.000090s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000001s : 0.00% optimize.rewriter_after_opt_a : 0.000020s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000582s : 0.02% optimize.opt_b.b_1 : 0.000155s : 0.01% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000022s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.00% optimize.overlap_param_gather : 0.000005s : 0.00% optimize.cconv : 0.000015s : 0.00% optimize.loop_unroll : 0.000438s : 0.02% optimize.opt_after_cconv.c_1 : 0.000033s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000021s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000023s : 0.00% optimize.tuple_transform.d_1 : 0.000046s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000001s : 0.00% optimize.add_recomputation : 0.000038s : 0.00% optimize.cse_after_recomputation.cse : 0.000014s : 0.00% optimize.environ_conv : 0.000013s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000013s : 0.00% optimize.bias_add_comm_swap : 0.000005s : 0.00% optimize.label_micro_interleaved_index : 0.000007s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000001s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000004s : 0.00% optimize.full_micro_interleaved_order_control : 0.000005s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000000s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000010s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000001s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000001s : 0.00% optimize.overlap_grad_ring_attention : 0.000009s : 0.00% optimize.overlap_grad_flash_sp : 0.000023s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000004s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000001s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000474s : 0.02% validate : 0.000045s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 2.643104s : 97.65% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000275 38 0.51% : 0.000001s : 2: substitution.elim_not_effective 0.35% : 0.000001s : 2: substitution.fold_const_symbol 1.55% : 0.000004s : 5: substitution.graph_param_transform 82.54% : 0.000227s : 6: substitution.inline 0.97% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.17% : 0.000009s : 4: substitution.remove_not_recompute_node 1.56% : 0.000004s : 6: substitution.replace_old_param 9.36% : 0.000026s : 9: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.056063 2 95.92% : 0.053777s : 1: type_inference.infer 4.08% : 0.002286s : 1: type_inference.specialize ------[replace.] 0.000108 15 52.79% : 0.000057s : 6: replace.inline 47.21% : 0.000051s : 9: replace.tuple_list_get_item_eliminator ------[match.] 0.000246 15 90.90% : 0.000224s : 6: match.inline 9.10% : 0.000022s : 9: match.tuple_list_get_item_eliminator ------[predicate.] 0.000270 1918 1.05% : 0.000003s : 21: predicate.accumulaten_eliminater 0.65% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000001s : 10: predicate.addn_check_dump 1.15% : 0.000003s : 21: predicate.addn_zero_filter 0.94% : 0.000003s : 21: predicate.adjust_all_reduce_mul_add 2.25% : 0.000006s : 31: predicate.arithmetic_simplify 0.99% : 0.000003s : 21: predicate.cast_eliminate 0.57% : 0.000002s : 10: predicate.check_bprop_eliminate 0.40% : 0.000001s : 10: predicate.compare_switch_simplify 0.16% : 0.000000s : 5: predicate.const_output_eliminate 0.49% : 0.000001s : 10: predicate.depend_value_elim 1.06% : 0.000003s : 21: predicate.dict_get_item_const_eliminator 1.15% : 0.000003s : 21: predicate.dict_get_item_eliminator 1.00% : 0.000003s : 21: predicate.dict_set_item_eliminator 0.74% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.19% : 0.000001s : 5: predicate.elim_not_effective 0.30% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.25% : 0.000003s : 26: predicate.environ_add_const_eliminate 1.16% : 0.000003s : 26: predicate.environ_get_add_eliminate 1.14% : 0.000003s : 26: predicate.environ_get_depend_swap 1.67% : 0.000005s : 36: predicate.environ_get_eliminate 1.15% : 0.000003s : 26: predicate.environ_get_set_eliminate 1.75% : 0.000005s : 36: predicate.exchange_switch_depend_value 2.45% : 0.000007s : 36: predicate.float_depend_g_call 0.43% : 0.000001s : 10: predicate.float_environ_get_switch 0.62% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 5: predicate.fold_const_symbol 0.57% : 0.000002s : 10: predicate.get_grad_eliminate 0.16% : 0.000000s : 5: predicate.graph_param_transform 0.43% : 0.000001s : 10: predicate.incorporate_call 0.38% : 0.000001s : 10: predicate.incorporate_call_switch 5.53% : 0.000015s : 87: predicate.inline 0.54% : 0.000001s : 10: predicate.inline_without_move 0.32% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.71% : 0.000002s : 10: predicate.less_batch_normalization 2.14% : 0.000006s : 40: predicate.list_to_tuple_eliminator_ 2.90% : 0.000008s : 61: predicate.load_eliminater 0.80% : 0.000002s : 5: predicate.loop_unroll_after_grad 3.03% : 0.000008s : 58: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 31: predicate.make_slice_get_slice_eliminator 0.53% : 0.000001s : 10: predicate.merge_addn 0.46% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.44% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.98% : 0.000003s : 21: predicate.minmaximum_grad 0.83% : 0.000002s : 5: predicate.mutable_eliminate 0.29% : 0.000001s : 5: predicate.opt_reshape 0.26% : 0.000001s : 5: predicate.parallel_virtual_node 2.30% : 0.000006s : 36: predicate.partial_defer_inline 1.82% : 0.000005s : 35: predicate.partial_eliminate 1.05% : 0.000003s : 21: predicate.print_const_string_wrapper 0.46% : 0.000001s : 10: predicate.reduce_all_const_elim 1.42% : 0.000004s : 21: predicate.reduce_eliminate 2.82% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.33% : 0.000001s : 10: predicate.remove_not_recompute_node 1.47% : 0.000004s : 40: predicate.replace_applicator 0.37% : 0.000001s : 10: predicate.replace_old_param 0.21% : 0.000001s : 5: predicate.reset_defer_inline 1.05% : 0.000003s : 21: predicate.reshape_eliminate 0.49% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.25% : 0.000001s : 5: predicate.row_tensor_eliminate 0.56% : 0.000002s : 10: predicate.same_eliminate 0.36% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.66% : 0.000002s : 10: predicate.shard_identity_eliminate 0.52% : 0.000001s : 10: predicate.special_op_eliminate 0.52% : 0.000001s : 10: predicate.specialize_transform 0.68% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.75% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.27% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.95% : 0.000005s : 36: predicate.switch_defer_inline 2.38% : 0.000006s : 46: predicate.switch_layer_defer_inline 5.68% : 0.000015s : 109: predicate.switch_simplify 1.07% : 0.000003s : 21: predicate.tile_eliminate 1.04% : 0.000003s : 21: predicate.transpose_eliminate 1.58% : 0.000004s : 31: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000004s : 31: predicate.tuple_list_get_item_const_eliminator 1.49% : 0.000004s : 31: predicate.tuple_list_get_item_depend_reorder 3.26% : 0.000009s : 50: predicate.tuple_list_get_item_eliminator 1.52% : 0.000004s : 31: predicate.tuple_list_get_set_item_eliminator 2.10% : 0.000006s : 41: predicate.tuple_list_set_item_eliminator 1.95% : 0.000005s : 40: predicate.tuple_to_list_eliminator_ 2.72% : 0.000007s : 61: predicate.updatestate_pure_node_eliminater 3.17% : 0.000009s : 71: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 5: predicate.value_based_eliminate 0.54% : 0.000001s : 10: predicate.virtual_dataset_eliminate 0.54% : 0.000001s : 10: predicate.virtual_output_eliminate 0.22% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.32% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001301 16 52.13% : 0.000678s : 8: func_graph_cloner_run.FuncGraphClonerGraph 47.87% : 0.000623s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 2.733217 196 0.00% : 0.000003s : 1: ForceFp32Comm 0.29% : 0.007964s : 1: add_attr 0.29% : 0.007951s : 1: add_attr_with_inline 0.00% : 0.000003s : 1: add_comm_op_reuse_tag 0.00% : 0.000042s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000133s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000008s : 1: bias_add_comm_swap 0.03% : 0.000955s : 1: bootstrap 0.00% : 0.000019s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000027s : 1: cse_after_recomputation 0.00% : 0.000004s : 1: dataset_repeat_opt 0.00% : 0.000004s : 1: detach_backward 0.00% : 0.000017s : 1: environ_conv 0.01% : 0.000153s : 1: event_method 0.00% : 0.000022s : 1: execute 0.00% : 0.000008s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000009s : 1: graph_reusing 0.00% : 0.000003s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000005s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000010s : 1: label_micro_interleaved_index 0.02% : 0.000447s : 1: loop_unroll 0.00% : 0.000003s : 1: merge_cast_opt 0.00% : 0.000004s : 1: micro_interleaved_order_control 0.02% : 0.000591s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.06% : 0.001709s : 78: opt.transform.opt_a 0.00% : 0.000032s : 1: opt.transform.opt_after_cconv 0.00% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000136s : 28: opt.transform.opt_b 0.00% : 0.000051s : 2: opt.transform.opt_trans_graph 0.00% : 0.000037s : 4: opt.transform.symbol_engine_opt 0.14% : 0.003929s : 1: opt_a 0.00% : 0.000110s : 1: opt_after_cconv 0.02% : 0.000484s : 1: opt_after_jit_grad 0.01% : 0.000244s : 1: opt_b 0.23% : 0.006285s : 1: optimize 0.00% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000026s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000012s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000008s : 1: overlap_param_gather 0.00% : 0.000003s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000004s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000057s : 1: pre_auto_parallel 0.00% : 0.000009s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000006s : 1: remove_cast_before_assign_add 0.00% : 0.000027s : 1: remove_dup_value 0.02% : 0.000670s : 1: renormalize.infer 0.02% : 0.000613s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000024s : 1: rewriter_after_opt_a 0.01% : 0.000325s : 1: rewriter_before_opt_a 0.00% : 0.000004s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000003s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000007s : 1: split_matmul_comm_elemetwise 0.00% : 0.000016s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000078s : 1: symbol_engine_optimizer 96.71% : 2.643331s : 1: task_emit 0.00% : 0.000079s : 1: tuple_transform 2.06% : 0.056186s : 1: type_inference 0.00% : 0.000071s : 1: validate TotalTime = 0.65794, [33] [bootstrap]: 0.00094585 [type_inference]: 0.0831425 [event_method]: 0.00019477 [auto_monad]: 0.0002469 [graph_reusing]: 1.085e-05 [pre_auto_parallel]: 1.411e-05 [py_interpret_to_execute]: 6.253e-05 [rewriter_before_opt_a]: 0.00019801 [expand_dump_flag]: 4.74e-06 [jit_opt_a]: 0.0278032, [3] [Cycle 1]: 0.0103908, [27] [switch_simplify]: 0.00039138 [loop_unroll]: 6.417e-05 [a_1]: 0.00153143 [with_stream_mark]: 3.811e-05 [recompute_prepare]: 2.05e-05 [updatestate_depend_eliminate]: 1.853e-05 [updatestate_assign_eliminate]: 1.441e-05 [updatestate_loads_eliminate]: 6.66e-06 [parameter_eliminate]: 2.62001e-06 [specialize_transform]: 1.611e-05 [updatestate_useless_node_eliminater]: 1.432e-05 [accelerated_algorithm]: 1.518e-05 [meta_shard_fg_expand]: 4.82e-06 [get_grad_eliminate_]: 1.438e-05 [merge_forward]: 1.028e-05 [cell_reuse_recompute_pass]: 9.79984e-07 [cell_reuse_handle_not_recompute_node_pass]: 3.193e-05 [j_node_and_user_rematch]: 3.411e-05 [meta_fg_expand]: 0.00228778 [replace_old_param]: 6.512e-05 [inline_without_move]: 5.756e-05 [renormalize]: 0.00501818 [add_forward_monad_depend]: 2.009e-05 [auto_monad_grad]: 6.83e-06 [auto_monad_eliminator]: 5.759e-05 [cse]: 0.00030849 [replace_applicator]: 7.376e-05 [Cycle 2]: 0.00245948, [27] [switch_simplify]: 4.479e-05 [loop_unroll]: 4.161e-05 [a_1]: 0.00112733 [with_stream_mark]: 1.13e-05 [recompute_prepare]: 8.38001e-06 [updatestate_depend_eliminate]: 3.70998e-06 [updatestate_assign_eliminate]: 2.91e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 1.10999e-06 [specialize_transform]: 7.03e-06 [updatestate_useless_node_eliminater]: 6.46e-06 [accelerated_algorithm]: 6.84999e-06 [meta_shard_fg_expand]: 1.55999e-06 [get_grad_eliminate_]: 5.87999e-06 [merge_forward]: 3.03e-06 [cell_reuse_recompute_pass]: 8.2e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.292e-05 [j_node_and_user_rematch]: 9.46998e-06 [meta_fg_expand]: 0.00033485 [replace_old_param]: 1.447e-05 [inline_without_move]: 6.74999e-06 [renormalize]: 0.00060538 [add_forward_monad_depend]: 4.22e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 1.06e-05 [cse]: 2.183e-05 [replace_applicator]: 1.255e-05 [Cycle 3]: 0.00036646, [27] [switch_simplify]: 7.28e-06 [loop_unroll]: 6.54001e-06 [a_1]: 0.0001176 [with_stream_mark]: 8.50001e-06 [recompute_prepare]: 6.51e-06 [updatestate_depend_eliminate]: 3.27997e-06 [updatestate_assign_eliminate]: 2.72001e-06 [updatestate_loads_eliminate]: 2.56e-06 [parameter_eliminate]: 8.59989e-07 [specialize_transform]: 6.16e-06 [updatestate_useless_node_eliminater]: 6.16e-06 [accelerated_algorithm]: 6.49001e-06 [meta_shard_fg_expand]: 1.36998e-06 [get_grad_eliminate_]: 5.81998e-06 [merge_forward]: 2.80997e-06 [cell_reuse_recompute_pass]: 1.36998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.358e-05 [j_node_and_user_rematch]: 9.34e-06 [meta_fg_expand]: 2.21e-06 [replace_old_param]: 9.54999e-06 [inline_without_move]: 5.84e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.00999e-06 [auto_monad_grad]: 7.50006e-07 [auto_monad_eliminator]: 5.76e-06 [cse]: 1.379e-05 [replace_applicator]: 6.21e-06 [py_interpret_to_execute_after_opt_a]: 1.295e-05 [rewriter_after_opt_a]: 0.0001251 [convert_after_rewriter]: 8.32e-06 [order_py_execute_after_rewriter]: 5.46e-06 [mutable_eliminate]: 0.0005091 [jit_opt_b]: 5.576e-05, [1] [Cycle 1]: 4.912e-05, [2] [frontend_op_eliminate]: 1.945e-05 [inline_after_opt_a]: 1.86e-05 [cconv]: 2.058e-05 [loop_unroll]: 0.00044843 [jit_opt_after_cconv]: 0.00015682, [1] [Cycle 1]: 0.00015063, [11] [c_1]: 2.692e-05 [parameter_eliminate]: 2.73e-06 [updatestate_depend_eliminate]: 5.98002e-06 [updatestate_assign_eliminate]: 3.11999e-06 [updatestate_loads_eliminate]: 2.71e-06 [cse]: 2.212e-05 [call_graph_tuple_transform]: 2.079e-05 [tuple_list_get_item_eliminator]: 6.77002e-06 [none_parameter_eliminate]: 1.44e-06 [renormalize]: 2.50002e-07 [switch_simplify]: 7.35e-06 [remove_dup_value]: 1.704e-05 [partial_unused_args_eliminate]: 2.46e-06 [environ_conv]: 1.038e-05 [add_recomputation]: 4.857e-05 [cse_after_recomputation]: 2.539e-05, [1] [Cycle 1]: 2.008e-05, [1] [cse]: 1.425e-05 [auto_monad_reorder]: 2.374e-05 [get_jit_bprop_graph]: 1.86e-06 [rewriter_after_jit_bprop_graph]: 2.99999e-06 [opt_after_jit_grad]: 0.00047401 [symbol_engine_optimizer]: 7.764e-05, [1] [Cycle 1]: 7.164e-05, [6] [build]: 3.51999e-06 [elim_shapecalc]: 9.19e-06 [elim_not_effective]: 1.441e-05 [opt_reshape]: 7.13e-06 [fold_const_symbol]: 9.82999e-06 [renormalize]: 3.69997e-07 [validate]: 6.051e-05 [backend_pass]: 1.03001e-06 [task_emit]: 0.542907 [execute]: 7.15e-06 Sums bootstrap : 0.000946s : 0.15% type_inference : 0.083142s : 12.94% event_method : 0.000195s : 0.03% auto_monad : 0.000247s : 0.04% graph_reusing : 0.000011s : 0.00% pre_auto_parallel : 0.000014s : 0.00% py_interpret_to_execute : 0.000063s : 0.01% rewriter_before_opt_a : 0.000198s : 0.03% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000443s : 0.07% jit_opt_a.loop_unroll : 0.000112s : 0.02% jit_opt_a.a_1 : 0.002776s : 0.43% jit_opt_a.with_stream_mark : 0.000058s : 0.01% jit_opt_a.recompute_prepare : 0.000035s : 0.01% jit_opt_a.updatestate_depend_eliminate : 0.000026s : 0.00% jit_opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.00% jit_opt_a.parameter_eliminate : 0.000005s : 0.00% jit_opt_a.specialize_transform : 0.000029s : 0.00% jit_opt_a.updatestate_useless_node_eliminater : 0.000027s : 0.00% jit_opt_a.accelerated_algorithm : 0.000029s : 0.00% jit_opt_a.meta_shard_fg_expand : 0.000008s : 0.00% jit_opt_a.get_grad_eliminate_ : 0.000026s : 0.00% jit_opt_a.merge_forward : 0.000016s : 0.00% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000058s : 0.01% jit_opt_a.j_node_and_user_rematch : 0.000053s : 0.01% jit_opt_a.meta_fg_expand : 0.002625s : 0.41% jit_opt_a.replace_old_param : 0.000089s : 0.01% jit_opt_a.inline_without_move : 0.000070s : 0.01% jit_opt_a.renormalize : 0.005624s : 0.88% jit_opt_a.add_forward_monad_depend : 0.000025s : 0.00% jit_opt_a.auto_monad_grad : 0.000009s : 0.00% jit_opt_a.auto_monad_eliminator : 0.000074s : 0.01% jit_opt_a.cse : 0.000344s : 0.05% jit_opt_a.replace_applicator : 0.000093s : 0.01% py_interpret_to_execute_after_opt_a : 0.000013s : 0.00% rewriter_after_opt_a : 0.000125s : 0.02% convert_after_rewriter : 0.000008s : 0.00% order_py_execute_after_rewriter : 0.000005s : 0.00% mutable_eliminate : 0.000509s : 0.08% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.00% jit_opt_b.inline_after_opt_a : 0.000019s : 0.00% cconv : 0.000021s : 0.00% loop_unroll : 0.000448s : 0.07% jit_opt_after_cconv.c_1 : 0.000027s : 0.00% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000022s : 0.00% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.00% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.00% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.00% remove_dup_value : 0.000017s : 0.00% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000010s : 0.00% add_recomputation : 0.000049s : 0.01% cse_after_recomputation.cse : 0.000014s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000474s : 0.07% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.00% symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.00% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000061s : 0.01% backend_pass : 0.000001s : 0.00% task_emit : 0.542907s : 84.51% execute : 0.000007s : 0.00% Time group info: ------[substitution.] 0.000843 128 0.28% : 0.000002s : 2: substitution.elim_not_effective 0.17% : 0.000001s : 2: substitution.fold_const_symbol 0.63% : 0.000005s : 4: substitution.graph_param_transform 72.09% : 0.000607s : 21: substitution.inline 1.89% : 0.000016s : 2: substitution.inline_without_move 1.91% : 0.000016s : 12: substitution.j_node_and_user_rematch 2.10% : 0.000018s : 7: substitution.minmaximum_grad 3.19% : 0.000027s : 11: substitution.partial_eliminate 1.16% : 0.000010s : 12: substitution.remove_not_recompute_node 2.83% : 0.000024s : 9: substitution.replace_applicator 1.22% : 0.000010s : 14: substitution.replace_old_param 0.26% : 0.000002s : 1: substitution.set_cell_output_no_recompute 4.09% : 0.000034s : 5: substitution.switch_simplify 2.42% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 1.68% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 4.07% : 0.000034s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.083002 2 95.83% : 0.079538s : 1: type_inference.infer 4.17% : 0.003463s : 1: type_inference.specialize ------[replace.] 0.000346 31 48.75% : 0.000169s : 21: replace.inline 36.91% : 0.000128s : 5: replace.switch_simplify 14.34% : 0.000050s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000638 31 93.56% : 0.000597s : 21: match.inline 4.84% : 0.000031s : 5: match.switch_simplify 1.60% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000446 3262 1.55% : 0.000007s : 56: predicate.accumulaten_eliminater 0.39% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.44% : 0.000006s : 56: predicate.addn_check_dump 1.53% : 0.000007s : 56: predicate.addn_zero_filter 2.37% : 0.000011s : 56: predicate.arithmetic_simplify 1.51% : 0.000007s : 56: predicate.cast_eliminate 0.13% : 0.000001s : 4: predicate.check_bprop_eliminate 1.43% : 0.000006s : 56: predicate.compare_switch_simplify 1.52% : 0.000007s : 56: predicate.depend_value_elim 1.49% : 0.000007s : 56: predicate.dict_get_item_const_eliminator 1.58% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.56% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.25% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.10% : 0.000000s : 4: predicate.elim_not_effective 0.18% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.48% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.46% : 0.000006s : 56: predicate.environ_get_add_eliminate 1.45% : 0.000006s : 56: predicate.environ_get_depend_swap 1.52% : 0.000007s : 56: predicate.environ_get_eliminate 1.47% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.09% : 0.000000s : 4: predicate.fold_const_symbol 0.74% : 0.000003s : 21: predicate.get_grad_eliminate 0.08% : 0.000000s : 4: predicate.graph_param_transform 4.39% : 0.000020s : 90: predicate.inline 1.66% : 0.000007s : 46: predicate.inline_without_move 0.36% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.82% : 0.000004s : 21: predicate.less_batch_normalization 1.76% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.85% : 0.000008s : 65: predicate.load_eliminater 0.40% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.83% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.74% : 0.000008s : 60: predicate.make_slice_get_slice_eliminator 1.44% : 0.000006s : 56: predicate.merge_addn 1.49% : 0.000007s : 56: predicate.minmaximum_grad 0.51% : 0.000002s : 4: predicate.mutable_eliminate 0.14% : 0.000001s : 4: predicate.opt_reshape 2.27% : 0.000010s : 65: predicate.partial_eliminate 1.53% : 0.000007s : 56: predicate.print_const_string_wrapper 1.88% : 0.000008s : 56: predicate.reduce_eliminate 1.70% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000002s : 21: predicate.remove_not_recompute_node 2.48% : 0.000011s : 113: predicate.replace_applicator 1.05% : 0.000005s : 46: predicate.replace_old_param 0.10% : 0.000000s : 4: predicate.reset_defer_inline 1.62% : 0.000007s : 56: predicate.reshape_eliminate 1.46% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.20% : 0.000001s : 4: predicate.row_tensor_eliminate 1.56% : 0.000007s : 56: predicate.same_eliminate 0.44% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.37% : 0.000002s : 8: predicate.special_op_eliminate 0.76% : 0.000003s : 21: predicate.specialize_transform 1.82% : 0.000008s : 56: predicate.split_environ_get_set_with_tuple_value 1.52% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.13% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.97% : 0.000013s : 82: predicate.switch_defer_inline 2.77% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.60% : 0.000034s : 216: predicate.switch_simplify 1.53% : 0.000007s : 56: predicate.tile_eliminate 1.50% : 0.000007s : 56: predicate.transpose_eliminate 1.94% : 0.000009s : 56: predicate.tuple_list_convert_item_index_to_positive 1.72% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.78% : 0.000012s : 69: predicate.tuple_list_get_item_eliminator 1.81% : 0.000008s : 56: predicate.tuple_list_set_item_eliminator 1.76% : 0.000008s : 61: predicate.tuple_to_list_eliminator_ 1.74% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.67% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 1.91% : 0.000009s : 56: predicate.value_based_eliminate 0.12% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.20% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003285 41 64.01% : 0.002103s : 16: func_graph_cloner_run.FuncGraphClonerGraph 35.99% : 0.001182s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.667373 91 0.01% : 0.000052s : 1: add_recomputation 0.04% : 0.000254s : 1: auto_monad 0.00% : 0.000026s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.14% : 0.000966s : 1: bootstrap 0.00% : 0.000023s : 1: cconv 0.00% : 0.000011s : 1: convert_after_rewriter 0.00% : 0.000028s : 1: cse_after_recomputation 0.00% : 0.000013s : 1: environ_conv 0.03% : 0.000203s : 1: event_method 0.00% : 0.000012s : 1: execute 0.00% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 4.17% : 0.027808s : 1: jit_opt_a 0.02% : 0.000160s : 1: jit_opt_after_cconv 0.01% : 0.000058s : 1: jit_opt_b 0.07% : 0.000455s : 1: loop_unroll 0.08% : 0.000516s : 1: mutable_eliminate 0.57% : 0.003776s : 39: opt.transform.jit_opt_a 0.01% : 0.000058s : 4: opt.transform.jit_opt_after_cconv 0.00% : 0.000032s : 4: opt.transform.jit_opt_b 0.00% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000015s : 1: opt.transform.mutable_eliminate 0.00% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000037s : 4: opt.transform.symbol_engine_opt 0.07% : 0.000481s : 1: opt_after_jit_grad 0.00% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000016s : 1: pre_auto_parallel 0.01% : 0.000066s : 1: py_interpret_to_execute 0.00% : 0.000015s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000020s : 1: remove_dup_value 0.50% : 0.003368s : 2: renormalize.infer 0.34% : 0.002239s : 2: renormalize.specialize 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000129s : 1: rewriter_after_opt_a 0.03% : 0.000202s : 1: rewriter_before_opt_a 0.01% : 0.000080s : 1: symbol_engine_optimizer 81.35% : 0.542927s : 1: task_emit 12.46% : 0.083160s : 1: type_inference 0.01% : 0.000081s : 1: validate TotalTime = 0.0906639, [33] [bootstrap]: 0.00123908 [type_inference]: 0.0596482 [event_method]: 0.00024172 [auto_monad]: 0.00017175 [graph_reusing]: 6.209e-05 [pre_auto_parallel]: 4.23001e-06 [py_interpret_to_execute]: 5.377e-05 [rewriter_before_opt_a]: 0.00025784 [expand_dump_flag]: 4.61002e-06 [jit_opt_a]: 0.0160765, [3] [Cycle 1]: 0.00858325, [27] [switch_simplify]: 0.00023466 [loop_unroll]: 6.267e-05 [a_1]: 0.00137311 [with_stream_mark]: 3.046e-05 [recompute_prepare]: 2.157e-05 [updatestate_depend_eliminate]: 9.92999e-06 [updatestate_assign_eliminate]: 7.26001e-06 [updatestate_loads_eliminate]: 7.03e-06 [parameter_eliminate]: 2.42001e-06 [specialize_transform]: 1.491e-05 [updatestate_useless_node_eliminater]: 1.447e-05 [accelerated_algorithm]: 1.497e-05 [meta_shard_fg_expand]: 4.42e-06 [get_grad_eliminate_]: 1.468e-05 [merge_forward]: 8.87999e-06 [cell_reuse_recompute_pass]: 9.80013e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.931e-05 [j_node_and_user_rematch]: 2.531e-05 [meta_fg_expand]: 0.00190176 [replace_old_param]: 6.71e-05 [inline_without_move]: 5.929e-05 [renormalize]: 0.00398136 [add_forward_monad_depend]: 1.592e-05 [auto_monad_grad]: 6.63e-06 [auto_monad_eliminator]: 5.88e-05 [cse]: 0.00029192 [replace_applicator]: 7.932e-05 [Cycle 2]: 0.00252403, [27] [switch_simplify]: 4.363e-05 [loop_unroll]: 4.064e-05 [a_1]: 0.00115731 [with_stream_mark]: 1.504e-05 [recompute_prepare]: 8.65999e-06 [updatestate_depend_eliminate]: 3.83001e-06 [updatestate_assign_eliminate]: 3.14001e-06 [updatestate_loads_eliminate]: 2.78e-06 [parameter_eliminate]: 1.80001e-06 [specialize_transform]: 7.11999e-06 [updatestate_useless_node_eliminater]: 7.15998e-06 [accelerated_algorithm]: 7.51001e-06 [meta_shard_fg_expand]: 2.05002e-06 [get_grad_eliminate_]: 6.11998e-06 [merge_forward]: 3.26999e-06 [cell_reuse_recompute_pass]: 1.05999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.374e-05 [j_node_and_user_rematch]: 9.79e-06 [meta_fg_expand]: 0.00019243 [replace_old_param]: 1.413e-05 [inline_without_move]: 6.69999e-06 [renormalize]: 0.0007177 [add_forward_monad_depend]: 5.79e-06 [auto_monad_grad]: 1.96e-06 [auto_monad_eliminator]: 4.799e-05 [cse]: 2.864e-05 [replace_applicator]: 1.311e-05 [Cycle 3]: 0.00037558, [27] [switch_simplify]: 8.05e-06 [loop_unroll]: 6.76999e-06 [a_1]: 0.0001214 [with_stream_mark]: 9.27001e-06 [recompute_prepare]: 6.56999e-06 [updatestate_depend_eliminate]: 3.27002e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.58e-06 [parameter_eliminate]: 1.05001e-06 [specialize_transform]: 6.31e-06 [updatestate_useless_node_eliminater]: 6.23e-06 [accelerated_algorithm]: 6.26e-06 [meta_shard_fg_expand]: 1.60999e-06 [get_grad_eliminate_]: 5.99999e-06 [merge_forward]: 2.94999e-06 [cell_reuse_recompute_pass]: 1.53002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.469e-05 [j_node_and_user_rematch]: 9.52001e-06 [meta_fg_expand]: 1.96e-06 [replace_old_param]: 9.60001e-06 [inline_without_move]: 6.11998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.43002e-06 [auto_monad_grad]: 7.00005e-07 [auto_monad_eliminator]: 6.72002e-06 [cse]: 1.517e-05 [replace_applicator]: 6.07999e-06 [py_interpret_to_execute_after_opt_a]: 1.45e-05 [rewriter_after_opt_a]: 3.923e-05 [convert_after_rewriter]: 7.1e-06 [order_py_execute_after_rewriter]: 5.25999e-06 [mutable_eliminate]: 0.0006705 [jit_opt_b]: 5.847e-05, [1] [Cycle 1]: 5.053e-05, [2] [frontend_op_eliminate]: 2.036e-05 [inline_after_opt_a]: 1.908e-05 [cconv]: 2.494e-05 [loop_unroll]: 0.00045805 [jit_opt_after_cconv]: 0.00016691, [1] [Cycle 1]: 0.00015993, [11] [c_1]: 3.134e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 5.50001e-06 [updatestate_assign_eliminate]: 2.96001e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 2.426e-05 [call_graph_tuple_transform]: 2.264e-05 [tuple_list_get_item_eliminator]: 6.83e-06 [none_parameter_eliminate]: 1.62999e-06 [renormalize]: 7.2e-07 [switch_simplify]: 6.69001e-06 [remove_dup_value]: 1.771e-05 [partial_unused_args_eliminate]: 1.99999e-06 [environ_conv]: 6.38e-06 [add_recomputation]: 4.772e-05 [cse_after_recomputation]: 2.595e-05, [1] [Cycle 1]: 2.04e-05, [1] [cse]: 1.431e-05 [auto_monad_reorder]: 1.791e-05 [get_jit_bprop_graph]: 1.97999e-06 [rewriter_after_jit_bprop_graph]: 6.12999e-06 [opt_after_jit_grad]: 0.00051834 [symbol_engine_optimizer]: 8.427e-05, [1] [Cycle 1]: 7.739e-05, [6] [build]: 4.07e-06 [elim_shapecalc]: 9.46998e-06 [elim_not_effective]: 1.565e-05 [opt_reshape]: 8.38001e-06 [fold_const_symbol]: 1.061e-05 [renormalize]: 4.20026e-07 [validate]: 4.282e-05 [backend_pass]: 1.29003e-06 [task_emit]: 0.0103822 [execute]: 7.83001e-06 Sums bootstrap : 0.001239s : 1.46% type_inference : 0.059648s : 70.07% event_method : 0.000242s : 0.28% auto_monad : 0.000172s : 0.20% graph_reusing : 0.000062s : 0.07% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000054s : 0.06% rewriter_before_opt_a : 0.000258s : 0.30% expand_dump_flag : 0.000005s : 0.01% jit_opt_a.switch_simplify : 0.000286s : 0.34% jit_opt_a.loop_unroll : 0.000110s : 0.13% jit_opt_a.a_1 : 0.002652s : 3.11% jit_opt_a.with_stream_mark : 0.000055s : 0.06% jit_opt_a.recompute_prepare : 0.000037s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000017s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% jit_opt_a.parameter_eliminate : 0.000005s : 0.01% jit_opt_a.specialize_transform : 0.000028s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000028s : 0.03% jit_opt_a.accelerated_algorithm : 0.000029s : 0.03% jit_opt_a.meta_shard_fg_expand : 0.000008s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000027s : 0.03% jit_opt_a.merge_forward : 0.000015s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000058s : 0.07% jit_opt_a.j_node_and_user_rematch : 0.000045s : 0.05% jit_opt_a.meta_fg_expand : 0.002096s : 2.46% jit_opt_a.replace_old_param : 0.000091s : 0.11% jit_opt_a.inline_without_move : 0.000072s : 0.08% jit_opt_a.renormalize : 0.004699s : 5.52% jit_opt_a.add_forward_monad_depend : 0.000023s : 0.03% jit_opt_a.auto_monad_grad : 0.000009s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000114s : 0.13% jit_opt_a.cse : 0.000336s : 0.39% jit_opt_a.replace_applicator : 0.000099s : 0.12% py_interpret_to_execute_after_opt_a : 0.000014s : 0.02% rewriter_after_opt_a : 0.000039s : 0.05% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000670s : 0.79% jit_opt_b.frontend_op_eliminate : 0.000020s : 0.02% jit_opt_b.inline_after_opt_a : 0.000019s : 0.02% cconv : 0.000025s : 0.03% loop_unroll : 0.000458s : 0.54% jit_opt_after_cconv.c_1 : 0.000031s : 0.04% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000024s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000023s : 0.03% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000001s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000018s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000006s : 0.01% add_recomputation : 0.000048s : 0.06% cse_after_recomputation.cse : 0.000014s : 0.02% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000518s : 0.61% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000043s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.010382s : 12.20% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000684 128 0.34% : 0.000002s : 2: substitution.elim_not_effective 0.22% : 0.000002s : 2: substitution.fold_const_symbol 0.79% : 0.000005s : 4: substitution.graph_param_transform 71.25% : 0.000487s : 21: substitution.inline 2.60% : 0.000018s : 2: substitution.inline_without_move 1.34% : 0.000009s : 12: substitution.j_node_and_user_rematch 1.53% : 0.000010s : 7: substitution.minmaximum_grad 1.54% : 0.000011s : 11: substitution.partial_eliminate 1.50% : 0.000010s : 12: substitution.remove_not_recompute_node 3.75% : 0.000026s : 9: substitution.replace_applicator 1.53% : 0.000010s : 14: substitution.replace_old_param 0.38% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.59% : 0.000018s : 5: substitution.switch_simplify 3.32% : 0.000023s : 7: substitution.tuple_list_convert_item_index_to_positive 2.29% : 0.000016s : 7: substitution.tuple_list_get_item_depend_reorder 5.01% : 0.000034s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.059503 2 94.42% : 0.056180s : 1: type_inference.infer 5.58% : 0.003323s : 1: type_inference.specialize ------[replace.] 0.000277 31 53.62% : 0.000148s : 21: replace.inline 29.15% : 0.000081s : 5: replace.switch_simplify 17.23% : 0.000048s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000500 31 95.26% : 0.000476s : 21: match.inline 2.84% : 0.000014s : 5: match.switch_simplify 1.90% : 0.000009s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000453 3262 1.57% : 0.000007s : 56: predicate.accumulaten_eliminater 0.38% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.45% : 0.000007s : 56: predicate.addn_check_dump 1.59% : 0.000007s : 56: predicate.addn_zero_filter 2.10% : 0.000010s : 56: predicate.arithmetic_simplify 1.57% : 0.000007s : 56: predicate.cast_eliminate 0.17% : 0.000001s : 4: predicate.check_bprop_eliminate 1.46% : 0.000007s : 56: predicate.compare_switch_simplify 1.47% : 0.000007s : 56: predicate.depend_value_elim 1.53% : 0.000007s : 56: predicate.dict_get_item_const_eliminator 1.60% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.49% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.24% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 4: predicate.elim_not_effective 0.19% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.51% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.45% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.50% : 0.000007s : 56: predicate.environ_get_depend_swap 1.55% : 0.000007s : 56: predicate.environ_get_eliminate 1.43% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.08% : 0.000000s : 4: predicate.fold_const_symbol 0.75% : 0.000003s : 21: predicate.get_grad_eliminate 0.09% : 0.000000s : 4: predicate.graph_param_transform 4.12% : 0.000019s : 90: predicate.inline 1.64% : 0.000007s : 46: predicate.inline_without_move 0.34% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.94% : 0.000004s : 21: predicate.less_batch_normalization 1.76% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.82% : 0.000008s : 65: predicate.load_eliminater 0.49% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.71% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.84% : 0.000008s : 60: predicate.make_slice_get_slice_eliminator 1.44% : 0.000007s : 56: predicate.merge_addn 1.48% : 0.000007s : 56: predicate.minmaximum_grad 0.46% : 0.000002s : 4: predicate.mutable_eliminate 0.16% : 0.000001s : 4: predicate.opt_reshape 2.32% : 0.000011s : 65: predicate.partial_eliminate 1.55% : 0.000007s : 56: predicate.print_const_string_wrapper 2.02% : 0.000009s : 56: predicate.reduce_eliminate 1.69% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.43% : 0.000002s : 21: predicate.remove_not_recompute_node 2.49% : 0.000011s : 113: predicate.replace_applicator 0.94% : 0.000004s : 46: predicate.replace_old_param 0.11% : 0.000000s : 4: predicate.reset_defer_inline 1.61% : 0.000007s : 56: predicate.reshape_eliminate 1.46% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.24% : 0.000001s : 4: predicate.row_tensor_eliminate 1.59% : 0.000007s : 56: predicate.same_eliminate 0.43% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.31% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000003s : 21: predicate.specialize_transform 1.81% : 0.000008s : 56: predicate.split_environ_get_set_with_tuple_value 1.58% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.15% : 0.000001s : 4: predicate.switch_call_monad_eliminater 3.00% : 0.000014s : 82: predicate.switch_defer_inline 2.66% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.54% : 0.000034s : 216: predicate.switch_simplify 1.50% : 0.000007s : 56: predicate.tile_eliminate 1.47% : 0.000007s : 56: predicate.transpose_eliminate 1.89% : 0.000009s : 56: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.82% : 0.000013s : 69: predicate.tuple_list_get_item_eliminator 1.92% : 0.000009s : 56: predicate.tuple_list_set_item_eliminator 1.74% : 0.000008s : 61: predicate.tuple_to_list_eliminator_ 1.78% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.69% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 2.10% : 0.000010s : 56: predicate.value_based_eliminate 0.10% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.17% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003530 41 67.60% : 0.002386s : 16: func_graph_cloner_run.FuncGraphClonerGraph 32.40% : 0.001144s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.098951 91 0.05% : 0.000051s : 1: add_recomputation 0.18% : 0.000180s : 1: auto_monad 0.02% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 1.28% : 0.001269s : 1: bootstrap 0.03% : 0.000027s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.03% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000009s : 1: environ_conv 0.25% : 0.000249s : 1: event_method 0.01% : 0.000012s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.07% : 0.000066s : 1: graph_reusing 16.25% : 0.016080s : 1: jit_opt_a 0.17% : 0.000170s : 1: jit_opt_after_cconv 0.06% : 0.000061s : 1: jit_opt_b 0.47% : 0.000466s : 1: loop_unroll 0.69% : 0.000679s : 1: mutable_eliminate 3.55% : 0.003508s : 39: opt.transform.jit_opt_a 0.06% : 0.000063s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000033s : 4: opt.transform.jit_opt_b 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000016s : 1: opt.transform.mutable_eliminate 0.03% : 0.000028s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.53% : 0.000527s : 1: opt_after_jit_grad 0.01% : 0.000007s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pre_auto_parallel 0.06% : 0.000058s : 1: py_interpret_to_execute 0.02% : 0.000017s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000020s : 1: remove_dup_value 2.56% : 0.002529s : 2: renormalize.infer 2.18% : 0.002153s : 2: renormalize.specialize 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000042s : 1: rewriter_after_opt_a 0.26% : 0.000262s : 1: rewriter_before_opt_a 0.09% : 0.000087s : 1: symbol_engine_optimizer 10.51% : 0.010396s : 1: task_emit 60.30% : 0.059669s : 1: type_inference 0.07% : 0.000067s : 1: validate TotalTime = 0.112808, [33] [bootstrap]: 0.00038821 [type_inference]: 0.0789983 [event_method]: 0.00032284 [auto_monad]: 0.00020733 [graph_reusing]: 1.081e-05 [pre_auto_parallel]: 4.37e-06 [py_interpret_to_execute]: 4.949e-05 [rewriter_before_opt_a]: 0.00016078 [expand_dump_flag]: 5.00999e-06 [jit_opt_a]: 0.0232595, [3] [Cycle 1]: 0.0167446, [27] [switch_simplify]: 0.00020994 [loop_unroll]: 0.0001134 [a_1]: 0.00138741 [with_stream_mark]: 2.968e-05 [recompute_prepare]: 2.339e-05 [updatestate_depend_eliminate]: 8.98002e-06 [updatestate_assign_eliminate]: 7.75e-06 [updatestate_loads_eliminate]: 7.27997e-06 [parameter_eliminate]: 2.94999e-06 [specialize_transform]: 1.68e-05 [updatestate_useless_node_eliminater]: 1.596e-05 [accelerated_algorithm]: 1.528e-05 [meta_shard_fg_expand]: 5.35999e-06 [get_grad_eliminate_]: 1.578e-05 [merge_forward]: 1.003e-05 [cell_reuse_recompute_pass]: 1.09e-06 [cell_reuse_handle_not_recompute_node_pass]: 3.132e-05 [j_node_and_user_rematch]: 2.672e-05 [meta_fg_expand]: 0.00424121 [replace_old_param]: 8.916e-05 [inline_without_move]: 8.167e-05 [renormalize]: 0.00758613 [add_forward_monad_depend]: 1.178e-05 [auto_monad_grad]: 7.47002e-06 [auto_monad_eliminator]: 0.00011062 [cse]: 0.00032222 [replace_applicator]: 8.187e-05 [Cycle 2]: 0.00269787, [27] [switch_simplify]: 4.526e-05 [loop_unroll]: 4.186e-05 [a_1]: 0.00117732 [with_stream_mark]: 1.362e-05 [recompute_prepare]: 8.45999e-06 [updatestate_depend_eliminate]: 3.80998e-06 [updatestate_assign_eliminate]: 3.02002e-06 [updatestate_loads_eliminate]: 3.00002e-06 [parameter_eliminate]: 1.55999e-06 [specialize_transform]: 7.71001e-06 [updatestate_useless_node_eliminater]: 6.82002e-06 [accelerated_algorithm]: 6.44999e-06 [meta_shard_fg_expand]: 1.92001e-06 [get_grad_eliminate_]: 6.06e-06 [merge_forward]: 3.36001e-06 [cell_reuse_recompute_pass]: 9.5999e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.359e-05 [j_node_and_user_rematch]: 9.84001e-06 [meta_fg_expand]: 0.00024382 [replace_old_param]: 1.634e-05 [inline_without_move]: 7e-06 [renormalize]: 0.00085791 [add_forward_monad_depend]: 4.94e-06 [auto_monad_grad]: 1.39e-06 [auto_monad_eliminator]: 1.297e-05 [cse]: 2.479e-05 [replace_applicator]: 1.346e-05 [Cycle 3]: 0.00037586, [27] [switch_simplify]: 7.92e-06 [loop_unroll]: 6.73e-06 [a_1]: 0.00012302 [with_stream_mark]: 9.34e-06 [recompute_prepare]: 6.34999e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.58998e-06 [parameter_eliminate]: 1.02998e-06 [specialize_transform]: 6.39999e-06 [updatestate_useless_node_eliminater]: 6.43003e-06 [accelerated_algorithm]: 6.14001e-06 [meta_shard_fg_expand]: 1.27999e-06 [get_grad_eliminate_]: 6.06998e-06 [merge_forward]: 3.22002e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.38e-05 [j_node_and_user_rematch]: 9.65002e-06 [meta_fg_expand]: 1.94e-06 [replace_old_param]: 9.51e-06 [inline_without_move]: 6.14999e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 7.2e-07 [auto_monad_eliminator]: 6.17999e-06 [cse]: 1.391e-05 [replace_applicator]: 6.60997e-06 [py_interpret_to_execute_after_opt_a]: 1.183e-05 [rewriter_after_opt_a]: 3.984e-05 [convert_after_rewriter]: 7.97e-06 [order_py_execute_after_rewriter]: 5.20999e-06 [mutable_eliminate]: 0.00071026 [jit_opt_b]: 6.017e-05, [1] [Cycle 1]: 5.294e-05, [2] [frontend_op_eliminate]: 2.072e-05 [inline_after_opt_a]: 2.059e-05 [cconv]: 2.228e-05 [loop_unroll]: 0.00045506 [jit_opt_after_cconv]: 0.00015781, [1] [Cycle 1]: 0.00015118, [11] [c_1]: 2.778e-05 [parameter_eliminate]: 2.68e-06 [updatestate_depend_eliminate]: 5.79e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [cse]: 2.231e-05 [call_graph_tuple_transform]: 2.08e-05 [tuple_list_get_item_eliminator]: 6.92002e-06 [none_parameter_eliminate]: 1.64e-06 [renormalize]: 4.89992e-07 [switch_simplify]: 6.96001e-06 [remove_dup_value]: 1.728e-05 [partial_unused_args_eliminate]: 2.29999e-06 [environ_conv]: 6.21e-06 [add_recomputation]: 4.739e-05 [cse_after_recomputation]: 2.5e-05, [1] [Cycle 1]: 1.944e-05, [1] [cse]: 1.373e-05 [auto_monad_reorder]: 1.814e-05 [get_jit_bprop_graph]: 1.76e-06 [rewriter_after_jit_bprop_graph]: 5.53002e-06 [opt_after_jit_grad]: 0.00049195 [symbol_engine_optimizer]: 9.826e-05, [1] [Cycle 1]: 9.203e-05, [6] [build]: 3.67002e-06 [elim_shapecalc]: 9.09e-06 [elim_not_effective]: 1.445e-05 [opt_reshape]: 8.64e-06 [fold_const_symbol]: 1.11e-05 [renormalize]: 4.19997e-07 [validate]: 4.13e-05 [backend_pass]: 9.89996e-07 [task_emit]: 0.00693867 [execute]: 6.83e-06 Sums bootstrap : 0.000388s : 0.36% type_inference : 0.078998s : 74.22% event_method : 0.000323s : 0.30% auto_monad : 0.000207s : 0.19% graph_reusing : 0.000011s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000049s : 0.05% rewriter_before_opt_a : 0.000161s : 0.15% expand_dump_flag : 0.000005s : 0.00% jit_opt_a.switch_simplify : 0.000263s : 0.25% jit_opt_a.loop_unroll : 0.000162s : 0.15% jit_opt_a.a_1 : 0.002688s : 2.53% jit_opt_a.with_stream_mark : 0.000053s : 0.05% jit_opt_a.recompute_prepare : 0.000038s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000016s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000013s : 0.01% jit_opt_a.parameter_eliminate : 0.000006s : 0.01% jit_opt_a.specialize_transform : 0.000031s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000029s : 0.03% jit_opt_a.accelerated_algorithm : 0.000028s : 0.03% jit_opt_a.meta_shard_fg_expand : 0.000009s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000028s : 0.03% jit_opt_a.merge_forward : 0.000017s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000059s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000046s : 0.04% jit_opt_a.meta_fg_expand : 0.004487s : 4.22% jit_opt_a.replace_old_param : 0.000115s : 0.11% jit_opt_a.inline_without_move : 0.000095s : 0.09% jit_opt_a.renormalize : 0.008444s : 7.93% jit_opt_a.add_forward_monad_depend : 0.000018s : 0.02% jit_opt_a.auto_monad_grad : 0.000010s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000130s : 0.12% jit_opt_a.cse : 0.000361s : 0.34% jit_opt_a.replace_applicator : 0.000102s : 0.10% py_interpret_to_execute_after_opt_a : 0.000012s : 0.01% rewriter_after_opt_a : 0.000040s : 0.04% convert_after_rewriter : 0.000008s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.00% mutable_eliminate : 0.000710s : 0.67% jit_opt_b.frontend_op_eliminate : 0.000021s : 0.02% jit_opt_b.inline_after_opt_a : 0.000021s : 0.02% cconv : 0.000022s : 0.02% loop_unroll : 0.000455s : 0.43% jit_opt_after_cconv.c_1 : 0.000028s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000022s : 0.02% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000017s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000006s : 0.01% add_recomputation : 0.000047s : 0.04% cse_after_recomputation.cse : 0.000014s : 0.01% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000006s : 0.01% opt_after_jit_grad : 0.000492s : 0.46% symbol_engine_optimizer.build : 0.000004s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.01% symbol_engine_optimizer.opt_reshape : 0.000009s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000041s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.006939s : 6.52% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000693 134 0.28% : 0.000002s : 2: substitution.elim_not_effective 0.40% : 0.000003s : 2: substitution.fold_const_symbol 0.85% : 0.000006s : 4: substitution.graph_param_transform 70.66% : 0.000489s : 21: substitution.inline 3.28% : 0.000023s : 3: substitution.inline_without_move 1.35% : 0.000009s : 13: substitution.j_node_and_user_rematch 1.54% : 0.000011s : 7: substitution.minmaximum_grad 1.52% : 0.000011s : 11: substitution.partial_eliminate 1.53% : 0.000011s : 13: substitution.remove_not_recompute_node 3.65% : 0.000025s : 9: substitution.replace_applicator 1.85% : 0.000013s : 17: substitution.replace_old_param 0.35% : 0.000002s : 1: substitution.set_cell_output_no_recompute 2.14% : 0.000015s : 5: substitution.switch_simplify 3.22% : 0.000022s : 7: substitution.tuple_list_convert_item_index_to_positive 2.14% : 0.000015s : 7: substitution.tuple_list_get_item_depend_reorder 5.24% : 0.000036s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.078862 2 95.49% : 0.075309s : 1: type_inference.infer 4.51% : 0.003554s : 1: type_inference.specialize ------[replace.] 0.000271 31 55.34% : 0.000150s : 21: replace.inline 26.03% : 0.000071s : 5: replace.switch_simplify 18.63% : 0.000051s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000501 31 95.56% : 0.000479s : 21: match.inline 2.40% : 0.000012s : 5: match.switch_simplify 2.04% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000469 3351 1.48% : 0.000007s : 57: predicate.accumulaten_eliminater 0.37% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.41% : 0.000007s : 57: predicate.addn_check_dump 1.71% : 0.000008s : 57: predicate.addn_zero_filter 2.27% : 0.000011s : 57: predicate.arithmetic_simplify 1.49% : 0.000007s : 57: predicate.cast_eliminate 0.15% : 0.000001s : 4: predicate.check_bprop_eliminate 1.44% : 0.000007s : 57: predicate.compare_switch_simplify 1.44% : 0.000007s : 57: predicate.depend_value_elim 1.41% : 0.000007s : 57: predicate.dict_get_item_const_eliminator 1.50% : 0.000007s : 57: predicate.dict_get_item_eliminator 1.40% : 0.000007s : 57: predicate.dict_set_item_eliminator 0.30% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.10% : 0.000000s : 4: predicate.elim_not_effective 0.16% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.44% : 0.000007s : 57: predicate.environ_add_const_eliminate 1.40% : 0.000007s : 57: predicate.environ_get_add_eliminate 1.40% : 0.000007s : 57: predicate.environ_get_depend_swap 1.47% : 0.000007s : 57: predicate.environ_get_eliminate 1.42% : 0.000007s : 57: predicate.environ_get_set_eliminate 0.08% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000004s : 22: predicate.get_grad_eliminate 0.07% : 0.000000s : 4: predicate.graph_param_transform 4.27% : 0.000020s : 91: predicate.inline 2.11% : 0.000010s : 64: predicate.inline_without_move 0.35% : 0.000002s : 22: predicate.j_node_and_user_rematch 0.82% : 0.000004s : 22: predicate.less_batch_normalization 1.77% : 0.000008s : 62: predicate.list_to_tuple_eliminator_ 1.78% : 0.000008s : 66: predicate.load_eliminater 0.42% : 0.000002s : 4: predicate.loop_unroll_after_grad 5.02% : 0.000024s : 121: predicate.loop_unroll_before_grad 1.78% : 0.000008s : 61: predicate.make_slice_get_slice_eliminator 1.40% : 0.000007s : 57: predicate.merge_addn 1.44% : 0.000007s : 57: predicate.minmaximum_grad 0.44% : 0.000002s : 4: predicate.mutable_eliminate 0.14% : 0.000001s : 4: predicate.opt_reshape 2.16% : 0.000010s : 66: predicate.partial_eliminate 1.54% : 0.000007s : 57: predicate.print_const_string_wrapper 1.88% : 0.000009s : 57: predicate.reduce_eliminate 1.75% : 0.000008s : 62: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000002s : 22: predicate.remove_not_recompute_node 2.42% : 0.000011s : 114: predicate.replace_applicator 1.19% : 0.000006s : 64: predicate.replace_old_param 0.10% : 0.000000s : 4: predicate.reset_defer_inline 1.57% : 0.000007s : 57: predicate.reshape_eliminate 1.50% : 0.000007s : 57: predicate.row_tensor_add_zeros_like 0.25% : 0.000001s : 4: predicate.row_tensor_eliminate 1.52% : 0.000007s : 57: predicate.same_eliminate 0.43% : 0.000002s : 22: predicate.set_cell_output_no_recompute 0.31% : 0.000001s : 8: predicate.special_op_eliminate 0.75% : 0.000003s : 22: predicate.specialize_transform 1.76% : 0.000008s : 57: predicate.split_environ_get_set_with_tuple_value 1.55% : 0.000007s : 57: predicate.stack_unstack_eliminate 0.13% : 0.000001s : 4: predicate.switch_call_monad_eliminater 3.00% : 0.000014s : 83: predicate.switch_defer_inline 2.63% : 0.000012s : 83: predicate.switch_layer_defer_inline 7.12% : 0.000033s : 218: predicate.switch_simplify 1.54% : 0.000007s : 57: predicate.tile_eliminate 1.45% : 0.000007s : 57: predicate.transpose_eliminate 1.84% : 0.000009s : 57: predicate.tuple_list_convert_item_index_to_positive 1.67% : 0.000008s : 57: predicate.tuple_list_get_item_depend_reorder 2.83% : 0.000013s : 70: predicate.tuple_list_get_item_eliminator 1.88% : 0.000009s : 57: predicate.tuple_list_set_item_eliminator 1.66% : 0.000008s : 62: predicate.tuple_to_list_eliminator_ 1.73% : 0.000008s : 66: predicate.updatestate_pure_node_eliminater 2.73% : 0.000013s : 88: predicate.updatestate_useless_node_eliminater 1.93% : 0.000009s : 57: predicate.value_based_eliminate 0.13% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.24% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003601 51 65.18% : 0.002347s : 23: func_graph_cloner_run.FuncGraphClonerGraph 34.82% : 0.001254s : 28: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.125011 91 0.04% : 0.000050s : 1: add_recomputation 0.17% : 0.000216s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.33% : 0.000414s : 1: bootstrap 0.02% : 0.000025s : 1: cconv 0.01% : 0.000010s : 1: convert_after_rewriter 0.02% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: environ_conv 0.26% : 0.000331s : 1: event_method 0.01% : 0.000010s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000014s : 1: graph_reusing 18.61% : 0.023264s : 1: jit_opt_a 0.13% : 0.000161s : 1: jit_opt_after_cconv 0.05% : 0.000063s : 1: jit_opt_b 0.37% : 0.000463s : 1: loop_unroll 0.58% : 0.000719s : 1: mutable_eliminate 2.91% : 0.003633s : 39: opt.transform.jit_opt_a 0.05% : 0.000059s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000035s : 4: opt.transform.jit_opt_b 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000016s : 1: opt.transform.mutable_eliminate 0.02% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.03% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.40% : 0.000501s : 1: opt_after_jit_grad 0.01% : 0.000007s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000007s : 1: pre_auto_parallel 0.04% : 0.000053s : 1: py_interpret_to_execute 0.01% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000020s : 1: remove_dup_value 4.57% : 0.005717s : 2: renormalize.infer 2.17% : 0.002710s : 2: renormalize.specialize 0.01% : 0.000008s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000043s : 1: rewriter_after_opt_a 0.13% : 0.000164s : 1: rewriter_before_opt_a 0.08% : 0.000101s : 1: symbol_engine_optimizer 5.56% : 0.006948s : 1: task_emit 63.21% : 0.079018s : 1: type_inference 0.05% : 0.000063s : 1: validate TotalTime = 0.0707601, [33] [bootstrap]: 0.00065085 [type_inference]: 0.0443758 [event_method]: 0.00023872 [auto_monad]: 0.0001617 [graph_reusing]: 1.008e-05 [pre_auto_parallel]: 4.57998e-06 [py_interpret_to_execute]: 4.935e-05 [rewriter_before_opt_a]: 0.00015713 [expand_dump_flag]: 4.63999e-06 [jit_opt_a]: 0.0141176, [3] [Cycle 1]: 0.00789204, [27] [switch_simplify]: 0.00020802 [loop_unroll]: 6.46e-05 [a_1]: 0.00128612 [with_stream_mark]: 2.713e-05 [recompute_prepare]: 6.669e-05 [updatestate_depend_eliminate]: 9.16002e-06 [updatestate_assign_eliminate]: 7.09001e-06 [updatestate_loads_eliminate]: 6.86999e-06 [parameter_eliminate]: 2.78998e-06 [specialize_transform]: 1.585e-05 [updatestate_useless_node_eliminater]: 1.42e-05 [accelerated_algorithm]: 1.429e-05 [meta_shard_fg_expand]: 4.40999e-06 [get_grad_eliminate_]: 1.444e-05 [merge_forward]: 8.70001e-06 [cell_reuse_recompute_pass]: 8.89995e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.845e-05 [j_node_and_user_rematch]: 2.465e-05 [meta_fg_expand]: 0.00162904 [replace_old_param]: 6.355e-05 [inline_without_move]: 5.655e-05 [renormalize]: 0.00368007 [add_forward_monad_depend]: 1.048e-05 [auto_monad_grad]: 6.14001e-06 [auto_monad_eliminator]: 5.671e-05 [cse]: 0.00029303 [replace_applicator]: 7.582e-05 [Cycle 2]: 0.00236766, [27] [switch_simplify]: 4.394e-05 [loop_unroll]: 4.205e-05 [a_1]: 0.00112901 [with_stream_mark]: 1.293e-05 [recompute_prepare]: 8.45999e-06 [updatestate_depend_eliminate]: 3.55998e-06 [updatestate_assign_eliminate]: 2.73e-06 [updatestate_loads_eliminate]: 2.76e-06 [parameter_eliminate]: 1.14e-06 [specialize_transform]: 7.01001e-06 [updatestate_useless_node_eliminater]: 6.53e-06 [accelerated_algorithm]: 6.38e-06 [meta_shard_fg_expand]: 1.94e-06 [get_grad_eliminate_]: 6.19001e-06 [merge_forward]: 2.97002e-06 [cell_reuse_recompute_pass]: 7.7e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.279e-05 [j_node_and_user_rematch]: 9.02e-06 [meta_fg_expand]: 0.00017366 [replace_old_param]: 1.397e-05 [inline_without_move]: 6.93e-06 [renormalize]: 0.00066701 [add_forward_monad_depend]: 4.62e-06 [auto_monad_grad]: 1.12e-06 [auto_monad_eliminator]: 1.101e-05 [cse]: 2.28e-05 [replace_applicator]: 1.322e-05 [Cycle 3]: 0.00037026, [27] [switch_simplify]: 7.23999e-06 [loop_unroll]: 6.61999e-06 [a_1]: 0.00011937 [with_stream_mark]: 9.09e-06 [recompute_prepare]: 6.26e-06 [updatestate_depend_eliminate]: 3.33e-06 [updatestate_assign_eliminate]: 2.89001e-06 [updatestate_loads_eliminate]: 2.48e-06 [parameter_eliminate]: 8.70001e-07 [specialize_transform]: 6.35002e-06 [updatestate_useless_node_eliminater]: 6.22001e-06 [accelerated_algorithm]: 6.14001e-06 [meta_shard_fg_expand]: 1.50999e-06 [get_grad_eliminate_]: 5.93998e-06 [merge_forward]: 2.96001e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.418e-05 [j_node_and_user_rematch]: 9.05001e-06 [meta_fg_expand]: 1.99999e-06 [replace_old_param]: 9.30001e-06 [inline_without_move]: 6.11998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.04e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 5.91e-06 [cse]: 1.485e-05 [replace_applicator]: 6.32001e-06 [py_interpret_to_execute_after_opt_a]: 1.137e-05 [rewriter_after_opt_a]: 3.74e-05 [convert_after_rewriter]: 6.93e-06 [order_py_execute_after_rewriter]: 5.37001e-06 [mutable_eliminate]: 0.00053838 [jit_opt_b]: 5.617e-05, [1] [Cycle 1]: 4.953e-05, [2] [frontend_op_eliminate]: 2.016e-05 [inline_after_opt_a]: 1.811e-05 [cconv]: 2.148e-05 [loop_unroll]: 0.00043308 [jit_opt_after_cconv]: 0.00015395, [1] [Cycle 1]: 0.00014779, [11] [c_1]: 2.653e-05 [parameter_eliminate]: 2.27999e-06 [updatestate_depend_eliminate]: 5.72999e-06 [updatestate_assign_eliminate]: 2.82002e-06 [updatestate_loads_eliminate]: 2.48998e-06 [cse]: 2.213e-05 [call_graph_tuple_transform]: 2.026e-05 [tuple_list_get_item_eliminator]: 6.81001e-06 [none_parameter_eliminate]: 1.44998e-06 [renormalize]: 4.69998e-07 [switch_simplify]: 6.87002e-06 [remove_dup_value]: 1.642e-05 [partial_unused_args_eliminate]: 1.94999e-06 [environ_conv]: 6.16998e-06 [add_recomputation]: 4.55e-05 [cse_after_recomputation]: 2.637e-05, [1] [Cycle 1]: 2.074e-05, [1] [cse]: 1.455e-05 [auto_monad_reorder]: 1.782e-05 [get_jit_bprop_graph]: 1.71002e-06 [rewriter_after_jit_bprop_graph]: 4.61002e-06 [opt_after_jit_grad]: 0.00047764 [symbol_engine_optimizer]: 9.88e-05, [1] [Cycle 1]: 9.257e-05, [6] [build]: 3.28e-06 [elim_shapecalc]: 2.636e-05 [elim_not_effective]: 1.543e-05 [opt_reshape]: 7.53999e-06 [fold_const_symbol]: 1.054e-05 [renormalize]: 3.69997e-07 [validate]: 3.773e-05 [backend_pass]: 1.04e-06 [task_emit]: 0.00874915 [execute]: 7.12002e-06 Sums bootstrap : 0.000651s : 0.98% type_inference : 0.044376s : 66.79% event_method : 0.000239s : 0.36% auto_monad : 0.000162s : 0.24% graph_reusing : 0.000010s : 0.02% pre_auto_parallel : 0.000005s : 0.01% py_interpret_to_execute : 0.000049s : 0.07% rewriter_before_opt_a : 0.000157s : 0.24% expand_dump_flag : 0.000005s : 0.01% jit_opt_a.switch_simplify : 0.000259s : 0.39% jit_opt_a.loop_unroll : 0.000113s : 0.17% jit_opt_a.a_1 : 0.002535s : 3.81% jit_opt_a.with_stream_mark : 0.000049s : 0.07% jit_opt_a.recompute_prepare : 0.000081s : 0.12% jit_opt_a.updatestate_depend_eliminate : 0.000016s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.02% jit_opt_a.parameter_eliminate : 0.000005s : 0.01% jit_opt_a.specialize_transform : 0.000029s : 0.04% jit_opt_a.updatestate_useless_node_eliminater : 0.000027s : 0.04% jit_opt_a.accelerated_algorithm : 0.000027s : 0.04% jit_opt_a.meta_shard_fg_expand : 0.000008s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000027s : 0.04% jit_opt_a.merge_forward : 0.000015s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.08% jit_opt_a.j_node_and_user_rematch : 0.000043s : 0.06% jit_opt_a.meta_fg_expand : 0.001805s : 2.72% jit_opt_a.replace_old_param : 0.000087s : 0.13% jit_opt_a.inline_without_move : 0.000070s : 0.10% jit_opt_a.renormalize : 0.004347s : 6.54% jit_opt_a.add_forward_monad_depend : 0.000016s : 0.02% jit_opt_a.auto_monad_grad : 0.000008s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000074s : 0.11% jit_opt_a.cse : 0.000331s : 0.50% jit_opt_a.replace_applicator : 0.000095s : 0.14% py_interpret_to_execute_after_opt_a : 0.000011s : 0.02% rewriter_after_opt_a : 0.000037s : 0.06% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000538s : 0.81% jit_opt_b.frontend_op_eliminate : 0.000020s : 0.03% jit_opt_b.inline_after_opt_a : 0.000018s : 0.03% cconv : 0.000021s : 0.03% loop_unroll : 0.000433s : 0.65% jit_opt_after_cconv.c_1 : 0.000027s : 0.04% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.cse : 0.000022s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000020s : 0.03% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000016s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000006s : 0.01% add_recomputation : 0.000045s : 0.07% cse_after_recomputation.cse : 0.000015s : 0.02% auto_monad_reorder : 0.000018s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000478s : 0.72% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000026s : 0.04% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000038s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.008749s : 13.17% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000675 128 0.29% : 0.000002s : 2: substitution.elim_not_effective 0.21% : 0.000001s : 2: substitution.fold_const_symbol 0.75% : 0.000005s : 4: substitution.graph_param_transform 66.27% : 0.000448s : 21: substitution.inline 2.31% : 0.000016s : 2: substitution.inline_without_move 1.17% : 0.000008s : 12: substitution.j_node_and_user_rematch 1.45% : 0.000010s : 7: substitution.minmaximum_grad 1.41% : 0.000010s : 11: substitution.partial_eliminate 1.42% : 0.000010s : 12: substitution.remove_not_recompute_node 3.49% : 0.000024s : 9: substitution.replace_applicator 1.66% : 0.000011s : 14: substitution.replace_old_param 7.12% : 0.000048s : 1: substitution.set_cell_output_no_recompute 2.25% : 0.000015s : 5: substitution.switch_simplify 3.07% : 0.000021s : 7: substitution.tuple_list_convert_item_index_to_positive 2.09% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.04% : 0.000034s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.044244 2 93.08% : 0.041181s : 1: type_inference.infer 6.92% : 0.003063s : 1: type_inference.specialize ------[replace.] 0.000259 31 54.70% : 0.000142s : 21: replace.inline 26.59% : 0.000069s : 5: replace.switch_simplify 18.70% : 0.000048s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000459 31 95.04% : 0.000436s : 21: match.inline 2.76% : 0.000013s : 5: match.switch_simplify 2.19% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000438 3262 1.54% : 0.000007s : 56: predicate.accumulaten_eliminater 0.32% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.46% : 0.000006s : 56: predicate.addn_check_dump 1.62% : 0.000007s : 56: predicate.addn_zero_filter 2.18% : 0.000010s : 56: predicate.arithmetic_simplify 1.59% : 0.000007s : 56: predicate.cast_eliminate 0.17% : 0.000001s : 4: predicate.check_bprop_eliminate 1.45% : 0.000006s : 56: predicate.compare_switch_simplify 1.52% : 0.000007s : 56: predicate.depend_value_elim 1.45% : 0.000006s : 56: predicate.dict_get_item_const_eliminator 1.57% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.50% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.26% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.11% : 0.000000s : 4: predicate.elim_not_effective 0.19% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.45% : 0.000006s : 56: predicate.environ_add_const_eliminate 1.47% : 0.000006s : 56: predicate.environ_get_add_eliminate 1.47% : 0.000006s : 56: predicate.environ_get_depend_swap 1.55% : 0.000007s : 56: predicate.environ_get_eliminate 1.45% : 0.000006s : 56: predicate.environ_get_set_eliminate 0.08% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000003s : 21: predicate.get_grad_eliminate 0.08% : 0.000000s : 4: predicate.graph_param_transform 4.29% : 0.000019s : 90: predicate.inline 1.68% : 0.000007s : 46: predicate.inline_without_move 0.36% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.81% : 0.000004s : 21: predicate.less_batch_normalization 1.78% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.89% : 0.000008s : 65: predicate.load_eliminater 0.44% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.85% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.65% : 0.000007s : 60: predicate.make_slice_get_slice_eliminator 1.46% : 0.000006s : 56: predicate.merge_addn 1.56% : 0.000007s : 56: predicate.minmaximum_grad 0.44% : 0.000002s : 4: predicate.mutable_eliminate 0.14% : 0.000001s : 4: predicate.opt_reshape 2.26% : 0.000010s : 65: predicate.partial_eliminate 1.57% : 0.000007s : 56: predicate.print_const_string_wrapper 2.10% : 0.000009s : 56: predicate.reduce_eliminate 1.70% : 0.000007s : 61: predicate.redundant_stop_gradient_eliminater 0.42% : 0.000002s : 21: predicate.remove_not_recompute_node 2.48% : 0.000011s : 113: predicate.replace_applicator 0.93% : 0.000004s : 46: predicate.replace_old_param 0.09% : 0.000000s : 4: predicate.reset_defer_inline 1.55% : 0.000007s : 56: predicate.reshape_eliminate 1.50% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.25% : 0.000001s : 4: predicate.row_tensor_eliminate 1.63% : 0.000007s : 56: predicate.same_eliminate 0.45% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.32% : 0.000001s : 8: predicate.special_op_eliminate 0.78% : 0.000003s : 21: predicate.specialize_transform 1.69% : 0.000007s : 56: predicate.split_environ_get_set_with_tuple_value 1.62% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.13% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.98% : 0.000013s : 82: predicate.switch_defer_inline 2.74% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.41% : 0.000032s : 216: predicate.switch_simplify 1.52% : 0.000007s : 56: predicate.tile_eliminate 1.62% : 0.000007s : 56: predicate.transpose_eliminate 1.86% : 0.000008s : 56: predicate.tuple_list_convert_item_index_to_positive 1.75% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.73% : 0.000012s : 69: predicate.tuple_list_get_item_eliminator 1.86% : 0.000008s : 56: predicate.tuple_list_set_item_eliminator 1.63% : 0.000007s : 61: predicate.tuple_to_list_eliminator_ 1.81% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.69% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 2.00% : 0.000009s : 56: predicate.value_based_eliminate 0.13% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.19% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002871 41 62.79% : 0.001803s : 16: func_graph_cloner_run.FuncGraphClonerGraph 37.21% : 0.001068s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.078631 91 0.06% : 0.000048s : 1: add_recomputation 0.22% : 0.000169s : 1: auto_monad 0.03% : 0.000020s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.87% : 0.000683s : 1: bootstrap 0.03% : 0.000024s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.04% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: environ_conv 0.31% : 0.000247s : 1: event_method 0.01% : 0.000011s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000013s : 1: graph_reusing 17.96% : 0.014120s : 1: jit_opt_a 0.20% : 0.000157s : 1: jit_opt_after_cconv 0.07% : 0.000059s : 1: jit_opt_b 0.56% : 0.000440s : 1: loop_unroll 0.69% : 0.000546s : 1: mutable_eliminate 4.33% : 0.003402s : 39: opt.transform.jit_opt_a 0.07% : 0.000057s : 4: opt.transform.jit_opt_after_cconv 0.04% : 0.000032s : 4: opt.transform.jit_opt_b 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.62% : 0.000486s : 1: opt_after_jit_grad 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000007s : 1: pre_auto_parallel 0.07% : 0.000053s : 1: py_interpret_to_execute 0.02% : 0.000014s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000019s : 1: remove_dup_value 3.02% : 0.002377s : 2: renormalize.infer 2.49% : 0.001955s : 2: renormalize.specialize 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000040s : 1: rewriter_after_opt_a 0.20% : 0.000160s : 1: rewriter_before_opt_a 0.13% : 0.000102s : 1: symbol_engine_optimizer 11.14% : 0.008759s : 1: task_emit 56.46% : 0.044398s : 1: type_inference 0.08% : 0.000060s : 1: validate TotalTime = 0.231646, [24] [bootstrap]: 0.00057334 [type_inference]: 0.177554 [event_method]: 5.766e-05 [auto_monad]: 0.00017616 [graph_reusing]: 9.86e-06 [inline]: 2.79999e-06 [add_attr]: 0.00410953, [1] [add_attr_with_inline]: 0.00410024, [1] [Cycle 1]: 0.00010357, [2] [tag_attr]: 5.168e-05 [meta_addattr_fg_expand]: 1.388e-05 [parallel-infer-symbol]: 3.68e-06 [pre_auto_parallel]: 6.868e-05 [insert-virtual-dataset]: 2.53998e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 2.17999e-06 [pipeline_split]: 1.59998e-06 [optimize]: 0.038284, [53] [py_interpret_to_execute]: 5.89e-06 [rewriter_before_opt_a]: 0.00046165 [opt_a]: 0.0354018, [3] [Cycle 1]: 0.0305278, [45] [expand_dump_flag]: 5.79e-06 [switch_simplify]: 0.00017588 [loop_unroll]: 7.074e-05 [a_1]: 0.00160055 [with_stream_mark]: 3.099e-05 [recompute_prepare]: 2.571e-05 [updatestate_depend_eliminate]: 1.025e-05 [updatestate_assign_eliminate]: 8.43001e-06 [updatestate_loads_eliminate]: 8.05999e-06 [parameter_eliminate]: 2.69001e-06 [a_2]: 0.0002666 [accelerated_algorithm]: 1.873e-05 [shard]: 2.51e-06 [meta_shard_fg_expand]: 6.21998e-06 [shard_inline]: 1.703e-05 [merge_send_recv]: 1.903e-05 [auto_parallel]: 1.353e-05 [parallel]: 3.204e-05 [flash_sp]: 1.176e-05 [merge_comm]: 9.84001e-06 [allreduce_fusion]: 9.36e-06 [matmul_add_comm_reduction]: 3.019e-05 [allreduce_slice_to_reducescatter]: 1.35999e-06 [virtual_shard_identity]: 1.944e-05 [virtual_dataset]: 1.641e-05 [get_grad_eliminate_]: 1.686e-05 [virtual_output]: 1.719e-05 [merge_forward]: 1.036e-05 [cell_reuse_recompute_pass]: 1.32999e-06 [offload_activation]: 1.787e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.926e-05 [merge_recompute_call_nodes]: 1.50001e-06 [before_grad]: 2.832e-05 [set_forward_comm_id_for_comm_node_pass]: 1.015e-05 [meta_fg_expand]: 0.00235088 [flash_sp_send_recv_attached]: 5.35001e-06 [receive_attached]: 2.31e-06 [after_resolve]: 9.824e-05 [a_after_grad]: 0.00011329 [renormalize]: 0.0240576 [add_forward_monad_depend]: 1.546e-05 [auto_monad_grad]: 8.01001e-06 [auto_monad_eliminator]: 6.468e-05 [cse]: 0.00047773 [a_3]: 0.00043326 [Cycle 2]: 0.00407136, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 5.773e-05 [loop_unroll]: 5.283e-05 [a_1]: 0.00148755 [with_stream_mark]: 2.445e-05 [recompute_prepare]: 1.226e-05 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 4.23999e-06 [updatestate_loads_eliminate]: 3.58999e-06 [parameter_eliminate]: 2.15002e-06 [a_2]: 0.00012783 [accelerated_algorithm]: 9.20999e-06 [shard]: 2.29001e-06 [meta_shard_fg_expand]: 3.11999e-06 [shard_inline]: 8.89e-06 [merge_send_recv]: 1.004e-05 [auto_parallel]: 1.022e-05 [parallel]: 9.82999e-06 [flash_sp]: 3.9e-06 [merge_comm]: 3.97e-06 [allreduce_fusion]: 4.07998e-06 [matmul_add_comm_reduction]: 1.031e-05 [allreduce_slice_to_reducescatter]: 1.38002e-06 [virtual_shard_identity]: 1.073e-05 [virtual_dataset]: 7.97e-06 [get_grad_eliminate_]: 8.04002e-06 [virtual_output]: 7.77e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 1.32e-06 [offload_activation]: 1.175e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.435e-05 [merge_recompute_call_nodes]: 1.59e-06 [before_grad]: 1.265e-05 [set_forward_comm_id_for_comm_node_pass]: 4.06001e-06 [meta_fg_expand]: 0.00014886 [flash_sp_send_recv_attached]: 2.01003e-06 [receive_attached]: 2.43e-06 [after_resolve]: 1.503e-05 [a_after_grad]: 1.267e-05 [renormalize]: 0.00157359 [add_forward_monad_depend]: 5.31998e-06 [auto_monad_grad]: 2.06998e-06 [auto_monad_eliminator]: 1.533e-05 [cse]: 3.646e-05 [a_3]: 6.495e-05 [Cycle 3]: 0.00078351, [45] [expand_dump_flag]: 1.76e-06 [switch_simplify]: 9.87999e-06 [loop_unroll]: 8.48001e-06 [a_1]: 0.00018752 [with_stream_mark]: 1.195e-05 [recompute_prepare]: 8.22e-06 [updatestate_depend_eliminate]: 4.23999e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 1.22999e-06 [a_2]: 0.00010216 [accelerated_algorithm]: 8.47998e-06 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.76998e-06 [shard_inline]: 8.27e-06 [merge_send_recv]: 6.28e-06 [auto_parallel]: 7.35e-06 [parallel]: 5.29998e-06 [flash_sp]: 9.20001e-07 [merge_comm]: 3.48e-06 [allreduce_fusion]: 3.78999e-06 [matmul_add_comm_reduction]: 6.53e-06 [allreduce_slice_to_reducescatter]: 1.20999e-06 [virtual_shard_identity]: 1.001e-05 [virtual_dataset]: 8.15e-06 [get_grad_eliminate_]: 7.62002e-06 [virtual_output]: 8.59e-06 [merge_forward]: 4.00998e-06 [cell_reuse_recompute_pass]: 1.74e-06 [offload_activation]: 8.59002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.559e-05 [merge_recompute_call_nodes]: 1.02998e-06 [before_grad]: 1.146e-05 [set_forward_comm_id_for_comm_node_pass]: 3.83001e-06 [meta_fg_expand]: 2.61999e-06 [flash_sp_send_recv_attached]: 8.00006e-07 [receive_attached]: 1.45999e-06 [after_resolve]: 1.077e-05 [a_after_grad]: 1.312e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.38002e-06 [auto_monad_grad]: 8.50006e-07 [auto_monad_eliminator]: 8.52998e-06 [cse]: 2.303e-05 [a_3]: 5.109e-05 [py_interpret_to_execute_after_opt_a]: 6.25002e-06 [slice_cell_reuse_recomputed_activation]: 2.16e-06 [rewriter_after_opt_a]: 2.622e-05 [convert_after_rewriter]: 1.27e-06 [order_py_execute_after_rewriter]: 1.16997e-06 [mutable_eliminate]: 0.00076018 [opt_b]: 0.00029632, [1] [Cycle 1]: 0.00028865, [7] [b_1]: 0.00019351 [b_2]: 9.82999e-06 [updatestate_depend_eliminate]: 8.32998e-06 [updatestate_assign_eliminate]: 3.56999e-06 [updatestate_loads_eliminate]: 2.98e-06 [renormalize]: 4.80009e-07 [cse]: 3.13e-05 [optimize_parallel_all_gather_comm]: 1.895e-05 [overlap_param_gather]: 1.97001e-06 [cconv]: 2.823e-05 [loop_unroll]: 0.0004677 [opt_after_cconv]: 0.00012588, [1] [Cycle 1]: 0.00011988, [7] [c_1]: 4.212e-05 [parameter_eliminate]: 3.24001e-06 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 3.16001e-06 [updatestate_loads_eliminate]: 3.13e-06 [cse]: 2.958e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 4.377e-05 [tuple_transform]: 8.727e-05, [1] [Cycle 1]: 8.277e-05, [4] [d_1]: 5.262e-05 [none_parameter_eliminate]: 2.16003e-06 [renormalize]: 2.80008e-07 [switch_simplify]: 8.69e-06 [partial_unused_args_eliminate]: 2.39999e-06 [add_recomputation]: 4.491e-05 [cse_after_recomputation]: 2.975e-05, [1] [Cycle 1]: 2.539e-05, [1] [cse]: 2.013e-05 [environ_conv]: 9.94999e-06 [swap_dp_allreduce_reducescatter]: 5.73002e-06 [bias_add_comm_swap]: 2.57001e-06 [label_micro_interleaved_index]: 4.25e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.12999e-06 [micro_interleaved_order_control]: 2.26e-06 [assign_add_opt]: 1.22999e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.13001e-06 [full_micro_interleaved_order_control]: 2.31e-06 [reorder_send_recv_between_fp_bp]: 2.89001e-06 [comm_op_add_attrs]: 1.20999e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.48002e-06 [interleave_parallel_branches]: 1.05999e-06 [overlap_opt_shard_in_pipeline]: 6.51e-06 [overlap_opt_shard_grad_in_pipeline]: 2.72001e-06 [control_data_broadcast_order]: 1.768e-05 [grouped_pairwise_exchange_alltoall]: 2.34999e-06 [offloading_packed_experts]: 4.52e-06 [overlap_recompute_and_grad_model_parallel]: 5.31998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.24e-06 [overlap_recompute_allgather_and_fa_grad]: 1.40999e-06 [overlap_recompute_comm]: 2.59999e-06 [overlap_grad_ring_attention]: 4.83001e-06 [overlap_grad_flash_sp]: 2.534e-05 [begin_end_overlap_inline]: 5.10016e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.67999e-06 [handle_group_info]: 9.50007e-07 [symbol_engine_optimizer]: 8.853e-05, [1] [Cycle 1]: 8.384e-05, [6] [build]: 3.09001e-06 [elim_shapecalc]: 1.315e-05 [elim_not_effective]: 1.572e-05 [opt_reshape]: 9.42001e-06 [fold_const_symbol]: 1.218e-05 [renormalize]: 1.79978e-07 [detach_backward]: 2.46e-06 [pipeline_parallel_scheduler]: 1.67999e-06 [auto_monad_reorder]: 2.053e-05 [get_jit_bprop_graph]: 2.17001e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00051124 [validate]: 6.194e-05 [backend_pass]: 9.79984e-07 [task_emit]: 0.00992495 [execute]: 1.028e-05 Sums bootstrap : 0.000573s : 0.25% type_inference : 0.177554s : 78.52% event_method : 0.000058s : 0.03% auto_monad : 0.000176s : 0.08% graph_reusing : 0.000010s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000052s : 0.02% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000069s : 0.03% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.00% optimize.rewriter_before_opt_a : 0.000462s : 0.20% optimize.opt_a.expand_dump_flag : 0.000011s : 0.00% optimize.opt_a.switch_simplify : 0.000243s : 0.11% optimize.opt_a.loop_unroll : 0.000132s : 0.06% optimize.opt_a.a_1 : 0.003276s : 1.45% optimize.opt_a.with_stream_mark : 0.000067s : 0.03% optimize.opt_a.recompute_prepare : 0.000046s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000015s : 0.01% optimize.opt_a.parameter_eliminate : 0.000006s : 0.00% optimize.opt_a.a_2 : 0.000497s : 0.22% optimize.opt_a.accelerated_algorithm : 0.000036s : 0.02% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000011s : 0.00% optimize.opt_a.shard_inline : 0.000034s : 0.02% optimize.opt_a.merge_send_recv : 0.000035s : 0.02% optimize.opt_a.auto_parallel : 0.000031s : 0.01% optimize.opt_a.parallel : 0.000047s : 0.02% optimize.opt_a.flash_sp : 0.000017s : 0.01% optimize.opt_a.merge_comm : 0.000017s : 0.01% optimize.opt_a.allreduce_fusion : 0.000017s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000047s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000004s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000040s : 0.02% optimize.opt_a.virtual_dataset : 0.000033s : 0.01% optimize.opt_a.get_grad_eliminate_ : 0.000033s : 0.01% optimize.opt_a.virtual_output : 0.000034s : 0.01% optimize.opt_a.merge_forward : 0.000019s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000038s : 0.02% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000059s : 0.03% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000052s : 0.02% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.01% optimize.opt_a.meta_fg_expand : 0.002502s : 1.11% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000006s : 0.00% optimize.opt_a.after_resolve : 0.000124s : 0.05% optimize.opt_a.a_after_grad : 0.000139s : 0.06% optimize.opt_a.renormalize : 0.025631s : 11.34% optimize.opt_a.add_forward_monad_depend : 0.000022s : 0.01% optimize.opt_a.auto_monad_grad : 0.000011s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000089s : 0.04% optimize.opt_a.cse : 0.000537s : 0.24% optimize.opt_a.a_3 : 0.000549s : 0.24% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000026s : 0.01% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000760s : 0.34% optimize.opt_b.b_1 : 0.000194s : 0.09% optimize.opt_b.b_2 : 0.000010s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000031s : 0.01% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.01% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000028s : 0.01% optimize.loop_unroll : 0.000468s : 0.21% optimize.opt_after_cconv.c_1 : 0.000042s : 0.02% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000030s : 0.01% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.02% optimize.tuple_transform.d_1 : 0.000053s : 0.02% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000009s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000045s : 0.02% optimize.cse_after_recomputation.cse : 0.000020s : 0.01% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000007s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000003s : 0.00% optimize.control_data_broadcast_order : 0.000018s : 0.01% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.00% optimize.overlap_grad_flash_sp : 0.000025s : 0.01% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000013s : 0.01% optimize.symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.01% optimize.symbol_engine_optimizer.opt_reshape : 0.000009s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.01% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.01% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.00% opt_after_jit_grad : 0.000511s : 0.23% validate : 0.000062s : 0.03% backend_pass : 0.000001s : 0.00% task_emit : 0.009925s : 4.39% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000864 166 0.23% : 0.000002s : 2: substitution.elim_not_effective 1.09% : 0.000009s : 11: substitution.float_depend_g_call 0.40% : 0.000003s : 2: substitution.float_tuple_getitem_switch 0.17% : 0.000002s : 2: substitution.fold_const_symbol 0.86% : 0.000007s : 5: substitution.graph_param_transform 0.35% : 0.000003s : 2: substitution.incorporate_call 0.26% : 0.000002s : 2: substitution.incorporate_call_switch 66.56% : 0.000575s : 20: substitution.inline 2.45% : 0.000021s : 2: substitution.inline_without_move 1.13% : 0.000010s : 12: substitution.j_node_and_user_rematch 1.44% : 0.000012s : 7: substitution.minmaximum_grad 2.43% : 0.000021s : 11: substitution.partial_eliminate 1.25% : 0.000011s : 12: substitution.remove_not_recompute_node 3.25% : 0.000028s : 9: substitution.replace_applicator 1.56% : 0.000013s : 19: substitution.replace_old_param 0.34% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.98% : 0.000026s : 3: substitution.switch_simplify 2.68% : 0.000023s : 7: substitution.tuple_list_convert_item_index_to_positive 1.14% : 0.000010s : 7: substitution.tuple_list_get_item_const_eliminator 1.68% : 0.000015s : 7: substitution.tuple_list_get_item_depend_reorder 5.95% : 0.000051s : 16: substitution.tuple_list_get_item_eliminator 1.79% : 0.000015s : 7: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.177419 2 96.53% : 0.171259s : 1: type_inference.infer 3.47% : 0.006159s : 1: type_inference.specialize ------[replace.] 0.000313 30 60.94% : 0.000191s : 20: replace.inline 13.90% : 0.000044s : 3: replace.switch_simplify 25.15% : 0.000079s : 7: replace.tuple_list_get_item_eliminator ------[match.] 0.000608 30 92.63% : 0.000563s : 20: match.inline 3.76% : 0.000023s : 3: match.switch_simplify 3.61% : 0.000022s : 7: match.tuple_list_get_item_eliminator ------[predicate.] 0.000698 5043 1.06% : 0.000007s : 61: predicate.accumulaten_eliminater 0.30% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.46% : 0.000003s : 26: predicate.addn_check_dump 1.09% : 0.000008s : 61: predicate.addn_zero_filter 0.99% : 0.000007s : 61: predicate.adjust_all_reduce_mul_add 2.01% : 0.000014s : 87: predicate.arithmetic_simplify 1.07% : 0.000007s : 61: predicate.cast_eliminate 1.20% : 0.000008s : 65: predicate.check_bprop_eliminate 0.48% : 0.000003s : 26: predicate.compare_switch_simplify 0.09% : 0.000001s : 6: predicate.const_output_eliminate 0.46% : 0.000003s : 26: predicate.depend_value_elim 1.11% : 0.000008s : 61: predicate.dict_get_item_const_eliminator 1.24% : 0.000009s : 61: predicate.dict_get_item_eliminator 1.08% : 0.000008s : 61: predicate.dict_set_item_eliminator 0.34% : 0.000002s : 11: predicate.dumpgradient_eliminate 0.08% : 0.000001s : 5: predicate.elim_not_effective 0.16% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.18% : 0.000008s : 67: predicate.environ_add_const_eliminate 1.08% : 0.000008s : 67: predicate.environ_get_add_eliminate 1.11% : 0.000008s : 67: predicate.environ_get_depend_swap 1.55% : 0.000011s : 93: predicate.environ_get_eliminate 1.12% : 0.000008s : 67: predicate.environ_get_set_eliminate 1.60% : 0.000011s : 88: predicate.exchange_switch_depend_value 2.39% : 0.000017s : 88: predicate.float_depend_g_call 0.44% : 0.000003s : 26: predicate.float_environ_get_switch 0.59% : 0.000004s : 32: predicate.float_tuple_getitem_switch 0.06% : 0.000000s : 5: predicate.fold_const_symbol 0.56% : 0.000004s : 26: predicate.get_grad_eliminate 0.09% : 0.000001s : 5: predicate.graph_param_transform 0.50% : 0.000004s : 26: predicate.incorporate_call 0.42% : 0.000003s : 26: predicate.incorporate_call_switch 5.32% : 0.000037s : 213: predicate.inline 1.61% : 0.000011s : 65: predicate.inline_without_move 0.32% : 0.000002s : 26: predicate.j_node_and_user_rematch 0.66% : 0.000005s : 26: predicate.less_batch_normalization 1.45% : 0.000010s : 79: predicate.list_to_tuple_eliminator_ 2.46% : 0.000017s : 141: predicate.load_eliminater 0.44% : 0.000003s : 6: predicate.loop_unroll_after_grad 2.72% : 0.000019s : 135: predicate.loop_unroll_before_grad 1.26% : 0.000009s : 73: predicate.make_slice_get_slice_eliminator 0.48% : 0.000003s : 26: predicate.merge_addn 1.15% : 0.000008s : 65: predicate.micro_step_allgather_replace 1.15% : 0.000008s : 65: predicate.mini_step_allgather_replace 1.03% : 0.000007s : 61: predicate.minmaximum_grad 0.35% : 0.000002s : 6: predicate.mutable_eliminate 0.15% : 0.000001s : 5: predicate.opt_reshape 0.13% : 0.000001s : 6: predicate.parallel_virtual_node 2.22% : 0.000015s : 88: predicate.partial_defer_inline 1.48% : 0.000010s : 74: predicate.partial_eliminate 1.07% : 0.000007s : 61: predicate.print_const_string_wrapper 0.49% : 0.000003s : 26: predicate.reduce_all_const_elim 1.43% : 0.000010s : 61: predicate.reduce_eliminate 2.40% : 0.000017s : 141: predicate.redundant_stop_gradient_eliminater 0.35% : 0.000002s : 26: predicate.remove_not_recompute_node 1.99% : 0.000014s : 133: predicate.replace_applicator 0.77% : 0.000005s : 65: predicate.replace_old_param 0.10% : 0.000001s : 6: predicate.reset_defer_inline 1.16% : 0.000008s : 61: predicate.reshape_eliminate 1.18% : 0.000008s : 65: predicate.row_tensor_add_zeros_like 0.12% : 0.000001s : 6: predicate.row_tensor_eliminate 1.44% : 0.000010s : 65: predicate.same_eliminate 0.38% : 0.000003s : 26: predicate.set_cell_output_no_recompute 0.61% : 0.000004s : 26: predicate.shard_identity_eliminate 0.24% : 0.000002s : 11: predicate.special_op_eliminate 0.49% : 0.000003s : 26: predicate.specialize_transform 1.31% : 0.000009s : 65: predicate.split_environ_get_set_with_tuple_value 1.49% : 0.000010s : 65: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 6: predicate.switch_call_monad_eliminater 1.79% : 0.000013s : 88: predicate.switch_defer_inline 2.93% : 0.000020s : 153: predicate.switch_layer_defer_inline 7.76% : 0.000054s : 260: predicate.switch_simplify 1.04% : 0.000007s : 61: predicate.tile_eliminate 1.08% : 0.000008s : 61: predicate.transpose_eliminate 1.28% : 0.000009s : 72: predicate.tuple_list_convert_item_index_to_positive 1.46% : 0.000010s : 72: predicate.tuple_list_get_item_const_eliminator 1.31% : 0.000009s : 72: predicate.tuple_list_get_item_depend_reorder 2.45% : 0.000017s : 105: predicate.tuple_list_get_item_eliminator 1.38% : 0.000010s : 72: predicate.tuple_list_get_set_item_eliminator 1.98% : 0.000014s : 98: predicate.tuple_list_set_item_eliminator 1.43% : 0.000010s : 79: predicate.tuple_to_list_eliminator_ 2.31% : 0.000016s : 141: predicate.updatestate_pure_node_eliminater 2.86% : 0.000020s : 167: predicate.updatestate_useless_node_eliminater 0.13% : 0.000001s : 6: predicate.value_based_eliminate 0.50% : 0.000003s : 26: predicate.virtual_dataset_eliminate 0.65% : 0.000005s : 26: predicate.virtual_output_eliminate 0.08% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.14% : 0.000001s : 6: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.006523 68 76.89% : 0.005015s : 44: func_graph_cloner_run.FuncGraphClonerGraph 23.11% : 0.001508s : 24: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.305076 237 0.00% : 0.000003s : 1: ForceFp32Comm 1.35% : 0.004114s : 1: add_attr 1.35% : 0.004104s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.02% : 0.000049s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.06% : 0.000183s : 1: auto_monad 0.01% : 0.000025s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.20% : 0.000608s : 1: bootstrap 0.01% : 0.000032s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.01% : 0.000021s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.01% : 0.000033s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000006s : 1: detach_backward 0.00% : 0.000013s : 1: environ_conv 0.02% : 0.000066s : 1: event_method 0.01% : 0.000018s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000007s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.16% : 0.000477s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.25% : 0.000769s : 1: mutable_eliminate 0.00% : 0.000008s : 1: offloading_packed_experts 0.01% : 0.000020s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000020s : 1: opt.transform.mutable_eliminate 1.70% : 0.005177s : 117: opt.transform.opt_a 0.01% : 0.000041s : 1: opt.transform.opt_after_cconv 0.01% : 0.000033s : 1: opt.transform.opt_after_jit_grad 0.05% : 0.000154s : 28: opt.transform.opt_b 0.02% : 0.000059s : 2: opt.transform.opt_trans_graph 0.02% : 0.000046s : 4: opt.transform.symbol_engine_opt 11.61% : 0.035405s : 1: opt_a 0.04% : 0.000129s : 1: opt_after_cconv 0.17% : 0.000521s : 1: opt_after_jit_grad 0.10% : 0.000300s : 1: opt_b 12.55% : 0.038290s : 1: optimize 0.01% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.01% : 0.000029s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000008s : 1: overlap_grad_ring_attention 0.00% : 0.000007s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000010s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.02% : 0.000073s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.02% : 0.000048s : 1: remove_dup_value 7.27% : 0.022173s : 2: renormalize.infer 1.13% : 0.003436s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.01% : 0.000029s : 1: rewriter_after_opt_a 0.15% : 0.000469s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.03% : 0.000092s : 1: symbol_engine_optimizer 3.26% : 0.009945s : 1: task_emit 0.03% : 0.000090s : 1: tuple_transform 58.21% : 0.177583s : 1: type_inference 0.03% : 0.000102s : 1: validate TotalTime = 0.0575914, [24] [bootstrap]: 0.00068018 [type_inference]: 0.0210503 [event_method]: 1.974e-05 [auto_monad]: 0.00012682 [graph_reusing]: 7.36999e-06 [inline]: 2.59001e-06 [add_attr]: 0.0042397, [1] [add_attr_with_inline]: 0.00422751, [1] [Cycle 1]: 7.841e-05, [2] [tag_attr]: 2.947e-05 [meta_addattr_fg_expand]: 7.05002e-06 [parallel-infer-symbol]: 3.6e-06 [pre_auto_parallel]: 3.994e-05 [insert-virtual-dataset]: 3.19001e-06 [parallel-infer-symbol-second]: 8.29983e-07 [dataset_repeat_opt]: 1.104e-05 [pipeline_split]: 1.87001e-06 [optimize]: 0.0166801, [53] [py_interpret_to_execute]: 5.64998e-06 [rewriter_before_opt_a]: 8.6e-05 [opt_a]: 0.0139096, [3] [Cycle 1]: 0.00919484, [45] [expand_dump_flag]: 8.43999e-06 [switch_simplify]: 5.31e-05 [loop_unroll]: 4.067e-05 [a_1]: 0.00133334 [with_stream_mark]: 2.951e-05 [recompute_prepare]: 2.649e-05 [updatestate_depend_eliminate]: 9.57999e-06 [updatestate_assign_eliminate]: 8.85999e-06 [updatestate_loads_eliminate]: 8.05e-06 [parameter_eliminate]: 2.84001e-06 [a_2]: 0.0002973 [accelerated_algorithm]: 5.951e-05 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 4.27e-06 [shard_inline]: 1.662e-05 [merge_send_recv]: 1.807e-05 [auto_parallel]: 1.243e-05 [parallel]: 2.867e-05 [flash_sp]: 1.011e-05 [merge_comm]: 9.51e-06 [allreduce_fusion]: 8.94998e-06 [matmul_add_comm_reduction]: 3.575e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 2.331e-05 [virtual_dataset]: 1.768e-05 [get_grad_eliminate_]: 1.763e-05 [virtual_output]: 1.832e-05 [merge_forward]: 1.374e-05 [cell_reuse_recompute_pass]: 1.20999e-06 [offload_activation]: 2.086e-05 [cell_reuse_handle_not_recompute_node_pass]: 3.609e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 3.269e-05 [set_forward_comm_id_for_comm_node_pass]: 1.108e-05 [meta_fg_expand]: 0.00234819 [flash_sp_send_recv_attached]: 7.32002e-06 [receive_attached]: 2.34999e-06 [after_resolve]: 6.544e-05 [a_after_grad]: 0.0001037 [renormalize]: 0.00345581 [add_forward_monad_depend]: 1.056e-05 [auto_monad_grad]: 6.54001e-06 [auto_monad_eliminator]: 6.11e-05 [cse]: 0.00022111 [a_3]: 0.00036581 [Cycle 2]: 0.00362928, [45] [expand_dump_flag]: 3.37002e-06 [switch_simplify]: 5.016e-05 [loop_unroll]: 4.706e-05 [a_1]: 0.00159408 [with_stream_mark]: 1.607e-05 [recompute_prepare]: 1.395e-05 [updatestate_depend_eliminate]: 6.41e-06 [updatestate_assign_eliminate]: 4.80999e-06 [updatestate_loads_eliminate]: 4.85999e-06 [parameter_eliminate]: 1.52001e-06 [a_2]: 0.0001532 [accelerated_algorithm]: 1.68e-05 [shard]: 1.34e-06 [meta_shard_fg_expand]: 3.01001e-06 [shard_inline]: 1.189e-05 [merge_send_recv]: 9.22001e-06 [auto_parallel]: 1.003e-05 [parallel]: 6.66e-06 [flash_sp]: 3.64002e-06 [merge_comm]: 5.98998e-06 [allreduce_fusion]: 5.59998e-06 [matmul_add_comm_reduction]: 1.011e-05 [allreduce_slice_to_reducescatter]: 3.9002e-07 [virtual_shard_identity]: 1.264e-05 [virtual_dataset]: 1.147e-05 [get_grad_eliminate_]: 1.128e-05 [virtual_output]: 1.172e-05 [merge_forward]: 5.20999e-06 [cell_reuse_recompute_pass]: 9.10019e-07 [offload_activation]: 1.213e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.989e-05 [merge_recompute_call_nodes]: 1.00999e-06 [before_grad]: 1.845e-05 [set_forward_comm_id_for_comm_node_pass]: 6.36e-06 [meta_fg_expand]: 0.00010481 [flash_sp_send_recv_attached]: 9.49978e-07 [receive_attached]: 1.62999e-06 [after_resolve]: 1.792e-05 [a_after_grad]: 1.832e-05 [renormalize]: 0.00094871 [add_forward_monad_depend]: 4.94998e-06 [auto_monad_grad]: 1.96998e-06 [auto_monad_eliminator]: 1.743e-05 [cse]: 9.316e-05 [a_3]: 8.537e-05 [Cycle 3]: 0.00106887, [45] [expand_dump_flag]: 1.75001e-06 [switch_simplify]: 1.276e-05 [loop_unroll]: 1.135e-05 [a_1]: 0.00031152 [with_stream_mark]: 1.257e-05 [recompute_prepare]: 1.155e-05 [updatestate_depend_eliminate]: 5.48002e-06 [updatestate_assign_eliminate]: 4.94e-06 [updatestate_loads_eliminate]: 4.59002e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.0001478 [accelerated_algorithm]: 1.524e-05 [shard]: 1.02998e-06 [meta_shard_fg_expand]: 2.01998e-06 [shard_inline]: 1.119e-05 [merge_send_recv]: 7.89002e-06 [auto_parallel]: 8e-06 [parallel]: 4.60999e-06 [flash_sp]: 2.11e-06 [merge_comm]: 5.59998e-06 [allreduce_fusion]: 5.32999e-06 [matmul_add_comm_reduction]: 8.37998e-06 [allreduce_slice_to_reducescatter]: 4.30009e-07 [virtual_shard_identity]: 1.274e-05 [virtual_dataset]: 1.118e-05 [get_grad_eliminate_]: 1.085e-05 [virtual_output]: 1.114e-05 [merge_forward]: 5.30999e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 1.016e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.959e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 1.883e-05 [set_forward_comm_id_for_comm_node_pass]: 5.74e-06 [meta_fg_expand]: 4.39002e-06 [flash_sp_send_recv_attached]: 9.70002e-07 [receive_attached]: 1.17e-06 [after_resolve]: 1.61e-05 [a_after_grad]: 1.851e-05 [renormalize]: 1.40019e-07 [add_forward_monad_depend]: 1.40999e-06 [auto_monad_grad]: 1.02e-06 [auto_monad_eliminator]: 1.212e-05 [cse]: 3.083e-05 [a_3]: 7.236e-05 [py_interpret_to_execute_after_opt_a]: 6.04999e-06 [slice_cell_reuse_recomputed_activation]: 2.00002e-06 [rewriter_after_opt_a]: 3.02e-05 [convert_after_rewriter]: 1.24e-06 [order_py_execute_after_rewriter]: 1.15999e-06 [mutable_eliminate]: 0.00065974 [opt_b]: 0.0004341, [1] [Cycle 1]: 0.00042711, [7] [b_1]: 0.00029931 [b_2]: 1.557e-05 [updatestate_depend_eliminate]: 8.40001e-06 [updatestate_assign_eliminate]: 5.08002e-06 [updatestate_loads_eliminate]: 4.55001e-06 [renormalize]: 2.80008e-07 [cse]: 5.162e-05 [optimize_parallel_all_gather_comm]: 2.211e-05 [overlap_param_gather]: 2.67001e-06 [cconv]: 2.672e-05 [loop_unroll]: 0.00047371 [opt_after_cconv]: 0.00017736, [1] [Cycle 1]: 0.00017089, [7] [c_1]: 7.127e-05 [parameter_eliminate]: 2.53003e-06 [updatestate_depend_eliminate]: 7.99002e-06 [updatestate_assign_eliminate]: 4.79998e-06 [updatestate_loads_eliminate]: 4.61997e-06 [cse]: 4.385e-05 [renormalize]: 5.60016e-07 [remove_dup_value]: 7.765e-05 [tuple_transform]: 0.00013124, [1] [Cycle 1]: 0.00012644, [4] [d_1]: 9.358e-05 [none_parameter_eliminate]: 2.14e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 1.153e-05 [partial_unused_args_eliminate]: 2.21e-06 [add_recomputation]: 6.862e-05 [cse_after_recomputation]: 4.774e-05, [1] [Cycle 1]: 4.25e-05, [1] [cse]: 3.661e-05 [environ_conv]: 1.062e-05 [swap_dp_allreduce_reducescatter]: 8.87e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 4.67e-06 [label_fine_grained_interleaved_index]: 2.55002e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.01e-06 [micro_interleaved_order_control]: 2.90002e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 8.39995e-07 [remove_cast_before_assign_add]: 1.09003e-06 [full_micro_interleaved_order_control]: 2.14e-06 [reorder_send_recv_between_fp_bp]: 2.47001e-06 [comm_op_add_attrs]: 1.11002e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.43002e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 4.53999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.94999e-06 [control_data_broadcast_order]: 2.077e-05 [grouped_pairwise_exchange_alltoall]: 1.58002e-06 [offloading_packed_experts]: 5.97999e-06 [overlap_recompute_and_grad_model_parallel]: 6.54999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.42e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 7.56999e-06 [overlap_grad_flash_sp]: 2.987e-05 [begin_end_overlap_inline]: 8.39995e-07 [split_matmul_comm_elemetwise]: 2.42001e-06 [split_layernorm_comm]: 1.67001e-06 [handle_group_info]: 1.05001e-06 [symbol_engine_optimizer]: 0.00013456, [1] [Cycle 1]: 0.00012785, [6] [build]: 2.406e-05 [elim_shapecalc]: 1.858e-05 [elim_not_effective]: 2.308e-05 [opt_reshape]: 1.406e-05 [fold_const_symbol]: 1.838e-05 [renormalize]: 1.79978e-07 [detach_backward]: 1.85001e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 2.739e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 4.13999e-06 [opt_after_jit_grad]: 0.00051524 [validate]: 6.186e-05 [backend_pass]: 9.29984e-07 [task_emit]: 0.0138074 [execute]: 9.85002e-06 Sums bootstrap : 0.000680s : 1.31% type_inference : 0.021050s : 40.56% event_method : 0.000020s : 0.04% auto_monad : 0.000127s : 0.24% graph_reusing : 0.000007s : 0.01% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000029s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000040s : 0.08% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000011s : 0.02% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000006s : 0.01% optimize.rewriter_before_opt_a : 0.000086s : 0.17% optimize.opt_a.expand_dump_flag : 0.000014s : 0.03% optimize.opt_a.switch_simplify : 0.000116s : 0.22% optimize.opt_a.loop_unroll : 0.000099s : 0.19% optimize.opt_a.a_1 : 0.003239s : 6.24% optimize.opt_a.with_stream_mark : 0.000058s : 0.11% optimize.opt_a.recompute_prepare : 0.000052s : 0.10% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.04% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.04% optimize.opt_a.updatestate_loads_eliminate : 0.000018s : 0.03% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000598s : 1.15% optimize.opt_a.accelerated_algorithm : 0.000092s : 0.18% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.02% optimize.opt_a.shard_inline : 0.000040s : 0.08% optimize.opt_a.merge_send_recv : 0.000035s : 0.07% optimize.opt_a.auto_parallel : 0.000030s : 0.06% optimize.opt_a.parallel : 0.000040s : 0.08% optimize.opt_a.flash_sp : 0.000016s : 0.03% optimize.opt_a.merge_comm : 0.000021s : 0.04% optimize.opt_a.allreduce_fusion : 0.000020s : 0.04% optimize.opt_a.matmul_add_comm_reduction : 0.000054s : 0.10% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000049s : 0.09% optimize.opt_a.virtual_dataset : 0.000040s : 0.08% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.08% optimize.opt_a.virtual_output : 0.000041s : 0.08% optimize.opt_a.merge_forward : 0.000024s : 0.05% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000043s : 0.08% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000076s : 0.15% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000070s : 0.13% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000023s : 0.04% optimize.opt_a.meta_fg_expand : 0.002457s : 4.73% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000099s : 0.19% optimize.opt_a.a_after_grad : 0.000141s : 0.27% optimize.opt_a.renormalize : 0.004405s : 8.49% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.03% optimize.opt_a.auto_monad_grad : 0.000010s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000091s : 0.17% optimize.opt_a.cse : 0.000345s : 0.66% optimize.opt_a.a_3 : 0.000524s : 1.01% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.06% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000660s : 1.27% optimize.opt_b.b_1 : 0.000299s : 0.58% optimize.opt_b.b_2 : 0.000016s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000052s : 0.10% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.04% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000027s : 0.05% optimize.loop_unroll : 0.000474s : 0.91% optimize.opt_after_cconv.c_1 : 0.000071s : 0.14% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.cse : 0.000044s : 0.08% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000078s : 0.15% optimize.tuple_transform.d_1 : 0.000094s : 0.18% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000069s : 0.13% optimize.cse_after_recomputation.cse : 0.000037s : 0.07% optimize.environ_conv : 0.000011s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000002s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000008s : 0.01% optimize.overlap_grad_flash_sp : 0.000030s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000024s : 0.05% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000023s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000014s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000018s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000027s : 0.05% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000515s : 0.99% validate : 0.000062s : 0.12% backend_pass : 0.000001s : 0.00% task_emit : 0.013807s : 26.60% execute : 0.000010s : 0.02% Time group info: ------[substitution.] 0.000758 202 0.40% : 0.000003s : 6: substitution.elim_not_effective 0.56% : 0.000004s : 5: substitution.float_depend_g_call 2.20% : 0.000017s : 3: substitution.float_tuple_getitem_switch 0.32% : 0.000002s : 6: substitution.fold_const_symbol 1.26% : 0.000010s : 9: substitution.graph_param_transform 0.33% : 0.000002s : 2: substitution.incorporate_call 0.35% : 0.000003s : 2: substitution.incorporate_call_switch 52.28% : 0.000396s : 12: substitution.inline 2.42% : 0.000018s : 2: substitution.inline_without_move 1.59% : 0.000012s : 22: substitution.j_node_and_user_rematch 5.24% : 0.000040s : 3: substitution.less_batch_normalization 1.99% : 0.000015s : 11: substitution.minmaximum_grad 2.42% : 0.000018s : 5: substitution.partial_eliminate 2.32% : 0.000018s : 22: substitution.remove_not_recompute_node 3.15% : 0.000024s : 9: substitution.replace_applicator 1.38% : 0.000010s : 12: substitution.replace_old_param 0.36% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.52% : 0.000012s : 4: substitution.transpose_eliminate 4.48% : 0.000034s : 11: substitution.tuple_list_convert_item_index_to_positive 2.31% : 0.000018s : 11: substitution.tuple_list_get_item_const_eliminator 2.75% : 0.000021s : 11: substitution.tuple_list_get_item_depend_reorder 7.50% : 0.000057s : 22: substitution.tuple_list_get_item_eliminator 2.89% : 0.000022s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.020971 2 91.57% : 0.019202s : 1: type_inference.infer 8.43% : 0.001768s : 1: type_inference.specialize ------[replace.] 0.000157 20 64.87% : 0.000102s : 12: replace.inline 35.13% : 0.000055s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000403 20 96.55% : 0.000389s : 12: match.inline 3.45% : 0.000014s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000747 5558 1.05% : 0.000008s : 64: predicate.accumulaten_eliminater 0.41% : 0.000003s : 9: predicate.ad_related_special_op_eliminate 0.58% : 0.000004s : 35: predicate.addn_check_dump 1.07% : 0.000008s : 64: predicate.addn_zero_filter 0.99% : 0.000007s : 64: predicate.adjust_all_reduce_mul_add 2.08% : 0.000016s : 99: predicate.arithmetic_simplify 1.07% : 0.000008s : 64: predicate.cast_eliminate 1.26% : 0.000009s : 71: predicate.check_bprop_eliminate 0.60% : 0.000004s : 35: predicate.compare_switch_simplify 0.12% : 0.000001s : 10: predicate.const_output_eliminate 0.61% : 0.000005s : 35: predicate.depend_value_elim 1.12% : 0.000008s : 64: predicate.dict_get_item_const_eliminator 1.24% : 0.000009s : 64: predicate.dict_get_item_eliminator 1.05% : 0.000008s : 64: predicate.dict_set_item_eliminator 0.46% : 0.000003s : 19: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 9: predicate.elim_not_effective 0.24% : 0.000002s : 9: predicate.elim_shapecalc_of_broadcastargs 1.21% : 0.000009s : 74: predicate.environ_add_const_eliminate 1.19% : 0.000009s : 74: predicate.environ_get_add_eliminate 1.17% : 0.000009s : 74: predicate.environ_get_depend_swap 1.84% : 0.000014s : 109: predicate.environ_get_eliminate 1.17% : 0.000009s : 74: predicate.environ_get_set_eliminate 1.40% : 0.000010s : 84: predicate.exchange_switch_depend_value 2.01% : 0.000015s : 84: predicate.float_depend_g_call 0.62% : 0.000005s : 35: predicate.float_environ_get_switch 0.77% : 0.000006s : 45: predicate.float_tuple_getitem_switch 0.10% : 0.000001s : 9: predicate.fold_const_symbol 0.67% : 0.000005s : 35: predicate.get_grad_eliminate 0.11% : 0.000001s : 9: predicate.graph_param_transform 0.62% : 0.000005s : 35: predicate.incorporate_call 0.57% : 0.000004s : 35: predicate.incorporate_call_switch 5.27% : 0.000039s : 238: predicate.inline 1.37% : 0.000010s : 58: predicate.inline_without_move 0.34% : 0.000003s : 35: predicate.j_node_and_user_rematch 0.86% : 0.000006s : 35: predicate.less_batch_normalization 1.66% : 0.000012s : 91: predicate.list_to_tuple_eliminator_ 2.49% : 0.000019s : 156: predicate.load_eliminater 0.42% : 0.000003s : 10: predicate.loop_unroll_after_grad 1.89% : 0.000014s : 104: predicate.loop_unroll_before_grad 1.45% : 0.000011s : 84: predicate.make_slice_get_slice_eliminator 0.64% : 0.000005s : 35: predicate.merge_addn 1.21% : 0.000009s : 71: predicate.micro_step_allgather_replace 1.22% : 0.000009s : 71: predicate.mini_step_allgather_replace 1.08% : 0.000008s : 64: predicate.minmaximum_grad 0.46% : 0.000003s : 10: predicate.mutable_eliminate 0.20% : 0.000001s : 9: predicate.opt_reshape 0.20% : 0.000002s : 10: predicate.parallel_virtual_node 1.84% : 0.000014s : 84: predicate.partial_defer_inline 1.56% : 0.000012s : 82: predicate.partial_eliminate 1.03% : 0.000008s : 64: predicate.print_const_string_wrapper 0.62% : 0.000005s : 35: predicate.reduce_all_const_elim 1.34% : 0.000010s : 64: predicate.reduce_eliminate 2.50% : 0.000019s : 156: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000003s : 35: predicate.remove_not_recompute_node 1.81% : 0.000014s : 143: predicate.replace_applicator 0.67% : 0.000005s : 58: predicate.replace_old_param 0.13% : 0.000001s : 10: predicate.reset_defer_inline 1.07% : 0.000008s : 64: predicate.reshape_eliminate 1.27% : 0.000010s : 71: predicate.row_tensor_add_zeros_like 0.21% : 0.000002s : 10: predicate.row_tensor_eliminate 1.41% : 0.000010s : 71: predicate.same_eliminate 0.41% : 0.000003s : 35: predicate.set_cell_output_no_recompute 0.70% : 0.000005s : 35: predicate.shard_identity_eliminate 0.39% : 0.000003s : 19: predicate.special_op_eliminate 0.69% : 0.000005s : 35: predicate.specialize_transform 1.37% : 0.000010s : 71: predicate.split_environ_get_set_with_tuple_value 1.24% : 0.000009s : 58: predicate.stack_unstack_eliminate 0.19% : 0.000001s : 10: predicate.switch_call_monad_eliminater 1.51% : 0.000011s : 84: predicate.switch_defer_inline 2.81% : 0.000021s : 155: predicate.switch_layer_defer_inline 4.33% : 0.000032s : 232: predicate.switch_simplify 1.03% : 0.000008s : 64: predicate.tile_eliminate 1.08% : 0.000008s : 64: predicate.transpose_eliminate 1.52% : 0.000011s : 83: predicate.tuple_list_convert_item_index_to_positive 1.61% : 0.000012s : 83: predicate.tuple_list_get_item_const_eliminator 1.42% : 0.000011s : 83: predicate.tuple_list_get_item_depend_reorder 2.71% : 0.000020s : 126: predicate.tuple_list_get_item_eliminator 1.59% : 0.000012s : 83: predicate.tuple_list_get_set_item_eliminator 2.20% : 0.000016s : 118: predicate.tuple_list_set_item_eliminator 1.54% : 0.000012s : 91: predicate.tuple_to_list_eliminator_ 2.44% : 0.000018s : 156: predicate.updatestate_pure_node_eliminater 3.13% : 0.000023s : 191: predicate.updatestate_useless_node_eliminater 0.20% : 0.000001s : 10: predicate.value_based_eliminate 0.67% : 0.000005s : 35: predicate.virtual_dataset_eliminate 0.70% : 0.000005s : 35: predicate.virtual_output_eliminate 0.17% : 0.000001s : 9: predicate.virtual_view_grad_eliminate 0.22% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002134 25 48.29% : 0.001030s : 9: func_graph_cloner_run.FuncGraphClonerGraph 51.71% : 0.001103s : 16: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.088548 237 0.00% : 0.000004s : 1: ForceFp32Comm 4.79% : 0.004245s : 1: add_attr 4.78% : 0.004231s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.08% : 0.000073s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.15% : 0.000133s : 1: auto_monad 0.04% : 0.000032s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000006s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.81% : 0.000721s : 1: bootstrap 0.03% : 0.000030s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000051s : 1: cse_after_recomputation 0.02% : 0.000014s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.03% : 0.000026s : 1: event_method 0.02% : 0.000016s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000006s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.01% : 0.000008s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.01% : 0.000008s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.54% : 0.000482s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.75% : 0.000668s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000023s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000022s : 1: opt.transform.mutable_eliminate 5.84% : 0.005175s : 117: opt.transform.opt_a 0.08% : 0.000070s : 1: opt.transform.opt_after_cconv 0.05% : 0.000044s : 1: opt.transform.opt_after_jit_grad 0.32% : 0.000283s : 28: opt.transform.opt_b 0.11% : 0.000102s : 2: opt.transform.opt_trans_graph 0.08% : 0.000069s : 4: opt.transform.symbol_engine_opt 15.71% : 0.013913s : 1: opt_a 0.20% : 0.000181s : 1: opt_after_cconv 0.59% : 0.000525s : 1: opt_after_jit_grad 0.49% : 0.000438s : 1: opt_b 18.84% : 0.016686s : 1: optimize 0.03% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.04% : 0.000033s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000010s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000004s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.05% : 0.000044s : 1: pre_auto_parallel 0.02% : 0.000016s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.09% : 0.000083s : 1: remove_dup_value 2.80% : 0.002476s : 2: renormalize.infer 2.16% : 0.001911s : 2: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000009s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000033s : 1: rewriter_after_opt_a 0.10% : 0.000090s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000006s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.15% : 0.000137s : 1: symbol_engine_optimizer 15.62% : 0.013827s : 1: task_emit 0.15% : 0.000134s : 1: tuple_transform 23.80% : 0.021073s : 1: type_inference 0.12% : 0.000106s : 1: validate TotalTime = 0.0305207, [24] [bootstrap]: 0.00043174 [type_inference]: 0.00747083 [event_method]: 1.273e-05 [auto_monad]: 6.553e-05 [graph_reusing]: 6.07999e-06 [inline]: 2.27001e-06 [add_attr]: 0.00337219, [1] [add_attr_with_inline]: 0.00336304, [1] [Cycle 1]: 5.439e-05, [2] [tag_attr]: 1.594e-05 [meta_addattr_fg_expand]: 3.71999e-06 [parallel-infer-symbol]: 3.32997e-06 [pre_auto_parallel]: 2.812e-05 [insert-virtual-dataset]: 2.80002e-06 [parallel-infer-symbol-second]: 7.10017e-07 [dataset_repeat_opt]: 2.19001e-06 [pipeline_split]: 1.81e-06 [optimize]: 0.00457677, [53] [py_interpret_to_execute]: 5.21998e-06 [rewriter_before_opt_a]: 4.653e-05 [opt_a]: 0.00251168, [2] [Cycle 1]: 0.00189208, [45] [expand_dump_flag]: 3.25998e-06 [switch_simplify]: 3.081e-05 [loop_unroll]: 2.107e-05 [a_1]: 0.00046248 [with_stream_mark]: 1.637e-05 [recompute_prepare]: 9.92001e-06 [updatestate_depend_eliminate]: 3.81001e-06 [updatestate_assign_eliminate]: 3.77998e-06 [updatestate_loads_eliminate]: 3.14999e-06 [parameter_eliminate]: 1.96e-06 [a_2]: 8.467e-05 [accelerated_algorithm]: 7.41999e-06 [shard]: 2.60002e-06 [meta_shard_fg_expand]: 2.05002e-06 [shard_inline]: 6.64999e-06 [merge_send_recv]: 8.43999e-06 [auto_parallel]: 7.01001e-06 [parallel]: 1.891e-05 [flash_sp]: 8.12e-06 [merge_comm]: 3.46001e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.74999e-06 [allreduce_slice_to_reducescatter]: 7.89994e-07 [virtual_shard_identity]: 8.40999e-06 [virtual_dataset]: 6.94999e-06 [get_grad_eliminate_]: 7.42002e-06 [virtual_output]: 7.21999e-06 [merge_forward]: 4.36002e-06 [cell_reuse_recompute_pass]: 1.33002e-06 [offload_activation]: 1.032e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.409e-05 [merge_recompute_call_nodes]: 1.64e-06 [before_grad]: 1.143e-05 [set_forward_comm_id_for_comm_node_pass]: 3.55e-06 [meta_fg_expand]: 2.84999e-06 [flash_sp_send_recv_attached]: 2.58e-06 [receive_attached]: 2.51e-06 [after_resolve]: 1.194e-05 [a_after_grad]: 1.05e-05 [renormalize]: 0.00070391 [add_forward_monad_depend]: 5.94e-06 [auto_monad_grad]: 1.91e-06 [auto_monad_eliminator]: 1.535e-05 [cse]: 2.999e-05 [a_3]: 4.643e-05 [Cycle 2]: 0.00060947, [45] [expand_dump_flag]: 1.28002e-06 [switch_simplify]: 7.66001e-06 [loop_unroll]: 6.21e-06 [a_1]: 0.00013037 [with_stream_mark]: 1.135e-05 [recompute_prepare]: 6.06e-06 [updatestate_depend_eliminate]: 2.91e-06 [updatestate_assign_eliminate]: 2.17999e-06 [updatestate_loads_eliminate]: 2.59999e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 6.812e-05 [accelerated_algorithm]: 5.95002e-06 [shard]: 1.06002e-06 [meta_shard_fg_expand]: 1.33002e-06 [shard_inline]: 5.93998e-06 [merge_send_recv]: 4.35999e-06 [auto_parallel]: 5.07999e-06 [parallel]: 4.70999e-06 [flash_sp]: 3.00002e-06 [merge_comm]: 2.73998e-06 [allreduce_fusion]: 2.81999e-06 [matmul_add_comm_reduction]: 5.89e-06 [allreduce_slice_to_reducescatter]: 4.50003e-07 [virtual_shard_identity]: 6.63e-06 [virtual_dataset]: 5.69e-06 [get_grad_eliminate_]: 5.64e-06 [virtual_output]: 5.64998e-06 [merge_forward]: 2.41e-06 [cell_reuse_recompute_pass]: 1.40001e-06 [offload_activation]: 5.46e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.289e-05 [merge_recompute_call_nodes]: 7.00005e-07 [before_grad]: 8.95001e-06 [set_forward_comm_id_for_comm_node_pass]: 3.49001e-06 [meta_fg_expand]: 1.97001e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 9.00007e-07 [after_resolve]: 9.51e-06 [a_after_grad]: 8.92e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.18001e-06 [auto_monad_grad]: 8.70001e-07 [auto_monad_eliminator]: 6.62002e-06 [cse]: 1.271e-05 [a_3]: 3.485e-05 [py_interpret_to_execute_after_opt_a]: 3.94002e-06 [slice_cell_reuse_recomputed_activation]: 1.86998e-06 [rewriter_after_opt_a]: 1.659e-05 [convert_after_rewriter]: 1.19998e-06 [order_py_execute_after_rewriter]: 1.10001e-06 [mutable_eliminate]: 0.00056943 [opt_b]: 0.00022114, [1] [Cycle 1]: 0.00021443, [7] [b_1]: 0.00013549 [b_2]: 8.44998e-06 [updatestate_depend_eliminate]: 5.46e-06 [updatestate_assign_eliminate]: 2.65002e-06 [updatestate_loads_eliminate]: 2.64999e-06 [renormalize]: 5.3001e-07 [cse]: 1.719e-05 [optimize_parallel_all_gather_comm]: 1.682e-05 [overlap_param_gather]: 2.12999e-06 [cconv]: 2.715e-05 [loop_unroll]: 0.00046443 [opt_after_cconv]: 0.00010734, [1] [Cycle 1]: 0.0001016, [7] [c_1]: 3.376e-05 [parameter_eliminate]: 2.53e-06 [updatestate_depend_eliminate]: 5.14e-06 [updatestate_assign_eliminate]: 2.66999e-06 [updatestate_loads_eliminate]: 2.51998e-06 [cse]: 1.663e-05 [renormalize]: 3.89991e-07 [remove_dup_value]: 1.485e-05 [tuple_transform]: 8.465e-05, [1] [Cycle 1]: 7.97e-05, [4] [d_1]: 4.911e-05 [none_parameter_eliminate]: 1.74e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 7.41001e-06 [partial_unused_args_eliminate]: 2.09e-06 [add_recomputation]: 5.057e-05 [cse_after_recomputation]: 2.226e-05, [1] [Cycle 1]: 1.768e-05, [1] [cse]: 1.111e-05 [environ_conv]: 5.57999e-06 [swap_dp_allreduce_reducescatter]: 5.59e-06 [bias_add_comm_swap]: 2.69999e-06 [label_micro_interleaved_index]: 4.23999e-06 [label_fine_grained_interleaved_index]: 2.61e-06 [merge_cast_opt]: 1.34e-06 [slice_recompute_activation]: 2.14999e-06 [micro_interleaved_order_control]: 2.53e-06 [assign_add_opt]: 1.51002e-06 [ForceFp32Comm]: 8.59989e-07 [remove_cast_before_assign_add]: 1.09003e-06 [full_micro_interleaved_order_control]: 2.71999e-06 [reorder_send_recv_between_fp_bp]: 3.29001e-06 [comm_op_add_attrs]: 1.23002e-06 [add_comm_op_reuse_tag]: 1.02e-06 [interleave_split_concat_branches]: 1.30001e-06 [interleave_parallel_branches]: 1.60999e-06 [overlap_opt_shard_in_pipeline]: 1.21997e-06 [overlap_opt_shard_grad_in_pipeline]: 1.85001e-06 [control_data_broadcast_order]: 1.216e-05 [grouped_pairwise_exchange_alltoall]: 1.95001e-06 [offloading_packed_experts]: 3.8e-06 [overlap_recompute_and_grad_model_parallel]: 5.17999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.38002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.49e-06 [overlap_recompute_comm]: 2.77002e-06 [overlap_grad_ring_attention]: 4.42e-06 [overlap_grad_flash_sp]: 1.864e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.46998e-06 [split_layernorm_comm]: 1.85001e-06 [handle_group_info]: 1.16997e-06 [symbol_engine_optimizer]: 8.239e-05, [1] [Cycle 1]: 7.737e-05, [6] [build]: 2.53e-06 [elim_shapecalc]: 1.088e-05 [elim_not_effective]: 1.274e-05 [opt_reshape]: 7.80998e-06 [fold_const_symbol]: 1.006e-05 [renormalize]: 1.60013e-07 [detach_backward]: 1.97001e-06 [pipeline_parallel_scheduler]: 1.47999e-06 [auto_monad_reorder]: 1.632e-05 [get_jit_bprop_graph]: 1.27999e-06 [rewriter_after_jit_bprop_graph]: 3.69002e-06 [opt_after_jit_grad]: 0.00054043 [validate]: 3.785e-05 [backend_pass]: 1.40001e-06 [task_emit]: 0.0137172 [execute]: 9.14e-06 Sums bootstrap : 0.000432s : 1.65% type_inference : 0.007471s : 28.62% event_method : 0.000013s : 0.05% auto_monad : 0.000066s : 0.25% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000016s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.01% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000028s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000005s : 0.02% optimize.rewriter_before_opt_a : 0.000047s : 0.18% optimize.opt_a.expand_dump_flag : 0.000005s : 0.02% optimize.opt_a.switch_simplify : 0.000038s : 0.15% optimize.opt_a.loop_unroll : 0.000027s : 0.10% optimize.opt_a.a_1 : 0.000593s : 2.27% optimize.opt_a.with_stream_mark : 0.000028s : 0.11% optimize.opt_a.recompute_prepare : 0.000016s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000153s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.05% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.05% optimize.opt_a.merge_send_recv : 0.000013s : 0.05% optimize.opt_a.auto_parallel : 0.000012s : 0.05% optimize.opt_a.parallel : 0.000024s : 0.09% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000006s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.06% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000027s : 0.10% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.03% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000019s : 0.07% optimize.opt_a.renormalize : 0.000704s : 2.70% optimize.opt_a.add_forward_monad_depend : 0.000007s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.08% optimize.opt_a.cse : 0.000043s : 0.16% optimize.opt_a.a_3 : 0.000081s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000017s : 0.06% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000569s : 2.18% optimize.opt_b.b_1 : 0.000135s : 0.52% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000027s : 0.10% optimize.loop_unroll : 0.000464s : 1.78% optimize.opt_after_cconv.c_1 : 0.000034s : 0.13% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.06% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.06% optimize.tuple_transform.d_1 : 0.000049s : 0.19% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000051s : 0.19% optimize.cse_after_recomputation.cse : 0.000011s : 0.04% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000002s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000002s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000019s : 0.07% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.04% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.01% auto_monad_reorder : 0.000016s : 0.06% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000540s : 2.07% validate : 0.000038s : 0.14% backend_pass : 0.000001s : 0.01% task_emit : 0.013717s : 52.55% execute : 0.000009s : 0.04% Time group info: ------[substitution.] 0.000144 23 1.25% : 0.000002s : 2: substitution.elim_not_effective 0.86% : 0.000001s : 2: substitution.fold_const_symbol 3.89% : 0.000006s : 4: substitution.graph_param_transform 85.63% : 0.000123s : 3: substitution.inline 2.31% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.28% : 0.000005s : 4: substitution.remove_not_recompute_node 2.78% : 0.000004s : 4: substitution.replace_old_param ------[type_inference.] 0.007420 2 92.71% : 0.006879s : 1: type_inference.infer 7.29% : 0.000541s : 1: type_inference.specialize ------[replace.] 0.000028 3 100.00% : 0.000028s : 3: replace.inline ------[match.] 0.000121 3 100.00% : 0.000121s : 3: match.inline ------[predicate.] 0.000158 1047 0.82% : 0.000001s : 10: predicate.accumulaten_eliminater 0.94% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 0.84% : 0.000001s : 10: predicate.addn_zero_filter 0.99% : 0.000002s : 10: predicate.adjust_all_reduce_mul_add 2.18% : 0.000003s : 18: predicate.arithmetic_simplify 0.83% : 0.000001s : 10: predicate.cast_eliminate 0.78% : 0.000001s : 8: predicate.check_bprop_eliminate 0.63% : 0.000001s : 8: predicate.compare_switch_simplify 0.23% : 0.000000s : 4: predicate.const_output_eliminate 0.71% : 0.000001s : 8: predicate.depend_value_elim 0.78% : 0.000001s : 10: predicate.dict_get_item_const_eliminator 1.11% : 0.000002s : 10: predicate.dict_get_item_eliminator 0.85% : 0.000001s : 10: predicate.dict_set_item_eliminator 1.23% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.26% : 0.000000s : 4: predicate.elim_not_effective 0.46% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.28% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.03% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.12% : 0.000002s : 14: predicate.environ_get_depend_swap 1.84% : 0.000003s : 22: predicate.environ_get_eliminate 1.06% : 0.000002s : 14: predicate.environ_get_set_eliminate 0.99% : 0.000002s : 13: predicate.exchange_switch_depend_value 1.85% : 0.000003s : 13: predicate.float_depend_g_call 0.63% : 0.000001s : 8: predicate.float_environ_get_switch 1.02% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.25% : 0.000000s : 4: predicate.fold_const_symbol 0.94% : 0.000001s : 8: predicate.get_grad_eliminate 0.35% : 0.000001s : 4: predicate.graph_param_transform 0.69% : 0.000001s : 8: predicate.incorporate_call 0.58% : 0.000001s : 8: predicate.incorporate_call_switch 5.46% : 0.000009s : 47: predicate.inline 0.85% : 0.000001s : 8: predicate.inline_without_move 0.39% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.88% : 0.000001s : 8: predicate.less_batch_normalization 1.84% : 0.000003s : 18: predicate.list_to_tuple_eliminator_ 2.21% : 0.000003s : 28: predicate.load_eliminater 1.17% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.10% : 0.000003s : 23: predicate.loop_unroll_before_grad 1.80% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.73% : 0.000001s : 8: predicate.merge_addn 0.75% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.72% : 0.000001s : 10: predicate.minmaximum_grad 1.25% : 0.000002s : 4: predicate.mutable_eliminate 0.52% : 0.000001s : 4: predicate.opt_reshape 0.47% : 0.000001s : 4: predicate.parallel_virtual_node 1.44% : 0.000002s : 13: predicate.partial_defer_inline 1.16% : 0.000002s : 14: predicate.partial_eliminate 0.94% : 0.000001s : 10: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.05% : 0.000002s : 10: predicate.reduce_eliminate 2.26% : 0.000004s : 28: predicate.redundant_stop_gradient_eliminater 0.70% : 0.000001s : 8: predicate.remove_not_recompute_node 1.31% : 0.000002s : 18: predicate.replace_applicator 0.71% : 0.000001s : 8: predicate.replace_old_param 0.32% : 0.000001s : 4: predicate.reset_defer_inline 1.00% : 0.000002s : 10: predicate.reshape_eliminate 0.74% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.58% : 0.000001s : 4: predicate.row_tensor_eliminate 0.87% : 0.000001s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 1.06% : 0.000002s : 8: predicate.shard_identity_eliminate 0.79% : 0.000001s : 8: predicate.special_op_eliminate 0.79% : 0.000001s : 8: predicate.specialize_transform 1.13% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.93% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.37% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.14% : 0.000002s : 13: predicate.switch_defer_inline 1.82% : 0.000003s : 21: predicate.switch_layer_defer_inline 4.75% : 0.000008s : 48: predicate.switch_simplify 0.79% : 0.000001s : 10: predicate.tile_eliminate 0.99% : 0.000002s : 10: predicate.transpose_eliminate 1.64% : 0.000003s : 18: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000002s : 18: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.29% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.60% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000004s : 26: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 18: predicate.tuple_to_list_eliminator_ 2.04% : 0.000003s : 28: predicate.updatestate_pure_node_eliminater 2.80% : 0.000004s : 36: predicate.updatestate_useless_node_eliminater 0.51% : 0.000001s : 4: predicate.value_based_eliminate 0.89% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.84% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.52% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000349 6 5.43% : 0.000019s : 1: func_graph_cloner_run.FuncGraphClonerGraph 94.57% : 0.000330s : 5: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040368 196 0.01% : 0.000004s : 1: ForceFp32Comm 8.37% : 0.003377s : 1: add_attr 8.34% : 0.003367s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.14% : 0.000055s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.18% : 0.000071s : 1: auto_monad 0.05% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.15% : 0.000463s : 1: bootstrap 0.08% : 0.000031s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000025s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000018s : 1: event_method 0.04% : 0.000015s : 1: execute 0.02% : 0.000007s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000006s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.17% : 0.000473s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 1.43% : 0.000578s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 2.46% : 0.000993s : 78: opt.transform.opt_a 0.08% : 0.000032s : 1: opt.transform.opt_after_cconv 0.06% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.28% : 0.000112s : 28: opt.transform.opt_b 0.13% : 0.000054s : 2: opt.transform.opt_trans_graph 0.09% : 0.000038s : 4: opt.transform.symbol_engine_opt 6.23% : 0.002515s : 1: opt_a 0.27% : 0.000111s : 1: opt_after_cconv 1.36% : 0.000550s : 1: opt_after_jit_grad 0.56% : 0.000225s : 1: opt_b 11.35% : 0.004581s : 1: optimize 0.05% : 0.000021s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.06% : 0.000023s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.02% : 0.000006s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000032s : 1: pre_auto_parallel 0.02% : 0.000010s : 1: py_interpret_to_execute 0.02% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000018s : 1: remove_dup_value 0.88% : 0.000357s : 1: renormalize.infer 0.84% : 0.000338s : 1: renormalize.specialize 0.02% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000020s : 1: rewriter_after_opt_a 0.13% : 0.000051s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000006s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.21% : 0.000085s : 1: symbol_engine_optimizer 34.02% : 0.013734s : 1: task_emit 0.22% : 0.000088s : 1: tuple_transform 18.54% : 0.007485s : 1: type_inference 0.17% : 0.000069s : 1: validate TotalTime = 0.0877454, [24] [bootstrap]: 0.00052977 [type_inference]: 0.0178966 [event_method]: 1.962e-05 [auto_monad]: 9.777e-05 [graph_reusing]: 7.4e-06 [inline]: 2.26e-06 [add_attr]: 0.00336291, [1] [add_attr_with_inline]: 0.00335452, [1] [Cycle 1]: 6.031e-05, [2] [tag_attr]: 2.45e-05 [meta_addattr_fg_expand]: 6.66e-06 [parallel-infer-symbol]: 3.71001e-06 [pre_auto_parallel]: 3.645e-05 [insert-virtual-dataset]: 2.96001e-06 [parallel-infer-symbol-second]: 6.89994e-07 [dataset_repeat_opt]: 1.72001e-06 [pipeline_split]: 1.69e-06 [optimize]: 0.0141515, [53] [py_interpret_to_execute]: 4.45999e-06 [rewriter_before_opt_a]: 8.159e-05 [opt_a]: 0.0116702, [3] [Cycle 1]: 0.00714962, [45] [expand_dump_flag]: 3.62002e-06 [switch_simplify]: 4.889e-05 [loop_unroll]: 3.723e-05 [a_1]: 0.00100275 [with_stream_mark]: 2.467e-05 [recompute_prepare]: 2.271e-05 [updatestate_depend_eliminate]: 9.20001e-06 [updatestate_assign_eliminate]: 7.91001e-06 [updatestate_loads_eliminate]: 7.63001e-06 [parameter_eliminate]: 2.69999e-06 [a_2]: 0.00024066 [accelerated_algorithm]: 3.075e-05 [shard]: 2.01e-06 [meta_shard_fg_expand]: 3.90998e-06 [shard_inline]: 1.608e-05 [merge_send_recv]: 1.671e-05 [auto_parallel]: 1.174e-05 [parallel]: 1.859e-05 [flash_sp]: 9.46e-06 [merge_comm]: 9.47001e-06 [allreduce_fusion]: 9.17999e-06 [matmul_add_comm_reduction]: 2.913e-05 [allreduce_slice_to_reducescatter]: 5.89993e-07 [virtual_shard_identity]: 1.746e-05 [virtual_dataset]: 1.607e-05 [get_grad_eliminate_]: 1.56e-05 [virtual_output]: 1.659e-05 [merge_forward]: 9.58997e-06 [cell_reuse_recompute_pass]: 1.17e-06 [offload_activation]: 1.822e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.896e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 2.828e-05 [set_forward_comm_id_for_comm_node_pass]: 1.026e-05 [meta_fg_expand]: 0.00155496 [flash_sp_send_recv_attached]: 6.83998e-06 [receive_attached]: 2.79999e-06 [after_resolve]: 6.34e-05 [a_after_grad]: 8.393e-05 [renormalize]: 0.00279545 [add_forward_monad_depend]: 9.85002e-06 [auto_monad_grad]: 5.97999e-06 [auto_monad_eliminator]: 5.826e-05 [cse]: 0.00019146 [a_3]: 0.00035328 [Cycle 2]: 0.00338934, [45] [expand_dump_flag]: 2.12001e-06 [switch_simplify]: 4.942e-05 [loop_unroll]: 4.623e-05 [a_1]: 0.00156708 [with_stream_mark]: 1.451e-05 [recompute_prepare]: 1.367e-05 [updatestate_depend_eliminate]: 5.96e-06 [updatestate_assign_eliminate]: 5.02e-06 [updatestate_loads_eliminate]: 4.83001e-06 [parameter_eliminate]: 1.29e-06 [a_2]: 0.0001495 [accelerated_algorithm]: 1.604e-05 [shard]: 1.15001e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 1.204e-05 [merge_send_recv]: 9.29e-06 [auto_parallel]: 9.36002e-06 [parallel]: 5.64e-06 [flash_sp]: 3.48999e-06 [merge_comm]: 6.87002e-06 [allreduce_fusion]: 6.39001e-06 [matmul_add_comm_reduction]: 1.022e-05 [allreduce_slice_to_reducescatter]: 7.00005e-07 [virtual_shard_identity]: 1.259e-05 [virtual_dataset]: 1.142e-05 [get_grad_eliminate_]: 1.093e-05 [virtual_output]: 1.107e-05 [merge_forward]: 5.53002e-06 [cell_reuse_recompute_pass]: 9.49978e-07 [offload_activation]: 1.145e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.039e-05 [merge_recompute_call_nodes]: 8.39995e-07 [before_grad]: 1.797e-05 [set_forward_comm_id_for_comm_node_pass]: 5.94999e-06 [meta_fg_expand]: 8.808e-05 [flash_sp_send_recv_attached]: 1.27999e-06 [receive_attached]: 1.32e-06 [after_resolve]: 1.811e-05 [a_after_grad]: 1.842e-05 [renormalize]: 0.00078276 [add_forward_monad_depend]: 4.18001e-06 [auto_monad_grad]: 1.60999e-06 [auto_monad_eliminator]: 1.603e-05 [cse]: 8.253e-05 [a_3]: 8.25e-05 [Cycle 3]: 0.00111652, [45] [expand_dump_flag]: 9.99979e-07 [switch_simplify]: 1.279e-05 [loop_unroll]: 1.144e-05 [a_1]: 0.00034991 [with_stream_mark]: 1.239e-05 [recompute_prepare]: 1.257e-05 [updatestate_depend_eliminate]: 5.35999e-06 [updatestate_assign_eliminate]: 4.69998e-06 [updatestate_loads_eliminate]: 4.75001e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 0.00014688 [accelerated_algorithm]: 1.43e-05 [shard]: 1.19e-06 [meta_shard_fg_expand]: 2.24001e-06 [shard_inline]: 1.122e-05 [merge_send_recv]: 7.31999e-06 [auto_parallel]: 7.88999e-06 [parallel]: 4.53999e-06 [flash_sp]: 1.20001e-06 [merge_comm]: 5.49e-06 [allreduce_fusion]: 5.24e-06 [matmul_add_comm_reduction]: 8.29002e-06 [allreduce_slice_to_reducescatter]: 5.09986e-07 [virtual_shard_identity]: 1.214e-05 [virtual_dataset]: 1.123e-05 [get_grad_eliminate_]: 1.217e-05 [virtual_output]: 1.106e-05 [merge_forward]: 5.94e-06 [cell_reuse_recompute_pass]: 1.46998e-06 [offload_activation]: 1.155e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.109e-05 [merge_recompute_call_nodes]: 8.50006e-07 [before_grad]: 1.893e-05 [set_forward_comm_id_for_comm_node_pass]: 6.17999e-06 [meta_fg_expand]: 3.71999e-06 [flash_sp_send_recv_attached]: 8.80013e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 1.614e-05 [a_after_grad]: 1.815e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 2.08002e-06 [auto_monad_grad]: 1.26002e-06 [auto_monad_eliminator]: 1.368e-05 [cse]: 3.832e-05 [a_3]: 7.553e-05 [py_interpret_to_execute_after_opt_a]: 5.46e-06 [slice_cell_reuse_recomputed_activation]: 2.29999e-06 [rewriter_after_opt_a]: 3.09e-05 [convert_after_rewriter]: 1.69998e-06 [order_py_execute_after_rewriter]: 1.60999e-06 [mutable_eliminate]: 0.00060272 [opt_b]: 0.00036081, [1] [Cycle 1]: 0.00035453, [7] [b_1]: 0.00024612 [b_2]: 1.358e-05 [updatestate_depend_eliminate]: 8.33001e-06 [updatestate_assign_eliminate]: 4.72e-06 [updatestate_loads_eliminate]: 5.02e-06 [renormalize]: 4.19997e-07 [cse]: 4.166e-05 [optimize_parallel_all_gather_comm]: 2.227e-05 [overlap_param_gather]: 2.04e-06 [cconv]: 2.381e-05 [loop_unroll]: 0.00045339 [opt_after_cconv]: 0.00016196, [1] [Cycle 1]: 0.0001559, [7] [c_1]: 6.437e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 8.21002e-06 [updatestate_assign_eliminate]: 4.77e-06 [updatestate_loads_eliminate]: 4.90001e-06 [cse]: 3.766e-05 [renormalize]: 3.59985e-07 [remove_dup_value]: 4.42e-05 [tuple_transform]: 0.0001202, [1] [Cycle 1]: 0.00011551, [4] [d_1]: 8.277e-05 [none_parameter_eliminate]: 1.76e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.229e-05 [partial_unused_args_eliminate]: 1.84998e-06 [add_recomputation]: 6.456e-05 [cse_after_recomputation]: 3.608e-05, [1] [Cycle 1]: 3.145e-05, [1] [cse]: 2.509e-05 [environ_conv]: 9.76003e-06 [swap_dp_allreduce_reducescatter]: 9.12999e-06 [bias_add_comm_swap]: 2.63e-06 [label_micro_interleaved_index]: 4.65001e-06 [label_fine_grained_interleaved_index]: 2.49999e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.24001e-06 [micro_interleaved_order_control]: 2.88e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 1.06002e-06 [remove_cast_before_assign_add]: 1.27e-06 [full_micro_interleaved_order_control]: 2.61e-06 [reorder_send_recv_between_fp_bp]: 2.84999e-06 [comm_op_add_attrs]: 1.07998e-06 [add_comm_op_reuse_tag]: 1.10001e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.12e-06 [overlap_opt_shard_in_pipeline]: 1.22999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.08002e-06 [control_data_broadcast_order]: 1.903e-05 [grouped_pairwise_exchange_alltoall]: 1.56998e-06 [offloading_packed_experts]: 5.38002e-06 [overlap_recompute_and_grad_model_parallel]: 6.09001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.12999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32e-06 [overlap_recompute_comm]: 2.74999e-06 [overlap_grad_ring_attention]: 5.82001e-06 [overlap_grad_flash_sp]: 2.838e-05 [begin_end_overlap_inline]: 5.69999e-07 [split_matmul_comm_elemetwise]: 2.24999e-06 [split_layernorm_comm]: 1.97001e-06 [handle_group_info]: 1.39e-06 [symbol_engine_optimizer]: 0.00011276, [1] [Cycle 1]: 0.0001081, [6] [build]: 1.131e-05 [elim_shapecalc]: 1.61e-05 [elim_not_effective]: 2.133e-05 [opt_reshape]: 1.281e-05 [fold_const_symbol]: 1.837e-05 [renormalize]: 1.90019e-07 [detach_backward]: 1.84e-06 [pipeline_parallel_scheduler]: 1.51002e-06 [auto_monad_reorder]: 2.633e-05 [get_jit_bprop_graph]: 1.50001e-06 [rewriter_after_jit_bprop_graph]: 3.50003e-06 [opt_after_jit_grad]: 0.00050767 [validate]: 5.748e-05 [backend_pass]: 8.80013e-07 [task_emit]: 0.0507932 [execute]: 9.20999e-06 Sums bootstrap : 0.000530s : 0.64% type_inference : 0.017897s : 21.54% event_method : 0.000020s : 0.02% auto_monad : 0.000098s : 0.12% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.03% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.01% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000036s : 0.04% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.01% optimize.rewriter_before_opt_a : 0.000082s : 0.10% optimize.opt_a.expand_dump_flag : 0.000007s : 0.01% optimize.opt_a.switch_simplify : 0.000111s : 0.13% optimize.opt_a.loop_unroll : 0.000095s : 0.11% optimize.opt_a.a_1 : 0.002920s : 3.51% optimize.opt_a.with_stream_mark : 0.000052s : 0.06% optimize.opt_a.recompute_prepare : 0.000049s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000021s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000018s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000017s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000537s : 0.65% optimize.opt_a.accelerated_algorithm : 0.000061s : 0.07% optimize.opt_a.shard : 0.000004s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.01% optimize.opt_a.shard_inline : 0.000039s : 0.05% optimize.opt_a.merge_send_recv : 0.000033s : 0.04% optimize.opt_a.auto_parallel : 0.000029s : 0.03% optimize.opt_a.parallel : 0.000029s : 0.03% optimize.opt_a.flash_sp : 0.000014s : 0.02% optimize.opt_a.merge_comm : 0.000022s : 0.03% optimize.opt_a.allreduce_fusion : 0.000021s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000048s : 0.06% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000042s : 0.05% optimize.opt_a.virtual_dataset : 0.000039s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000039s : 0.05% optimize.opt_a.virtual_output : 0.000039s : 0.05% optimize.opt_a.merge_forward : 0.000021s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000041s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000070s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000065s : 0.08% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000022s : 0.03% optimize.opt_a.meta_fg_expand : 0.001647s : 1.98% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000098s : 0.12% optimize.opt_a.a_after_grad : 0.000120s : 0.15% optimize.opt_a.renormalize : 0.003578s : 4.31% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.02% optimize.opt_a.auto_monad_grad : 0.000009s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000088s : 0.11% optimize.opt_a.cse : 0.000312s : 0.38% optimize.opt_a.a_3 : 0.000511s : 0.62% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000031s : 0.04% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000603s : 0.73% optimize.opt_b.b_1 : 0.000246s : 0.30% optimize.opt_b.b_2 : 0.000014s : 0.02% optimize.opt_b.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000042s : 0.05% optimize.optimize_parallel_all_gather_comm : 0.000022s : 0.03% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000024s : 0.03% optimize.loop_unroll : 0.000453s : 0.55% optimize.opt_after_cconv.c_1 : 0.000064s : 0.08% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.cse : 0.000038s : 0.05% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000044s : 0.05% optimize.tuple_transform.d_1 : 0.000083s : 0.10% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000012s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000065s : 0.08% optimize.cse_after_recomputation.cse : 0.000025s : 0.03% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000009s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000003s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000019s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000005s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000006s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000006s : 0.01% optimize.overlap_grad_flash_sp : 0.000028s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000016s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000021s : 0.03% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000018s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000026s : 0.03% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000508s : 0.61% validate : 0.000057s : 0.07% backend_pass : 0.000001s : 0.00% task_emit : 0.050793s : 61.14% execute : 0.000009s : 0.01% Time group info: ------[substitution.] 0.000643 202 0.52% : 0.000003s : 6: substitution.elim_not_effective 0.64% : 0.000004s : 5: substitution.float_depend_g_call 0.73% : 0.000005s : 3: substitution.float_tuple_getitem_switch 0.39% : 0.000002s : 6: substitution.fold_const_symbol 1.43% : 0.000009s : 9: substitution.graph_param_transform 0.45% : 0.000003s : 2: substitution.incorporate_call 0.34% : 0.000002s : 2: substitution.incorporate_call_switch 55.38% : 0.000356s : 12: substitution.inline 2.74% : 0.000018s : 2: substitution.inline_without_move 1.93% : 0.000012s : 22: substitution.j_node_and_user_rematch 2.55% : 0.000016s : 3: substitution.less_batch_normalization 1.98% : 0.000013s : 11: substitution.minmaximum_grad 0.87% : 0.000006s : 5: substitution.partial_eliminate 2.45% : 0.000016s : 22: substitution.remove_not_recompute_node 3.12% : 0.000020s : 9: substitution.replace_applicator 1.66% : 0.000011s : 12: substitution.replace_old_param 0.44% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.95% : 0.000013s : 4: substitution.transpose_eliminate 4.83% : 0.000031s : 11: substitution.tuple_list_convert_item_index_to_positive 2.20% : 0.000014s : 11: substitution.tuple_list_get_item_const_eliminator 2.97% : 0.000019s : 11: substitution.tuple_list_get_item_depend_reorder 7.62% : 0.000049s : 22: substitution.tuple_list_get_item_eliminator 2.83% : 0.000018s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.017826 2 94.40% : 0.016827s : 1: type_inference.infer 5.60% : 0.000999s : 1: type_inference.specialize ------[replace.] 0.000149 20 64.13% : 0.000096s : 12: replace.inline 35.87% : 0.000053s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000362 20 96.19% : 0.000349s : 12: match.inline 3.81% : 0.000014s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000739 5558 1.05% : 0.000008s : 64: predicate.accumulaten_eliminater 0.38% : 0.000003s : 9: predicate.ad_related_special_op_eliminate 0.58% : 0.000004s : 35: predicate.addn_check_dump 1.07% : 0.000008s : 64: predicate.addn_zero_filter 0.98% : 0.000007s : 64: predicate.adjust_all_reduce_mul_add 2.12% : 0.000016s : 99: predicate.arithmetic_simplify 1.05% : 0.000008s : 64: predicate.cast_eliminate 1.25% : 0.000009s : 71: predicate.check_bprop_eliminate 0.60% : 0.000004s : 35: predicate.compare_switch_simplify 0.12% : 0.000001s : 10: predicate.const_output_eliminate 0.60% : 0.000004s : 35: predicate.depend_value_elim 1.14% : 0.000008s : 64: predicate.dict_get_item_const_eliminator 1.24% : 0.000009s : 64: predicate.dict_get_item_eliminator 1.03% : 0.000008s : 64: predicate.dict_set_item_eliminator 0.46% : 0.000003s : 19: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 9: predicate.elim_not_effective 0.20% : 0.000001s : 9: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000009s : 74: predicate.environ_add_const_eliminate 1.17% : 0.000009s : 74: predicate.environ_get_add_eliminate 1.18% : 0.000009s : 74: predicate.environ_get_depend_swap 1.83% : 0.000014s : 109: predicate.environ_get_eliminate 1.18% : 0.000009s : 74: predicate.environ_get_set_eliminate 1.42% : 0.000010s : 84: predicate.exchange_switch_depend_value 1.98% : 0.000015s : 84: predicate.float_depend_g_call 0.59% : 0.000004s : 35: predicate.float_environ_get_switch 0.79% : 0.000006s : 45: predicate.float_tuple_getitem_switch 0.10% : 0.000001s : 9: predicate.fold_const_symbol 0.64% : 0.000005s : 35: predicate.get_grad_eliminate 0.11% : 0.000001s : 9: predicate.graph_param_transform 0.63% : 0.000005s : 35: predicate.incorporate_call 0.56% : 0.000004s : 35: predicate.incorporate_call_switch 5.32% : 0.000039s : 238: predicate.inline 1.33% : 0.000010s : 58: predicate.inline_without_move 0.34% : 0.000003s : 35: predicate.j_node_and_user_rematch 0.79% : 0.000006s : 35: predicate.less_batch_normalization 1.56% : 0.000012s : 91: predicate.list_to_tuple_eliminator_ 2.53% : 0.000019s : 156: predicate.load_eliminater 0.45% : 0.000003s : 10: predicate.loop_unroll_after_grad 1.87% : 0.000014s : 104: predicate.loop_unroll_before_grad 1.46% : 0.000011s : 84: predicate.make_slice_get_slice_eliminator 0.62% : 0.000005s : 35: predicate.merge_addn 1.25% : 0.000009s : 71: predicate.micro_step_allgather_replace 1.24% : 0.000009s : 71: predicate.mini_step_allgather_replace 1.03% : 0.000008s : 64: predicate.minmaximum_grad 0.49% : 0.000004s : 10: predicate.mutable_eliminate 0.19% : 0.000001s : 9: predicate.opt_reshape 0.20% : 0.000002s : 10: predicate.parallel_virtual_node 1.77% : 0.000013s : 84: predicate.partial_defer_inline 1.55% : 0.000011s : 82: predicate.partial_eliminate 1.03% : 0.000008s : 64: predicate.print_const_string_wrapper 0.61% : 0.000004s : 35: predicate.reduce_all_const_elim 1.34% : 0.000010s : 64: predicate.reduce_eliminate 2.49% : 0.000018s : 156: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000003s : 35: predicate.remove_not_recompute_node 1.87% : 0.000014s : 143: predicate.replace_applicator 0.63% : 0.000005s : 58: predicate.replace_old_param 0.14% : 0.000001s : 10: predicate.reset_defer_inline 1.09% : 0.000008s : 64: predicate.reshape_eliminate 1.27% : 0.000009s : 71: predicate.row_tensor_add_zeros_like 0.20% : 0.000001s : 10: predicate.row_tensor_eliminate 1.48% : 0.000011s : 71: predicate.same_eliminate 0.41% : 0.000003s : 35: predicate.set_cell_output_no_recompute 0.74% : 0.000005s : 35: predicate.shard_identity_eliminate 0.36% : 0.000003s : 19: predicate.special_op_eliminate 0.69% : 0.000005s : 35: predicate.specialize_transform 1.45% : 0.000011s : 71: predicate.split_environ_get_set_with_tuple_value 1.22% : 0.000009s : 58: predicate.stack_unstack_eliminate 0.19% : 0.000001s : 10: predicate.switch_call_monad_eliminater 1.55% : 0.000011s : 84: predicate.switch_defer_inline 2.77% : 0.000021s : 155: predicate.switch_layer_defer_inline 4.37% : 0.000032s : 232: predicate.switch_simplify 1.01% : 0.000007s : 64: predicate.tile_eliminate 1.07% : 0.000008s : 64: predicate.transpose_eliminate 1.55% : 0.000011s : 83: predicate.tuple_list_convert_item_index_to_positive 1.56% : 0.000012s : 83: predicate.tuple_list_get_item_const_eliminator 1.47% : 0.000011s : 83: predicate.tuple_list_get_item_depend_reorder 2.77% : 0.000020s : 126: predicate.tuple_list_get_item_eliminator 1.53% : 0.000011s : 83: predicate.tuple_list_get_set_item_eliminator 2.27% : 0.000017s : 118: predicate.tuple_list_set_item_eliminator 1.54% : 0.000011s : 91: predicate.tuple_to_list_eliminator_ 2.47% : 0.000018s : 156: predicate.updatestate_pure_node_eliminater 3.13% : 0.000023s : 191: predicate.updatestate_useless_node_eliminater 0.25% : 0.000002s : 10: predicate.value_based_eliminate 0.66% : 0.000005s : 35: predicate.virtual_dataset_eliminate 0.66% : 0.000005s : 35: predicate.virtual_output_eliminate 0.16% : 0.000001s : 9: predicate.virtual_view_grad_eliminate 0.28% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001391 25 52.86% : 0.000735s : 9: func_graph_cloner_run.FuncGraphClonerGraph 47.14% : 0.000656s : 16: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.113969 237 0.00% : 0.000004s : 1: ForceFp32Comm 2.95% : 0.003367s : 1: add_attr 2.95% : 0.003358s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.06% : 0.000069s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.09% : 0.000103s : 1: auto_monad 0.03% : 0.000030s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000004s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.49% : 0.000558s : 1: bootstrap 0.02% : 0.000027s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000022s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000039s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000013s : 1: environ_conv 0.02% : 0.000025s : 1: event_method 0.01% : 0.000016s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 0.00% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.41% : 0.000462s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000006s : 1: micro_interleaved_order_control 0.54% : 0.000612s : 1: mutable_eliminate 0.01% : 0.000008s : 1: offloading_packed_experts 0.02% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000022s : 1: opt.transform.mutable_eliminate 4.12% : 0.004700s : 117: opt.transform.opt_a 0.06% : 0.000063s : 1: opt.transform.opt_after_cconv 0.03% : 0.000039s : 1: opt.transform.opt_after_jit_grad 0.20% : 0.000231s : 28: opt.transform.opt_b 0.08% : 0.000093s : 2: opt.transform.opt_trans_graph 0.06% : 0.000065s : 4: opt.transform.symbol_engine_opt 10.24% : 0.011674s : 1: opt_a 0.15% : 0.000165s : 1: opt_after_cconv 0.45% : 0.000517s : 1: opt_after_jit_grad 0.32% : 0.000364s : 1: opt_b 12.42% : 0.014156s : 1: optimize 0.02% : 0.000026s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000032s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000009s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.04% : 0.000040s : 1: pre_auto_parallel 0.01% : 0.000008s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000049s : 1: remove_dup_value 1.72% : 0.001960s : 2: renormalize.infer 1.41% : 0.001602s : 2: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000034s : 1: rewriter_after_opt_a 0.08% : 0.000085s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000012s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000116s : 1: symbol_engine_optimizer 44.59% : 0.050814s : 1: task_emit 0.11% : 0.000123s : 1: tuple_transform 15.72% : 0.017916s : 1: type_inference 0.09% : 0.000098s : 1: validate TotalTime = 5.17883, [24] [bootstrap]: 0.00090385 [type_inference]: 0.0550804 [event_method]: 5.985e-05 [auto_monad]: 0.00016204 [graph_reusing]: 6.28e-06 [inline]: 2.71e-06 [add_attr]: 0.00863623, [1] [add_attr_with_inline]: 0.00862113, [1] [Cycle 1]: 0.00017321, [2] [tag_attr]: 5.029e-05 [meta_addattr_fg_expand]: 1.868e-05 [parallel-infer-symbol]: 3.64002e-06 [pre_auto_parallel]: 6.65e-05 [insert-virtual-dataset]: 2.51e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.62001e-06 [optimize]: 0.00594303, [53] [py_interpret_to_execute]: 6.69001e-06 [rewriter_before_opt_a]: 0.00025841 [opt_a]: 0.00350965, [2] [Cycle 1]: 0.00289781, [45] [expand_dump_flag]: 3.52997e-06 [switch_simplify]: 8.863e-05 [loop_unroll]: 3.77e-05 [a_1]: 0.00079262 [with_stream_mark]: 1.669e-05 [recompute_prepare]: 8.10999e-06 [updatestate_depend_eliminate]: 1.651e-05 [updatestate_assign_eliminate]: 1.416e-05 [updatestate_loads_eliminate]: 3.41999e-06 [parameter_eliminate]: 1.96003e-06 [a_2]: 8.217e-05 [accelerated_algorithm]: 7.06999e-06 [shard]: 1.12999e-06 [meta_shard_fg_expand]: 5.80002e-06 [shard_inline]: 6.09001e-06 [merge_send_recv]: 5.436e-05 [auto_parallel]: 8.90999e-06 [parallel]: 0.00010333 [flash_sp]: 4.304e-05 [merge_comm]: 7.26999e-06 [allreduce_fusion]: 1.369e-05 [matmul_add_comm_reduction]: 2.061e-05 [allreduce_slice_to_reducescatter]: 1.179e-05 [virtual_shard_identity]: 9.24e-06 [virtual_dataset]: 7.5e-06 [get_grad_eliminate_]: 6.17001e-06 [virtual_output]: 6.21e-06 [merge_forward]: 3.83999e-06 [cell_reuse_recompute_pass]: 1.70001e-06 [offload_activation]: 2.3e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.492e-05 [merge_recompute_call_nodes]: 3.68e-06 [before_grad]: 9.32001e-06 [set_forward_comm_id_for_comm_node_pass]: 1.336e-05 [meta_fg_expand]: 3.33e-06 [flash_sp_send_recv_attached]: 4.13999e-06 [receive_attached]: 2.4e-05 [after_resolve]: 1.066e-05 [a_after_grad]: 1.253e-05 [renormalize]: 0.00098376 [add_forward_monad_depend]: 6.84999e-06 [auto_monad_grad]: 2.26998e-06 [auto_monad_eliminator]: 3.394e-05 [cse]: 6.957e-05 [a_3]: 4.229e-05 [Cycle 2]: 0.00060238, [45] [expand_dump_flag]: 1.19998e-06 [switch_simplify]: 6.86001e-06 [loop_unroll]: 6.59001e-06 [a_1]: 0.0001263 [with_stream_mark]: 9.57001e-06 [recompute_prepare]: 6.16e-06 [updatestate_depend_eliminate]: 2.63998e-06 [updatestate_assign_eliminate]: 2.37001e-06 [updatestate_loads_eliminate]: 2.44001e-06 [parameter_eliminate]: 1.02998e-06 [a_2]: 6.766e-05 [accelerated_algorithm]: 5.92001e-06 [shard]: 1.15999e-06 [meta_shard_fg_expand]: 1.49998e-06 [shard_inline]: 5.83997e-06 [merge_send_recv]: 5.22999e-06 [auto_parallel]: 6.19001e-06 [parallel]: 4.36002e-06 [flash_sp]: 6.43e-06 [merge_comm]: 3.13e-06 [allreduce_fusion]: 2.86999e-06 [matmul_add_comm_reduction]: 5.05001e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.11999e-06 [virtual_dataset]: 5.81003e-06 [get_grad_eliminate_]: 5.59e-06 [virtual_output]: 5.47999e-06 [merge_forward]: 2.83e-06 [cell_reuse_recompute_pass]: 1.60001e-06 [offload_activation]: 5.92001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.188e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 8.26002e-06 [set_forward_comm_id_for_comm_node_pass]: 2.90998e-06 [meta_fg_expand]: 2.19001e-06 [flash_sp_send_recv_attached]: 8.70001e-07 [receive_attached]: 8.2e-07 [after_resolve]: 9.51e-06 [a_after_grad]: 8.96998e-06 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.30999e-06 [auto_monad_grad]: 6.19999e-07 [auto_monad_eliminator]: 6.18002e-06 [cse]: 1.24e-05 [a_3]: 3.368e-05 [py_interpret_to_execute_after_opt_a]: 4.53999e-06 [slice_cell_reuse_recomputed_activation]: 6.21e-06 [rewriter_after_opt_a]: 2.909e-05 [convert_after_rewriter]: 1.59e-06 [order_py_execute_after_rewriter]: 1.20001e-06 [mutable_eliminate]: 0.00056378 [opt_b]: 0.00018947, [1] [Cycle 1]: 0.00018293, [7] [b_1]: 0.0001157 [b_2]: 7.68999e-06 [updatestate_depend_eliminate]: 4.37e-06 [updatestate_assign_eliminate]: 2.41e-06 [updatestate_loads_eliminate]: 2.27001e-06 [renormalize]: 3.69997e-07 [cse]: 1.816e-05 [optimize_parallel_all_gather_comm]: 3.166e-05 [overlap_param_gather]: 1.078e-05 [cconv]: 2.305e-05 [loop_unroll]: 0.00044803 [opt_after_cconv]: 9.595e-05, [1] [Cycle 1]: 9.016e-05, [7] [c_1]: 2.831e-05 [parameter_eliminate]: 2.09e-06 [updatestate_depend_eliminate]: 5.37001e-06 [updatestate_assign_eliminate]: 2.54001e-06 [updatestate_loads_eliminate]: 2.16e-06 [cse]: 1.778e-05 [renormalize]: 4.19997e-07 [remove_dup_value]: 1.309e-05 [tuple_transform]: 7.018e-05, [1] [Cycle 1]: 6.526e-05, [4] [d_1]: 3.947e-05 [none_parameter_eliminate]: 1.72001e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 6.88e-06 [partial_unused_args_eliminate]: 1.63002e-06 [add_recomputation]: 7.646e-05 [cse_after_recomputation]: 2.063e-05, [1] [Cycle 1]: 1.652e-05, [1] [cse]: 1.11e-05 [environ_conv]: 1.179e-05 [swap_dp_allreduce_reducescatter]: 2.316e-05 [bias_add_comm_swap]: 1.517e-05 [label_micro_interleaved_index]: 1.412e-05 [label_fine_grained_interleaved_index]: 1.83002e-06 [merge_cast_opt]: 4.18001e-06 [slice_recompute_activation]: 7.00005e-07 [micro_interleaved_order_control]: 2.47001e-06 [assign_add_opt]: 4.70001e-06 [ForceFp32Comm]: 1.00999e-06 [remove_cast_before_assign_add]: 1.127e-05 [full_micro_interleaved_order_control]: 1.303e-05 [reorder_send_recv_between_fp_bp]: 3.09001e-06 [comm_op_add_attrs]: 1.06002e-06 [add_comm_op_reuse_tag]: 9.80013e-07 [interleave_split_concat_branches]: 1.09998e-06 [interleave_parallel_branches]: 1.038e-05 [overlap_opt_shard_in_pipeline]: 1.456e-05 [overlap_opt_shard_grad_in_pipeline]: 4.80999e-06 [control_data_broadcast_order]: 1.088e-05 [grouped_pairwise_exchange_alltoall]: 1.55001e-06 [offloading_packed_experts]: 4.03999e-06 [overlap_recompute_and_grad_model_parallel]: 1.766e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.09998e-06 [overlap_recompute_allgather_and_fa_grad]: 1.39e-06 [overlap_recompute_comm]: 2.24999e-06 [overlap_grad_ring_attention]: 2.627e-05 [overlap_grad_flash_sp]: 5.188e-05 [begin_end_overlap_inline]: 5.59987e-07 [split_matmul_comm_elemetwise]: 1.578e-05 [split_layernorm_comm]: 1.00001e-06 [handle_group_info]: 7.30011e-07 [symbol_engine_optimizer]: 7.22e-05, [1] [Cycle 1]: 6.778e-05, [6] [build]: 2.75002e-06 [elim_shapecalc]: 9.64e-06 [elim_not_effective]: 1.17e-05 [opt_reshape]: 6.79999e-06 [fold_const_symbol]: 9.53002e-06 [renormalize]: 1.39989e-07 [detach_backward]: 1.76e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.294e-05 [get_jit_bprop_graph]: 1.66e-06 [rewriter_after_jit_bprop_graph]: 2.33998e-06 [opt_after_jit_grad]: 0.00044761 [validate]: 6.094e-05 [backend_pass]: 6.09987e-07 [task_emit]: 5.1071 [execute]: 1.227e-05 Sums bootstrap : 0.000904s : 0.02% type_inference : 0.055080s : 1.07% event_method : 0.000060s : 0.00% auto_monad : 0.000162s : 0.00% graph_reusing : 0.000006s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000050s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000019s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000067s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000258s : 0.00% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000095s : 0.00% optimize.opt_a.loop_unroll : 0.000044s : 0.00% optimize.opt_a.a_1 : 0.000919s : 0.02% optimize.opt_a.with_stream_mark : 0.000026s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000017s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000150s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000002s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000007s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000060s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000108s : 0.00% optimize.opt_a.flash_sp : 0.000049s : 0.00% optimize.opt_a.merge_comm : 0.000010s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000029s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000004s : 0.00% optimize.opt_a.before_grad : 0.000018s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000016s : 0.00% optimize.opt_a.meta_fg_expand : 0.000006s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000005s : 0.00% optimize.opt_a.receive_attached : 0.000025s : 0.00% optimize.opt_a.after_resolve : 0.000020s : 0.00% optimize.opt_a.a_after_grad : 0.000021s : 0.00% optimize.opt_a.renormalize : 0.000984s : 0.02% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.00% optimize.opt_a.cse : 0.000082s : 0.00% optimize.opt_a.a_3 : 0.000076s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000006s : 0.00% optimize.rewriter_after_opt_a : 0.000029s : 0.00% optimize.convert_after_rewriter : 0.000002s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000564s : 0.01% optimize.opt_b.b_1 : 0.000116s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000032s : 0.00% optimize.overlap_param_gather : 0.000011s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000448s : 0.01% optimize.opt_after_cconv.c_1 : 0.000028s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000039s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000076s : 0.00% optimize.cse_after_recomputation.cse : 0.000011s : 0.00% optimize.environ_conv : 0.000012s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000023s : 0.00% optimize.bias_add_comm_swap : 0.000015s : 0.00% optimize.label_micro_interleaved_index : 0.000014s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000002s : 0.00% optimize.merge_cast_opt : 0.000004s : 0.00% optimize.slice_recompute_activation : 0.000001s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000005s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000011s : 0.00% optimize.full_micro_interleaved_order_control : 0.000013s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000010s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000015s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000005s : 0.00% optimize.control_data_broadcast_order : 0.000011s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000018s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000052s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000016s : 0.00% optimize.split_layernorm_comm : 0.000001s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000002s : 0.00% opt_after_jit_grad : 0.000448s : 0.01% validate : 0.000061s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 5.107103s : 98.80% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000262 30 0.72% : 0.000002s : 2: substitution.elim_not_effective 0.50% : 0.000001s : 2: substitution.fold_const_symbol 1.97% : 0.000005s : 4: substitution.graph_param_transform 77.91% : 0.000204s : 6: substitution.inline 0.94% : 0.000002s : 4: substitution.j_node_and_user_rematch 6.00% : 0.000016s : 4: substitution.remove_not_recompute_node 1.46% : 0.000004s : 4: substitution.replace_old_param 10.49% : 0.000027s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.054982 2 97.75% : 0.053745s : 1: type_inference.infer 2.25% : 0.001238s : 1: type_inference.specialize ------[replace.] 0.000080 10 67.37% : 0.000054s : 6: replace.inline 32.63% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000226 10 88.64% : 0.000201s : 6: match.inline 11.36% : 0.000026s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000215 1408 0.93% : 0.000002s : 15: predicate.accumulaten_eliminater 0.68% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.70% : 0.000002s : 8: predicate.addn_check_dump 1.19% : 0.000003s : 15: predicate.addn_zero_filter 0.90% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.10% : 0.000005s : 23: predicate.arithmetic_simplify 0.99% : 0.000002s : 15: predicate.cast_eliminate 0.50% : 0.000001s : 8: predicate.check_bprop_eliminate 0.53% : 0.000001s : 8: predicate.compare_switch_simplify 0.17% : 0.000000s : 4: predicate.const_output_eliminate 0.56% : 0.000001s : 8: predicate.depend_value_elim 1.02% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.22% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.97% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.71% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.30% : 0.000001s : 4: predicate.elim_not_effective 0.33% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.22% : 0.000003s : 19: predicate.environ_add_const_eliminate 1.08% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 19: predicate.environ_get_depend_swap 1.70% : 0.000004s : 27: predicate.environ_get_eliminate 1.17% : 0.000003s : 19: predicate.environ_get_set_eliminate 1.63% : 0.000004s : 25: predicate.exchange_switch_depend_value 2.41% : 0.000005s : 25: predicate.float_depend_g_call 0.50% : 0.000001s : 8: predicate.float_environ_get_switch 0.86% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.16% : 0.000000s : 4: predicate.fold_const_symbol 0.72% : 0.000002s : 8: predicate.get_grad_eliminate 0.17% : 0.000000s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.70% : 0.000012s : 64: predicate.inline 0.65% : 0.000001s : 8: predicate.inline_without_move 0.31% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.95% : 0.000002s : 8: predicate.less_batch_normalization 1.81% : 0.000004s : 27: predicate.list_to_tuple_eliminator_ 2.51% : 0.000005s : 42: predicate.load_eliminater 0.84% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.91% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.65% : 0.000004s : 23: predicate.make_slice_get_slice_eliminator 0.54% : 0.000001s : 8: predicate.merge_addn 0.52% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.48% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.86% : 0.000002s : 15: predicate.minmaximum_grad 0.91% : 0.000002s : 4: predicate.mutable_eliminate 0.32% : 0.000001s : 4: predicate.opt_reshape 0.36% : 0.000001s : 4: predicate.parallel_virtual_node 2.04% : 0.000004s : 25: predicate.partial_defer_inline 1.52% : 0.000003s : 23: predicate.partial_eliminate 0.97% : 0.000002s : 15: predicate.print_const_string_wrapper 0.59% : 0.000001s : 8: predicate.reduce_all_const_elim 1.44% : 0.000003s : 15: predicate.reduce_eliminate 2.67% : 0.000006s : 42: predicate.redundant_stop_gradient_eliminater 0.55% : 0.000001s : 8: predicate.remove_not_recompute_node 1.30% : 0.000003s : 27: predicate.replace_applicator 0.39% : 0.000001s : 8: predicate.replace_old_param 0.25% : 0.000001s : 4: predicate.reset_defer_inline 1.04% : 0.000002s : 15: predicate.reshape_eliminate 0.53% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.34% : 0.000001s : 4: predicate.row_tensor_eliminate 0.64% : 0.000001s : 8: predicate.same_eliminate 0.40% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.73% : 0.000002s : 8: predicate.shard_identity_eliminate 0.86% : 0.000002s : 8: predicate.special_op_eliminate 0.62% : 0.000001s : 8: predicate.specialize_transform 0.65% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.82% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.69% : 0.000004s : 25: predicate.switch_defer_inline 2.20% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.67% : 0.000012s : 83: predicate.switch_simplify 1.05% : 0.000002s : 15: predicate.tile_eliminate 1.02% : 0.000002s : 15: predicate.transpose_eliminate 1.52% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.49% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.44% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 3.11% : 0.000007s : 35: predicate.tuple_list_get_item_eliminator 1.50% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.23% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 27: predicate.tuple_to_list_eliminator_ 2.53% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 2.99% : 0.000006s : 50: predicate.updatestate_useless_node_eliminater 0.26% : 0.000001s : 4: predicate.value_based_eliminate 0.85% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.60% : 0.000001s : 8: predicate.virtual_output_eliminate 0.22% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.33% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001030 16 56.86% : 0.000586s : 8: func_graph_cloner_run.FuncGraphClonerGraph 43.14% : 0.000444s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 5.195901 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.17% : 0.008642s : 1: add_attr 0.17% : 0.008626s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000081s : 1: add_recomputation 0.00% : 0.000007s : 1: assign_add_opt 0.00% : 0.000169s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000005s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000018s : 1: bias_add_comm_swap 0.02% : 0.000955s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000014s : 1: control_data_broadcast_order 0.00% : 0.000005s : 1: convert_after_rewriter 0.00% : 0.000024s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000016s : 1: environ_conv 0.00% : 0.000068s : 1: event_method 0.00% : 0.000030s : 1: execute 0.00% : 0.000016s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000003s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000013s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000005s : 1: label_fine_grained_interleaved_index 0.00% : 0.000017s : 1: label_micro_interleaved_index 0.01% : 0.000456s : 1: loop_unroll 0.00% : 0.000007s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.01% : 0.000571s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.03% : 0.001389s : 78: opt.transform.opt_a 0.00% : 0.000027s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000097s : 28: opt.transform.opt_b 0.00% : 0.000044s : 2: opt.transform.opt_trans_graph 0.00% : 0.000034s : 4: opt.transform.symbol_engine_opt 0.07% : 0.003513s : 1: opt_a 0.00% : 0.000099s : 1: opt_after_cconv 0.01% : 0.000456s : 1: opt_after_jit_grad 0.00% : 0.000193s : 1: opt_b 0.11% : 0.005947s : 1: optimize 0.00% : 0.000035s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000055s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000008s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000018s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000014s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000021s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.00% : 0.000004s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000071s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000014s : 1: remove_cast_before_assign_add 0.00% : 0.000016s : 1: remove_dup_value 0.01% : 0.000488s : 1: renormalize.infer 0.01% : 0.000488s : 1: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000005s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000044s : 1: rewriter_after_opt_a 0.01% : 0.000264s : 1: rewriter_before_opt_a 0.00% : 0.000009s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000004s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000019s : 1: split_matmul_comm_elemetwise 0.00% : 0.000026s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000075s : 1: symbol_engine_optimizer 98.29% : 5.107144s : 1: task_emit 0.00% : 0.000073s : 1: tuple_transform 1.06% : 0.055106s : 1: type_inference 0.00% : 0.000084s : 1: validate TotalTime = 0.0229815, [24] [bootstrap]: 0.00045782 [type_inference]: 0.00796857 [event_method]: 1.238e-05 [auto_monad]: 5.903e-05 [graph_reusing]: 5.45001e-06 [inline]: 1.94999e-06 [add_attr]: 0.00316765, [1] [add_attr_with_inline]: 0.00315934, [1] [Cycle 1]: 4.735e-05, [2] [tag_attr]: 1.501e-05 [meta_addattr_fg_expand]: 3.78001e-06 [parallel-infer-symbol]: 3.58999e-06 [pre_auto_parallel]: 2.524e-05 [insert-virtual-dataset]: 2.85002e-06 [parallel-infer-symbol-second]: 6.19999e-07 [dataset_repeat_opt]: 2.09e-06 [pipeline_split]: 1.75001e-06 [optimize]: 0.00401061, [53] [py_interpret_to_execute]: 4.38999e-06 [rewriter_before_opt_a]: 4.524e-05 [opt_a]: 0.00215536, [2] [Cycle 1]: 0.00152332, [45] [expand_dump_flag]: 2.83e-06 [switch_simplify]: 2.957e-05 [loop_unroll]: 1.809e-05 [a_1]: 0.00036729 [with_stream_mark]: 1.45e-05 [recompute_prepare]: 7.56001e-06 [updatestate_depend_eliminate]: 3.65003e-06 [updatestate_assign_eliminate]: 4.12e-06 [updatestate_loads_eliminate]: 3.38999e-06 [parameter_eliminate]: 1.71e-06 [a_2]: 7.442e-05 [accelerated_algorithm]: 6.12001e-06 [shard]: 2.51998e-06 [meta_shard_fg_expand]: 1.75001e-06 [shard_inline]: 5.90002e-06 [merge_send_recv]: 8.35001e-06 [auto_parallel]: 5.81e-06 [parallel]: 1.806e-05 [flash_sp]: 7.74002e-06 [merge_comm]: 3.7e-06 [allreduce_fusion]: 3.03998e-06 [matmul_add_comm_reduction]: 9.11002e-06 [allreduce_slice_to_reducescatter]: 7.39994e-07 [virtual_shard_identity]: 7.93001e-06 [virtual_dataset]: 6.24999e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 6.13002e-06 [merge_forward]: 3.78999e-06 [cell_reuse_recompute_pass]: 1.20001e-06 [offload_activation]: 9.62999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.207e-05 [merge_recompute_call_nodes]: 1.44998e-06 [before_grad]: 1.075e-05 [set_forward_comm_id_for_comm_node_pass]: 3.41001e-06 [meta_fg_expand]: 2.91e-06 [flash_sp_send_recv_attached]: 2.27999e-06 [receive_attached]: 2.01998e-06 [after_resolve]: 1.079e-05 [a_after_grad]: 9.56998e-06 [renormalize]: 0.00051098 [add_forward_monad_depend]: 5.12e-06 [auto_monad_grad]: 1.99999e-06 [auto_monad_eliminator]: 1.516e-05 [cse]: 3.05e-05 [a_3]: 4.206e-05 [Cycle 2]: 0.00062289, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 7.08998e-06 [loop_unroll]: 5.60001e-06 [a_1]: 0.00012444 [with_stream_mark]: 1.123e-05 [recompute_prepare]: 5.86e-06 [updatestate_depend_eliminate]: 2.73998e-06 [updatestate_assign_eliminate]: 2.14e-06 [updatestate_loads_eliminate]: 2.41e-06 [parameter_eliminate]: 8.80013e-07 [a_2]: 6.613e-05 [accelerated_algorithm]: 5.51e-06 [shard]: 1.11002e-06 [meta_shard_fg_expand]: 1.22999e-06 [shard_inline]: 5.76e-06 [merge_send_recv]: 2.07e-05 [auto_parallel]: 5.77999e-06 [parallel]: 4.24997e-06 [flash_sp]: 3.61999e-06 [merge_comm]: 2.88998e-06 [allreduce_fusion]: 2.69001e-06 [matmul_add_comm_reduction]: 5.77999e-06 [allreduce_slice_to_reducescatter]: 3.50003e-07 [virtual_shard_identity]: 7.14001e-06 [virtual_dataset]: 5.51e-06 [get_grad_eliminate_]: 5.42001e-06 [virtual_output]: 5.62999e-06 [merge_forward]: 2.68e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 6.52001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.24e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 8.84e-06 [set_forward_comm_id_for_comm_node_pass]: 3.34001e-06 [meta_fg_expand]: 1.76998e-06 [flash_sp_send_recv_attached]: 9.80013e-07 [receive_attached]: 1.07e-06 [after_resolve]: 9.74e-06 [a_after_grad]: 8.72e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.14003e-06 [auto_monad_grad]: 7.2e-07 [auto_monad_eliminator]: 6.56e-06 [cse]: 1.391e-05 [a_3]: 3.322e-05 [py_interpret_to_execute_after_opt_a]: 4.57998e-06 [slice_cell_reuse_recomputed_activation]: 2.09e-06 [rewriter_after_opt_a]: 1.558e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.28002e-06 [mutable_eliminate]: 0.00049163 [opt_b]: 0.0001886, [1] [Cycle 1]: 0.00018281, [7] [b_1]: 0.00011442 [b_2]: 8.16002e-06 [updatestate_depend_eliminate]: 4.99e-06 [updatestate_assign_eliminate]: 2.14e-06 [updatestate_loads_eliminate]: 2.48e-06 [renormalize]: 5.8001e-07 [cse]: 1.722e-05 [optimize_parallel_all_gather_comm]: 1.57e-05 [overlap_param_gather]: 1.97999e-06 [cconv]: 2.341e-05 [loop_unroll]: 0.00041983 [opt_after_cconv]: 9.695e-05, [1] [Cycle 1]: 9.136e-05, [7] [c_1]: 2.976e-05 [parameter_eliminate]: 2.41998e-06 [updatestate_depend_eliminate]: 4.87e-06 [updatestate_assign_eliminate]: 2.34999e-06 [updatestate_loads_eliminate]: 2.25002e-06 [cse]: 1.719e-05 [renormalize]: 3.80009e-07 [remove_dup_value]: 1.339e-05 [tuple_transform]: 7.209e-05, [1] [Cycle 1]: 6.812e-05, [4] [d_1]: 4.244e-05 [none_parameter_eliminate]: 1.44998e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.44999e-06 [partial_unused_args_eliminate]: 1.76003e-06 [add_recomputation]: 4.566e-05 [cse_after_recomputation]: 2.146e-05, [1] [Cycle 1]: 1.711e-05, [1] [cse]: 1.174e-05 [environ_conv]: 4.82998e-06 [swap_dp_allreduce_reducescatter]: 4.90999e-06 [bias_add_comm_swap]: 2.74001e-06 [label_micro_interleaved_index]: 3.93001e-06 [label_fine_grained_interleaved_index]: 2.88e-06 [merge_cast_opt]: 1.37999e-06 [slice_recompute_activation]: 2.11e-06 [micro_interleaved_order_control]: 2.49001e-06 [assign_add_opt]: 1.25001e-06 [ForceFp32Comm]: 7.89994e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.27999e-06 [reorder_send_recv_between_fp_bp]: 2.62001e-06 [comm_op_add_attrs]: 1.09e-06 [add_comm_op_reuse_tag]: 1.09998e-06 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.32e-06 [overlap_opt_shard_in_pipeline]: 1.15999e-06 [overlap_opt_shard_grad_in_pipeline]: 2.15002e-06 [control_data_broadcast_order]: 1.227e-05 [grouped_pairwise_exchange_alltoall]: 1.86e-06 [offloading_packed_experts]: 3.79002e-06 [overlap_recompute_and_grad_model_parallel]: 4.53001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.10001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34003e-06 [overlap_recompute_comm]: 2.22999e-06 [overlap_grad_ring_attention]: 4.45e-06 [overlap_grad_flash_sp]: 1.84e-05 [begin_end_overlap_inline]: 8.49977e-07 [split_matmul_comm_elemetwise]: 2.56e-06 [split_layernorm_comm]: 2.18002e-06 [handle_group_info]: 1.20999e-06 [symbol_engine_optimizer]: 7.575e-05, [1] [Cycle 1]: 7.144e-05, [6] [build]: 3.18e-06 [elim_shapecalc]: 9.97999e-06 [elim_not_effective]: 1.277e-05 [opt_reshape]: 7.7e-06 [fold_const_symbol]: 9.44e-06 [renormalize]: 1.90019e-07 [detach_backward]: 2.22001e-06 [pipeline_parallel_scheduler]: 1.56998e-06 [auto_monad_reorder]: 1.661e-05 [get_jit_bprop_graph]: 1.52999e-06 [rewriter_after_jit_bprop_graph]: 3.45e-06 [opt_after_jit_grad]: 0.000456 [validate]: 3.464e-05 [backend_pass]: 1.16002e-06 [task_emit]: 0.00653115 [execute]: 8.78001e-06 Sums bootstrap : 0.000458s : 2.43% type_inference : 0.007969s : 42.28% event_method : 0.000012s : 0.07% auto_monad : 0.000059s : 0.31% graph_reusing : 0.000005s : 0.03% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000015s : 0.08% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000004s : 0.02% parallel-infer-symbol : 0.000004s : 0.02% pre_auto_parallel : 0.000025s : 0.13% insert-virtual-dataset : 0.000003s : 0.02% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.02% optimize.rewriter_before_opt_a : 0.000045s : 0.24% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000037s : 0.19% optimize.opt_a.loop_unroll : 0.000024s : 0.13% optimize.opt_a.a_1 : 0.000492s : 2.61% optimize.opt_a.with_stream_mark : 0.000026s : 0.14% optimize.opt_a.recompute_prepare : 0.000013s : 0.07% optimize.opt_a.updatestate_depend_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.03% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.03% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000141s : 0.75% optimize.opt_a.accelerated_algorithm : 0.000012s : 0.06% optimize.opt_a.shard : 0.000004s : 0.02% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.02% optimize.opt_a.shard_inline : 0.000012s : 0.06% optimize.opt_a.merge_send_recv : 0.000029s : 0.15% optimize.opt_a.auto_parallel : 0.000012s : 0.06% optimize.opt_a.parallel : 0.000022s : 0.12% optimize.opt_a.flash_sp : 0.000011s : 0.06% optimize.opt_a.merge_comm : 0.000007s : 0.03% optimize.opt_a.allreduce_fusion : 0.000006s : 0.03% optimize.opt_a.matmul_add_comm_reduction : 0.000015s : 0.08% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.08% optimize.opt_a.virtual_dataset : 0.000012s : 0.06% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.06% optimize.opt_a.virtual_output : 0.000012s : 0.06% optimize.opt_a.merge_forward : 0.000006s : 0.03% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.09% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000024s : 0.13% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000020s : 0.10% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.04% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.02% optimize.opt_a.receive_attached : 0.000003s : 0.02% optimize.opt_a.after_resolve : 0.000021s : 0.11% optimize.opt_a.a_after_grad : 0.000018s : 0.10% optimize.opt_a.renormalize : 0.000511s : 2.71% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.03% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000022s : 0.12% optimize.opt_a.cse : 0.000044s : 0.24% optimize.opt_a.a_3 : 0.000075s : 0.40% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000016s : 0.08% optimize.convert_after_rewriter : 0.000001s : 0.01% optimize.order_py_execute_after_rewriter : 0.000001s : 0.01% optimize.mutable_eliminate : 0.000492s : 2.61% optimize.opt_b.b_1 : 0.000114s : 0.61% optimize.opt_b.b_2 : 0.000008s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_b.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000017s : 0.09% optimize.optimize_parallel_all_gather_comm : 0.000016s : 0.08% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.12% optimize.loop_unroll : 0.000420s : 2.23% optimize.opt_after_cconv.c_1 : 0.000030s : 0.16% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.03% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000017s : 0.09% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.07% optimize.tuple_transform.d_1 : 0.000042s : 0.23% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.24% optimize.cse_after_recomputation.cse : 0.000012s : 0.06% optimize.environ_conv : 0.000005s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.02% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.02% optimize.merge_cast_opt : 0.000001s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.01% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.01% optimize.add_comm_op_reuse_tag : 0.000001s : 0.01% optimize.interleave_split_concat_branches : 0.000001s : 0.01% optimize.interleave_parallel_branches : 0.000001s : 0.01% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.07% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.01% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.01% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.02% optimize.overlap_grad_flash_sp : 0.000018s : 0.10% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.01% optimize.symbol_engine_optimizer.build : 0.000003s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000013s : 0.07% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.04% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.05% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000017s : 0.09% get_jit_bprop_graph : 0.000002s : 0.01% rewriter_after_jit_bprop_graph : 0.000003s : 0.02% opt_after_jit_grad : 0.000456s : 2.42% validate : 0.000035s : 0.18% backend_pass : 0.000001s : 0.01% task_emit : 0.006531s : 34.65% execute : 0.000009s : 0.05% Time group info: ------[substitution.] 0.000131 23 1.45% : 0.000002s : 2: substitution.elim_not_effective 0.97% : 0.000001s : 2: substitution.fold_const_symbol 4.31% : 0.000006s : 4: substitution.graph_param_transform 83.46% : 0.000109s : 3: substitution.inline 2.62% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.70% : 0.000005s : 4: substitution.remove_not_recompute_node 3.50% : 0.000005s : 4: substitution.replace_old_param ------[type_inference.] 0.007919 2 94.04% : 0.007447s : 1: type_inference.infer 5.96% : 0.000472s : 1: type_inference.specialize ------[replace.] 0.000024 3 100.00% : 0.000024s : 3: replace.inline ------[match.] 0.000107 3 100.00% : 0.000107s : 3: match.inline ------[predicate.] 0.000152 1047 1.10% : 0.000002s : 10: predicate.accumulaten_eliminater 0.97% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.63% : 0.000001s : 8: predicate.addn_check_dump 0.83% : 0.000001s : 10: predicate.addn_zero_filter 0.78% : 0.000001s : 10: predicate.adjust_all_reduce_mul_add 2.22% : 0.000003s : 18: predicate.arithmetic_simplify 0.81% : 0.000001s : 10: predicate.cast_eliminate 0.74% : 0.000001s : 8: predicate.check_bprop_eliminate 0.61% : 0.000001s : 8: predicate.compare_switch_simplify 0.26% : 0.000000s : 4: predicate.const_output_eliminate 0.66% : 0.000001s : 8: predicate.depend_value_elim 0.84% : 0.000001s : 10: predicate.dict_get_item_const_eliminator 1.24% : 0.000002s : 10: predicate.dict_get_item_eliminator 0.81% : 0.000001s : 10: predicate.dict_set_item_eliminator 1.22% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000000s : 4: predicate.elim_not_effective 0.43% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.11% : 0.000002s : 14: predicate.environ_add_const_eliminate 1.05% : 0.000002s : 14: predicate.environ_get_add_eliminate 1.08% : 0.000002s : 14: predicate.environ_get_depend_swap 1.78% : 0.000003s : 22: predicate.environ_get_eliminate 1.10% : 0.000002s : 14: predicate.environ_get_set_eliminate 1.09% : 0.000002s : 13: predicate.exchange_switch_depend_value 1.92% : 0.000003s : 13: predicate.float_depend_g_call 0.65% : 0.000001s : 8: predicate.float_environ_get_switch 0.94% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.26% : 0.000000s : 4: predicate.fold_const_symbol 1.02% : 0.000002s : 8: predicate.get_grad_eliminate 0.33% : 0.000001s : 4: predicate.graph_param_transform 0.68% : 0.000001s : 8: predicate.incorporate_call 0.62% : 0.000001s : 8: predicate.incorporate_call_switch 5.63% : 0.000009s : 47: predicate.inline 0.88% : 0.000001s : 8: predicate.inline_without_move 0.41% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.89% : 0.000001s : 8: predicate.less_batch_normalization 1.68% : 0.000003s : 18: predicate.list_to_tuple_eliminator_ 2.13% : 0.000003s : 28: predicate.load_eliminater 1.09% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.05% : 0.000003s : 23: predicate.loop_unroll_before_grad 1.80% : 0.000003s : 18: predicate.make_slice_get_slice_eliminator 0.70% : 0.000001s : 8: predicate.merge_addn 0.84% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.68% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.76% : 0.000001s : 10: predicate.minmaximum_grad 1.20% : 0.000002s : 4: predicate.mutable_eliminate 0.43% : 0.000001s : 4: predicate.opt_reshape 0.67% : 0.000001s : 4: predicate.parallel_virtual_node 1.43% : 0.000002s : 13: predicate.partial_defer_inline 1.18% : 0.000002s : 14: predicate.partial_eliminate 0.84% : 0.000001s : 10: predicate.print_const_string_wrapper 0.70% : 0.000001s : 8: predicate.reduce_all_const_elim 1.40% : 0.000002s : 10: predicate.reduce_eliminate 2.20% : 0.000003s : 28: predicate.redundant_stop_gradient_eliminater 0.68% : 0.000001s : 8: predicate.remove_not_recompute_node 1.24% : 0.000002s : 18: predicate.replace_applicator 0.55% : 0.000001s : 8: predicate.replace_old_param 0.35% : 0.000001s : 4: predicate.reset_defer_inline 0.96% : 0.000001s : 10: predicate.reshape_eliminate 0.78% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.42% : 0.000001s : 4: predicate.row_tensor_eliminate 0.93% : 0.000001s : 8: predicate.same_eliminate 0.56% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.87% : 0.000001s : 8: predicate.shard_identity_eliminate 0.79% : 0.000001s : 8: predicate.special_op_eliminate 0.81% : 0.000001s : 8: predicate.specialize_transform 1.17% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 1.01% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.38% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.18% : 0.000002s : 13: predicate.switch_defer_inline 1.85% : 0.000003s : 21: predicate.switch_layer_defer_inline 4.66% : 0.000007s : 48: predicate.switch_simplify 0.78% : 0.000001s : 10: predicate.tile_eliminate 0.79% : 0.000001s : 10: predicate.transpose_eliminate 1.51% : 0.000002s : 18: predicate.tuple_list_convert_item_index_to_positive 1.55% : 0.000002s : 18: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000002s : 18: predicate.tuple_list_get_item_depend_reorder 3.15% : 0.000005s : 26: predicate.tuple_list_get_item_eliminator 1.70% : 0.000003s : 18: predicate.tuple_list_get_set_item_eliminator 2.40% : 0.000004s : 26: predicate.tuple_list_set_item_eliminator 1.65% : 0.000002s : 18: predicate.tuple_to_list_eliminator_ 2.17% : 0.000003s : 28: predicate.updatestate_pure_node_eliminater 3.02% : 0.000005s : 36: predicate.updatestate_useless_node_eliminater 0.43% : 0.000001s : 4: predicate.value_based_eliminate 0.83% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.82% : 0.000001s : 8: predicate.virtual_output_eliminate 0.33% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.54% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000228 6 6.86% : 0.000016s : 1: func_graph_cloner_run.FuncGraphClonerGraph 93.14% : 0.000212s : 5: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.031702 196 0.01% : 0.000004s : 1: ForceFp32Comm 10.01% : 0.003173s : 1: add_attr 9.98% : 0.003163s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.16% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.20% : 0.000064s : 1: auto_monad 0.06% : 0.000020s : 1: auto_monad_reorder 0.02% : 0.000006s : 1: backend_pass 0.01% : 0.000004s : 1: begin_end_overlap_inline 0.02% : 0.000006s : 1: bias_add_comm_swap 1.55% : 0.000491s : 1: bootstrap 0.08% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.05% : 0.000015s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.08% : 0.000024s : 1: cse_after_recomputation 0.02% : 0.000005s : 1: dataset_repeat_opt 0.02% : 0.000006s : 1: detach_backward 0.02% : 0.000008s : 1: environ_conv 0.06% : 0.000018s : 1: event_method 0.04% : 0.000014s : 1: execute 0.02% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.03% : 0.000009s : 1: graph_reusing 0.02% : 0.000005s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.02% : 0.000005s : 1: inline 0.02% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.02% : 0.000006s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.35% : 0.000428s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.02% : 0.000005s : 1: micro_interleaved_order_control 1.58% : 0.000500s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.04% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000014s : 1: opt.transform.mutable_eliminate 2.72% : 0.000861s : 78: opt.transform.opt_a 0.09% : 0.000028s : 1: opt.transform.opt_after_cconv 0.07% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.30% : 0.000095s : 28: opt.transform.opt_b 0.15% : 0.000047s : 2: opt.transform.opt_trans_graph 0.12% : 0.000037s : 4: opt.transform.symbol_engine_opt 6.81% : 0.002158s : 1: opt_a 0.32% : 0.000100s : 1: opt_after_cconv 1.47% : 0.000465s : 1: opt_after_jit_grad 0.61% : 0.000192s : 1: opt_b 12.67% : 0.004015s : 1: optimize 0.06% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.07% : 0.000022s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.02% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.02% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000003s : 1: parallel-infer-symbol-second 0.02% : 0.000005s : 1: partial_unused_args_eliminate 0.02% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000029s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.02% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000017s : 1: remove_dup_value 0.87% : 0.000277s : 1: renormalize.infer 0.72% : 0.000227s : 1: renormalize.specialize 0.02% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000019s : 1: rewriter_after_opt_a 0.16% : 0.000049s : 1: rewriter_before_opt_a 0.02% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.02% : 0.000005s : 1: slice_recompute_activation 0.02% : 0.000005s : 1: split_layernorm_comm 0.02% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.25% : 0.000079s : 1: symbol_engine_optimizer 20.64% : 0.006544s : 1: task_emit 0.24% : 0.000075s : 1: tuple_transform 25.19% : 0.007985s : 1: type_inference 0.20% : 0.000065s : 1: validate TotalTime = 0.0440453, [24] [bootstrap]: 0.00050372 [type_inference]: 0.0124089 [event_method]: 1.943e-05 [auto_monad]: 9.796e-05 [graph_reusing]: 7.49002e-06 [inline]: 1.94e-06 [add_attr]: 0.00330913, [1] [add_attr_with_inline]: 0.00329917, [1] [Cycle 1]: 6.208e-05, [2] [tag_attr]: 2.462e-05 [meta_addattr_fg_expand]: 6.84999e-06 [parallel-infer-symbol]: 3.66001e-06 [pre_auto_parallel]: 3.836e-05 [insert-virtual-dataset]: 2.93e-06 [parallel-infer-symbol-second]: 8.2e-07 [dataset_repeat_opt]: 2.11998e-06 [pipeline_split]: 1.77999e-06 [optimize]: 0.0158429, [53] [py_interpret_to_execute]: 5.05999e-06 [rewriter_before_opt_a]: 8.055e-05 [opt_a]: 0.0132368, [3] [Cycle 1]: 0.00749807, [45] [expand_dump_flag]: 3.51999e-06 [switch_simplify]: 5.107e-05 [loop_unroll]: 3.735e-05 [a_1]: 0.00101685 [with_stream_mark]: 5.247e-05 [recompute_prepare]: 2.435e-05 [updatestate_depend_eliminate]: 9.62999e-06 [updatestate_assign_eliminate]: 8.43001e-06 [updatestate_loads_eliminate]: 7.67998e-06 [parameter_eliminate]: 2.89999e-06 [a_2]: 0.00024671 [accelerated_algorithm]: 3.213e-05 [shard]: 1.87999e-06 [meta_shard_fg_expand]: 3.70998e-06 [shard_inline]: 1.648e-05 [merge_send_recv]: 1.749e-05 [auto_parallel]: 1.176e-05 [parallel]: 1.863e-05 [flash_sp]: 9.59e-06 [merge_comm]: 9.54e-06 [allreduce_fusion]: 8.79e-06 [matmul_add_comm_reduction]: 2.885e-05 [allreduce_slice_to_reducescatter]: 8.2e-07 [virtual_shard_identity]: 1.714e-05 [virtual_dataset]: 1.638e-05 [get_grad_eliminate_]: 1.562e-05 [virtual_output]: 1.781e-05 [merge_forward]: 1.016e-05 [cell_reuse_recompute_pass]: 1.26997e-06 [offload_activation]: 1.871e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.942e-05 [merge_recompute_call_nodes]: 1.47001e-06 [before_grad]: 2.933e-05 [set_forward_comm_id_for_comm_node_pass]: 1.087e-05 [meta_fg_expand]: 0.00159858 [flash_sp_send_recv_attached]: 7.08e-06 [receive_attached]: 2.49999e-06 [after_resolve]: 6.54e-05 [a_after_grad]: 8.531e-05 [renormalize]: 0.00303887 [add_forward_monad_depend]: 8.97e-06 [auto_monad_grad]: 5.64998e-06 [auto_monad_eliminator]: 6.521e-05 [cse]: 0.00017152 [a_3]: 0.00037948 [Cycle 2]: 0.00448271, [45] [expand_dump_flag]: 2.10002e-06 [switch_simplify]: 5.459e-05 [loop_unroll]: 4.995e-05 [a_1]: 0.00173583 [with_stream_mark]: 1.641e-05 [recompute_prepare]: 1.7e-05 [updatestate_depend_eliminate]: 7.85e-06 [updatestate_assign_eliminate]: 6.77002e-06 [updatestate_loads_eliminate]: 5.72999e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00020001 [accelerated_algorithm]: 2.002e-05 [shard]: 1.45999e-06 [meta_shard_fg_expand]: 2.68e-06 [shard_inline]: 1.56e-05 [merge_send_recv]: 1.033e-05 [auto_parallel]: 1.049e-05 [parallel]: 5.94e-06 [flash_sp]: 3.29001e-06 [merge_comm]: 7.15e-06 [allreduce_fusion]: 7.31001e-06 [matmul_add_comm_reduction]: 1.236e-05 [allreduce_slice_to_reducescatter]: 1.10001e-06 [virtual_shard_identity]: 1.683e-05 [virtual_dataset]: 1.444e-05 [get_grad_eliminate_]: 1.606e-05 [virtual_output]: 1.462e-05 [merge_forward]: 6.74001e-06 [cell_reuse_recompute_pass]: 1.22e-06 [offload_activation]: 1.321e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.477e-05 [merge_recompute_call_nodes]: 7.89994e-07 [before_grad]: 2.421e-05 [set_forward_comm_id_for_comm_node_pass]: 7.6e-06 [meta_fg_expand]: 8.28e-05 [flash_sp_send_recv_attached]: 1.09e-06 [receive_attached]: 1.39e-06 [after_resolve]: 2.063e-05 [a_after_grad]: 2.471e-05 [renormalize]: 0.00157614 [add_forward_monad_depend]: 5.86e-06 [auto_monad_grad]: 1.32e-06 [auto_monad_eliminator]: 1.991e-05 [cse]: 7.182e-05 [a_3]: 9.752e-05 [Cycle 3]: 0.00124066, [45] [expand_dump_flag]: 1.37e-06 [switch_simplify]: 1.537e-05 [loop_unroll]: 1.351e-05 [a_1]: 0.00037256 [with_stream_mark]: 1.352e-05 [recompute_prepare]: 1.367e-05 [updatestate_depend_eliminate]: 6.39001e-06 [updatestate_assign_eliminate]: 5.22e-06 [updatestate_loads_eliminate]: 5.23002e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00017401 [accelerated_algorithm]: 1.71e-05 [shard]: 1.30999e-06 [meta_shard_fg_expand]: 2.78e-06 [shard_inline]: 1.565e-05 [merge_send_recv]: 9.19e-06 [auto_parallel]: 1.031e-05 [parallel]: 4.75001e-06 [flash_sp]: 1.18001e-06 [merge_comm]: 6.78e-06 [allreduce_fusion]: 6.31e-06 [matmul_add_comm_reduction]: 9.11002e-06 [allreduce_slice_to_reducescatter]: 4.10015e-07 [virtual_shard_identity]: 1.402e-05 [virtual_dataset]: 1.323e-05 [get_grad_eliminate_]: 1.291e-05 [virtual_output]: 1.284e-05 [merge_forward]: 5.94999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.109e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.234e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 2.15e-05 [set_forward_comm_id_for_comm_node_pass]: 6.89999e-06 [meta_fg_expand]: 4.48001e-06 [flash_sp_send_recv_attached]: 9.09989e-07 [receive_attached]: 1.03001e-06 [after_resolve]: 1.706e-05 [a_after_grad]: 2.145e-05 [renormalize]: 9.00181e-08 [add_forward_monad_depend]: 1.37e-06 [auto_monad_grad]: 8.80013e-07 [auto_monad_eliminator]: 1.356e-05 [cse]: 3.755e-05 [a_3]: 0.0001162 [py_interpret_to_execute_after_opt_a]: 6.03002e-06 [slice_cell_reuse_recomputed_activation]: 2.57001e-06 [rewriter_after_opt_a]: 3.319e-05 [convert_after_rewriter]: 1.28002e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00057392 [opt_b]: 0.000413, [1] [Cycle 1]: 0.00040622, [7] [b_1]: 0.00028926 [b_2]: 1.613e-05 [updatestate_depend_eliminate]: 8.81002e-06 [updatestate_assign_eliminate]: 5.51e-06 [updatestate_loads_eliminate]: 5.45001e-06 [renormalize]: 7.00005e-07 [cse]: 4.524e-05 [optimize_parallel_all_gather_comm]: 2.944e-05 [overlap_param_gather]: 2.54999e-06 [cconv]: 2.343e-05 [loop_unroll]: 0.00046163 [opt_after_cconv]: 0.00017781, [1] [Cycle 1]: 0.00017228, [7] [c_1]: 7.385e-05 [parameter_eliminate]: 2.63e-06 [updatestate_depend_eliminate]: 8.80999e-06 [updatestate_assign_eliminate]: 5.57001e-06 [updatestate_loads_eliminate]: 5.19e-06 [cse]: 4.195e-05 [renormalize]: 5.40022e-07 [remove_dup_value]: 5.293e-05 [tuple_transform]: 0.00013687, [1] [Cycle 1]: 0.00013226, [4] [d_1]: 9.79e-05 [none_parameter_eliminate]: 1.81998e-06 [renormalize]: 1.69995e-07 [switch_simplify]: 1.361e-05 [partial_unused_args_eliminate]: 1.87001e-06 [add_recomputation]: 7.219e-05 [cse_after_recomputation]: 4.177e-05, [1] [Cycle 1]: 3.611e-05, [1] [cse]: 3.025e-05 [environ_conv]: 1.121e-05 [swap_dp_allreduce_reducescatter]: 1.002e-05 [bias_add_comm_swap]: 2.73998e-06 [label_micro_interleaved_index]: 4.48999e-06 [label_fine_grained_interleaved_index]: 2.81e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.16998e-06 [micro_interleaved_order_control]: 2.17999e-06 [assign_add_opt]: 1.27e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.08001e-06 [full_micro_interleaved_order_control]: 2.37001e-06 [reorder_send_recv_between_fp_bp]: 2.88e-06 [comm_op_add_attrs]: 1.32999e-06 [add_comm_op_reuse_tag]: 1.05999e-06 [interleave_split_concat_branches]: 1.22e-06 [interleave_parallel_branches]: 1.06002e-06 [overlap_opt_shard_in_pipeline]: 1.70001e-06 [overlap_opt_shard_grad_in_pipeline]: 1.76998e-06 [control_data_broadcast_order]: 2.108e-05 [grouped_pairwise_exchange_alltoall]: 1.55999e-06 [offloading_packed_experts]: 6.24001e-06 [overlap_recompute_and_grad_model_parallel]: 6.99001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.29e-06 [overlap_recompute_allgather_and_fa_grad]: 1.32999e-06 [overlap_recompute_comm]: 2.00002e-06 [overlap_grad_ring_attention]: 6.17001e-06 [overlap_grad_flash_sp]: 3.04e-05 [begin_end_overlap_inline]: 5.00004e-07 [split_matmul_comm_elemetwise]: 2.24001e-06 [split_layernorm_comm]: 1.94999e-06 [handle_group_info]: 1.04e-06 [symbol_engine_optimizer]: 0.00012743, [1] [Cycle 1]: 0.00012172, [6] [build]: 9.34998e-06 [elim_shapecalc]: 1.878e-05 [elim_not_effective]: 2.454e-05 [opt_reshape]: 1.806e-05 [fold_const_symbol]: 2.173e-05 [renormalize]: 1.80007e-07 [detach_backward]: 1.96998e-06 [pipeline_parallel_scheduler]: 1.45999e-06 [auto_monad_reorder]: 2.925e-05 [get_jit_bprop_graph]: 1.34998e-06 [rewriter_after_jit_bprop_graph]: 3.60998e-06 [opt_after_jit_grad]: 0.00053093 [validate]: 6.486e-05 [backend_pass]: 9.80013e-07 [task_emit]: 0.0109332 [execute]: 7.98999e-06 Sums bootstrap : 0.000504s : 1.28% type_inference : 0.012409s : 31.47% event_method : 0.000019s : 0.05% auto_monad : 0.000098s : 0.25% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000025s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000007s : 0.02% parallel-infer-symbol : 0.000004s : 0.01% pre_auto_parallel : 0.000038s : 0.10% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000081s : 0.20% optimize.opt_a.expand_dump_flag : 0.000007s : 0.02% optimize.opt_a.switch_simplify : 0.000121s : 0.31% optimize.opt_a.loop_unroll : 0.000101s : 0.26% optimize.opt_a.a_1 : 0.003125s : 7.93% optimize.opt_a.with_stream_mark : 0.000082s : 0.21% optimize.opt_a.recompute_prepare : 0.000055s : 0.14% optimize.opt_a.updatestate_depend_eliminate : 0.000024s : 0.06% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.05% optimize.opt_a.updatestate_loads_eliminate : 0.000019s : 0.05% optimize.opt_a.parameter_eliminate : 0.000005s : 0.01% optimize.opt_a.a_2 : 0.000621s : 1.57% optimize.opt_a.accelerated_algorithm : 0.000069s : 0.18% optimize.opt_a.shard : 0.000005s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.02% optimize.opt_a.shard_inline : 0.000048s : 0.12% optimize.opt_a.merge_send_recv : 0.000037s : 0.09% optimize.opt_a.auto_parallel : 0.000033s : 0.08% optimize.opt_a.parallel : 0.000029s : 0.07% optimize.opt_a.flash_sp : 0.000014s : 0.04% optimize.opt_a.merge_comm : 0.000023s : 0.06% optimize.opt_a.allreduce_fusion : 0.000022s : 0.06% optimize.opt_a.matmul_add_comm_reduction : 0.000050s : 0.13% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.01% optimize.opt_a.virtual_shard_identity : 0.000048s : 0.12% optimize.opt_a.virtual_dataset : 0.000044s : 0.11% optimize.opt_a.get_grad_eliminate_ : 0.000045s : 0.11% optimize.opt_a.virtual_output : 0.000045s : 0.11% optimize.opt_a.merge_forward : 0.000023s : 0.06% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.01% optimize.opt_a.offload_activation : 0.000043s : 0.11% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000077s : 0.19% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.01% optimize.opt_a.before_grad : 0.000075s : 0.19% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000025s : 0.06% optimize.opt_a.meta_fg_expand : 0.001686s : 4.28% optimize.opt_a.flash_sp_send_recv_attached : 0.000009s : 0.02% optimize.opt_a.receive_attached : 0.000005s : 0.01% optimize.opt_a.after_resolve : 0.000103s : 0.26% optimize.opt_a.a_after_grad : 0.000131s : 0.33% optimize.opt_a.renormalize : 0.004615s : 11.71% optimize.opt_a.add_forward_monad_depend : 0.000016s : 0.04% optimize.opt_a.auto_monad_grad : 0.000008s : 0.02% optimize.opt_a.auto_monad_eliminator : 0.000099s : 0.25% optimize.opt_a.cse : 0.000281s : 0.71% optimize.opt_a.a_3 : 0.000593s : 1.50% optimize.py_interpret_to_execute_after_opt_a : 0.000006s : 0.02% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.01% optimize.rewriter_after_opt_a : 0.000033s : 0.08% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000574s : 1.46% optimize.opt_b.b_1 : 0.000289s : 0.73% optimize.opt_b.b_2 : 0.000016s : 0.04% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000045s : 0.11% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.07% optimize.overlap_param_gather : 0.000003s : 0.01% optimize.cconv : 0.000023s : 0.06% optimize.loop_unroll : 0.000462s : 1.17% optimize.opt_after_cconv.c_1 : 0.000074s : 0.19% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000005s : 0.01% optimize.opt_after_cconv.cse : 0.000042s : 0.11% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000053s : 0.13% optimize.tuple_transform.d_1 : 0.000098s : 0.25% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000014s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000072s : 0.18% optimize.cse_after_recomputation.cse : 0.000030s : 0.08% optimize.environ_conv : 0.000011s : 0.03% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.03% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000002s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000021s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000006s : 0.02% optimize.overlap_recompute_and_grad_model_parallel : 0.000007s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000006s : 0.02% optimize.overlap_grad_flash_sp : 0.000030s : 0.08% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000009s : 0.02% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000019s : 0.05% optimize.symbol_engine_optimizer.elim_not_effective : 0.000025s : 0.06% optimize.symbol_engine_optimizer.opt_reshape : 0.000018s : 0.05% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000022s : 0.06% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000029s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000531s : 1.35% validate : 0.000065s : 0.16% backend_pass : 0.000001s : 0.00% task_emit : 0.010933s : 27.73% execute : 0.000008s : 0.02% Time group info: ------[substitution.] 0.000677 216 0.53% : 0.000004s : 7: substitution.elim_not_effective 0.59% : 0.000004s : 5: substitution.float_depend_g_call 0.78% : 0.000005s : 3: substitution.float_tuple_getitem_switch 0.43% : 0.000003s : 7: substitution.fold_const_symbol 1.53% : 0.000010s : 11: substitution.graph_param_transform 0.42% : 0.000003s : 2: substitution.incorporate_call 0.33% : 0.000002s : 2: substitution.incorporate_call_switch 54.89% : 0.000372s : 12: substitution.inline 2.60% : 0.000018s : 2: substitution.inline_without_move 1.99% : 0.000013s : 25: substitution.j_node_and_user_rematch 2.55% : 0.000017s : 3: substitution.less_batch_normalization 1.91% : 0.000013s : 11: substitution.minmaximum_grad 0.17% : 0.000001s : 1: substitution.opt_reshape 0.81% : 0.000006s : 5: substitution.partial_eliminate 2.61% : 0.000018s : 25: substitution.remove_not_recompute_node 3.08% : 0.000021s : 9: substitution.replace_applicator 1.55% : 0.000011s : 12: substitution.replace_old_param 1.77% : 0.000012s : 3: substitution.reshape_eliminate 0.44% : 0.000003s : 1: substitution.set_cell_output_no_recompute 1.73% : 0.000012s : 4: substitution.transpose_eliminate 4.56% : 0.000031s : 11: substitution.tuple_list_convert_item_index_to_positive 2.04% : 0.000014s : 11: substitution.tuple_list_get_item_const_eliminator 2.80% : 0.000019s : 11: substitution.tuple_list_get_item_depend_reorder 7.13% : 0.000048s : 22: substitution.tuple_list_get_item_eliminator 2.74% : 0.000019s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.012342 2 92.39% : 0.011403s : 1: type_inference.infer 7.61% : 0.000939s : 1: type_inference.specialize ------[replace.] 0.000152 20 64.77% : 0.000098s : 12: replace.inline 35.23% : 0.000054s : 8: replace.tuple_list_get_item_eliminator ------[match.] 0.000378 20 96.34% : 0.000364s : 12: match.inline 3.66% : 0.000014s : 8: match.tuple_list_get_item_eliminator ------[predicate.] 0.000847 6196 0.98% : 0.000008s : 70: predicate.accumulaten_eliminater 0.40% : 0.000003s : 11: predicate.ad_related_special_op_eliminate 0.60% : 0.000005s : 41: predicate.addn_check_dump 0.97% : 0.000008s : 70: predicate.addn_zero_filter 0.94% : 0.000008s : 70: predicate.adjust_all_reduce_mul_add 2.09% : 0.000018s : 111: predicate.arithmetic_simplify 0.99% : 0.000008s : 70: predicate.cast_eliminate 1.25% : 0.000011s : 79: predicate.check_bprop_eliminate 0.60% : 0.000005s : 41: predicate.compare_switch_simplify 0.12% : 0.000001s : 12: predicate.const_output_eliminate 0.62% : 0.000005s : 41: predicate.depend_value_elim 1.06% : 0.000009s : 70: predicate.dict_get_item_const_eliminator 1.29% : 0.000011s : 70: predicate.dict_get_item_eliminator 1.01% : 0.000009s : 70: predicate.dict_set_item_eliminator 0.43% : 0.000004s : 23: predicate.dumpgradient_eliminate 0.13% : 0.000001s : 11: predicate.elim_not_effective 0.20% : 0.000002s : 11: predicate.elim_shapecalc_of_broadcastargs 1.17% : 0.000010s : 82: predicate.environ_add_const_eliminate 1.15% : 0.000010s : 82: predicate.environ_get_add_eliminate 1.14% : 0.000010s : 82: predicate.environ_get_depend_swap 1.77% : 0.000015s : 123: predicate.environ_get_eliminate 1.14% : 0.000010s : 82: predicate.environ_get_set_eliminate 1.33% : 0.000011s : 90: predicate.exchange_switch_depend_value 1.88% : 0.000016s : 90: predicate.float_depend_g_call 0.60% : 0.000005s : 41: predicate.float_environ_get_switch 0.80% : 0.000007s : 53: predicate.float_tuple_getitem_switch 0.10% : 0.000001s : 11: predicate.fold_const_symbol 0.68% : 0.000006s : 41: predicate.get_grad_eliminate 0.11% : 0.000001s : 11: predicate.graph_param_transform 0.63% : 0.000005s : 41: predicate.incorporate_call 0.58% : 0.000005s : 41: predicate.incorporate_call_switch 5.24% : 0.000044s : 266: predicate.inline 1.33% : 0.000011s : 64: predicate.inline_without_move 0.37% : 0.000003s : 41: predicate.j_node_and_user_rematch 0.78% : 0.000007s : 41: predicate.less_batch_normalization 1.53% : 0.000013s : 101: predicate.list_to_tuple_eliminator_ 2.40% : 0.000020s : 172: predicate.load_eliminater 0.41% : 0.000003s : 12: predicate.loop_unroll_after_grad 1.73% : 0.000015s : 110: predicate.loop_unroll_before_grad 1.40% : 0.000012s : 94: predicate.make_slice_get_slice_eliminator 0.63% : 0.000005s : 41: predicate.merge_addn 1.18% : 0.000010s : 79: predicate.micro_step_allgather_replace 1.19% : 0.000010s : 79: predicate.mini_step_allgather_replace 0.99% : 0.000008s : 70: predicate.minmaximum_grad 0.45% : 0.000004s : 12: predicate.mutable_eliminate 0.22% : 0.000002s : 11: predicate.opt_reshape 0.21% : 0.000002s : 12: predicate.parallel_virtual_node 1.64% : 0.000014s : 90: predicate.partial_defer_inline 1.45% : 0.000012s : 90: predicate.partial_eliminate 1.02% : 0.000009s : 70: predicate.print_const_string_wrapper 0.62% : 0.000005s : 41: predicate.reduce_all_const_elim 1.25% : 0.000011s : 70: predicate.reduce_eliminate 2.42% : 0.000020s : 172: predicate.redundant_stop_gradient_eliminater 0.38% : 0.000003s : 41: predicate.remove_not_recompute_node 1.81% : 0.000015s : 157: predicate.replace_applicator 0.63% : 0.000005s : 64: predicate.replace_old_param 0.15% : 0.000001s : 12: predicate.reset_defer_inline 1.06% : 0.000009s : 70: predicate.reshape_eliminate 1.27% : 0.000011s : 79: predicate.row_tensor_add_zeros_like 0.21% : 0.000002s : 12: predicate.row_tensor_eliminate 1.31% : 0.000011s : 79: predicate.same_eliminate 0.42% : 0.000004s : 41: predicate.set_cell_output_no_recompute 0.67% : 0.000006s : 41: predicate.shard_identity_eliminate 0.40% : 0.000003s : 23: predicate.special_op_eliminate 0.70% : 0.000006s : 41: predicate.specialize_transform 1.32% : 0.000011s : 79: predicate.split_environ_get_set_with_tuple_value 1.18% : 0.000010s : 64: predicate.stack_unstack_eliminate 0.20% : 0.000002s : 12: predicate.switch_call_monad_eliminater 1.44% : 0.000012s : 90: predicate.switch_defer_inline 2.62% : 0.000022s : 169: predicate.switch_layer_defer_inline 4.17% : 0.000035s : 252: predicate.switch_simplify 1.00% : 0.000008s : 70: predicate.tile_eliminate 1.04% : 0.000009s : 70: predicate.transpose_eliminate 4.75% : 0.000040s : 93: predicate.tuple_list_convert_item_index_to_positive 1.50% : 0.000013s : 93: predicate.tuple_list_get_item_const_eliminator 1.40% : 0.000012s : 93: predicate.tuple_list_get_item_depend_reorder 2.59% : 0.000022s : 142: predicate.tuple_list_get_item_eliminator 1.50% : 0.000013s : 93: predicate.tuple_list_get_set_item_eliminator 2.14% : 0.000018s : 134: predicate.tuple_list_set_item_eliminator 1.50% : 0.000013s : 101: predicate.tuple_to_list_eliminator_ 2.38% : 0.000020s : 172: predicate.updatestate_pure_node_eliminater 3.07% : 0.000026s : 213: predicate.updatestate_useless_node_eliminater 0.20% : 0.000002s : 12: predicate.value_based_eliminate 0.67% : 0.000006s : 41: predicate.virtual_dataset_eliminate 0.69% : 0.000006s : 41: predicate.virtual_output_eliminate 0.17% : 0.000001s : 11: predicate.virtual_view_grad_eliminate 0.23% : 0.000002s : 12: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001312 25 56.16% : 0.000737s : 9: func_graph_cloner_run.FuncGraphClonerGraph 43.84% : 0.000575s : 16: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.073501 237 0.00% : 0.000004s : 1: ForceFp32Comm 4.51% : 0.003314s : 1: add_attr 4.49% : 0.003303s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.10% : 0.000077s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.14% : 0.000104s : 1: auto_monad 0.05% : 0.000033s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.73% : 0.000537s : 1: bootstrap 0.04% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.03% : 0.000024s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000045s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000014s : 1: environ_conv 0.03% : 0.000025s : 1: event_method 0.02% : 0.000013s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000008s : 1: label_micro_interleaved_index 0.64% : 0.000471s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 0.79% : 0.000583s : 1: mutable_eliminate 0.01% : 0.000009s : 1: offloading_packed_experts 0.03% : 0.000024s : 1: opt.transform.loop_unroll_optimizer 0.03% : 0.000024s : 1: opt.transform.mutable_eliminate 7.03% : 0.005165s : 117: opt.transform.opt_a 0.10% : 0.000072s : 1: opt.transform.opt_after_cconv 0.06% : 0.000047s : 1: opt.transform.opt_after_jit_grad 0.38% : 0.000276s : 28: opt.transform.opt_b 0.15% : 0.000109s : 2: opt.transform.opt_trans_graph 0.11% : 0.000079s : 4: opt.transform.symbol_engine_opt 18.01% : 0.013240s : 1: opt_a 0.25% : 0.000181s : 1: opt_after_cconv 0.74% : 0.000541s : 1: opt_after_jit_grad 0.57% : 0.000417s : 1: opt_b 21.56% : 0.015848s : 1: optimize 0.05% : 0.000034s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.05% : 0.000034s : 1: overlap_grad_flash_sp 0.01% : 0.000005s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000009s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000005s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.06% : 0.000043s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000009s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.08% : 0.000058s : 1: remove_dup_value 4.06% : 0.002981s : 2: renormalize.infer 2.20% : 0.001618s : 2: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000037s : 1: rewriter_after_opt_a 0.11% : 0.000084s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000130s : 1: symbol_engine_optimizer 14.90% : 0.010950s : 1: task_emit 0.19% : 0.000140s : 1: tuple_transform 16.91% : 0.012429s : 1: type_inference 0.14% : 0.000105s : 1: validate ... TotalTime = 32.3979, [24] [bootstrap]: 0.0009033 [type_inference]: 0.0550804 [event_method]: 5.999e-05 [auto_monad]: 0.00016193 [graph_reusing]: 6.65002e-06 [inline]: 3.23e-06 [add_attr]: 0.00863628, [1] [add_attr_with_inline]: 0.0086211, [1] [Cycle 1]: 0.00016674, [2] [tag_attr]: 4.989e-05 [meta_addattr_fg_expand]: 2.057e-05 [parallel-infer-symbol]: 3.7e-06 [pre_auto_parallel]: 6.643e-05 [insert-virtual-dataset]: 2.56998e-06 [parallel-infer-symbol-second]: 9.39996e-07 [dataset_repeat_opt]: 1.62001e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00594307, [53] [py_interpret_to_execute]: 6.73e-06 [rewriter_before_opt_a]: 0.00025822 [opt_a]: 0.0035097, [2] [Cycle 1]: 0.00289544, [45] [expand_dump_flag]: 3.53e-06 [switch_simplify]: 8.927e-05 [loop_unroll]: 3.735e-05 [a_1]: 0.00079157 [with_stream_mark]: 1.66e-05 [recompute_prepare]: 8.16002e-06 [updatestate_depend_eliminate]: 1.685e-05 [updatestate_assign_eliminate]: 1.352e-05 [updatestate_loads_eliminate]: 4.95999e-06 [parameter_eliminate]: 1.01002e-06 [a_2]: 8.037e-05 [accelerated_algorithm]: 6.93998e-06 [shard]: 5.12e-06 [meta_shard_fg_expand]: 2.10002e-06 [shard_inline]: 6.65002e-06 [merge_send_recv]: 5.47e-05 [auto_parallel]: 9.05001e-06 [parallel]: 0.00010146 [flash_sp]: 4.421e-05 [merge_comm]: 6.41998e-06 [allreduce_fusion]: 1.138e-05 [matmul_add_comm_reduction]: 2.046e-05 [allreduce_slice_to_reducescatter]: 1.168e-05 [virtual_shard_identity]: 9.20999e-06 [virtual_dataset]: 6.82002e-06 [get_grad_eliminate_]: 6.02999e-06 [virtual_output]: 6.38e-06 [merge_forward]: 4.05e-06 [cell_reuse_recompute_pass]: 1.42999e-06 [offload_activation]: 2.357e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.483e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.001e-05 [set_forward_comm_id_for_comm_node_pass]: 1.505e-05 [meta_fg_expand]: 3.4e-06 [flash_sp_send_recv_attached]: 3.44001e-06 [receive_attached]: 2.313e-05 [after_resolve]: 1.442e-05 [a_after_grad]: 1.014e-05 [renormalize]: 0.00098385 [add_forward_monad_depend]: 6.87002e-06 [auto_monad_grad]: 2.39001e-06 [auto_monad_eliminator]: 3.4e-05 [cse]: 6.16e-05 [a_3]: 4.763e-05 [Cycle 2]: 0.00060344, [45] [expand_dump_flag]: 1.88002e-06 [switch_simplify]: 6.83e-06 [loop_unroll]: 6.61e-06 [a_1]: 0.00012393 [with_stream_mark]: 1.147e-05 [recompute_prepare]: 5.86e-06 [updatestate_depend_eliminate]: 3.08e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.42001e-06 [parameter_eliminate]: 1.04998e-06 [a_2]: 6.833e-05 [accelerated_algorithm]: 6.09001e-06 [shard]: 8.89995e-07 [meta_shard_fg_expand]: 1.36998e-06 [shard_inline]: 5.67999e-06 [merge_send_recv]: 4.40999e-06 [auto_parallel]: 6.21998e-06 [parallel]: 4.60001e-06 [flash_sp]: 6.43e-06 [merge_comm]: 3.06999e-06 [allreduce_fusion]: 2.79999e-06 [matmul_add_comm_reduction]: 5.10999e-06 [allreduce_slice_to_reducescatter]: 4.09986e-07 [virtual_shard_identity]: 6.73998e-06 [virtual_dataset]: 5.94e-06 [get_grad_eliminate_]: 5.67001e-06 [virtual_output]: 5.60001e-06 [merge_forward]: 3.00998e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.03002e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.196e-05 [merge_recompute_call_nodes]: 7.10017e-07 [before_grad]: 8.78001e-06 [set_forward_comm_id_for_comm_node_pass]: 2.92002e-06 [meta_fg_expand]: 1.92001e-06 [flash_sp_send_recv_attached]: 8.90024e-07 [receive_attached]: 1.30001e-06 [after_resolve]: 9.36998e-06 [a_after_grad]: 8.92e-06 [renormalize]: 8.9989e-08 [add_forward_monad_depend]: 1.03001e-06 [auto_monad_grad]: 1.20999e-06 [auto_monad_eliminator]: 5.95002e-06 [cse]: 1.293e-05 [a_3]: 3.407e-05 [py_interpret_to_execute_after_opt_a]: 3.96001e-06 [slice_cell_reuse_recomputed_activation]: 4.90999e-06 [rewriter_after_opt_a]: 1.871e-05 [convert_after_rewriter]: 1.37999e-06 [order_py_execute_after_rewriter]: 1.30999e-06 [mutable_eliminate]: 0.00057546 [opt_b]: 0.00018969, [1] [Cycle 1]: 0.00018306, [7] [b_1]: 0.00011545 [b_2]: 7.56999e-06 [updatestate_depend_eliminate]: 4.42e-06 [updatestate_assign_eliminate]: 2.53e-06 [updatestate_loads_eliminate]: 2.12999e-06 [renormalize]: 3.80009e-07 [cse]: 1.78e-05 [optimize_parallel_all_gather_comm]: 2.866e-05 [overlap_param_gather]: 1.345e-05 [cconv]: 2.314e-05 [loop_unroll]: 0.00044749 [opt_after_cconv]: 9.588e-05, [1] [Cycle 1]: 9.006e-05, [7] [c_1]: 2.827e-05 [parameter_eliminate]: 2.24001e-06 [updatestate_depend_eliminate]: 5.27001e-06 [updatestate_assign_eliminate]: 2.44999e-06 [updatestate_loads_eliminate]: 2.21998e-06 [cse]: 1.8e-05 [renormalize]: 2.89991e-07 [remove_dup_value]: 1.301e-05 [tuple_transform]: 6.958e-05, [1] [Cycle 1]: 6.51e-05, [4] [d_1]: 4.025e-05 [none_parameter_eliminate]: 1.30001e-06 [renormalize]: 1.29978e-07 [switch_simplify]: 6.42001e-06 [partial_unused_args_eliminate]: 2.19999e-06 [add_recomputation]: 6.559e-05 [cse_after_recomputation]: 2.174e-05, [1] [Cycle 1]: 1.705e-05, [1] [cse]: 1.193e-05 [environ_conv]: 1.672e-05 [swap_dp_allreduce_reducescatter]: 2.781e-05 [bias_add_comm_swap]: 1.393e-05 [label_micro_interleaved_index]: 1.47e-05 [label_fine_grained_interleaved_index]: 5.09e-06 [merge_cast_opt]: 9.69972e-07 [slice_recompute_activation]: 2.22999e-06 [micro_interleaved_order_control]: 7.2e-06 [assign_add_opt]: 4.69998e-07 [ForceFp32Comm]: 4.41002e-06 [remove_cast_before_assign_add]: 7.38e-06 [full_micro_interleaved_order_control]: 2.082e-05 [reorder_send_recv_between_fp_bp]: 1.01002e-06 [comm_op_add_attrs]: 3.00002e-07 [add_comm_op_reuse_tag]: 3.41001e-06 [interleave_split_concat_branches]: 6.79982e-07 [interleave_parallel_branches]: 4.13999e-06 [overlap_opt_shard_in_pipeline]: 1.554e-05 [overlap_opt_shard_grad_in_pipeline]: 8.90024e-07 [control_data_broadcast_order]: 1.234e-05 [grouped_pairwise_exchange_alltoall]: 3.46999e-06 [offloading_packed_experts]: 2.66e-06 [overlap_recompute_and_grad_model_parallel]: 1.666e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.22999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.30999e-06 [overlap_recompute_comm]: 2.36e-06 [overlap_grad_ring_attention]: 2.615e-05 [overlap_grad_flash_sp]: 5.225e-05 [begin_end_overlap_inline]: 3.29979e-07 [split_matmul_comm_elemetwise]: 1.427e-05 [split_layernorm_comm]: 1.60001e-06 [handle_group_info]: 9.00007e-07 [symbol_engine_optimizer]: 7.235e-05, [1] [Cycle 1]: 6.789e-05, [6] [build]: 2.54999e-06 [elim_shapecalc]: 1.032e-05 [elim_not_effective]: 1.196e-05 [opt_reshape]: 6.48e-06 [fold_const_symbol]: 9.04e-06 [renormalize]: 1.50001e-07 [detach_backward]: 1.80001e-06 [pipeline_parallel_scheduler]: 1.37999e-06 [auto_monad_reorder]: 2.26e-05 [get_jit_bprop_graph]: 1.64998e-06 [rewriter_after_jit_bprop_graph]: 2.61e-06 [opt_after_jit_grad]: 0.00044755 [validate]: 5.4e-05 [backend_pass]: 9.30013e-07 [task_emit]: 32.326 [execute]: 1.216e-05 Sums bootstrap : 0.000903s : 0.00% type_inference : 0.055080s : 0.17% event_method : 0.000060s : 0.00% auto_monad : 0.000162s : 0.00% graph_reusing : 0.000007s : 0.00% inline : 0.000003s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000050s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000021s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000066s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000007s : 0.00% optimize.rewriter_before_opt_a : 0.000258s : 0.00% optimize.opt_a.expand_dump_flag : 0.000005s : 0.00% optimize.opt_a.switch_simplify : 0.000096s : 0.00% optimize.opt_a.loop_unroll : 0.000044s : 0.00% optimize.opt_a.a_1 : 0.000916s : 0.00% optimize.opt_a.with_stream_mark : 0.000028s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000016s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000007s : 0.00% optimize.opt_a.parameter_eliminate : 0.000002s : 0.00% optimize.opt_a.a_2 : 0.000149s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000006s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000059s : 0.00% optimize.opt_a.auto_parallel : 0.000015s : 0.00% optimize.opt_a.parallel : 0.000106s : 0.00% optimize.opt_a.flash_sp : 0.000051s : 0.00% optimize.opt_a.merge_comm : 0.000009s : 0.00% optimize.opt_a.allreduce_fusion : 0.000014s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000026s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000012s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000016s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000030s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000037s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000018s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000004s : 0.00% optimize.opt_a.receive_attached : 0.000024s : 0.00% optimize.opt_a.after_resolve : 0.000024s : 0.00% optimize.opt_a.a_after_grad : 0.000019s : 0.00% optimize.opt_a.renormalize : 0.000984s : 0.00% optimize.opt_a.add_forward_monad_depend : 0.000008s : 0.00% optimize.opt_a.auto_monad_grad : 0.000004s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000040s : 0.00% optimize.opt_a.cse : 0.000075s : 0.00% optimize.opt_a.a_3 : 0.000082s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000005s : 0.00% optimize.rewriter_after_opt_a : 0.000019s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000575s : 0.00% optimize.opt_b.b_1 : 0.000115s : 0.00% optimize.opt_b.b_2 : 0.000008s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000004s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000018s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000013s : 0.00% optimize.cconv : 0.000023s : 0.00% optimize.loop_unroll : 0.000447s : 0.00% optimize.opt_after_cconv.c_1 : 0.000028s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.cse : 0.000018s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000013s : 0.00% optimize.tuple_transform.d_1 : 0.000040s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000001s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000006s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000066s : 0.00% optimize.cse_after_recomputation.cse : 0.000012s : 0.00% optimize.environ_conv : 0.000017s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000028s : 0.00% optimize.bias_add_comm_swap : 0.000014s : 0.00% optimize.label_micro_interleaved_index : 0.000015s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000005s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000007s : 0.00% optimize.assign_add_opt : 0.000000s : 0.00% optimize.ForceFp32Comm : 0.000004s : 0.00% optimize.remove_cast_before_assign_add : 0.000007s : 0.00% optimize.full_micro_interleaved_order_control : 0.000021s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000001s : 0.00% optimize.comm_op_add_attrs : 0.000000s : 0.00% optimize.add_comm_op_reuse_tag : 0.000003s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000004s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000016s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000001s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000003s : 0.00% optimize.offloading_packed_experts : 0.000003s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000017s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000052s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000014s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000006s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000023s : 0.00% get_jit_bprop_graph : 0.000002s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000448s : 0.00% validate : 0.000054s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 32.326035s : 99.81% execute : 0.000012s : 0.00% Time group info: ------[substitution.] 0.000261 30 0.67% : 0.000002s : 2: substitution.elim_not_effective 0.38% : 0.000001s : 2: substitution.fold_const_symbol 2.24% : 0.000006s : 4: substitution.graph_param_transform 76.38% : 0.000199s : 6: substitution.inline 1.20% : 0.000003s : 4: substitution.j_node_and_user_rematch 6.21% : 0.000016s : 4: substitution.remove_not_recompute_node 2.59% : 0.000007s : 4: substitution.replace_old_param 10.33% : 0.000027s : 4: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.054983 2 97.75% : 0.053745s : 1: type_inference.infer 2.25% : 0.001238s : 1: type_inference.specialize ------[replace.] 0.000084 10 68.62% : 0.000058s : 6: replace.inline 31.38% : 0.000026s : 4: replace.tuple_list_get_item_eliminator ------[match.] 0.000221 10 88.61% : 0.000196s : 6: match.inline 11.39% : 0.000025s : 4: match.tuple_list_get_item_eliminator ------[predicate.] 0.000213 1408 1.05% : 0.000002s : 15: predicate.accumulaten_eliminater 0.68% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 0.45% : 0.000001s : 8: predicate.addn_check_dump 1.01% : 0.000002s : 15: predicate.addn_zero_filter 0.87% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.14% : 0.000005s : 23: predicate.arithmetic_simplify 0.98% : 0.000002s : 15: predicate.cast_eliminate 0.56% : 0.000001s : 8: predicate.check_bprop_eliminate 0.49% : 0.000001s : 8: predicate.compare_switch_simplify 0.19% : 0.000000s : 4: predicate.const_output_eliminate 0.60% : 0.000001s : 8: predicate.depend_value_elim 0.98% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.25% : 0.000003s : 15: predicate.dict_get_item_eliminator 0.96% : 0.000002s : 15: predicate.dict_set_item_eliminator 0.78% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.20% : 0.000000s : 4: predicate.elim_not_effective 0.36% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.37% : 0.000003s : 19: predicate.environ_add_const_eliminate 1.11% : 0.000002s : 19: predicate.environ_get_add_eliminate 1.21% : 0.000003s : 19: predicate.environ_get_depend_swap 1.69% : 0.000004s : 27: predicate.environ_get_eliminate 1.12% : 0.000002s : 19: predicate.environ_get_set_eliminate 1.55% : 0.000003s : 25: predicate.exchange_switch_depend_value 2.23% : 0.000005s : 25: predicate.float_depend_g_call 0.49% : 0.000001s : 8: predicate.float_environ_get_switch 0.77% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.18% : 0.000000s : 4: predicate.fold_const_symbol 0.64% : 0.000001s : 8: predicate.get_grad_eliminate 0.18% : 0.000000s : 4: predicate.graph_param_transform 0.49% : 0.000001s : 8: predicate.incorporate_call 0.42% : 0.000001s : 8: predicate.incorporate_call_switch 5.72% : 0.000012s : 64: predicate.inline 0.67% : 0.000001s : 8: predicate.inline_without_move 0.29% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.80% : 0.000002s : 8: predicate.less_batch_normalization 2.01% : 0.000004s : 27: predicate.list_to_tuple_eliminator_ 2.54% : 0.000005s : 42: predicate.load_eliminater 0.86% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.91% : 0.000006s : 46: predicate.loop_unroll_before_grad 1.61% : 0.000003s : 23: predicate.make_slice_get_slice_eliminator 0.55% : 0.000001s : 8: predicate.merge_addn 0.48% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.50% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.88% : 0.000002s : 15: predicate.minmaximum_grad 0.86% : 0.000002s : 4: predicate.mutable_eliminate 0.31% : 0.000001s : 4: predicate.opt_reshape 0.33% : 0.000001s : 4: predicate.parallel_virtual_node 2.13% : 0.000005s : 25: predicate.partial_defer_inline 1.55% : 0.000003s : 23: predicate.partial_eliminate 0.98% : 0.000002s : 15: predicate.print_const_string_wrapper 0.68% : 0.000001s : 8: predicate.reduce_all_const_elim 1.33% : 0.000003s : 15: predicate.reduce_eliminate 2.57% : 0.000005s : 42: predicate.redundant_stop_gradient_eliminater 0.53% : 0.000001s : 8: predicate.remove_not_recompute_node 1.34% : 0.000003s : 27: predicate.replace_applicator 0.49% : 0.000001s : 8: predicate.replace_old_param 0.19% : 0.000000s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 15: predicate.reshape_eliminate 0.55% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.70% : 0.000002s : 8: predicate.same_eliminate 0.38% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000002s : 8: predicate.shard_identity_eliminate 0.75% : 0.000002s : 8: predicate.special_op_eliminate 0.63% : 0.000001s : 8: predicate.specialize_transform 0.75% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.90% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.26% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.80% : 0.000004s : 25: predicate.switch_defer_inline 2.35% : 0.000005s : 33: predicate.switch_layer_defer_inline 5.74% : 0.000012s : 83: predicate.switch_simplify 1.00% : 0.000002s : 15: predicate.tile_eliminate 0.97% : 0.000002s : 15: predicate.transpose_eliminate 1.62% : 0.000003s : 23: predicate.tuple_list_convert_item_index_to_positive 1.57% : 0.000003s : 23: predicate.tuple_list_get_item_const_eliminator 1.53% : 0.000003s : 23: predicate.tuple_list_get_item_depend_reorder 2.86% : 0.000006s : 35: predicate.tuple_list_get_item_eliminator 1.51% : 0.000003s : 23: predicate.tuple_list_get_set_item_eliminator 2.25% : 0.000005s : 31: predicate.tuple_list_set_item_eliminator 1.86% : 0.000004s : 27: predicate.tuple_to_list_eliminator_ 2.36% : 0.000005s : 42: predicate.updatestate_pure_node_eliminater 3.05% : 0.000007s : 50: predicate.updatestate_useless_node_eliminater 0.28% : 0.000001s : 4: predicate.value_based_eliminate 0.76% : 0.000002s : 8: predicate.virtual_dataset_eliminate 0.61% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.40% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001026 16 57.01% : 0.000585s : 8: func_graph_cloner_run.FuncGraphClonerGraph 42.99% : 0.000441s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 32.414888 196 0.00% : 0.000007s : 1: ForceFp32Comm 0.03% : 0.008642s : 1: add_attr 0.03% : 0.008625s : 1: add_attr_with_inline 0.00% : 0.000006s : 1: add_comm_op_reuse_tag 0.00% : 0.000070s : 1: add_recomputation 0.00% : 0.000003s : 1: assign_add_opt 0.00% : 0.000169s : 1: auto_monad 0.00% : 0.000027s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000017s : 1: bias_add_comm_swap 0.00% : 0.000955s : 1: bootstrap 0.00% : 0.000027s : 1: cconv 0.00% : 0.000003s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000025s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000020s : 1: environ_conv 0.00% : 0.000068s : 1: event_method 0.00% : 0.000045s : 1: execute 0.00% : 0.000024s : 1: full_micro_interleaved_order_control 0.00% : 0.000005s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000006s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000006s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000007s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000008s : 1: label_fine_grained_interleaved_index 0.00% : 0.000018s : 1: label_micro_interleaved_index 0.00% : 0.000456s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000010s : 1: micro_interleaved_order_control 0.00% : 0.000583s : 1: mutable_eliminate 0.00% : 0.000006s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.00% : 0.001388s : 78: opt.transform.opt_a 0.00% : 0.000027s : 1: opt.transform.opt_after_cconv 0.00% : 0.000023s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000096s : 28: opt.transform.opt_b 0.00% : 0.000045s : 2: opt.transform.opt_trans_graph 0.00% : 0.000034s : 4: opt.transform.symbol_engine_opt 0.01% : 0.003513s : 1: opt_a 0.00% : 0.000099s : 1: opt_after_cconv 0.00% : 0.000456s : 1: opt_after_jit_grad 0.00% : 0.000193s : 1: opt_b 0.02% : 0.005947s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000056s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000030s : 1: overlap_grad_ring_attention 0.00% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000019s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000017s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000004s : 1: pipeline_split 0.00% : 0.000071s : 1: pre_auto_parallel 0.00% : 0.000010s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000011s : 1: remove_cast_before_assign_add 0.00% : 0.000016s : 1: remove_dup_value 0.00% : 0.000488s : 1: renormalize.infer 0.00% : 0.000488s : 1: renormalize.specialize 0.00% : 0.000004s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000022s : 1: rewriter_after_opt_a 0.00% : 0.000263s : 1: rewriter_before_opt_a 0.00% : 0.000008s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000017s : 1: split_matmul_comm_elemetwise 0.00% : 0.000031s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000075s : 1: symbol_engine_optimizer 99.73% : 32.326137s : 1: task_emit 0.00% : 0.000072s : 1: tuple_transform 0.17% : 0.055106s : 1: type_inference 0.00% : 0.000077s : 1: validate ..... TotalTime = 21.5411, [24] [bootstrap]: 0.00066788 [type_inference]: 0.0654269 [event_method]: 0.00015879 [auto_monad]: 0.00017352 [graph_reusing]: 1.015e-05 [inline]: 2.46e-06 [add_attr]: 0.00430556, [1] [add_attr_with_inline]: 0.00429637, [1] [Cycle 1]: 0.00011656, [2] [tag_attr]: 4.51e-05 [meta_addattr_fg_expand]: 1.373e-05 [parallel-infer-symbol]: 3.63e-06 [pre_auto_parallel]: 6.332e-05 [insert-virtual-dataset]: 2.51998e-06 [parallel-infer-symbol-second]: 7.59988e-07 [dataset_repeat_opt]: 2.10002e-06 [pipeline_split]: 2.04e-06 [optimize]: 0.0230752, [53] [py_interpret_to_execute]: 4.95999e-06 [rewriter_before_opt_a]: 0.00043873 [opt_a]: 0.0202494, [3] [Cycle 1]: 0.0155491, [45] [expand_dump_flag]: 5.05999e-06 [switch_simplify]: 0.0001723 [loop_unroll]: 8.072e-05 [a_1]: 0.00167346 [with_stream_mark]: 2.442e-05 [recompute_prepare]: 2.216e-05 [updatestate_depend_eliminate]: 9.44e-06 [updatestate_assign_eliminate]: 8.41002e-06 [updatestate_loads_eliminate]: 8.06001e-06 [parameter_eliminate]: 2.94001e-06 [a_2]: 0.00024786 [accelerated_algorithm]: 5.485e-05 [shard]: 1.77999e-06 [meta_shard_fg_expand]: 4.31002e-06 [shard_inline]: 1.703e-05 [merge_send_recv]: 1.766e-05 [auto_parallel]: 1.129e-05 [parallel]: 0.00011705 [flash_sp]: 1.071e-05 [merge_comm]: 1.095e-05 [allreduce_fusion]: 9.66998e-06 [matmul_add_comm_reduction]: 2.822e-05 [allreduce_slice_to_reducescatter]: 1.10001e-06 [virtual_shard_identity]: 1.856e-05 [virtual_dataset]: 1.563e-05 [get_grad_eliminate_]: 1.581e-05 [virtual_output]: 1.584e-05 [merge_forward]: 9.90002e-06 [cell_reuse_recompute_pass]: 1.14e-06 [offload_activation]: 1.747e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.981e-05 [merge_recompute_call_nodes]: 1.34e-06 [before_grad]: 2.815e-05 [set_forward_comm_id_for_comm_node_pass]: 9.64999e-06 [meta_fg_expand]: 0.0018122 [flash_sp_send_recv_attached]: 6.31e-06 [receive_attached]: 2.36e-06 [after_resolve]: 6.547e-05 [a_after_grad]: 8.74e-05 [renormalize]: 0.00979465 [add_forward_monad_depend]: 1.086e-05 [auto_monad_grad]: 5.69999e-06 [auto_monad_eliminator]: 6.299e-05 [cse]: 0.00033779 [a_3]: 0.00036765 [Cycle 2]: 0.0035782, [45] [expand_dump_flag]: 2.07999e-06 [switch_simplify]: 5.055e-05 [loop_unroll]: 4.862e-05 [a_1]: 0.00162156 [with_stream_mark]: 1.486e-05 [recompute_prepare]: 1.352e-05 [updatestate_depend_eliminate]: 6.68998e-06 [updatestate_assign_eliminate]: 5.99e-06 [updatestate_loads_eliminate]: 5.45001e-06 [parameter_eliminate]: 9.89996e-07 [a_2]: 0.00015777 [accelerated_algorithm]: 1.559e-05 [shard]: 9.60019e-07 [meta_shard_fg_expand]: 2.63998e-06 [shard_inline]: 1.187e-05 [merge_send_recv]: 8.73001e-06 [auto_parallel]: 9.00001e-06 [parallel]: 4.3e-06 [flash_sp]: 3.53999e-06 [merge_comm]: 7.58001e-06 [allreduce_fusion]: 6.64999e-06 [matmul_add_comm_reduction]: 1.124e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 1.313e-05 [virtual_dataset]: 1.19e-05 [get_grad_eliminate_]: 1.254e-05 [virtual_output]: 1.176e-05 [merge_forward]: 6.06e-06 [cell_reuse_recompute_pass]: 8.10018e-07 [offload_activation]: 1.124e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.131e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.92e-05 [set_forward_comm_id_for_comm_node_pass]: 7.36999e-06 [meta_fg_expand]: 7.162e-05 [flash_sp_send_recv_attached]: 9.89996e-07 [receive_attached]: 1.05999e-06 [after_resolve]: 1.708e-05 [a_after_grad]: 1.93e-05 [renormalize]: 0.00088808 [add_forward_monad_depend]: 4.37e-06 [auto_monad_grad]: 1.18001e-06 [auto_monad_eliminator]: 1.881e-05 [cse]: 9.961e-05 [a_3]: 8.596e-05 [Cycle 3]: 0.00110707, [45] [expand_dump_flag]: 1.24e-06 [switch_simplify]: 1.439e-05 [loop_unroll]: 1.221e-05 [a_1]: 0.00030985 [with_stream_mark]: 1.249e-05 [recompute_prepare]: 1.174e-05 [updatestate_depend_eliminate]: 6.44001e-06 [updatestate_assign_eliminate]: 5.57001e-06 [updatestate_loads_eliminate]: 5.69999e-06 [parameter_eliminate]: 9.00007e-07 [a_2]: 0.00015518 [accelerated_algorithm]: 1.491e-05 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 2.26e-06 [shard_inline]: 1.135e-05 [merge_send_recv]: 8.47998e-06 [auto_parallel]: 8.80999e-06 [parallel]: 4.23001e-06 [flash_sp]: 9.79984e-07 [merge_comm]: 7.07002e-06 [allreduce_fusion]: 6.31e-06 [matmul_add_comm_reduction]: 9.95002e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 3.269e-05 [virtual_dataset]: 1.168e-05 [get_grad_eliminate_]: 1.13e-05 [virtual_output]: 1.121e-05 [merge_forward]: 6.59999e-06 [cell_reuse_recompute_pass]: 1.52999e-06 [offload_activation]: 1.14e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.187e-05 [merge_recompute_call_nodes]: 8.09989e-07 [before_grad]: 1.883e-05 [set_forward_comm_id_for_comm_node_pass]: 6.88e-06 [meta_fg_expand]: 4.25e-06 [flash_sp_send_recv_attached]: 9.30013e-07 [receive_attached]: 9.5999e-07 [after_resolve]: 1.459e-05 [a_after_grad]: 1.813e-05 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 1.04998e-06 [auto_monad_eliminator]: 1.42e-05 [cse]: 3.483e-05 [a_3]: 7.863e-05 [py_interpret_to_execute_after_opt_a]: 5.10001e-06 [slice_cell_reuse_recomputed_activation]: 1.86e-06 [rewriter_after_opt_a]: 3.386e-05 [convert_after_rewriter]: 1.40999e-06 [order_py_execute_after_rewriter]: 1.34e-06 [mutable_eliminate]: 0.00049688 [opt_b]: 0.00037532, [1] [Cycle 1]: 0.00036876, [7] [b_1]: 0.00025763 [b_2]: 1.408e-05 [updatestate_depend_eliminate]: 9.00999e-06 [updatestate_assign_eliminate]: 5.57999e-06 [updatestate_loads_eliminate]: 5.79e-06 [renormalize]: 3.7998e-07 [cse]: 4.264e-05 [optimize_parallel_all_gather_comm]: 2.848e-05 [overlap_param_gather]: 2.06e-06 [cconv]: 2.11e-05 [loop_unroll]: 0.00045347 [opt_after_cconv]: 0.00016588, [1] [Cycle 1]: 0.0001599, [7] [c_1]: 6.188e-05 [parameter_eliminate]: 2.43e-06 [updatestate_depend_eliminate]: 8.96002e-06 [updatestate_assign_eliminate]: 5.87999e-06 [updatestate_loads_eliminate]: 5.54998e-06 [cse]: 4.116e-05 [renormalize]: 3.60014e-07 [remove_dup_value]: 4.248e-05 [tuple_transform]: 0.0001258, [1] [Cycle 1]: 0.00012123, [4] [d_1]: 8.808e-05 [none_parameter_eliminate]: 1.87999e-06 [renormalize]: 2.00002e-07 [switch_simplify]: 1.252e-05 [partial_unused_args_eliminate]: 2.16e-06 [add_recomputation]: 7.488e-05 [cse_after_recomputation]: 4.049e-05, [1] [Cycle 1]: 3.569e-05, [1] [cse]: 3.008e-05 [environ_conv]: 1.048e-05 [swap_dp_allreduce_reducescatter]: 1.036e-05 [bias_add_comm_swap]: 2.64999e-06 [label_micro_interleaved_index]: 4.86997e-06 [label_fine_grained_interleaved_index]: 2.89999e-06 [merge_cast_opt]: 1.51998e-06 [slice_recompute_activation]: 2.54999e-06 [micro_interleaved_order_control]: 2.79001e-06 [assign_add_opt]: 1.22e-06 [ForceFp32Comm]: 1.08001e-06 [remove_cast_before_assign_add]: 1.45999e-06 [full_micro_interleaved_order_control]: 2.34001e-06 [reorder_send_recv_between_fp_bp]: 2.59001e-06 [comm_op_add_attrs]: 1.04e-06 [add_comm_op_reuse_tag]: 1.07e-06 [interleave_split_concat_branches]: 1.14e-06 [interleave_parallel_branches]: 1.42e-06 [overlap_opt_shard_in_pipeline]: 3.006e-05 [overlap_opt_shard_grad_in_pipeline]: 1.77001e-06 [control_data_broadcast_order]: 2.196e-05 [grouped_pairwise_exchange_alltoall]: 1.53002e-06 [offloading_packed_experts]: 6.80002e-06 [overlap_recompute_and_grad_model_parallel]: 7.58001e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.41002e-06 [overlap_recompute_allgather_and_fa_grad]: 1.35999e-06 [overlap_recompute_comm]: 2.36998e-06 [overlap_grad_ring_attention]: 6.57002e-06 [overlap_grad_flash_sp]: 3.025e-05 [begin_end_overlap_inline]: 4.7998e-07 [split_matmul_comm_elemetwise]: 2.17001e-06 [split_layernorm_comm]: 1.87001e-06 [handle_group_info]: 1.35001e-06 [symbol_engine_optimizer]: 0.00011646, [1] [Cycle 1]: 0.00011173, [6] [build]: 1.12e-05 [elim_shapecalc]: 1.673e-05 [elim_not_effective]: 2.305e-05 [opt_reshape]: 1.292e-05 [fold_const_symbol]: 1.991e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.91998e-06 [pipeline_parallel_scheduler]: 1.42e-06 [auto_monad_reorder]: 2.774e-05 [get_jit_bprop_graph]: 1.28002e-06 [rewriter_after_jit_bprop_graph]: 3.58999e-06 [opt_after_jit_grad]: 0.00051255 [validate]: 8.912e-05 [backend_pass]: 1.09e-06 [task_emit]: 21.4463 [execute]: 1.084e-05 Sums bootstrap : 0.000668s : 0.00% type_inference : 0.065427s : 0.30% event_method : 0.000159s : 0.00% auto_monad : 0.000174s : 0.00% graph_reusing : 0.000010s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000045s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000014s : 0.00% parallel-infer-symbol : 0.000004s : 0.00% pre_auto_parallel : 0.000063s : 0.00% insert-virtual-dataset : 0.000003s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000439s : 0.00% optimize.opt_a.expand_dump_flag : 0.000008s : 0.00% optimize.opt_a.switch_simplify : 0.000237s : 0.00% optimize.opt_a.loop_unroll : 0.000142s : 0.00% optimize.opt_a.a_1 : 0.003605s : 0.02% optimize.opt_a.with_stream_mark : 0.000052s : 0.00% optimize.opt_a.recompute_prepare : 0.000047s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000023s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000019s : 0.00% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000561s : 0.00% optimize.opt_a.accelerated_algorithm : 0.000085s : 0.00% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000009s : 0.00% optimize.opt_a.shard_inline : 0.000040s : 0.00% optimize.opt_a.merge_send_recv : 0.000035s : 0.00% optimize.opt_a.auto_parallel : 0.000029s : 0.00% optimize.opt_a.parallel : 0.000126s : 0.00% optimize.opt_a.flash_sp : 0.000015s : 0.00% optimize.opt_a.merge_comm : 0.000026s : 0.00% optimize.opt_a.allreduce_fusion : 0.000023s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000049s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000064s : 0.00% optimize.opt_a.virtual_dataset : 0.000039s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000040s : 0.00% optimize.opt_a.virtual_output : 0.000039s : 0.00% optimize.opt_a.merge_forward : 0.000023s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000040s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000073s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000066s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000024s : 0.00% optimize.opt_a.meta_fg_expand : 0.001888s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.00% optimize.opt_a.receive_attached : 0.000004s : 0.00% optimize.opt_a.after_resolve : 0.000097s : 0.00% optimize.opt_a.a_after_grad : 0.000125s : 0.00% optimize.opt_a.renormalize : 0.010683s : 0.05% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.00% optimize.opt_a.auto_monad_grad : 0.000008s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000096s : 0.00% optimize.opt_a.cse : 0.000472s : 0.00% optimize.opt_a.a_3 : 0.000532s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000034s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000497s : 0.00% optimize.opt_b.b_1 : 0.000258s : 0.00% optimize.opt_b.b_2 : 0.000014s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000043s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000028s : 0.00% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000021s : 0.00% optimize.loop_unroll : 0.000453s : 0.00% optimize.opt_after_cconv.c_1 : 0.000062s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000009s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_after_cconv.cse : 0.000041s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000042s : 0.00% optimize.tuple_transform.d_1 : 0.000088s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000013s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000075s : 0.00% optimize.cse_after_recomputation.cse : 0.000030s : 0.00% optimize.environ_conv : 0.000010s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000010s : 0.00% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000003s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000030s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000022s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.00% optimize.overlap_grad_flash_sp : 0.000030s : 0.00% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000011s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000017s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000023s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000013s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000020s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000028s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000513s : 0.00% validate : 0.000089s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 21.446334s : 99.59% execute : 0.000011s : 0.00% Time group info: ------[substitution.] 0.000868 244 0.37% : 0.000003s : 7: substitution.elim_not_effective 0.93% : 0.000008s : 13: substitution.float_depend_g_call 0.52% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.36% : 0.000003s : 7: substitution.fold_const_symbol 1.07% : 0.000009s : 10: substitution.graph_param_transform 0.34% : 0.000003s : 2: substitution.incorporate_call 0.22% : 0.000002s : 2: substitution.incorporate_call_switch 58.45% : 0.000507s : 22: substitution.inline 1.86% : 0.000016s : 2: substitution.inline_without_move 1.39% : 0.000012s : 24: substitution.j_node_and_user_rematch 4.39% : 0.000038s : 3: substitution.less_batch_normalization 1.50% : 0.000013s : 11: substitution.minmaximum_grad 3.12% : 0.000027s : 13: substitution.partial_eliminate 1.85% : 0.000016s : 24: substitution.remove_not_recompute_node 2.57% : 0.000022s : 9: substitution.replace_applicator 1.17% : 0.000010s : 14: substitution.replace_old_param 0.31% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.75% : 0.000024s : 3: substitution.switch_simplify 1.31% : 0.000011s : 4: substitution.transpose_eliminate 3.16% : 0.000027s : 11: substitution.tuple_list_convert_item_index_to_positive 1.60% : 0.000014s : 11: substitution.tuple_list_get_item_const_eliminator 2.12% : 0.000018s : 11: substitution.tuple_list_get_item_depend_reorder 6.64% : 0.000058s : 26: substitution.tuple_list_get_item_eliminator 2.01% : 0.000017s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.065326 2 95.27% : 0.062236s : 1: type_inference.infer 4.73% : 0.003090s : 1: type_inference.specialize ------[replace.] 0.000278 37 58.55% : 0.000163s : 22: replace.inline 12.87% : 0.000036s : 3: replace.switch_simplify 28.57% : 0.000079s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000544 37 91.20% : 0.000496s : 22: match.inline 4.00% : 0.000022s : 3: match.switch_simplify 4.80% : 0.000026s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000850 6287 1.06% : 0.000009s : 74: predicate.accumulaten_eliminater 0.33% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.50% : 0.000004s : 35: predicate.addn_check_dump 1.10% : 0.000009s : 74: predicate.addn_zero_filter 1.03% : 0.000009s : 74: predicate.adjust_all_reduce_mul_add 2.03% : 0.000017s : 109: predicate.arithmetic_simplify 1.09% : 0.000009s : 74: predicate.cast_eliminate 1.08% : 0.000009s : 71: predicate.check_bprop_eliminate 0.52% : 0.000004s : 35: predicate.compare_switch_simplify 0.10% : 0.000001s : 10: predicate.const_output_eliminate 0.53% : 0.000004s : 35: predicate.depend_value_elim 1.16% : 0.000010s : 74: predicate.dict_get_item_const_eliminator 1.24% : 0.000011s : 74: predicate.dict_get_item_eliminator 1.07% : 0.000009s : 74: predicate.dict_set_item_eliminator 0.39% : 0.000003s : 20: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 10: predicate.elim_not_effective 0.19% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000010s : 84: predicate.environ_add_const_eliminate 1.20% : 0.000010s : 84: predicate.environ_get_add_eliminate 1.19% : 0.000010s : 84: predicate.environ_get_depend_swap 1.73% : 0.000015s : 119: predicate.environ_get_eliminate 1.19% : 0.000010s : 84: predicate.environ_get_set_eliminate 1.68% : 0.000014s : 108: predicate.exchange_switch_depend_value 2.29% : 0.000019s : 108: predicate.float_depend_g_call 0.52% : 0.000004s : 35: predicate.float_environ_get_switch 0.67% : 0.000006s : 45: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 10: predicate.fold_const_symbol 0.59% : 0.000005s : 35: predicate.get_grad_eliminate 0.10% : 0.000001s : 10: predicate.graph_param_transform 0.54% : 0.000005s : 35: predicate.incorporate_call 0.48% : 0.000004s : 35: predicate.incorporate_call_switch 5.53% : 0.000047s : 272: predicate.inline 1.24% : 0.000011s : 61: predicate.inline_without_move 0.30% : 0.000003s : 35: predicate.j_node_and_user_rematch 0.67% : 0.000006s : 35: predicate.less_batch_normalization 1.62% : 0.000014s : 106: predicate.list_to_tuple_eliminator_ 2.60% : 0.000022s : 180: predicate.load_eliminater 0.37% : 0.000003s : 10: predicate.loop_unroll_after_grad 2.47% : 0.000021s : 155: predicate.loop_unroll_before_grad 1.40% : 0.000012s : 94: predicate.make_slice_get_slice_eliminator 0.55% : 0.000005s : 35: predicate.merge_addn 1.07% : 0.000009s : 71: predicate.micro_step_allgather_replace 1.06% : 0.000009s : 71: predicate.mini_step_allgather_replace 1.08% : 0.000009s : 74: predicate.minmaximum_grad 0.38% : 0.000003s : 10: predicate.mutable_eliminate 0.18% : 0.000001s : 10: predicate.opt_reshape 0.20% : 0.000002s : 10: predicate.parallel_virtual_node 2.02% : 0.000017s : 108: predicate.partial_defer_inline 1.64% : 0.000014s : 96: predicate.partial_eliminate 1.06% : 0.000009s : 74: predicate.print_const_string_wrapper 0.53% : 0.000005s : 35: predicate.reduce_all_const_elim 1.42% : 0.000012s : 74: predicate.reduce_eliminate 2.59% : 0.000022s : 180: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000003s : 35: predicate.remove_not_recompute_node 1.73% : 0.000015s : 157: predicate.replace_applicator 0.59% : 0.000005s : 61: predicate.replace_old_param 0.11% : 0.000001s : 10: predicate.reset_defer_inline 1.08% : 0.000009s : 74: predicate.reshape_eliminate 1.10% : 0.000009s : 71: predicate.row_tensor_add_zeros_like 0.21% : 0.000002s : 10: predicate.row_tensor_eliminate 1.26% : 0.000011s : 71: predicate.same_eliminate 0.35% : 0.000003s : 35: predicate.set_cell_output_no_recompute 0.64% : 0.000005s : 35: predicate.shard_identity_eliminate 0.36% : 0.000003s : 20: predicate.special_op_eliminate 0.61% : 0.000005s : 35: predicate.specialize_transform 1.18% : 0.000010s : 71: predicate.split_environ_get_set_with_tuple_value 1.14% : 0.000010s : 61: predicate.stack_unstack_eliminate 0.17% : 0.000001s : 10: predicate.switch_call_monad_eliminater 1.78% : 0.000015s : 108: predicate.switch_defer_inline 2.83% : 0.000024s : 179: predicate.switch_layer_defer_inline 5.38% : 0.000046s : 314: predicate.switch_simplify 1.10% : 0.000009s : 74: predicate.tile_eliminate 1.15% : 0.000010s : 74: predicate.transpose_eliminate 1.50% : 0.000013s : 94: predicate.tuple_list_convert_item_index_to_positive 1.60% : 0.000014s : 94: predicate.tuple_list_get_item_const_eliminator 1.46% : 0.000012s : 94: predicate.tuple_list_get_item_depend_reorder 2.71% : 0.000023s : 141: predicate.tuple_list_get_item_eliminator 1.50% : 0.000013s : 94: predicate.tuple_list_get_set_item_eliminator 2.12% : 0.000018s : 129: predicate.tuple_list_set_item_eliminator 1.68% : 0.000014s : 106: predicate.tuple_to_list_eliminator_ 2.53% : 0.000022s : 180: predicate.updatestate_pure_node_eliminater 3.16% : 0.000027s : 215: predicate.updatestate_useless_node_eliminater 0.17% : 0.000001s : 10: predicate.value_based_eliminate 0.59% : 0.000005s : 35: predicate.virtual_dataset_eliminate 0.58% : 0.000005s : 35: predicate.virtual_output_eliminate 0.16% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.20% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002934 46 61.54% : 0.001806s : 20: func_graph_cloner_run.FuncGraphClonerGraph 38.46% : 0.001129s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 21.585310 237 0.00% : 0.000004s : 1: ForceFp32Comm 0.02% : 0.004310s : 1: add_attr 0.02% : 0.004301s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000079s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.00% : 0.000185s : 1: auto_monad 0.00% : 0.000032s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000006s : 1: bias_add_comm_swap 0.00% : 0.000704s : 1: bootstrap 0.00% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000025s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000044s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000014s : 1: environ_conv 0.00% : 0.000170s : 1: event_method 0.00% : 0.000019s : 1: execute 0.00% : 0.000005s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000014s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000008s : 1: label_micro_interleaved_index 0.00% : 0.000463s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.00% : 0.000506s : 1: mutable_eliminate 0.00% : 0.000010s : 1: offloading_packed_experts 0.00% : 0.000021s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000022s : 1: opt.transform.mutable_eliminate 0.03% : 0.005664s : 117: opt.transform.opt_a 0.00% : 0.000061s : 1: opt.transform.opt_after_cconv 0.00% : 0.000043s : 1: opt.transform.opt_after_jit_grad 0.00% : 0.000245s : 28: opt.transform.opt_b 0.00% : 0.000099s : 2: opt.transform.opt_trans_graph 0.00% : 0.000069s : 4: opt.transform.symbol_engine_opt 0.09% : 0.020253s : 1: opt_a 0.00% : 0.000169s : 1: opt_after_cconv 0.00% : 0.000522s : 1: opt_after_jit_grad 0.00% : 0.000379s : 1: opt_b 0.11% : 0.023079s : 1: optimize 0.00% : 0.000032s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000033s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000010s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000034s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000010s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000068s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.00% : 0.000047s : 1: remove_dup_value 0.04% : 0.008626s : 2: renormalize.infer 0.01% : 0.002042s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000037s : 1: rewriter_after_opt_a 0.00% : 0.000447s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.00% : 0.000013s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000119s : 1: symbol_engine_optimizer 99.36% : 21.446360s : 1: task_emit 0.00% : 0.000129s : 1: tuple_transform 0.30% : 0.065446s : 1: type_inference 0.00% : 0.000119s : 1: validate .. TotalTime = 0.123207, [24] [bootstrap]: 0.00064204 [type_inference]: 0.0667019 [event_method]: 0.00011153 [auto_monad]: 0.00018318 [graph_reusing]: 1.019e-05 [inline]: 1.82999e-06 [add_attr]: 0.00384077, [1] [add_attr_with_inline]: 0.00383118, [1] [Cycle 1]: 9.145e-05, [2] [tag_attr]: 4.52e-05 [meta_addattr_fg_expand]: 1.345e-05 [parallel-infer-symbol]: 2.91e-06 [pre_auto_parallel]: 6.91e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.2e-07 [dataset_repeat_opt]: 2.34999e-06 [pipeline_split]: 1.52001e-06 [optimize]: 0.0264771, [53] [py_interpret_to_execute]: 4.67e-06 [rewriter_before_opt_a]: 0.00045349 [opt_a]: 0.0234946, [3] [Cycle 1]: 0.0185414, [45] [expand_dump_flag]: 5.14e-06 [switch_simplify]: 0.00017193 [loop_unroll]: 8.025e-05 [a_1]: 0.00166398 [with_stream_mark]: 2.571e-05 [recompute_prepare]: 2.306e-05 [updatestate_depend_eliminate]: 9.67001e-06 [updatestate_assign_eliminate]: 8.1e-06 [updatestate_loads_eliminate]: 8.24002e-06 [parameter_eliminate]: 2.80002e-06 [a_2]: 0.00024124 [accelerated_algorithm]: 4.934e-05 [shard]: 1.60999e-06 [meta_shard_fg_expand]: 4.53001e-06 [shard_inline]: 1.627e-05 [merge_send_recv]: 1.649e-05 [auto_parallel]: 1.158e-05 [parallel]: 3.005e-05 [flash_sp]: 9.50001e-06 [merge_comm]: 9.86e-06 [allreduce_fusion]: 9.37999e-06 [matmul_add_comm_reduction]: 2.751e-05 [allreduce_slice_to_reducescatter]: 5.8001e-07 [virtual_shard_identity]: 1.791e-05 [virtual_dataset]: 1.615e-05 [get_grad_eliminate_]: 1.534e-05 [virtual_output]: 1.522e-05 [merge_forward]: 9.66e-06 [cell_reuse_recompute_pass]: 1.25999e-06 [offload_activation]: 1.748e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.918e-05 [merge_recompute_call_nodes]: 1.75001e-06 [before_grad]: 2.75e-05 [set_forward_comm_id_for_comm_node_pass]: 9.55001e-06 [meta_fg_expand]: 0.00166264 [flash_sp_send_recv_attached]: 6.36e-06 [receive_attached]: 2.49001e-06 [after_resolve]: 6.92e-05 [a_after_grad]: 8.654e-05 [renormalize]: 0.0129703 [add_forward_monad_depend]: 1.094e-05 [auto_monad_grad]: 6.86999e-06 [auto_monad_eliminator]: 6.874e-05 [cse]: 0.00039249 [a_3]: 0.00037668 [Cycle 2]: 0.00384879, [45] [expand_dump_flag]: 2.36998e-06 [switch_simplify]: 5.064e-05 [loop_unroll]: 4.713e-05 [a_1]: 0.00167184 [with_stream_mark]: 1.816e-05 [recompute_prepare]: 1.421e-05 [updatestate_depend_eliminate]: 7.45003e-06 [updatestate_assign_eliminate]: 6.19999e-06 [updatestate_loads_eliminate]: 5.55001e-06 [parameter_eliminate]: 1.15999e-06 [a_2]: 0.00015757 [accelerated_algorithm]: 1.738e-05 [shard]: 1.14e-06 [meta_shard_fg_expand]: 2.98998e-06 [shard_inline]: 1.181e-05 [merge_send_recv]: 8.98002e-06 [auto_parallel]: 1.013e-05 [parallel]: 5.10001e-06 [flash_sp]: 4.1e-06 [merge_comm]: 7.93001e-06 [allreduce_fusion]: 6.49999e-06 [matmul_add_comm_reduction]: 1.232e-05 [allreduce_slice_to_reducescatter]: 8.09989e-07 [virtual_shard_identity]: 1.362e-05 [virtual_dataset]: 1.202e-05 [get_grad_eliminate_]: 1.159e-05 [virtual_output]: 1.121e-05 [merge_forward]: 6.06e-06 [cell_reuse_recompute_pass]: 1.42e-06 [offload_activation]: 1.249e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.23e-05 [merge_recompute_call_nodes]: 9.20001e-07 [before_grad]: 1.952e-05 [set_forward_comm_id_for_comm_node_pass]: 7.4e-06 [meta_fg_expand]: 9.789e-05 [flash_sp_send_recv_attached]: 1.07e-06 [receive_attached]: 1.45999e-06 [after_resolve]: 1.862e-05 [a_after_grad]: 1.882e-05 [renormalize]: 0.00105235 [add_forward_monad_depend]: 4.39998e-06 [auto_monad_grad]: 1.30999e-06 [auto_monad_eliminator]: 1.865e-05 [cse]: 0.00011347 [a_3]: 8.642e-05 [Cycle 3]: 0.00108919, [45] [expand_dump_flag]: 1.25999e-06 [switch_simplify]: 1.389e-05 [loop_unroll]: 1.256e-05 [a_1]: 0.00031172 [with_stream_mark]: 1.324e-05 [recompute_prepare]: 1.199e-05 [updatestate_depend_eliminate]: 6.49999e-06 [updatestate_assign_eliminate]: 5.74e-06 [updatestate_loads_eliminate]: 5.87999e-06 [parameter_eliminate]: 1.05001e-06 [a_2]: 0.00015562 [accelerated_algorithm]: 1.499e-05 [shard]: 1.00999e-06 [meta_shard_fg_expand]: 2.17999e-06 [shard_inline]: 1.147e-05 [merge_send_recv]: 8.57e-06 [auto_parallel]: 9.00001e-06 [parallel]: 4.69002e-06 [flash_sp]: 9.50007e-07 [merge_comm]: 7.2e-06 [allreduce_fusion]: 6.41e-06 [matmul_add_comm_reduction]: 1.166e-05 [allreduce_slice_to_reducescatter]: 3.7998e-07 [virtual_shard_identity]: 1.355e-05 [virtual_dataset]: 1.166e-05 [get_grad_eliminate_]: 1.125e-05 [virtual_output]: 1.11e-05 [merge_forward]: 6.42001e-06 [cell_reuse_recompute_pass]: 1.66e-06 [offload_activation]: 1.15e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.1e-05 [merge_recompute_call_nodes]: 7.30011e-07 [before_grad]: 1.882e-05 [set_forward_comm_id_for_comm_node_pass]: 6.89001e-06 [meta_fg_expand]: 4.37e-06 [flash_sp_send_recv_attached]: 9.79984e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.518e-05 [a_after_grad]: 1.824e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.33002e-06 [auto_monad_grad]: 9.89996e-07 [auto_monad_eliminator]: 1.392e-05 [cse]: 3.488e-05 [a_3]: 7.563e-05 [py_interpret_to_execute_after_opt_a]: 4.72998e-06 [slice_cell_reuse_recomputed_activation]: 2.66999e-06 [rewriter_after_opt_a]: 3.017e-05 [convert_after_rewriter]: 1.21002e-06 [order_py_execute_after_rewriter]: 1.20999e-06 [mutable_eliminate]: 0.00061989 [opt_b]: 0.00038162, [1] [Cycle 1]: 0.00037485, [7] [b_1]: 0.00025727 [b_2]: 1.576e-05 [updatestate_depend_eliminate]: 9.04e-06 [updatestate_assign_eliminate]: 6.07001e-06 [updatestate_loads_eliminate]: 6.58e-06 [renormalize]: 5.19998e-07 [cse]: 4.507e-05 [optimize_parallel_all_gather_comm]: 2.57e-05 [overlap_param_gather]: 2.01e-06 [cconv]: 2.182e-05 [loop_unroll]: 0.00046327 [opt_after_cconv]: 0.00017926, [1] [Cycle 1]: 0.0001737, [7] [c_1]: 6.629e-05 [parameter_eliminate]: 2.56e-06 [updatestate_depend_eliminate]: 9.57001e-06 [updatestate_assign_eliminate]: 6.60002e-06 [updatestate_loads_eliminate]: 6.39001e-06 [cse]: 4.8e-05 [renormalize]: 5.19998e-07 [remove_dup_value]: 4.252e-05 [tuple_transform]: 0.00012657, [1] [Cycle 1]: 0.00012183, [4] [d_1]: 8.805e-05 [none_parameter_eliminate]: 1.84e-06 [renormalize]: 2.10013e-07 [switch_simplify]: 1.363e-05 [partial_unused_args_eliminate]: 1.82001e-06 [add_recomputation]: 7.626e-05 [cse_after_recomputation]: 4.484e-05, [1] [Cycle 1]: 4.002e-05, [1] [cse]: 3.419e-05 [environ_conv]: 1.04e-05 [swap_dp_allreduce_reducescatter]: 1.087e-05 [bias_add_comm_swap]: 2.46998e-06 [label_micro_interleaved_index]: 4.65001e-06 [label_fine_grained_interleaved_index]: 2.64001e-06 [merge_cast_opt]: 1.39e-06 [slice_recompute_activation]: 2.32001e-06 [micro_interleaved_order_control]: 2.33998e-06 [assign_add_opt]: 1.21002e-06 [ForceFp32Comm]: 7.60017e-07 [remove_cast_before_assign_add]: 1.14e-06 [full_micro_interleaved_order_control]: 2.41e-06 [reorder_send_recv_between_fp_bp]: 2.68998e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 1.01002e-06 [interleave_split_concat_branches]: 1.12999e-06 [interleave_parallel_branches]: 1.47999e-06 [overlap_opt_shard_in_pipeline]: 4.55999e-06 [overlap_opt_shard_grad_in_pipeline]: 1.86003e-06 [control_data_broadcast_order]: 2.468e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 7.08e-06 [overlap_recompute_and_grad_model_parallel]: 7.67998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.37999e-06 [overlap_recompute_comm]: 2.21e-06 [overlap_grad_ring_attention]: 6.81999e-06 [overlap_grad_flash_sp]: 3.251e-05 [begin_end_overlap_inline]: 4.90021e-07 [split_matmul_comm_elemetwise]: 2.12001e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.15999e-06 [symbol_engine_optimizer]: 0.00012255, [1] [Cycle 1]: 0.0001182, [6] [build]: 1.175e-05 [elim_shapecalc]: 1.777e-05 [elim_not_effective]: 2.43e-05 [opt_reshape]: 1.438e-05 [fold_const_symbol]: 2.052e-05 [renormalize]: 1.90019e-07 [detach_backward]: 2.07001e-06 [pipeline_parallel_scheduler]: 1.72999e-06 [auto_monad_reorder]: 3.481e-05 [get_jit_bprop_graph]: 1.32999e-06 [rewriter_after_jit_bprop_graph]: 3.56999e-06 [opt_after_jit_grad]: 0.00051848 [validate]: 6.337e-05 [backend_pass]: 1.02998e-06 [task_emit]: 0.024303 [execute]: 7.61999e-06 Sums bootstrap : 0.000642s : 0.54% type_inference : 0.066702s : 56.49% event_method : 0.000112s : 0.09% auto_monad : 0.000183s : 0.16% graph_reusing : 0.000010s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000045s : 0.04% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000013s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000069s : 0.06% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.00% optimize.rewriter_before_opt_a : 0.000453s : 0.38% optimize.opt_a.expand_dump_flag : 0.000009s : 0.01% optimize.opt_a.switch_simplify : 0.000236s : 0.20% optimize.opt_a.loop_unroll : 0.000140s : 0.12% optimize.opt_a.a_1 : 0.003648s : 3.09% optimize.opt_a.with_stream_mark : 0.000057s : 0.05% optimize.opt_a.recompute_prepare : 0.000049s : 0.04% optimize.opt_a.updatestate_depend_eliminate : 0.000024s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000020s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000020s : 0.02% optimize.opt_a.parameter_eliminate : 0.000005s : 0.00% optimize.opt_a.a_2 : 0.000554s : 0.47% optimize.opt_a.accelerated_algorithm : 0.000082s : 0.07% optimize.opt_a.shard : 0.000004s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000010s : 0.01% optimize.opt_a.shard_inline : 0.000040s : 0.03% optimize.opt_a.merge_send_recv : 0.000034s : 0.03% optimize.opt_a.auto_parallel : 0.000031s : 0.03% optimize.opt_a.parallel : 0.000040s : 0.03% optimize.opt_a.flash_sp : 0.000015s : 0.01% optimize.opt_a.merge_comm : 0.000025s : 0.02% optimize.opt_a.allreduce_fusion : 0.000022s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000051s : 0.04% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000002s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000045s : 0.04% optimize.opt_a.virtual_dataset : 0.000040s : 0.03% optimize.opt_a.get_grad_eliminate_ : 0.000038s : 0.03% optimize.opt_a.virtual_output : 0.000038s : 0.03% optimize.opt_a.merge_forward : 0.000022s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000004s : 0.00% optimize.opt_a.offload_activation : 0.000041s : 0.04% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000072s : 0.06% optimize.opt_a.merge_recompute_call_nodes : 0.000003s : 0.00% optimize.opt_a.before_grad : 0.000066s : 0.06% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000024s : 0.02% optimize.opt_a.meta_fg_expand : 0.001765s : 1.49% optimize.opt_a.flash_sp_send_recv_attached : 0.000008s : 0.01% optimize.opt_a.receive_attached : 0.000005s : 0.00% optimize.opt_a.after_resolve : 0.000103s : 0.09% optimize.opt_a.a_after_grad : 0.000124s : 0.10% optimize.opt_a.renormalize : 0.014023s : 11.88% optimize.opt_a.add_forward_monad_depend : 0.000017s : 0.01% optimize.opt_a.auto_monad_grad : 0.000009s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000101s : 0.09% optimize.opt_a.cse : 0.000541s : 0.46% optimize.opt_a.a_3 : 0.000539s : 0.46% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000003s : 0.00% optimize.rewriter_after_opt_a : 0.000030s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000620s : 0.52% optimize.opt_b.b_1 : 0.000257s : 0.22% optimize.opt_b.b_2 : 0.000016s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000009s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000006s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000007s : 0.01% optimize.opt_b.renormalize : 0.000001s : 0.00% optimize.opt_b.cse : 0.000045s : 0.04% optimize.optimize_parallel_all_gather_comm : 0.000026s : 0.02% optimize.overlap_param_gather : 0.000002s : 0.00% optimize.cconv : 0.000022s : 0.02% optimize.loop_unroll : 0.000463s : 0.39% optimize.opt_after_cconv.c_1 : 0.000066s : 0.06% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000010s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000007s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.cse : 0.000048s : 0.04% optimize.opt_after_cconv.renormalize : 0.000001s : 0.00% optimize.remove_dup_value : 0.000043s : 0.04% optimize.tuple_transform.d_1 : 0.000088s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000014s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000076s : 0.06% optimize.cse_after_recomputation.cse : 0.000034s : 0.03% optimize.environ_conv : 0.000010s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000011s : 0.01% optimize.bias_add_comm_swap : 0.000002s : 0.00% optimize.label_micro_interleaved_index : 0.000005s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000002s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000005s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000025s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000007s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000008s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000007s : 0.01% optimize.overlap_grad_flash_sp : 0.000033s : 0.03% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000012s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000018s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000024s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000014s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000021s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000035s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000518s : 0.44% validate : 0.000063s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.024303s : 20.58% execute : 0.000008s : 0.01% Time group info: ------[substitution.] 0.000886 244 0.44% : 0.000004s : 7: substitution.elim_not_effective 0.94% : 0.000008s : 13: substitution.float_depend_g_call 0.43% : 0.000004s : 3: substitution.float_tuple_getitem_switch 0.34% : 0.000003s : 7: substitution.fold_const_symbol 1.08% : 0.000010s : 10: substitution.graph_param_transform 0.34% : 0.000003s : 2: substitution.incorporate_call 0.22% : 0.000002s : 2: substitution.incorporate_call_switch 60.58% : 0.000537s : 22: substitution.inline 1.85% : 0.000016s : 2: substitution.inline_without_move 1.41% : 0.000012s : 24: substitution.j_node_and_user_rematch 3.80% : 0.000034s : 3: substitution.less_batch_normalization 1.43% : 0.000013s : 11: substitution.minmaximum_grad 2.15% : 0.000019s : 13: substitution.partial_eliminate 1.87% : 0.000017s : 24: substitution.remove_not_recompute_node 2.68% : 0.000024s : 9: substitution.replace_applicator 1.13% : 0.000010s : 14: substitution.replace_old_param 0.27% : 0.000002s : 1: substitution.set_cell_output_no_recompute 2.51% : 0.000022s : 3: substitution.switch_simplify 1.35% : 0.000012s : 4: substitution.transpose_eliminate 3.18% : 0.000028s : 11: substitution.tuple_list_convert_item_index_to_positive 1.52% : 0.000013s : 11: substitution.tuple_list_get_item_const_eliminator 2.00% : 0.000018s : 11: substitution.tuple_list_get_item_depend_reorder 6.42% : 0.000057s : 26: substitution.tuple_list_get_item_eliminator 2.06% : 0.000018s : 11: substitution.tuple_list_get_set_item_eliminator ------[type_inference.] 0.066586 2 95.10% : 0.063321s : 1: type_inference.infer 4.90% : 0.003265s : 1: type_inference.specialize ------[replace.] 0.000290 37 58.55% : 0.000170s : 22: replace.inline 12.75% : 0.000037s : 3: replace.switch_simplify 28.70% : 0.000083s : 12: replace.tuple_list_get_item_eliminator ------[match.] 0.000572 37 91.95% : 0.000526s : 22: match.inline 3.53% : 0.000020s : 3: match.switch_simplify 4.51% : 0.000026s : 12: match.tuple_list_get_item_eliminator ------[predicate.] 0.000881 6287 1.10% : 0.000010s : 74: predicate.accumulaten_eliminater 0.37% : 0.000003s : 10: predicate.ad_related_special_op_eliminate 0.51% : 0.000004s : 35: predicate.addn_check_dump 1.11% : 0.000010s : 74: predicate.addn_zero_filter 1.00% : 0.000009s : 74: predicate.adjust_all_reduce_mul_add 2.33% : 0.000020s : 109: predicate.arithmetic_simplify 1.19% : 0.000010s : 74: predicate.cast_eliminate 1.06% : 0.000009s : 71: predicate.check_bprop_eliminate 0.51% : 0.000004s : 35: predicate.compare_switch_simplify 0.09% : 0.000001s : 10: predicate.const_output_eliminate 0.51% : 0.000005s : 35: predicate.depend_value_elim 1.14% : 0.000010s : 74: predicate.dict_get_item_const_eliminator 1.20% : 0.000011s : 74: predicate.dict_get_item_eliminator 1.06% : 0.000009s : 74: predicate.dict_set_item_eliminator 0.40% : 0.000004s : 20: predicate.dumpgradient_eliminate 0.11% : 0.000001s : 10: predicate.elim_not_effective 0.18% : 0.000002s : 10: predicate.elim_shapecalc_of_broadcastargs 1.19% : 0.000010s : 84: predicate.environ_add_const_eliminate 1.16% : 0.000010s : 84: predicate.environ_get_add_eliminate 1.17% : 0.000010s : 84: predicate.environ_get_depend_swap 1.70% : 0.000015s : 119: predicate.environ_get_eliminate 1.18% : 0.000010s : 84: predicate.environ_get_set_eliminate 1.63% : 0.000014s : 108: predicate.exchange_switch_depend_value 2.22% : 0.000020s : 108: predicate.float_depend_g_call 0.50% : 0.000004s : 35: predicate.float_environ_get_switch 0.67% : 0.000006s : 45: predicate.float_tuple_getitem_switch 0.09% : 0.000001s : 10: predicate.fold_const_symbol 0.57% : 0.000005s : 35: predicate.get_grad_eliminate 0.10% : 0.000001s : 10: predicate.graph_param_transform 0.51% : 0.000004s : 35: predicate.incorporate_call 0.48% : 0.000004s : 35: predicate.incorporate_call_switch 5.31% : 0.000047s : 272: predicate.inline 1.21% : 0.000011s : 61: predicate.inline_without_move 0.28% : 0.000002s : 35: predicate.j_node_and_user_rematch 0.87% : 0.000008s : 35: predicate.less_batch_normalization 1.69% : 0.000015s : 106: predicate.list_to_tuple_eliminator_ 2.58% : 0.000023s : 180: predicate.load_eliminater 0.38% : 0.000003s : 10: predicate.loop_unroll_after_grad 2.37% : 0.000021s : 155: predicate.loop_unroll_before_grad 1.41% : 0.000012s : 94: predicate.make_slice_get_slice_eliminator 0.56% : 0.000005s : 35: predicate.merge_addn 1.05% : 0.000009s : 71: predicate.micro_step_allgather_replace 1.03% : 0.000009s : 71: predicate.mini_step_allgather_replace 1.05% : 0.000009s : 74: predicate.minmaximum_grad 0.42% : 0.000004s : 10: predicate.mutable_eliminate 0.19% : 0.000002s : 10: predicate.opt_reshape 0.17% : 0.000001s : 10: predicate.parallel_virtual_node 2.04% : 0.000018s : 108: predicate.partial_defer_inline 1.60% : 0.000014s : 96: predicate.partial_eliminate 1.07% : 0.000009s : 74: predicate.print_const_string_wrapper 0.52% : 0.000005s : 35: predicate.reduce_all_const_elim 1.55% : 0.000014s : 74: predicate.reduce_eliminate 2.54% : 0.000022s : 180: predicate.redundant_stop_gradient_eliminater 0.31% : 0.000003s : 35: predicate.remove_not_recompute_node 1.67% : 0.000015s : 157: predicate.replace_applicator 0.58% : 0.000005s : 61: predicate.replace_old_param 0.11% : 0.000001s : 10: predicate.reset_defer_inline 1.16% : 0.000010s : 74: predicate.reshape_eliminate 1.18% : 0.000010s : 71: predicate.row_tensor_add_zeros_like 0.18% : 0.000002s : 10: predicate.row_tensor_eliminate 1.52% : 0.000013s : 71: predicate.same_eliminate 0.36% : 0.000003s : 35: predicate.set_cell_output_no_recompute 0.75% : 0.000007s : 35: predicate.shard_identity_eliminate 0.36% : 0.000003s : 20: predicate.special_op_eliminate 0.58% : 0.000005s : 35: predicate.specialize_transform 1.24% : 0.000011s : 71: predicate.split_environ_get_set_with_tuple_value 1.16% : 0.000010s : 61: predicate.stack_unstack_eliminate 0.18% : 0.000002s : 10: predicate.switch_call_monad_eliminater 1.74% : 0.000015s : 108: predicate.switch_defer_inline 2.75% : 0.000024s : 179: predicate.switch_layer_defer_inline 5.19% : 0.000046s : 314: predicate.switch_simplify 1.09% : 0.000010s : 74: predicate.tile_eliminate 1.15% : 0.000010s : 74: predicate.transpose_eliminate 1.49% : 0.000013s : 94: predicate.tuple_list_convert_item_index_to_positive 1.59% : 0.000014s : 94: predicate.tuple_list_get_item_const_eliminator 1.45% : 0.000013s : 94: predicate.tuple_list_get_item_depend_reorder 2.68% : 0.000024s : 141: predicate.tuple_list_get_item_eliminator 1.56% : 0.000014s : 94: predicate.tuple_list_get_set_item_eliminator 2.19% : 0.000019s : 129: predicate.tuple_list_set_item_eliminator 1.63% : 0.000014s : 106: predicate.tuple_to_list_eliminator_ 2.48% : 0.000022s : 180: predicate.updatestate_pure_node_eliminater 3.09% : 0.000027s : 215: predicate.updatestate_useless_node_eliminater 0.17% : 0.000001s : 10: predicate.value_based_eliminate 0.59% : 0.000005s : 35: predicate.virtual_dataset_eliminate 0.56% : 0.000005s : 35: predicate.virtual_output_eliminate 0.15% : 0.000001s : 10: predicate.virtual_view_grad_eliminate 0.19% : 0.000002s : 10: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003248 46 60.47% : 0.001964s : 20: func_graph_cloner_run.FuncGraphClonerGraph 39.53% : 0.001284s : 26: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.173700 237 0.00% : 0.000003s : 1: ForceFp32Comm 2.21% : 0.003845s : 1: add_attr 2.21% : 0.003835s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.05% : 0.000080s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000192s : 1: auto_monad 0.02% : 0.000040s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000005s : 1: bias_add_comm_swap 0.39% : 0.000676s : 1: bootstrap 0.01% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000028s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.03% : 0.000048s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.01% : 0.000014s : 1: environ_conv 0.07% : 0.000120s : 1: event_method 0.01% : 0.000013s : 1: execute 0.00% : 0.000006s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.01% : 0.000015s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000007s : 1: label_micro_interleaved_index 0.27% : 0.000472s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.36% : 0.000629s : 1: mutable_eliminate 0.01% : 0.000010s : 1: offloading_packed_experts 0.01% : 0.000022s : 1: opt.transform.loop_unroll_optimizer 0.01% : 0.000024s : 1: opt.transform.mutable_eliminate 3.27% : 0.005682s : 117: opt.transform.opt_a 0.04% : 0.000065s : 1: opt.transform.opt_after_cconv 0.03% : 0.000046s : 1: opt.transform.opt_after_jit_grad 0.14% : 0.000246s : 28: opt.transform.opt_b 0.06% : 0.000100s : 2: opt.transform.opt_trans_graph 0.04% : 0.000073s : 4: opt.transform.symbol_engine_opt 13.53% : 0.023501s : 1: opt_a 0.11% : 0.000183s : 1: opt_after_cconv 0.30% : 0.000527s : 1: opt_after_jit_grad 0.22% : 0.000385s : 1: opt_b 15.25% : 0.026481s : 1: optimize 0.02% : 0.000029s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.02% : 0.000036s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000011s : 1: overlap_grad_ring_attention 0.00% : 0.000006s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000008s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000005s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000011s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.04% : 0.000074s : 1: pre_auto_parallel 0.00% : 0.000008s : 1: py_interpret_to_execute 0.00% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000047s : 1: remove_dup_value 6.61% : 0.011476s : 2: renormalize.infer 1.45% : 0.002527s : 2: renormalize.specialize 0.00% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.02% : 0.000033s : 1: rewriter_after_opt_a 0.27% : 0.000461s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000004s : 1: split_layernorm_comm 0.00% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000014s : 1: swap_dp_allreduce_reducescatter 0.07% : 0.000125s : 1: symbol_engine_optimizer 14.00% : 0.024315s : 1: task_emit 0.07% : 0.000129s : 1: tuple_transform 38.41% : 0.066719s : 1: type_inference 0.06% : 0.000101s : 1: validate ........group_cases_23 have all been run, results of sub cases are below: case: (1,) {} pass. case: (0,) {} pass. case: ('pynative',) {} pass. case: ('KBK',) {} pass. case: ('pynative',) {} pass. case: ('KBK',) {} pass. case: ('GRAPH',) {} pass. case: ('KBK',) {} pass. ops group_cases_24 with 1 cases start to running, all cases are below: case: (, 'KBK') ops group_cases_24 total running memory: 4M, memory threshold: 51200M TotalTime = 1.90185, [24] [bootstrap]: 0.00086227 [type_inference]: 0.0350632 [event_method]: 1.729e-05 [auto_monad]: 0.0001635 [graph_reusing]: 6.05002e-06 [inline]: 1.80001e-06 [add_attr]: 0.00737372, [1] [add_attr_with_inline]: 0.00736337, [1] [Cycle 1]: 0.00013314, [2] [tag_attr]: 3.439e-05 [meta_addattr_fg_expand]: 1.742e-05 [parallel-infer-symbol]: 2.68e-06 [pre_auto_parallel]: 5.316e-05 [insert-virtual-dataset]: 2.46e-06 [parallel-infer-symbol-second]: 6.80011e-07 [dataset_repeat_opt]: 2.12999e-06 [pipeline_split]: 1.78002e-06 [optimize]: 0.00496642, [53] [py_interpret_to_execute]: 4.04002e-06 [rewriter_before_opt_a]: 9.048e-05 [opt_a]: 0.00281892, [2] [Cycle 1]: 0.00220135, [45] [expand_dump_flag]: 3.38e-06 [switch_simplify]: 0.00014132 [loop_unroll]: 2.622e-05 [a_1]: 0.00049384 [with_stream_mark]: 1.307e-05 [recompute_prepare]: 7.89002e-06 [updatestate_depend_eliminate]: 1.702e-05 [updatestate_assign_eliminate]: 1.606e-05 [updatestate_loads_eliminate]: 3.36001e-06 [parameter_eliminate]: 1.84e-06 [a_2]: 8.334e-05 [accelerated_algorithm]: 6.84999e-06 [shard]: 1.87001e-06 [meta_shard_fg_expand]: 1.99999e-06 [shard_inline]: 6.26998e-06 [merge_send_recv]: 5.73e-05 [auto_parallel]: 6.43998e-06 [parallel]: 9.447e-05 [flash_sp]: 4.224e-05 [merge_comm]: 4.17e-06 [allreduce_fusion]: 1.439e-05 [matmul_add_comm_reduction]: 2.219e-05 [allreduce_slice_to_reducescatter]: 1.255e-05 [virtual_shard_identity]: 8.65001e-06 [virtual_dataset]: 6.51999e-06 [get_grad_eliminate_]: 6.01e-06 [virtual_output]: 6.16e-06 [merge_forward]: 4.35e-06 [cell_reuse_recompute_pass]: 1.37e-06 [offload_activation]: 3.222e-05 [cell_reuse_handle_not_recompute_node_pass]: 2.592e-05 [merge_recompute_call_nodes]: 1.52001e-06 [before_grad]: 9.86998e-06 [set_forward_comm_id_for_comm_node_pass]: 1.722e-05 [meta_fg_expand]: 2.88998e-06 [flash_sp_send_recv_attached]: 2.51998e-06 [receive_attached]: 2.606e-05 [after_resolve]: 1.146e-05 [a_after_grad]: 9.36e-06 [renormalize]: 0.00058674 [add_forward_monad_depend]: 5.07e-06 [auto_monad_grad]: 1.76e-06 [auto_monad_eliminator]: 2.775e-05 [cse]: 5.545e-05 [a_3]: 4.494e-05 [Cycle 2]: 0.00060904, [45] [expand_dump_flag]: 1.05999e-06 [switch_simplify]: 7.29001e-06 [loop_unroll]: 5.76e-06 [a_1]: 0.00011933 [with_stream_mark]: 1.033e-05 [recompute_prepare]: 5.90002e-06 [updatestate_depend_eliminate]: 2.94001e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.46e-06 [parameter_eliminate]: 8.70001e-07 [a_2]: 7.098e-05 [accelerated_algorithm]: 5.87001e-06 [shard]: 1.05001e-06 [meta_shard_fg_expand]: 1.38002e-06 [shard_inline]: 5.86e-06 [merge_send_recv]: 4.61002e-06 [auto_parallel]: 5.26002e-06 [parallel]: 3.66001e-06 [flash_sp]: 3.26001e-06 [merge_comm]: 3.08998e-06 [allreduce_fusion]: 2.87002e-06 [matmul_add_comm_reduction]: 5.05999e-06 [allreduce_slice_to_reducescatter]: 4.00003e-07 [virtual_shard_identity]: 1.409e-05 [virtual_dataset]: 6.32001e-06 [get_grad_eliminate_]: 6.03998e-06 [virtual_output]: 5.87001e-06 [merge_forward]: 3.07002e-06 [cell_reuse_recompute_pass]: 1.30999e-06 [offload_activation]: 6.23998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.229e-05 [merge_recompute_call_nodes]: 6.89994e-07 [before_grad]: 9.00999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.18e-06 [meta_fg_expand]: 1.99999e-06 [flash_sp_send_recv_attached]: 8.30012e-07 [receive_attached]: 1.02e-06 [after_resolve]: 9.47999e-06 [a_after_grad]: 8.60999e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.32e-06 [auto_monad_grad]: 9.00007e-07 [auto_monad_eliminator]: 6.29999e-06 [cse]: 1.478e-05 [a_3]: 3.515e-05 [py_interpret_to_execute_after_opt_a]: 4.08001e-06 [slice_cell_reuse_recomputed_activation]: 2.12999e-06 [rewriter_after_opt_a]: 3.125e-05 [convert_after_rewriter]: 1.38002e-06 [order_py_execute_after_rewriter]: 1.39e-06 [mutable_eliminate]: 0.00049842 [opt_b]: 0.00019505, [1] [Cycle 1]: 0.00018953, [7] [b_1]: 0.00011987 [b_2]: 7.43e-06 [updatestate_depend_eliminate]: 5.31998e-06 [updatestate_assign_eliminate]: 2.55002e-06 [updatestate_loads_eliminate]: 2.42001e-06 [renormalize]: 4.10015e-07 [cse]: 1.928e-05 [optimize_parallel_all_gather_comm]: 2.924e-05 [overlap_param_gather]: 1.57e-05 [cconv]: 2.237e-05 [loop_unroll]: 0.00040645 [opt_after_cconv]: 9.809e-05, [1] [Cycle 1]: 9.299e-05, [7] [c_1]: 2.862e-05 [parameter_eliminate]: 2.23998e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.61e-06 [cse]: 1.954e-05 [renormalize]: 4.10015e-07 [remove_dup_value]: 1.472e-05 [tuple_transform]: 6.971e-05, [1] [Cycle 1]: 6.572e-05, [4] [d_1]: 3.991e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.82002e-06 [partial_unused_args_eliminate]: 1.74e-06 [add_recomputation]: 6.295e-05 [cse_after_recomputation]: 2.266e-05, [1] [Cycle 1]: 1.85e-05, [1] [cse]: 1.319e-05 [environ_conv]: 1.462e-05 [swap_dp_allreduce_reducescatter]: 3.003e-05 [bias_add_comm_swap]: 1.517e-05 [label_micro_interleaved_index]: 1.688e-05 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.70001e-06 [slice_recompute_activation]: 2.21998e-06 [micro_interleaved_order_control]: 2.68998e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 7.59988e-07 [remove_cast_before_assign_add]: 1.398e-05 [full_micro_interleaved_order_control]: 1.412e-05 [reorder_send_recv_between_fp_bp]: 2.71999e-06 [comm_op_add_attrs]: 1.00999e-06 [add_comm_op_reuse_tag]: 9.89996e-07 [interleave_split_concat_branches]: 1.13001e-06 [interleave_parallel_branches]: 1.302e-05 [overlap_opt_shard_in_pipeline]: 1.363e-05 [overlap_opt_shard_grad_in_pipeline]: 1.97001e-06 [control_data_broadcast_order]: 1.206e-05 [grouped_pairwise_exchange_alltoall]: 1.54998e-06 [offloading_packed_experts]: 3.99002e-06 [overlap_recompute_and_grad_model_parallel]: 1.696e-05 [overlap_grad_matmul_and_grad_allreduce]: 1.22e-06 [overlap_recompute_allgather_and_fa_grad]: 1.64998e-06 [overlap_recompute_comm]: 2.06e-06 [overlap_grad_ring_attention]: 2.552e-05 [overlap_grad_flash_sp]: 5.366e-05 [begin_end_overlap_inline]: 6.00005e-07 [split_matmul_comm_elemetwise]: 1.452e-05 [split_layernorm_comm]: 1.94e-06 [handle_group_info]: 1.25999e-06 [symbol_engine_optimizer]: 7.297e-05, [1] [Cycle 1]: 6.838e-05, [6] [build]: 2.37999e-06 [elim_shapecalc]: 9.87001e-06 [elim_not_effective]: 1.221e-05 [opt_reshape]: 6.54999e-06 [fold_const_symbol]: 9.47999e-06 [renormalize]: 2.10013e-07 [detach_backward]: 1.71e-06 [pipeline_parallel_scheduler]: 1.92001e-06 [auto_monad_reorder]: 2.391e-05 [get_jit_bprop_graph]: 1.04e-06 [rewriter_after_jit_bprop_graph]: 3.08998e-06 [opt_after_jit_grad]: 0.00046954 [validate]: 5.344e-05 [backend_pass]: 9.00007e-07 [task_emit]: 1.85251 [execute]: 1.05e-05 Sums bootstrap : 0.000862s : 0.05% type_inference : 0.035063s : 1.85% event_method : 0.000017s : 0.00% auto_monad : 0.000163s : 0.01% graph_reusing : 0.000006s : 0.00% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000034s : 0.00% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000017s : 0.00% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000053s : 0.00% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000004s : 0.00% optimize.rewriter_before_opt_a : 0.000090s : 0.00% optimize.opt_a.expand_dump_flag : 0.000004s : 0.00% optimize.opt_a.switch_simplify : 0.000149s : 0.01% optimize.opt_a.loop_unroll : 0.000032s : 0.00% optimize.opt_a.a_1 : 0.000613s : 0.03% optimize.opt_a.with_stream_mark : 0.000023s : 0.00% optimize.opt_a.recompute_prepare : 0.000014s : 0.00% optimize.opt_a.updatestate_depend_eliminate : 0.000020s : 0.00% optimize.opt_a.updatestate_assign_eliminate : 0.000019s : 0.00% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.00% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000154s : 0.01% optimize.opt_a.accelerated_algorithm : 0.000013s : 0.00% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.00% optimize.opt_a.shard_inline : 0.000012s : 0.00% optimize.opt_a.merge_send_recv : 0.000062s : 0.00% optimize.opt_a.auto_parallel : 0.000012s : 0.00% optimize.opt_a.parallel : 0.000098s : 0.01% optimize.opt_a.flash_sp : 0.000045s : 0.00% optimize.opt_a.merge_comm : 0.000007s : 0.00% optimize.opt_a.allreduce_fusion : 0.000017s : 0.00% optimize.opt_a.matmul_add_comm_reduction : 0.000027s : 0.00% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000013s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000023s : 0.00% optimize.opt_a.virtual_dataset : 0.000013s : 0.00% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.00% optimize.opt_a.virtual_output : 0.000012s : 0.00% optimize.opt_a.merge_forward : 0.000007s : 0.00% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000038s : 0.00% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000038s : 0.00% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000019s : 0.00% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000020s : 0.00% optimize.opt_a.meta_fg_expand : 0.000005s : 0.00% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000027s : 0.00% optimize.opt_a.after_resolve : 0.000021s : 0.00% optimize.opt_a.a_after_grad : 0.000018s : 0.00% optimize.opt_a.renormalize : 0.000587s : 0.03% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.00% optimize.opt_a.auto_monad_grad : 0.000003s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000034s : 0.00% optimize.opt_a.cse : 0.000070s : 0.00% optimize.opt_a.a_3 : 0.000080s : 0.00% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.00% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000031s : 0.00% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000498s : 0.03% optimize.opt_b.b_1 : 0.000120s : 0.01% optimize.opt_b.b_2 : 0.000007s : 0.00% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000019s : 0.00% optimize.optimize_parallel_all_gather_comm : 0.000029s : 0.00% optimize.overlap_param_gather : 0.000016s : 0.00% optimize.cconv : 0.000022s : 0.00% optimize.loop_unroll : 0.000406s : 0.02% optimize.opt_after_cconv.c_1 : 0.000029s : 0.00% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.00% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000020s : 0.00% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.00% optimize.tuple_transform.d_1 : 0.000040s : 0.00% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.00% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000063s : 0.00% optimize.cse_after_recomputation.cse : 0.000013s : 0.00% optimize.environ_conv : 0.000015s : 0.00% optimize.swap_dp_allreduce_reducescatter : 0.000030s : 0.00% optimize.bias_add_comm_swap : 0.000015s : 0.00% optimize.label_micro_interleaved_index : 0.000017s : 0.00% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000002s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000014s : 0.00% optimize.full_micro_interleaved_order_control : 0.000014s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000013s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000014s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000012s : 0.00% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.00% optimize.overlap_recompute_and_grad_model_parallel : 0.000017s : 0.00% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000002s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.00% optimize.overlap_grad_ring_attention : 0.000026s : 0.00% optimize.overlap_grad_flash_sp : 0.000054s : 0.00% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000015s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.00% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.00% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.00% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000009s : 0.00% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000024s : 0.00% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.00% opt_after_jit_grad : 0.000470s : 0.02% validate : 0.000053s : 0.00% backend_pass : 0.000001s : 0.00% task_emit : 1.852506s : 97.84% execute : 0.000010s : 0.00% Time group info: ------[substitution.] 0.000184 28 0.97% : 0.000002s : 2: substitution.elim_not_effective 0.76% : 0.000001s : 2: substitution.fold_const_symbol 2.83% : 0.000005s : 4: substitution.graph_param_transform 70.25% : 0.000130s : 6: substitution.inline 1.78% : 0.000003s : 4: substitution.j_node_and_user_rematch 9.47% : 0.000017s : 4: substitution.remove_not_recompute_node 2.22% : 0.000004s : 4: substitution.replace_old_param 11.72% : 0.000022s : 2: substitution.switch_simplify ------[type_inference.] 0.034981 2 97.01% : 0.033935s : 1: type_inference.infer 2.99% : 0.001046s : 1: type_inference.specialize ------[replace.] 0.000068 8 53.53% : 0.000036s : 6: replace.inline 46.47% : 0.000032s : 2: replace.switch_simplify ------[match.] 0.000147 8 86.16% : 0.000126s : 6: match.inline 13.84% : 0.000020s : 2: match.switch_simplify ------[predicate.] 0.000175 1228 0.94% : 0.000002s : 13: predicate.accumulaten_eliminater 1.08% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.54% : 0.000001s : 8: predicate.addn_check_dump 1.00% : 0.000002s : 13: predicate.addn_zero_filter 0.88% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.29% : 0.000004s : 21: predicate.arithmetic_simplify 0.93% : 0.000002s : 13: predicate.cast_eliminate 0.63% : 0.000001s : 8: predicate.check_bprop_eliminate 0.58% : 0.000001s : 8: predicate.compare_switch_simplify 0.22% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.17% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.93% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.39% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.12% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_depend_swap 1.75% : 0.000003s : 25: predicate.environ_get_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000002s : 19: predicate.exchange_switch_depend_value 2.24% : 0.000004s : 19: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.78% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.22% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000001s : 8: predicate.get_grad_eliminate 0.34% : 0.000001s : 4: predicate.graph_param_transform 0.62% : 0.000001s : 8: predicate.incorporate_call 0.54% : 0.000001s : 8: predicate.incorporate_call_switch 5.83% : 0.000010s : 56: predicate.inline 0.79% : 0.000001s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.83% : 0.000001s : 8: predicate.less_batch_normalization 1.66% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.28% : 0.000004s : 34: predicate.load_eliminater 0.97% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.56% : 0.000004s : 32: predicate.loop_unroll_before_grad 1.64% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.63% : 0.000001s : 8: predicate.merge_addn 0.55% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.60% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.84% : 0.000001s : 13: predicate.minmaximum_grad 1.09% : 0.000002s : 4: predicate.mutable_eliminate 0.36% : 0.000001s : 4: predicate.opt_reshape 0.44% : 0.000001s : 4: predicate.parallel_virtual_node 1.70% : 0.000003s : 19: predicate.partial_defer_inline 1.32% : 0.000002s : 17: predicate.partial_eliminate 0.93% : 0.000002s : 13: predicate.print_const_string_wrapper 0.60% : 0.000001s : 8: predicate.reduce_all_const_elim 1.14% : 0.000002s : 13: predicate.reduce_eliminate 2.42% : 0.000004s : 34: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.25% : 0.000002s : 21: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.26% : 0.000000s : 4: predicate.reset_defer_inline 1.01% : 0.000002s : 13: predicate.reshape_eliminate 0.62% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.35% : 0.000001s : 4: predicate.row_tensor_eliminate 0.73% : 0.000001s : 8: predicate.same_eliminate 0.49% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.81% : 0.000001s : 8: predicate.shard_identity_eliminate 0.69% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000001s : 8: predicate.specialize_transform 0.80% : 0.000001s : 8: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.33% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.56% : 0.000003s : 19: predicate.switch_defer_inline 2.16% : 0.000004s : 27: predicate.switch_layer_defer_inline 6.11% : 0.000011s : 67: predicate.switch_simplify 0.97% : 0.000002s : 13: predicate.tile_eliminate 0.93% : 0.000002s : 13: predicate.transpose_eliminate 1.63% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.50% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.79% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.31% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.57% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.43% : 0.000004s : 34: predicate.updatestate_pure_node_eliminater 2.88% : 0.000005s : 42: predicate.updatestate_useless_node_eliminater 0.35% : 0.000001s : 4: predicate.value_based_eliminate 0.73% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.72% : 0.000001s : 8: predicate.virtual_output_eliminate 0.25% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.42% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000756 12 60.02% : 0.000454s : 4: func_graph_cloner_run.FuncGraphClonerGraph 39.98% : 0.000302s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 1.916073 196 0.00% : 0.000004s : 1: ForceFp32Comm 0.39% : 0.007378s : 1: add_attr 0.38% : 0.007367s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.00% : 0.000067s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.01% : 0.000170s : 1: auto_monad 0.00% : 0.000028s : 1: auto_monad_reorder 0.00% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.00% : 0.000018s : 1: bias_add_comm_swap 0.05% : 0.000916s : 1: bootstrap 0.00% : 0.000026s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.00% : 0.000015s : 1: control_data_broadcast_order 0.00% : 0.000004s : 1: convert_after_rewriter 0.00% : 0.000026s : 1: cse_after_recomputation 0.00% : 0.000005s : 1: dataset_repeat_opt 0.00% : 0.000005s : 1: detach_backward 0.00% : 0.000018s : 1: environ_conv 0.00% : 0.000022s : 1: event_method 0.00% : 0.000023s : 1: execute 0.00% : 0.000017s : 1: full_micro_interleaved_order_control 0.00% : 0.000004s : 1: get_jit_bprop_graph 0.00% : 0.000010s : 1: graph_reusing 0.00% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.00% : 0.000005s : 1: inline 0.00% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000016s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.00% : 0.000006s : 1: label_fine_grained_interleaved_index 0.00% : 0.000020s : 1: label_micro_interleaved_index 0.02% : 0.000414s : 1: loop_unroll 0.00% : 0.000004s : 1: merge_cast_opt 0.00% : 0.000005s : 1: micro_interleaved_order_control 0.03% : 0.000506s : 1: mutable_eliminate 0.00% : 0.000007s : 1: offloading_packed_experts 0.00% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.00% : 0.000014s : 1: opt.transform.mutable_eliminate 0.06% : 0.001142s : 78: opt.transform.opt_a 0.00% : 0.000027s : 1: opt.transform.opt_after_cconv 0.00% : 0.000024s : 1: opt.transform.opt_after_jit_grad 0.01% : 0.000102s : 28: opt.transform.opt_b 0.00% : 0.000045s : 2: opt.transform.opt_trans_graph 0.00% : 0.000035s : 4: opt.transform.symbol_engine_opt 0.15% : 0.002822s : 1: opt_a 0.01% : 0.000101s : 1: opt_after_cconv 0.02% : 0.000478s : 1: opt_after_jit_grad 0.01% : 0.000198s : 1: opt_b 0.26% : 0.004970s : 1: optimize 0.00% : 0.000033s : 1: optimize_parallel_all_gather_comm 0.00% : 0.000004s : 1: order_py_execute_after_rewriter 0.00% : 0.000057s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.00% : 0.000029s : 1: overlap_grad_ring_attention 0.00% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000017s : 1: overlap_opt_shard_in_pipeline 0.00% : 0.000019s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.00% : 0.000020s : 1: overlap_recompute_and_grad_model_parallel 0.00% : 0.000005s : 1: overlap_recompute_comm 0.00% : 0.000006s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.00% : 0.000005s : 1: pipeline_parallel_scheduler 0.00% : 0.000005s : 1: pipeline_split 0.00% : 0.000057s : 1: pre_auto_parallel 0.00% : 0.000007s : 1: py_interpret_to_execute 0.00% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000017s : 1: remove_cast_before_assign_add 0.00% : 0.000018s : 1: remove_dup_value 0.02% : 0.000297s : 1: renormalize.infer 0.01% : 0.000283s : 1: renormalize.specialize 0.00% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.00% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.00% : 0.000035s : 1: rewriter_after_opt_a 0.00% : 0.000095s : 1: rewriter_before_opt_a 0.00% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.00% : 0.000005s : 1: slice_recompute_activation 0.00% : 0.000005s : 1: split_layernorm_comm 0.00% : 0.000017s : 1: split_matmul_comm_elemetwise 0.00% : 0.000033s : 1: swap_dp_allreduce_reducescatter 0.00% : 0.000076s : 1: symbol_engine_optimizer 96.68% : 1.852539s : 1: task_emit 0.00% : 0.000072s : 1: tuple_transform 1.83% : 0.035076s : 1: type_inference 0.00% : 0.000081s : 1: validate TotalTime = 0.0908978, [33] [bootstrap]: 0.00040978 [type_inference]: 0.0548187 [event_method]: 0.00027771 [auto_monad]: 0.00017069 [graph_reusing]: 1.072e-05 [pre_auto_parallel]: 3.68999e-06 [py_interpret_to_execute]: 5.236e-05 [rewriter_before_opt_a]: 0.00015395 [expand_dump_flag]: 4.43001e-06 [jit_opt_a]: 0.013959, [3] [Cycle 1]: 0.00784785, [27] [switch_simplify]: 0.00019247 [loop_unroll]: 6.321e-05 [a_1]: 0.00131001 [with_stream_mark]: 2.355e-05 [recompute_prepare]: 2.122e-05 [updatestate_depend_eliminate]: 8.26002e-06 [updatestate_assign_eliminate]: 7.33999e-06 [updatestate_loads_eliminate]: 6.86001e-06 [parameter_eliminate]: 2.66999e-06 [specialize_transform]: 1.637e-05 [updatestate_useless_node_eliminater]: 1.507e-05 [accelerated_algorithm]: 1.443e-05 [meta_shard_fg_expand]: 4.70999e-06 [get_grad_eliminate_]: 1.486e-05 [merge_forward]: 8.60999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [cell_reuse_handle_not_recompute_node_pass]: 2.85e-05 [j_node_and_user_rematch]: 2.508e-05 [meta_fg_expand]: 0.00171416 [replace_old_param]: 6.454e-05 [inline_without_move]: 5.888e-05 [renormalize]: 0.00366026 [add_forward_monad_depend]: 1.294e-05 [auto_monad_grad]: 5.99999e-06 [auto_monad_eliminator]: 5.686e-05 [cse]: 0.00023116 [replace_applicator]: 7.331e-05 [Cycle 2]: 0.00236281, [27] [switch_simplify]: 4.525e-05 [loop_unroll]: 4.261e-05 [a_1]: 0.00115526 [with_stream_mark]: 1.122e-05 [recompute_prepare]: 8.05e-06 [updatestate_depend_eliminate]: 3.55e-06 [updatestate_assign_eliminate]: 2.80002e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 9.89996e-07 [specialize_transform]: 6.91001e-06 [updatestate_useless_node_eliminater]: 6.61999e-06 [accelerated_algorithm]: 6.73e-06 [meta_shard_fg_expand]: 1.67001e-06 [get_grad_eliminate_]: 6.08998e-06 [merge_forward]: 3.08e-06 [cell_reuse_recompute_pass]: 8.00006e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.165e-05 [j_node_and_user_rematch]: 9.36e-06 [meta_fg_expand]: 0.00015063 [replace_old_param]: 1.485e-05 [inline_without_move]: 7.03e-06 [renormalize]: 0.00066045 [add_forward_monad_depend]: 4.36002e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 1.069e-05 [cse]: 2.221e-05 [replace_applicator]: 1.315e-05 [Cycle 3]: 0.00037501, [27] [switch_simplify]: 7.61001e-06 [loop_unroll]: 7.01001e-06 [a_1]: 0.00012175 [with_stream_mark]: 8.54998e-06 [recompute_prepare]: 6.25002e-06 [updatestate_depend_eliminate]: 3.55998e-06 [updatestate_assign_eliminate]: 2.73998e-06 [updatestate_loads_eliminate]: 2.54999e-06 [parameter_eliminate]: 9.09989e-07 [specialize_transform]: 6.37001e-06 [updatestate_useless_node_eliminater]: 6.36e-06 [accelerated_algorithm]: 6.69001e-06 [meta_shard_fg_expand]: 1.35001e-06 [get_grad_eliminate_]: 6.11e-06 [merge_forward]: 2.94999e-06 [cell_reuse_recompute_pass]: 1.42e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.365e-05 [j_node_and_user_rematch]: 9.67999e-06 [meta_fg_expand]: 1.99e-06 [replace_old_param]: 9.56e-06 [inline_without_move]: 6.01998e-06 [renormalize]: 5.9983e-08 [add_forward_monad_depend]: 1.09e-06 [auto_monad_grad]: 7.99977e-07 [auto_monad_eliminator]: 5.76998e-06 [cse]: 1.477e-05 [replace_applicator]: 6.60997e-06 [py_interpret_to_execute_after_opt_a]: 9.25001e-06 [rewriter_after_opt_a]: 3.33e-05 [convert_after_rewriter]: 6.94999e-06 [order_py_execute_after_rewriter]: 5.25001e-06 [mutable_eliminate]: 0.00046429 [jit_opt_b]: 5.727e-05, [1] [Cycle 1]: 5.098e-05, [2] [frontend_op_eliminate]: 2.046e-05 [inline_after_opt_a]: 1.893e-05 [cconv]: 2.143e-05 [loop_unroll]: 0.00042298 [jit_opt_after_cconv]: 0.00015758, [1] [Cycle 1]: 0.0001512, [11] [c_1]: 2.762e-05 [parameter_eliminate]: 2.26e-06 [updatestate_depend_eliminate]: 5.81003e-06 [updatestate_assign_eliminate]: 3.53e-06 [updatestate_loads_eliminate]: 2.64001e-06 [cse]: 2.173e-05 [call_graph_tuple_transform]: 2.111e-05 [tuple_list_get_item_eliminator]: 7.17002e-06 [none_parameter_eliminate]: 1.47999e-06 [renormalize]: 3.50003e-07 [switch_simplify]: 6.95998e-06 [remove_dup_value]: 1.65e-05 [partial_unused_args_eliminate]: 2.57001e-06 [environ_conv]: 7e-06 [add_recomputation]: 4.169e-05 [cse_after_recomputation]: 2.551e-05, [1] [Cycle 1]: 2.024e-05, [1] [cse]: 1.461e-05 [auto_monad_reorder]: 1.823e-05 [get_jit_bprop_graph]: 1.35001e-06 [rewriter_after_jit_bprop_graph]: 4.23999e-06 [opt_after_jit_grad]: 0.00046123 [symbol_engine_optimizer]: 9.55e-05, [1] [Cycle 1]: 8.972e-05, [6] [build]: 3.01001e-06 [elim_shapecalc]: 9.60001e-06 [elim_not_effective]: 1.572e-05 [opt_reshape]: 7.51001e-06 [fold_const_symbol]: 1.075e-05 [renormalize]: 4.00003e-07 [validate]: 3.972e-05 [backend_pass]: 1.15999e-06 [task_emit]: 0.0189428 [execute]: 6.86999e-06 Sums bootstrap : 0.000410s : 0.47% type_inference : 0.054819s : 63.21% event_method : 0.000278s : 0.32% auto_monad : 0.000171s : 0.20% graph_reusing : 0.000011s : 0.01% pre_auto_parallel : 0.000004s : 0.00% py_interpret_to_execute : 0.000052s : 0.06% rewriter_before_opt_a : 0.000154s : 0.18% expand_dump_flag : 0.000004s : 0.01% jit_opt_a.switch_simplify : 0.000245s : 0.28% jit_opt_a.loop_unroll : 0.000113s : 0.13% jit_opt_a.a_1 : 0.002587s : 2.98% jit_opt_a.with_stream_mark : 0.000043s : 0.05% jit_opt_a.recompute_prepare : 0.000036s : 0.04% jit_opt_a.updatestate_depend_eliminate : 0.000015s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.01% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.01% jit_opt_a.parameter_eliminate : 0.000005s : 0.01% jit_opt_a.specialize_transform : 0.000030s : 0.03% jit_opt_a.updatestate_useless_node_eliminater : 0.000028s : 0.03% jit_opt_a.accelerated_algorithm : 0.000028s : 0.03% jit_opt_a.meta_shard_fg_expand : 0.000008s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000027s : 0.03% jit_opt_a.merge_forward : 0.000015s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000054s : 0.06% jit_opt_a.j_node_and_user_rematch : 0.000044s : 0.05% jit_opt_a.meta_fg_expand : 0.001867s : 2.15% jit_opt_a.replace_old_param : 0.000089s : 0.10% jit_opt_a.inline_without_move : 0.000072s : 0.08% jit_opt_a.renormalize : 0.004321s : 4.98% jit_opt_a.add_forward_monad_depend : 0.000018s : 0.02% jit_opt_a.auto_monad_grad : 0.000008s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000073s : 0.08% jit_opt_a.cse : 0.000268s : 0.31% jit_opt_a.replace_applicator : 0.000093s : 0.11% py_interpret_to_execute_after_opt_a : 0.000009s : 0.01% rewriter_after_opt_a : 0.000033s : 0.04% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000464s : 0.54% jit_opt_b.frontend_op_eliminate : 0.000020s : 0.02% jit_opt_b.inline_after_opt_a : 0.000019s : 0.02% cconv : 0.000021s : 0.02% loop_unroll : 0.000423s : 0.49% jit_opt_after_cconv.c_1 : 0.000028s : 0.03% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000004s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000022s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.02% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000017s : 0.02% partial_unused_args_eliminate : 0.000003s : 0.00% environ_conv : 0.000007s : 0.01% add_recomputation : 0.000042s : 0.05% cse_after_recomputation.cse : 0.000015s : 0.02% auto_monad_reorder : 0.000018s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.00% opt_after_jit_grad : 0.000461s : 0.53% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000016s : 0.02% symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000040s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.018943s : 21.84% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000630 128 0.35% : 0.000002s : 2: substitution.elim_not_effective 0.22% : 0.000001s : 2: substitution.fold_const_symbol 0.84% : 0.000005s : 4: substitution.graph_param_transform 69.25% : 0.000436s : 21: substitution.inline 2.64% : 0.000017s : 2: substitution.inline_without_move 1.31% : 0.000008s : 12: substitution.j_node_and_user_rematch 1.59% : 0.000010s : 7: substitution.minmaximum_grad 3.77% : 0.000024s : 11: substitution.partial_eliminate 1.45% : 0.000009s : 12: substitution.remove_not_recompute_node 3.61% : 0.000023s : 9: substitution.replace_applicator 1.55% : 0.000010s : 14: substitution.replace_old_param 0.55% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.28% : 0.000014s : 5: substitution.switch_simplify 3.04% : 0.000019s : 7: substitution.tuple_list_convert_item_index_to_positive 2.27% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.28% : 0.000033s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.054713 2 94.85% : 0.051895s : 1: type_inference.infer 5.15% : 0.002818s : 1: type_inference.specialize ------[replace.] 0.000248 31 57.06% : 0.000141s : 21: replace.inline 22.88% : 0.000057s : 5: replace.switch_simplify 20.05% : 0.000050s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000447 31 95.03% : 0.000424s : 21: match.inline 2.66% : 0.000012s : 5: match.switch_simplify 2.31% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000442 3262 1.52% : 0.000007s : 56: predicate.accumulaten_eliminater 0.33% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.51% : 0.000007s : 56: predicate.addn_check_dump 1.50% : 0.000007s : 56: predicate.addn_zero_filter 2.19% : 0.000010s : 56: predicate.arithmetic_simplify 1.51% : 0.000007s : 56: predicate.cast_eliminate 0.13% : 0.000001s : 4: predicate.check_bprop_eliminate 1.48% : 0.000007s : 56: predicate.compare_switch_simplify 1.48% : 0.000007s : 56: predicate.depend_value_elim 1.53% : 0.000007s : 56: predicate.dict_get_item_const_eliminator 1.61% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.48% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.25% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.12% : 0.000001s : 4: predicate.elim_not_effective 0.16% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.47% : 0.000007s : 56: predicate.environ_add_const_eliminate 1.49% : 0.000007s : 56: predicate.environ_get_add_eliminate 1.52% : 0.000007s : 56: predicate.environ_get_depend_swap 1.48% : 0.000007s : 56: predicate.environ_get_eliminate 1.49% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.09% : 0.000000s : 4: predicate.fold_const_symbol 0.77% : 0.000003s : 21: predicate.get_grad_eliminate 0.21% : 0.000001s : 4: predicate.graph_param_transform 4.41% : 0.000019s : 90: predicate.inline 1.70% : 0.000008s : 46: predicate.inline_without_move 0.34% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.81% : 0.000004s : 21: predicate.less_batch_normalization 1.75% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.84% : 0.000008s : 65: predicate.load_eliminater 0.40% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.79% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.68% : 0.000007s : 60: predicate.make_slice_get_slice_eliminator 1.49% : 0.000007s : 56: predicate.merge_addn 1.48% : 0.000007s : 56: predicate.minmaximum_grad 0.43% : 0.000002s : 4: predicate.mutable_eliminate 0.16% : 0.000001s : 4: predicate.opt_reshape 2.30% : 0.000010s : 65: predicate.partial_eliminate 1.46% : 0.000006s : 56: predicate.print_const_string_wrapper 1.89% : 0.000008s : 56: predicate.reduce_eliminate 1.71% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000002s : 21: predicate.remove_not_recompute_node 2.48% : 0.000011s : 113: predicate.replace_applicator 0.94% : 0.000004s : 46: predicate.replace_old_param 0.09% : 0.000000s : 4: predicate.reset_defer_inline 1.52% : 0.000007s : 56: predicate.reshape_eliminate 1.62% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.23% : 0.000001s : 4: predicate.row_tensor_eliminate 1.58% : 0.000007s : 56: predicate.same_eliminate 0.44% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.31% : 0.000001s : 8: predicate.special_op_eliminate 0.74% : 0.000003s : 21: predicate.specialize_transform 1.69% : 0.000007s : 56: predicate.split_environ_get_set_with_tuple_value 1.48% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.97% : 0.000013s : 82: predicate.switch_defer_inline 2.74% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.57% : 0.000033s : 216: predicate.switch_simplify 1.50% : 0.000007s : 56: predicate.tile_eliminate 1.47% : 0.000007s : 56: predicate.transpose_eliminate 1.84% : 0.000008s : 56: predicate.tuple_list_convert_item_index_to_positive 1.77% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.94% : 0.000013s : 69: predicate.tuple_list_get_item_eliminator 1.93% : 0.000009s : 56: predicate.tuple_list_set_item_eliminator 1.87% : 0.000008s : 61: predicate.tuple_to_list_eliminator_ 1.83% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.76% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 1.82% : 0.000008s : 56: predicate.value_based_eliminate 0.11% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.27% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002733 41 61.10% : 0.001670s : 16: func_graph_cloner_run.FuncGraphClonerGraph 38.90% : 0.001063s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.098751 91 0.05% : 0.000045s : 1: add_recomputation 0.18% : 0.000178s : 1: auto_monad 0.02% : 0.000021s : 1: auto_monad_reorder 0.00% : 0.000004s : 1: backend_pass 0.43% : 0.000426s : 1: bootstrap 0.02% : 0.000024s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.03% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000009s : 1: environ_conv 0.29% : 0.000285s : 1: event_method 0.01% : 0.000010s : 1: execute 0.01% : 0.000007s : 1: expand_dump_flag 0.00% : 0.000003s : 1: get_jit_bprop_graph 0.01% : 0.000013s : 1: graph_reusing 14.14% : 0.013962s : 1: jit_opt_a 0.16% : 0.000160s : 1: jit_opt_after_cconv 0.06% : 0.000060s : 1: jit_opt_b 0.44% : 0.000430s : 1: loop_unroll 0.48% : 0.000472s : 1: mutable_eliminate 3.44% : 0.003399s : 39: opt.transform.jit_opt_a 0.06% : 0.000059s : 4: opt.transform.jit_opt_after_cconv 0.03% : 0.000033s : 4: opt.transform.jit_opt_b 0.01% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.48% : 0.000469s : 1: opt_after_jit_grad 0.01% : 0.000007s : 1: order_py_execute_after_rewriter 0.00% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pre_auto_parallel 0.06% : 0.000056s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000019s : 1: remove_dup_value 2.37% : 0.002337s : 2: renormalize.infer 1.99% : 0.001969s : 2: renormalize.specialize 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000036s : 1: rewriter_after_opt_a 0.16% : 0.000157s : 1: rewriter_before_opt_a 0.10% : 0.000098s : 1: symbol_engine_optimizer 19.19% : 0.018953s : 1: task_emit 55.52% : 0.054830s : 1: type_inference 0.06% : 0.000058s : 1: validate TotalTime = 0.0326016, [24] [bootstrap]: 0.00046536 [type_inference]: 0.0153727 [event_method]: 1.761e-05 [auto_monad]: 8.173e-05 [graph_reusing]: 6.24001e-06 [inline]: 1.71e-06 [add_attr]: 0.00313698, [1] [add_attr_with_inline]: 0.00312903, [1] [Cycle 1]: 5.85e-05, [2] [tag_attr]: 1.927e-05 [meta_addattr_fg_expand]: 5.95002e-06 [parallel-infer-symbol]: 2.88e-06 [pre_auto_parallel]: 3.083e-05 [insert-virtual-dataset]: 2.83e-06 [parallel-infer-symbol-second]: 8.70001e-07 [dataset_repeat_opt]: 1.96e-06 [pipeline_split]: 1.96e-06 [optimize]: 0.00445663, [53] [py_interpret_to_execute]: 4.48001e-06 [rewriter_before_opt_a]: 6.872e-05 [opt_a]: 0.00250728, [2] [Cycle 1]: 0.001887, [45] [expand_dump_flag]: 3.11999e-06 [switch_simplify]: 9.281e-05 [loop_unroll]: 2.695e-05 [a_1]: 0.00050138 [with_stream_mark]: 1.35e-05 [recompute_prepare]: 8.02998e-06 [updatestate_depend_eliminate]: 3.73001e-06 [updatestate_assign_eliminate]: 3.93001e-06 [updatestate_loads_eliminate]: 3.21001e-06 [parameter_eliminate]: 1.79e-06 [a_2]: 8.25e-05 [accelerated_algorithm]: 7.38999e-06 [shard]: 1.84998e-06 [meta_shard_fg_expand]: 1.96998e-06 [shard_inline]: 6.44001e-06 [merge_send_recv]: 7.93001e-06 [auto_parallel]: 6.02999e-06 [parallel]: 2.614e-05 [flash_sp]: 7.45003e-06 [merge_comm]: 3.71999e-06 [allreduce_fusion]: 3.38999e-06 [matmul_add_comm_reduction]: 9.17999e-06 [allreduce_slice_to_reducescatter]: 7.59988e-07 [virtual_shard_identity]: 8.07998e-06 [virtual_dataset]: 6.48998e-06 [get_grad_eliminate_]: 6.46999e-06 [virtual_output]: 6.41998e-06 [merge_forward]: 4e-06 [cell_reuse_recompute_pass]: 1.30001e-06 [offload_activation]: 9.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.169e-05 [merge_recompute_call_nodes]: 1.45999e-06 [before_grad]: 1.017e-05 [set_forward_comm_id_for_comm_node_pass]: 3.6e-06 [meta_fg_expand]: 2.73e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.22001e-06 [after_resolve]: 1.133e-05 [a_after_grad]: 9.64e-06 [renormalize]: 0.00063388 [add_forward_monad_depend]: 5.18002e-06 [auto_monad_grad]: 2.29001e-06 [auto_monad_eliminator]: 1.481e-05 [cse]: 3.227e-05 [a_3]: 4.779e-05 [Cycle 2]: 0.0006112, [45] [expand_dump_flag]: 9.20001e-07 [switch_simplify]: 7.48e-06 [loop_unroll]: 6.19001e-06 [a_1]: 0.00012538 [with_stream_mark]: 1.103e-05 [recompute_prepare]: 6.42001e-06 [updatestate_depend_eliminate]: 2.94999e-06 [updatestate_assign_eliminate]: 2.46e-06 [updatestate_loads_eliminate]: 2.63e-06 [parameter_eliminate]: 9.20001e-07 [a_2]: 7.231e-05 [accelerated_algorithm]: 6.22001e-06 [shard]: 1.03001e-06 [meta_shard_fg_expand]: 1.19998e-06 [shard_inline]: 6.07001e-06 [merge_send_recv]: 4.43001e-06 [auto_parallel]: 5.19e-06 [parallel]: 4.27998e-06 [flash_sp]: 3.28e-06 [merge_comm]: 2.99999e-06 [allreduce_fusion]: 2.81e-06 [matmul_add_comm_reduction]: 4.95999e-06 [allreduce_slice_to_reducescatter]: 3.30008e-07 [virtual_shard_identity]: 7.15e-06 [virtual_dataset]: 6.02001e-06 [get_grad_eliminate_]: 5.86e-06 [virtual_output]: 5.64e-06 [merge_forward]: 2.82002e-06 [cell_reuse_recompute_pass]: 1.44e-06 [offload_activation]: 5.72999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.146e-05 [merge_recompute_call_nodes]: 7.50006e-07 [before_grad]: 8.61002e-06 [set_forward_comm_id_for_comm_node_pass]: 3.21999e-06 [meta_fg_expand]: 1.94999e-06 [flash_sp_send_recv_attached]: 7.59988e-07 [receive_attached]: 8.79983e-07 [after_resolve]: 9.99999e-06 [a_after_grad]: 9.46e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.08001e-06 [auto_monad_grad]: 7.50006e-07 [auto_monad_eliminator]: 6.29001e-06 [cse]: 1.489e-05 [a_3]: 3.761e-05 [py_interpret_to_execute_after_opt_a]: 1.543e-05 [slice_cell_reuse_recomputed_activation]: 2.31e-06 [rewriter_after_opt_a]: 1.834e-05 [convert_after_rewriter]: 1.34e-06 [order_py_execute_after_rewriter]: 1.30001e-06 [mutable_eliminate]: 0.00046076 [opt_b]: 0.00020889, [1] [Cycle 1]: 0.00020298, [7] [b_1]: 0.0001302 [b_2]: 8.53001e-06 [updatestate_depend_eliminate]: 5.32001e-06 [updatestate_assign_eliminate]: 2.51e-06 [updatestate_loads_eliminate]: 2.44001e-06 [renormalize]: 3.7998e-07 [cse]: 2.018e-05 [optimize_parallel_all_gather_comm]: 1.66e-05 [overlap_param_gather]: 2.07999e-06 [cconv]: 2.341e-05 [loop_unroll]: 0.00048128 [opt_after_cconv]: 0.00010304, [1] [Cycle 1]: 9.743e-05, [7] [c_1]: 3.093e-05 [parameter_eliminate]: 2.36e-06 [updatestate_depend_eliminate]: 5.20001e-06 [updatestate_assign_eliminate]: 2.68998e-06 [updatestate_loads_eliminate]: 2.46998e-06 [cse]: 2.065e-05 [renormalize]: 3.50003e-07 [remove_dup_value]: 1.522e-05 [tuple_transform]: 7.302e-05, [1] [Cycle 1]: 6.86e-05, [4] [d_1]: 4.248e-05 [none_parameter_eliminate]: 1.55999e-06 [renormalize]: 1.60013e-07 [switch_simplify]: 6.81999e-06 [partial_unused_args_eliminate]: 1.79e-06 [add_recomputation]: 4.572e-05 [cse_after_recomputation]: 2.377e-05, [1] [Cycle 1]: 1.916e-05, [1] [cse]: 1.364e-05 [environ_conv]: 6.09999e-06 [swap_dp_allreduce_reducescatter]: 4.88001e-06 [bias_add_comm_swap]: 2.78e-06 [label_micro_interleaved_index]: 4.15e-06 [label_fine_grained_interleaved_index]: 2.60002e-06 [merge_cast_opt]: 1.66002e-06 [slice_recompute_activation]: 2.02999e-06 [micro_interleaved_order_control]: 2.11e-06 [assign_add_opt]: 1.19998e-06 [ForceFp32Comm]: 7.7e-07 [remove_cast_before_assign_add]: 1.14003e-06 [full_micro_interleaved_order_control]: 2.31998e-06 [reorder_send_recv_between_fp_bp]: 2.90998e-06 [comm_op_add_attrs]: 1.05001e-06 [add_comm_op_reuse_tag]: 1.00001e-06 [interleave_split_concat_branches]: 1.14003e-06 [interleave_parallel_branches]: 1.13001e-06 [overlap_opt_shard_in_pipeline]: 3.58e-06 [overlap_opt_shard_grad_in_pipeline]: 1.74e-06 [control_data_broadcast_order]: 1.235e-05 [grouped_pairwise_exchange_alltoall]: 1.52001e-06 [offloading_packed_experts]: 3.57997e-06 [overlap_recompute_and_grad_model_parallel]: 4.90999e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.13001e-06 [overlap_recompute_allgather_and_fa_grad]: 1.38002e-06 [overlap_recompute_comm]: 2.36998e-06 [overlap_grad_ring_attention]: 3.95e-06 [overlap_grad_flash_sp]: 1.712e-05 [begin_end_overlap_inline]: 4.80009e-07 [split_matmul_comm_elemetwise]: 2.54999e-06 [split_layernorm_comm]: 1.74e-06 [handle_group_info]: 1.08001e-06 [symbol_engine_optimizer]: 7.144e-05, [1] [Cycle 1]: 6.731e-05, [6] [build]: 2.18998e-06 [elim_shapecalc]: 9.74e-06 [elim_not_effective]: 1.197e-05 [opt_reshape]: 7.12002e-06 [fold_const_symbol]: 9.64999e-06 [renormalize]: 2.50002e-07 [detach_backward]: 1.60001e-06 [pipeline_parallel_scheduler]: 1.41998e-06 [auto_monad_reorder]: 1.877e-05 [get_jit_bprop_graph]: 1.00999e-06 [rewriter_after_jit_bprop_graph]: 3.31001e-06 [opt_after_jit_grad]: 0.00045788 [validate]: 3.496e-05 [backend_pass]: 8.89995e-07 [task_emit]: 0.00830113 [execute]: 7.71001e-06 Sums bootstrap : 0.000465s : 1.63% type_inference : 0.015373s : 53.90% event_method : 0.000018s : 0.06% auto_monad : 0.000082s : 0.29% graph_reusing : 0.000006s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000019s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000031s : 0.11% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.02% optimize.rewriter_before_opt_a : 0.000069s : 0.24% optimize.opt_a.expand_dump_flag : 0.000004s : 0.01% optimize.opt_a.switch_simplify : 0.000100s : 0.35% optimize.opt_a.loop_unroll : 0.000033s : 0.12% optimize.opt_a.a_1 : 0.000627s : 2.20% optimize.opt_a.with_stream_mark : 0.000025s : 0.09% optimize.opt_a.recompute_prepare : 0.000014s : 0.05% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000155s : 0.54% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000013s : 0.04% optimize.opt_a.merge_send_recv : 0.000012s : 0.04% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000030s : 0.11% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.04% optimize.opt_a.get_grad_eliminate_ : 0.000012s : 0.04% optimize.opt_a.virtual_output : 0.000012s : 0.04% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000015s : 0.05% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000023s : 0.08% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.07% optimize.opt_a.a_after_grad : 0.000019s : 0.07% optimize.opt_a.renormalize : 0.000634s : 2.22% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.07% optimize.opt_a.cse : 0.000047s : 0.17% optimize.opt_a.a_3 : 0.000085s : 0.30% optimize.py_interpret_to_execute_after_opt_a : 0.000015s : 0.05% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000018s : 0.06% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000461s : 1.62% optimize.opt_b.b_1 : 0.000130s : 0.46% optimize.opt_b.b_2 : 0.000009s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000017s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000023s : 0.08% optimize.loop_unroll : 0.000481s : 1.69% optimize.opt_after_cconv.c_1 : 0.000031s : 0.11% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000021s : 0.07% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000015s : 0.05% optimize.tuple_transform.d_1 : 0.000042s : 0.15% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.02% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.16% optimize.cse_after_recomputation.cse : 0.000014s : 0.05% optimize.environ_conv : 0.000006s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000005s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000002s : 0.01% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000002s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000004s : 0.01% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000012s : 0.04% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000002s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.06% optimize.begin_end_overlap_inline : 0.000000s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000003s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.04% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.02% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.03% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000001s : 0.00% auto_monad_reorder : 0.000019s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000458s : 1.61% validate : 0.000035s : 0.12% backend_pass : 0.000001s : 0.00% task_emit : 0.008301s : 29.11% execute : 0.000008s : 0.03% Time group info: ------[substitution.] 0.000152 28 1.18% : 0.000002s : 2: substitution.elim_not_effective 0.83% : 0.000001s : 2: substitution.fold_const_symbol 3.64% : 0.000006s : 4: substitution.graph_param_transform 80.88% : 0.000123s : 6: substitution.inline 2.00% : 0.000003s : 4: substitution.j_node_and_user_rematch 2.79% : 0.000004s : 4: substitution.remove_not_recompute_node 2.80% : 0.000004s : 4: substitution.replace_old_param 5.88% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.015323 2 93.66% : 0.014351s : 1: type_inference.infer 6.34% : 0.000972s : 1: type_inference.specialize ------[replace.] 0.000068 8 56.08% : 0.000038s : 6: replace.inline 43.92% : 0.000030s : 2: replace.switch_simplify ------[match.] 0.000127 8 94.17% : 0.000119s : 6: match.inline 5.83% : 0.000007s : 2: match.switch_simplify ------[predicate.] 0.000179 1228 0.91% : 0.000002s : 13: predicate.accumulaten_eliminater 0.88% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.61% : 0.000001s : 8: predicate.addn_check_dump 0.95% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.05% : 0.000004s : 21: predicate.arithmetic_simplify 0.97% : 0.000002s : 13: predicate.cast_eliminate 0.64% : 0.000001s : 8: predicate.check_bprop_eliminate 0.55% : 0.000001s : 8: predicate.compare_switch_simplify 0.20% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 1.00% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.15% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.90% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.97% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.25% : 0.000000s : 4: predicate.elim_not_effective 0.37% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.20% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.15% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_depend_swap 1.72% : 0.000003s : 25: predicate.environ_get_eliminate 1.10% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.38% : 0.000002s : 19: predicate.exchange_switch_depend_value 2.19% : 0.000004s : 19: predicate.float_depend_g_call 0.56% : 0.000001s : 8: predicate.float_environ_get_switch 0.79% : 0.000001s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.76% : 0.000001s : 8: predicate.get_grad_eliminate 0.23% : 0.000000s : 4: predicate.graph_param_transform 0.60% : 0.000001s : 8: predicate.incorporate_call 0.52% : 0.000001s : 8: predicate.incorporate_call_switch 5.86% : 0.000011s : 56: predicate.inline 0.77% : 0.000001s : 8: predicate.inline_without_move 0.35% : 0.000001s : 8: predicate.j_node_and_user_rematch 1.04% : 0.000002s : 8: predicate.less_batch_normalization 1.62% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.29% : 0.000004s : 34: predicate.load_eliminater 1.08% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.55% : 0.000005s : 32: predicate.loop_unroll_before_grad 1.59% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.62% : 0.000001s : 8: predicate.merge_addn 0.62% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.77% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.82% : 0.000001s : 13: predicate.minmaximum_grad 1.06% : 0.000002s : 4: predicate.mutable_eliminate 0.35% : 0.000001s : 4: predicate.opt_reshape 0.38% : 0.000001s : 4: predicate.parallel_virtual_node 1.84% : 0.000003s : 19: predicate.partial_defer_inline 1.26% : 0.000002s : 17: predicate.partial_eliminate 0.99% : 0.000002s : 13: predicate.print_const_string_wrapper 0.61% : 0.000001s : 8: predicate.reduce_all_const_elim 1.22% : 0.000002s : 13: predicate.reduce_eliminate 2.24% : 0.000004s : 34: predicate.redundant_stop_gradient_eliminater 0.56% : 0.000001s : 8: predicate.remove_not_recompute_node 1.28% : 0.000002s : 21: predicate.replace_applicator 0.61% : 0.000001s : 8: predicate.replace_old_param 0.29% : 0.000001s : 4: predicate.reset_defer_inline 0.97% : 0.000002s : 13: predicate.reshape_eliminate 0.73% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.45% : 0.000001s : 4: predicate.row_tensor_eliminate 0.71% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.89% : 0.000002s : 8: predicate.shard_identity_eliminate 0.67% : 0.000001s : 8: predicate.special_op_eliminate 0.68% : 0.000001s : 8: predicate.specialize_transform 0.93% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.84% : 0.000002s : 8: predicate.stack_unstack_eliminate 0.60% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.51% : 0.000003s : 19: predicate.switch_defer_inline 2.14% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.73% : 0.000010s : 67: predicate.switch_simplify 0.91% : 0.000002s : 13: predicate.tile_eliminate 1.00% : 0.000002s : 13: predicate.transpose_eliminate 1.51% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.58% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.89% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.58% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.32% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.61% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.24% : 0.000004s : 34: predicate.updatestate_pure_node_eliminater 2.99% : 0.000005s : 42: predicate.updatestate_useless_node_eliminater 0.38% : 0.000001s : 4: predicate.value_based_eliminate 0.69% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 8: predicate.virtual_output_eliminate 0.24% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.47% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000777 12 60.38% : 0.000469s : 4: func_graph_cloner_run.FuncGraphClonerGraph 39.62% : 0.000308s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.042116 196 0.01% : 0.000004s : 1: ForceFp32Comm 7.46% : 0.003141s : 1: add_attr 7.44% : 0.003132s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.21% : 0.000087s : 1: auto_monad 0.05% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000005s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.18% : 0.000499s : 1: bootstrap 0.06% : 0.000027s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000027s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.05% : 0.000022s : 1: event_method 0.03% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.16% : 0.000490s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.11% : 0.000469s : 1: mutable_eliminate 0.02% : 0.000007s : 1: offloading_packed_experts 0.03% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000016s : 1: opt.transform.mutable_eliminate 2.60% : 0.001095s : 78: opt.transform.opt_a 0.07% : 0.000030s : 1: opt.transform.opt_after_cconv 0.06% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.26% : 0.000108s : 28: opt.transform.opt_b 0.11% : 0.000047s : 2: opt.transform.opt_trans_graph 0.08% : 0.000035s : 4: opt.transform.symbol_engine_opt 5.96% : 0.002510s : 1: opt_a 0.25% : 0.000106s : 1: opt_after_cconv 1.11% : 0.000467s : 1: opt_after_jit_grad 0.50% : 0.000212s : 1: opt_b 10.59% : 0.004461s : 1: optimize 0.05% : 0.000020s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.05% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000004s : 1: overlap_opt_shard_grad_in_pipeline 0.02% : 0.000007s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.08% : 0.000035s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.05% : 0.000019s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.04% : 0.000019s : 1: remove_dup_value 0.74% : 0.000310s : 1: renormalize.infer 0.75% : 0.000317s : 1: renormalize.specialize 0.01% : 0.000006s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000022s : 1: rewriter_after_opt_a 0.17% : 0.000073s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000008s : 1: swap_dp_allreduce_reducescatter 0.18% : 0.000074s : 1: symbol_engine_optimizer 19.73% : 0.008311s : 1: task_emit 0.18% : 0.000076s : 1: tuple_transform 36.53% : 0.015386s : 1: type_inference 0.15% : 0.000062s : 1: validate TotalTime = 0.0590366, [33] [bootstrap]: 0.00028772 [type_inference]: 0.0358148 [event_method]: 0.00021682 [auto_monad]: 0.00013134 [graph_reusing]: 8.2e-06 [pre_auto_parallel]: 3.34001e-06 [py_interpret_to_execute]: 5.018e-05 [rewriter_before_opt_a]: 0.0001439 [expand_dump_flag]: 3.6e-06 [jit_opt_a]: 0.0128116, [3] [Cycle 1]: 0.00724799, [27] [switch_simplify]: 0.00018178 [loop_unroll]: 6.289e-05 [a_1]: 0.00132459 [with_stream_mark]: 2.173e-05 [recompute_prepare]: 2.087e-05 [updatestate_depend_eliminate]: 8.55001e-06 [updatestate_assign_eliminate]: 6.38e-06 [updatestate_loads_eliminate]: 6.09001e-06 [parameter_eliminate]: 1.99999e-06 [specialize_transform]: 1.573e-05 [updatestate_useless_node_eliminater]: 1.448e-05 [accelerated_algorithm]: 1.425e-05 [meta_shard_fg_expand]: 3.95e-06 [get_grad_eliminate_]: 1.435e-05 [merge_forward]: 7.91001e-06 [cell_reuse_recompute_pass]: 8.70001e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.677e-05 [j_node_and_user_rematch]: 2.456e-05 [meta_fg_expand]: 0.0016007 [replace_old_param]: 6.342e-05 [inline_without_move]: 5.873e-05 [renormalize]: 0.00314234 [add_forward_monad_depend]: 9.40001e-06 [auto_monad_grad]: 5.32001e-06 [auto_monad_eliminator]: 5.249e-05 [cse]: 0.00027067 [replace_applicator]: 7.446e-05 [Cycle 2]: 0.00230488, [27] [switch_simplify]: 4.472e-05 [loop_unroll]: 4.174e-05 [a_1]: 0.00118228 [with_stream_mark]: 1.18e-05 [recompute_prepare]: 8.84e-06 [updatestate_depend_eliminate]: 3.5e-06 [updatestate_assign_eliminate]: 2.73998e-06 [updatestate_loads_eliminate]: 2.69999e-06 [parameter_eliminate]: 9.80013e-07 [specialize_transform]: 7e-06 [updatestate_useless_node_eliminater]: 6.48e-06 [accelerated_algorithm]: 7.41001e-06 [meta_shard_fg_expand]: 1.64e-06 [get_grad_eliminate_]: 6.09999e-06 [merge_forward]: 3.00002e-06 [cell_reuse_recompute_pass]: 8.89995e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.223e-05 [j_node_and_user_rematch]: 9.44e-06 [meta_fg_expand]: 0.00013324 [replace_old_param]: 1.439e-05 [inline_without_move]: 7.05e-06 [renormalize]: 0.0005915 [add_forward_monad_depend]: 4.25999e-06 [auto_monad_grad]: 1.17e-06 [auto_monad_eliminator]: 1.077e-05 [cse]: 2.087e-05 [replace_applicator]: 1.377e-05 [Cycle 3]: 0.00037949, [27] [switch_simplify]: 7.73999e-06 [loop_unroll]: 6.79999e-06 [a_1]: 0.00012405 [with_stream_mark]: 8.94e-06 [recompute_prepare]: 6.72002e-06 [updatestate_depend_eliminate]: 3.16999e-06 [updatestate_assign_eliminate]: 2.61999e-06 [updatestate_loads_eliminate]: 2.58998e-06 [parameter_eliminate]: 9.79984e-07 [specialize_transform]: 6.48998e-06 [updatestate_useless_node_eliminater]: 6.24001e-06 [accelerated_algorithm]: 6.83998e-06 [meta_shard_fg_expand]: 1.38002e-06 [get_grad_eliminate_]: 6.11998e-06 [merge_forward]: 3.03e-06 [cell_reuse_recompute_pass]: 1.55001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.406e-05 [j_node_and_user_rematch]: 9.39e-06 [meta_fg_expand]: 2.16e-06 [replace_old_param]: 9.99999e-06 [inline_without_move]: 6.28998e-06 [renormalize]: 8.00064e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 7.09988e-07 [auto_monad_eliminator]: 6.16e-06 [cse]: 1.491e-05 [replace_applicator]: 6.32001e-06 [py_interpret_to_execute_after_opt_a]: 8.65001e-06 [rewriter_after_opt_a]: 3.031e-05 [convert_after_rewriter]: 6.71999e-06 [order_py_execute_after_rewriter]: 5.22999e-06 [mutable_eliminate]: 0.00047235 [jit_opt_b]: 5.661e-05, [1] [Cycle 1]: 5.011e-05, [2] [frontend_op_eliminate]: 1.935e-05 [inline_after_opt_a]: 1.916e-05 [cconv]: 1.706e-05 [loop_unroll]: 0.00042588 [jit_opt_after_cconv]: 0.0001559, [1] [Cycle 1]: 0.00014952, [11] [c_1]: 2.786e-05 [parameter_eliminate]: 2.22001e-06 [updatestate_depend_eliminate]: 5.84999e-06 [updatestate_assign_eliminate]: 3.45e-06 [updatestate_loads_eliminate]: 2.63003e-06 [cse]: 2.268e-05 [call_graph_tuple_transform]: 1.952e-05 [tuple_list_get_item_eliminator]: 7.1e-06 [none_parameter_eliminate]: 9.00007e-07 [renormalize]: 4.09986e-07 [switch_simplify]: 6.86001e-06 [remove_dup_value]: 1.328e-05 [partial_unused_args_eliminate]: 1.52999e-06 [environ_conv]: 5.44e-06 [add_recomputation]: 3.068e-05 [cse_after_recomputation]: 4.023e-05, [1] [Cycle 1]: 2.015e-05, [1] [cse]: 1.451e-05 [auto_monad_reorder]: 1.348e-05 [get_jit_bprop_graph]: 1.40999e-06 [rewriter_after_jit_bprop_graph]: 4.42998e-06 [opt_after_jit_grad]: 0.00047387 [symbol_engine_optimizer]: 8.156e-05, [1] [Cycle 1]: 7.548e-05, [6] [build]: 3.01999e-06 [elim_shapecalc]: 9.54999e-06 [elim_not_effective]: 1.507e-05 [opt_reshape]: 7.70998e-06 [fold_const_symbol]: 1.04e-05 [renormalize]: 3.39991e-07 [validate]: 2.98e-05 [backend_pass]: 7.7e-07 [task_emit]: 0.00750337 [execute]: 4.97999e-06 Sums bootstrap : 0.000288s : 0.52% type_inference : 0.035815s : 64.69% event_method : 0.000217s : 0.39% auto_monad : 0.000131s : 0.24% graph_reusing : 0.000008s : 0.01% pre_auto_parallel : 0.000003s : 0.01% py_interpret_to_execute : 0.000050s : 0.09% rewriter_before_opt_a : 0.000144s : 0.26% expand_dump_flag : 0.000004s : 0.01% jit_opt_a.switch_simplify : 0.000234s : 0.42% jit_opt_a.loop_unroll : 0.000111s : 0.20% jit_opt_a.a_1 : 0.002631s : 4.75% jit_opt_a.with_stream_mark : 0.000042s : 0.08% jit_opt_a.recompute_prepare : 0.000036s : 0.07% jit_opt_a.updatestate_depend_eliminate : 0.000015s : 0.03% jit_opt_a.updatestate_assign_eliminate : 0.000012s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000011s : 0.02% jit_opt_a.parameter_eliminate : 0.000004s : 0.01% jit_opt_a.specialize_transform : 0.000029s : 0.05% jit_opt_a.updatestate_useless_node_eliminater : 0.000027s : 0.05% jit_opt_a.accelerated_algorithm : 0.000028s : 0.05% jit_opt_a.meta_shard_fg_expand : 0.000007s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000027s : 0.05% jit_opt_a.merge_forward : 0.000014s : 0.03% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000053s : 0.10% jit_opt_a.j_node_and_user_rematch : 0.000043s : 0.08% jit_opt_a.meta_fg_expand : 0.001736s : 3.14% jit_opt_a.replace_old_param : 0.000088s : 0.16% jit_opt_a.inline_without_move : 0.000072s : 0.13% jit_opt_a.renormalize : 0.003734s : 6.74% jit_opt_a.add_forward_monad_depend : 0.000015s : 0.03% jit_opt_a.auto_monad_grad : 0.000007s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000069s : 0.13% jit_opt_a.cse : 0.000306s : 0.55% jit_opt_a.replace_applicator : 0.000095s : 0.17% py_interpret_to_execute_after_opt_a : 0.000009s : 0.02% rewriter_after_opt_a : 0.000030s : 0.05% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000472s : 0.85% jit_opt_b.frontend_op_eliminate : 0.000019s : 0.03% jit_opt_b.inline_after_opt_a : 0.000019s : 0.03% cconv : 0.000017s : 0.03% loop_unroll : 0.000426s : 0.77% jit_opt_after_cconv.c_1 : 0.000028s : 0.05% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000023s : 0.04% jit_opt_after_cconv.call_graph_tuple_transform : 0.000020s : 0.04% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000013s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000005s : 0.01% add_recomputation : 0.000031s : 0.06% cse_after_recomputation.cse : 0.000015s : 0.03% auto_monad_reorder : 0.000013s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000474s : 0.86% symbol_engine_optimizer.build : 0.000003s : 0.01% symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.03% symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.02% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000030s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.007503s : 13.55% execute : 0.000005s : 0.01% Time group info: ------[substitution.] 0.000599 128 0.32% : 0.000002s : 2: substitution.elim_not_effective 0.19% : 0.000001s : 2: substitution.fold_const_symbol 0.65% : 0.000004s : 4: substitution.graph_param_transform 70.87% : 0.000425s : 21: substitution.inline 2.75% : 0.000016s : 2: substitution.inline_without_move 1.25% : 0.000008s : 12: substitution.j_node_and_user_rematch 1.67% : 0.000010s : 7: substitution.minmaximum_grad 1.73% : 0.000010s : 11: substitution.partial_eliminate 1.54% : 0.000009s : 12: substitution.remove_not_recompute_node 3.83% : 0.000023s : 9: substitution.replace_applicator 1.54% : 0.000009s : 14: substitution.replace_old_param 0.44% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.10% : 0.000013s : 5: substitution.switch_simplify 3.33% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 2.31% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.48% : 0.000033s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.035708 2 92.94% : 0.033188s : 1: type_inference.infer 7.06% : 0.002519s : 1: type_inference.specialize ------[replace.] 0.000266 31 60.55% : 0.000161s : 21: replace.inline 20.85% : 0.000055s : 5: replace.switch_simplify 18.60% : 0.000049s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000432 31 95.34% : 0.000412s : 21: match.inline 2.33% : 0.000010s : 5: match.switch_simplify 2.33% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000446 3262 1.48% : 0.000007s : 56: predicate.accumulaten_eliminater 0.31% : 0.000001s : 4: predicate.ad_related_special_op_eliminate 1.46% : 0.000007s : 56: predicate.addn_check_dump 1.59% : 0.000007s : 56: predicate.addn_zero_filter 2.24% : 0.000010s : 56: predicate.arithmetic_simplify 1.60% : 0.000007s : 56: predicate.cast_eliminate 0.14% : 0.000001s : 4: predicate.check_bprop_eliminate 1.44% : 0.000006s : 56: predicate.compare_switch_simplify 1.46% : 0.000006s : 56: predicate.depend_value_elim 1.54% : 0.000007s : 56: predicate.dict_get_item_const_eliminator 1.48% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.48% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.22% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.09% : 0.000000s : 4: predicate.elim_not_effective 0.17% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.45% : 0.000006s : 56: predicate.environ_add_const_eliminate 1.43% : 0.000006s : 56: predicate.environ_get_add_eliminate 1.46% : 0.000007s : 56: predicate.environ_get_depend_swap 1.52% : 0.000007s : 56: predicate.environ_get_eliminate 1.51% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.09% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000003s : 21: predicate.get_grad_eliminate 0.09% : 0.000000s : 4: predicate.graph_param_transform 4.41% : 0.000020s : 90: predicate.inline 1.74% : 0.000008s : 46: predicate.inline_without_move 0.34% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.83% : 0.000004s : 21: predicate.less_batch_normalization 1.80% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.87% : 0.000008s : 65: predicate.load_eliminater 0.43% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.82% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.75% : 0.000008s : 60: predicate.make_slice_get_slice_eliminator 1.46% : 0.000007s : 56: predicate.merge_addn 1.54% : 0.000007s : 56: predicate.minmaximum_grad 0.44% : 0.000002s : 4: predicate.mutable_eliminate 0.16% : 0.000001s : 4: predicate.opt_reshape 2.28% : 0.000010s : 65: predicate.partial_eliminate 1.48% : 0.000007s : 56: predicate.print_const_string_wrapper 1.97% : 0.000009s : 56: predicate.reduce_eliminate 1.67% : 0.000007s : 61: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000002s : 21: predicate.remove_not_recompute_node 2.53% : 0.000011s : 113: predicate.replace_applicator 0.93% : 0.000004s : 46: predicate.replace_old_param 0.09% : 0.000000s : 4: predicate.reset_defer_inline 1.63% : 0.000007s : 56: predicate.reshape_eliminate 1.57% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.21% : 0.000001s : 4: predicate.row_tensor_eliminate 1.56% : 0.000007s : 56: predicate.same_eliminate 0.45% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.28% : 0.000001s : 8: predicate.special_op_eliminate 0.76% : 0.000003s : 21: predicate.specialize_transform 1.82% : 0.000008s : 56: predicate.split_environ_get_set_with_tuple_value 1.57% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.14% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.98% : 0.000013s : 82: predicate.switch_defer_inline 2.69% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.24% : 0.000032s : 216: predicate.switch_simplify 1.51% : 0.000007s : 56: predicate.tile_eliminate 1.54% : 0.000007s : 56: predicate.transpose_eliminate 2.01% : 0.000009s : 56: predicate.tuple_list_convert_item_index_to_positive 1.86% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.81% : 0.000013s : 69: predicate.tuple_list_get_item_eliminator 1.92% : 0.000009s : 56: predicate.tuple_list_set_item_eliminator 1.85% : 0.000008s : 61: predicate.tuple_to_list_eliminator_ 1.80% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.66% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 1.92% : 0.000009s : 56: predicate.value_based_eliminate 0.11% : 0.000000s : 4: predicate.virtual_view_grad_eliminate 0.22% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002571 41 62.22% : 0.001600s : 16: func_graph_cloner_run.FuncGraphClonerGraph 37.78% : 0.000971s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.066336 91 0.05% : 0.000034s : 1: add_recomputation 0.21% : 0.000138s : 1: auto_monad 0.02% : 0.000016s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: backend_pass 0.45% : 0.000298s : 1: bootstrap 0.03% : 0.000020s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.06% : 0.000042s : 1: cse_after_recomputation 0.01% : 0.000008s : 1: environ_conv 0.34% : 0.000224s : 1: event_method 0.01% : 0.000008s : 1: execute 0.01% : 0.000006s : 1: expand_dump_flag 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000011s : 1: graph_reusing 19.32% : 0.012814s : 1: jit_opt_a 0.24% : 0.000159s : 1: jit_opt_after_cconv 0.09% : 0.000059s : 1: jit_opt_b 0.65% : 0.000433s : 1: loop_unroll 0.72% : 0.000480s : 1: mutable_eliminate 5.17% : 0.003432s : 39: opt.transform.jit_opt_a 0.09% : 0.000058s : 4: opt.transform.jit_opt_after_cconv 0.05% : 0.000032s : 4: opt.transform.jit_opt_b 0.02% : 0.000015s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 0.04% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000039s : 4: opt.transform.symbol_engine_opt 0.73% : 0.000481s : 1: opt_after_jit_grad 0.01% : 0.000007s : 1: order_py_execute_after_rewriter 0.01% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pre_auto_parallel 0.08% : 0.000055s : 1: py_interpret_to_execute 0.02% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000016s : 1: remove_dup_value 2.97% : 0.001970s : 2: renormalize.infer 2.64% : 0.001749s : 2: renormalize.specialize 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000033s : 1: rewriter_after_opt_a 0.22% : 0.000147s : 1: rewriter_before_opt_a 0.13% : 0.000084s : 1: symbol_engine_optimizer 11.32% : 0.007512s : 1: task_emit 54.01% : 0.035825s : 1: type_inference 0.07% : 0.000046s : 1: validate TotalTime = 0.0731011, [24] [bootstrap]: 0.00047638 [type_inference]: 0.0522443 [event_method]: 1.98e-05 [auto_monad]: 8.567e-05 [graph_reusing]: 6.54999e-06 [inline]: 1.92001e-06 [add_attr]: 0.00330765, [1] [add_attr_with_inline]: 0.0032999, [1] [Cycle 1]: 8.059e-05, [2] [tag_attr]: 4.339e-05 [meta_addattr_fg_expand]: 5.92999e-06 [parallel-infer-symbol]: 3.16001e-06 [pre_auto_parallel]: 3.31e-05 [insert-virtual-dataset]: 2.44001e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.04999e-06 [pipeline_split]: 1.64e-06 [optimize]: 0.00524063, [53] [py_interpret_to_execute]: 5.15999e-06 [rewriter_before_opt_a]: 7e-05 [opt_a]: 0.00316889, [2] [Cycle 1]: 0.00245899, [45] [expand_dump_flag]: 3.56001e-06 [switch_simplify]: 9.365e-05 [loop_unroll]: 2.759e-05 [a_1]: 0.00054526 [with_stream_mark]: 1.379e-05 [recompute_prepare]: 9.27999e-06 [updatestate_depend_eliminate]: 4.63999e-06 [updatestate_assign_eliminate]: 4.39002e-06 [updatestate_loads_eliminate]: 4.18999e-06 [parameter_eliminate]: 1.71998e-06 [a_2]: 0.00010071 [accelerated_algorithm]: 8.12e-06 [shard]: 2.03002e-06 [meta_shard_fg_expand]: 2.04e-06 [shard_inline]: 7.50998e-06 [merge_send_recv]: 9.53997e-06 [auto_parallel]: 6.89999e-06 [parallel]: 1.717e-05 [flash_sp]: 8.18001e-06 [merge_comm]: 4.32003e-06 [allreduce_fusion]: 4.26001e-06 [matmul_add_comm_reduction]: 1.017e-05 [allreduce_slice_to_reducescatter]: 6.09987e-07 [virtual_shard_identity]: 9.34998e-06 [virtual_dataset]: 7.85e-06 [get_grad_eliminate_]: 7.27002e-06 [virtual_output]: 7.48e-06 [merge_forward]: 5.02999e-06 [cell_reuse_recompute_pass]: 1.39e-06 [offload_activation]: 1.098e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.514e-05 [merge_recompute_call_nodes]: 1.45001e-06 [before_grad]: 1.182e-05 [set_forward_comm_id_for_comm_node_pass]: 4.43999e-06 [meta_fg_expand]: 3.73001e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.44001e-06 [after_resolve]: 1.239e-05 [a_after_grad]: 1.171e-05 [renormalize]: 0.00110653 [add_forward_monad_depend]: 5.02999e-06 [auto_monad_grad]: 1.66e-06 [auto_monad_eliminator]: 1.787e-05 [cse]: 3.677e-05 [a_3]: 5.553e-05 [Cycle 2]: 0.0007005, [45] [expand_dump_flag]: 1.12e-06 [switch_simplify]: 9.02999e-06 [loop_unroll]: 7.4e-06 [a_1]: 0.00015597 [with_stream_mark]: 1.203e-05 [recompute_prepare]: 7.48999e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 3.16999e-06 [updatestate_loads_eliminate]: 3.81999e-06 [parameter_eliminate]: 9.70002e-07 [a_2]: 9.085e-05 [accelerated_algorithm]: 7.42002e-06 [shard]: 9.60019e-07 [meta_shard_fg_expand]: 1.64e-06 [shard_inline]: 7.33999e-06 [merge_send_recv]: 5.20001e-06 [auto_parallel]: 6.03002e-06 [parallel]: 4.17e-06 [flash_sp]: 2.91e-06 [merge_comm]: 3.68e-06 [allreduce_fusion]: 3.55e-06 [matmul_add_comm_reduction]: 6.15002e-06 [allreduce_slice_to_reducescatter]: 3.89991e-07 [virtual_shard_identity]: 8.22998e-06 [virtual_dataset]: 7.01999e-06 [get_grad_eliminate_]: 7.04001e-06 [virtual_output]: 6.67002e-06 [merge_forward]: 3.99002e-06 [cell_reuse_recompute_pass]: 1.57001e-06 [offload_activation]: 6.58e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.48e-05 [merge_recompute_call_nodes]: 6.70028e-07 [before_grad]: 1.072e-05 [set_forward_comm_id_for_comm_node_pass]: 3.91999e-06 [meta_fg_expand]: 2.69001e-06 [flash_sp_send_recv_attached]: 7.50006e-07 [receive_attached]: 9.89996e-07 [after_resolve]: 1.088e-05 [a_after_grad]: 1.06e-05 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.15999e-06 [auto_monad_grad]: 8.00006e-07 [auto_monad_eliminator]: 7.97e-06 [cse]: 1.784e-05 [a_3]: 4.411e-05 [py_interpret_to_execute_after_opt_a]: 4.61002e-06 [slice_cell_reuse_recomputed_activation]: 2.14999e-06 [rewriter_after_opt_a]: 2.041e-05 [convert_after_rewriter]: 1.40001e-06 [order_py_execute_after_rewriter]: 1.58002e-06 [mutable_eliminate]: 0.00047862 [opt_b]: 0.00029519, [1] [Cycle 1]: 0.00028903, [7] [b_1]: 0.00020475 [b_2]: 9.73002e-06 [updatestate_depend_eliminate]: 6.76999e-06 [updatestate_assign_eliminate]: 3.25e-06 [updatestate_loads_eliminate]: 3.34001e-06 [renormalize]: 4.39992e-07 [cse]: 2.38e-05 [optimize_parallel_all_gather_comm]: 1.884e-05 [overlap_param_gather]: 2.58e-06 [cconv]: 2.206e-05 [loop_unroll]: 0.00044994 [opt_after_cconv]: 0.00011414, [1] [Cycle 1]: 0.00010859, [7] [c_1]: 3.652e-05 [parameter_eliminate]: 2.54001e-06 [updatestate_depend_eliminate]: 6.28e-06 [updatestate_assign_eliminate]: 3.37997e-06 [updatestate_loads_eliminate]: 3.17002e-06 [cse]: 2.381e-05 [renormalize]: 4.69998e-07 [remove_dup_value]: 1.908e-05 [tuple_transform]: 8.305e-05, [1] [Cycle 1]: 7.846e-05, [4] [d_1]: 5.091e-05 [none_parameter_eliminate]: 1.57001e-06 [renormalize]: 1.39989e-07 [switch_simplify]: 8.2e-06 [partial_unused_args_eliminate]: 2.29999e-06 [add_recomputation]: 5.451e-05 [cse_after_recomputation]: 2.559e-05, [1] [Cycle 1]: 2.131e-05, [1] [cse]: 1.616e-05 [environ_conv]: 6.05002e-06 [swap_dp_allreduce_reducescatter]: 5.89999e-06 [bias_add_comm_swap]: 3.21999e-06 [label_micro_interleaved_index]: 4.35e-06 [label_fine_grained_interleaved_index]: 2.73e-06 [merge_cast_opt]: 1.41998e-06 [slice_recompute_activation]: 2.04999e-06 [micro_interleaved_order_control]: 2.79001e-06 [assign_add_opt]: 1.20999e-06 [ForceFp32Comm]: 7.2e-07 [remove_cast_before_assign_add]: 1.10999e-06 [full_micro_interleaved_order_control]: 2.36e-06 [reorder_send_recv_between_fp_bp]: 2.81e-06 [comm_op_add_attrs]: 1.33002e-06 [add_comm_op_reuse_tag]: 1.19e-06 [interleave_split_concat_branches]: 1.47999e-06 [interleave_parallel_branches]: 1.07e-06 [overlap_opt_shard_in_pipeline]: 1.14e-06 [overlap_opt_shard_grad_in_pipeline]: 1.89e-06 [control_data_broadcast_order]: 1.431e-05 [grouped_pairwise_exchange_alltoall]: 1.61998e-06 [offloading_packed_experts]: 4.07e-06 [overlap_recompute_and_grad_model_parallel]: 5.19e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.33002e-06 [overlap_recompute_comm]: 2.81e-06 [overlap_grad_ring_attention]: 5.00999e-06 [overlap_grad_flash_sp]: 1.911e-05 [begin_end_overlap_inline]: 5.09986e-07 [split_matmul_comm_elemetwise]: 2.41e-06 [split_layernorm_comm]: 1.68002e-06 [handle_group_info]: 9.60019e-07 [symbol_engine_optimizer]: 8.043e-05, [1] [Cycle 1]: 7.638e-05, [6] [build]: 2.63e-06 [elim_shapecalc]: 1.108e-05 [elim_not_effective]: 1.48e-05 [opt_reshape]: 8.35001e-06 [fold_const_symbol]: 1.192e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.96e-06 [pipeline_parallel_scheduler]: 1.60001e-06 [auto_monad_reorder]: 2.084e-05 [get_jit_bprop_graph]: 1.22e-06 [rewriter_after_jit_bprop_graph]: 3.66999e-06 [opt_after_jit_grad]: 0.00048004 [validate]: 3.929e-05 [backend_pass]: 9.39996e-07 [task_emit]: 0.0109183 [execute]: 6.62002e-06 Sums bootstrap : 0.000476s : 0.69% type_inference : 0.052244s : 75.87% event_method : 0.000020s : 0.03% auto_monad : 0.000086s : 0.12% graph_reusing : 0.000007s : 0.01% inline : 0.000002s : 0.00% add_attr.add_attr_with_inline.tag_attr : 0.000043s : 0.06% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.01% parallel-infer-symbol : 0.000003s : 0.00% pre_auto_parallel : 0.000033s : 0.05% insert-virtual-dataset : 0.000002s : 0.00% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.00% pipeline_split : 0.000002s : 0.00% optimize.py_interpret_to_execute : 0.000005s : 0.01% optimize.rewriter_before_opt_a : 0.000070s : 0.10% optimize.opt_a.expand_dump_flag : 0.000005s : 0.01% optimize.opt_a.switch_simplify : 0.000103s : 0.15% optimize.opt_a.loop_unroll : 0.000035s : 0.05% optimize.opt_a.a_1 : 0.000701s : 1.02% optimize.opt_a.with_stream_mark : 0.000026s : 0.04% optimize.opt_a.recompute_prepare : 0.000017s : 0.02% optimize.opt_a.updatestate_depend_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_assign_eliminate : 0.000008s : 0.01% optimize.opt_a.updatestate_loads_eliminate : 0.000008s : 0.01% optimize.opt_a.parameter_eliminate : 0.000003s : 0.00% optimize.opt_a.a_2 : 0.000192s : 0.28% optimize.opt_a.accelerated_algorithm : 0.000016s : 0.02% optimize.opt_a.shard : 0.000003s : 0.00% optimize.opt_a.meta_shard_fg_expand : 0.000004s : 0.01% optimize.opt_a.shard_inline : 0.000015s : 0.02% optimize.opt_a.merge_send_recv : 0.000015s : 0.02% optimize.opt_a.auto_parallel : 0.000013s : 0.02% optimize.opt_a.parallel : 0.000021s : 0.03% optimize.opt_a.flash_sp : 0.000011s : 0.02% optimize.opt_a.merge_comm : 0.000008s : 0.01% optimize.opt_a.allreduce_fusion : 0.000008s : 0.01% optimize.opt_a.matmul_add_comm_reduction : 0.000016s : 0.02% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000018s : 0.03% optimize.opt_a.virtual_dataset : 0.000015s : 0.02% optimize.opt_a.get_grad_eliminate_ : 0.000014s : 0.02% optimize.opt_a.virtual_output : 0.000014s : 0.02% optimize.opt_a.merge_forward : 0.000009s : 0.01% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% optimize.opt_a.offload_activation : 0.000018s : 0.03% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000030s : 0.04% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.00% optimize.opt_a.before_grad : 0.000023s : 0.03% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000008s : 0.01% optimize.opt_a.meta_fg_expand : 0.000006s : 0.01% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.00% optimize.opt_a.receive_attached : 0.000003s : 0.00% optimize.opt_a.after_resolve : 0.000023s : 0.03% optimize.opt_a.a_after_grad : 0.000022s : 0.03% optimize.opt_a.renormalize : 0.001107s : 1.61% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.01% optimize.opt_a.auto_monad_grad : 0.000002s : 0.00% optimize.opt_a.auto_monad_eliminator : 0.000026s : 0.04% optimize.opt_a.cse : 0.000055s : 0.08% optimize.opt_a.a_3 : 0.000100s : 0.14% optimize.py_interpret_to_execute_after_opt_a : 0.000005s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.00% optimize.rewriter_after_opt_a : 0.000020s : 0.03% optimize.convert_after_rewriter : 0.000001s : 0.00% optimize.order_py_execute_after_rewriter : 0.000002s : 0.00% optimize.mutable_eliminate : 0.000479s : 0.70% optimize.opt_b.b_1 : 0.000205s : 0.30% optimize.opt_b.b_2 : 0.000010s : 0.01% optimize.opt_b.updatestate_depend_eliminate : 0.000007s : 0.01% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_b.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000024s : 0.03% optimize.optimize_parallel_all_gather_comm : 0.000019s : 0.03% optimize.overlap_param_gather : 0.000003s : 0.00% optimize.cconv : 0.000022s : 0.03% optimize.loop_unroll : 0.000450s : 0.65% optimize.opt_after_cconv.c_1 : 0.000037s : 0.05% optimize.opt_after_cconv.parameter_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% optimize.opt_after_cconv.cse : 0.000024s : 0.03% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000019s : 0.03% optimize.tuple_transform.d_1 : 0.000051s : 0.07% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.00% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000008s : 0.01% optimize.partial_unused_args_eliminate : 0.000002s : 0.00% optimize.add_recomputation : 0.000055s : 0.08% optimize.cse_after_recomputation.cse : 0.000016s : 0.02% optimize.environ_conv : 0.000006s : 0.01% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.01% optimize.bias_add_comm_swap : 0.000003s : 0.00% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.00% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.00% optimize.micro_interleaved_order_control : 0.000003s : 0.00% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000001s : 0.00% optimize.full_micro_interleaved_order_control : 0.000002s : 0.00% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.00% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.00% optimize.control_data_broadcast_order : 0.000014s : 0.02% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.00% optimize.offloading_packed_experts : 0.000004s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000005s : 0.01% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.00% optimize.overlap_grad_ring_attention : 0.000005s : 0.01% optimize.overlap_grad_flash_sp : 0.000019s : 0.03% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.00% optimize.split_layernorm_comm : 0.000002s : 0.00% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000003s : 0.00% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000011s : 0.02% optimize.symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% optimize.symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000012s : 0.02% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.00% pipeline_parallel_scheduler : 0.000002s : 0.00% auto_monad_reorder : 0.000021s : 0.03% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000480s : 0.70% validate : 0.000039s : 0.06% backend_pass : 0.000001s : 0.00% task_emit : 0.010918s : 15.86% execute : 0.000007s : 0.01% Time group info: ------[substitution.] 0.000165 35 1.36% : 0.000002s : 3: substitution.elim_not_effective 1.04% : 0.000002s : 3: substitution.fold_const_symbol 3.71% : 0.000006s : 5: substitution.graph_param_transform 79.99% : 0.000132s : 6: substitution.inline 2.33% : 0.000004s : 6: substitution.j_node_and_user_rematch 3.74% : 0.000006s : 6: substitution.remove_not_recompute_node 2.31% : 0.000004s : 4: substitution.replace_old_param 5.52% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.052191 2 96.51% : 0.050370s : 1: type_inference.infer 3.49% : 0.001821s : 1: type_inference.specialize ------[replace.] 0.000070 8 55.60% : 0.000039s : 6: replace.inline 44.40% : 0.000031s : 2: replace.switch_simplify ------[match.] 0.000135 8 94.37% : 0.000128s : 6: match.inline 5.63% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000211 1454 1.01% : 0.000002s : 15: predicate.accumulaten_eliminater 0.95% : 0.000002s : 5: predicate.ad_related_special_op_eliminate 0.59% : 0.000001s : 10: predicate.addn_check_dump 0.94% : 0.000002s : 15: predicate.addn_zero_filter 0.82% : 0.000002s : 15: predicate.adjust_all_reduce_mul_add 2.10% : 0.000004s : 25: predicate.arithmetic_simplify 0.96% : 0.000002s : 15: predicate.cast_eliminate 0.64% : 0.000001s : 10: predicate.check_bprop_eliminate 0.62% : 0.000001s : 10: predicate.compare_switch_simplify 0.21% : 0.000000s : 5: predicate.const_output_eliminate 0.62% : 0.000001s : 10: predicate.depend_value_elim 0.94% : 0.000002s : 15: predicate.dict_get_item_const_eliminator 1.05% : 0.000002s : 15: predicate.dict_get_item_eliminator 0.89% : 0.000002s : 15: predicate.dict_set_item_eliminator 1.01% : 0.000002s : 10: predicate.dumpgradient_eliminate 0.31% : 0.000001s : 5: predicate.elim_not_effective 0.41% : 0.000001s : 5: predicate.elim_shapecalc_of_broadcastargs 1.43% : 0.000003s : 20: predicate.environ_add_const_eliminate 1.14% : 0.000002s : 20: predicate.environ_get_add_eliminate 1.15% : 0.000002s : 20: predicate.environ_get_depend_swap 1.84% : 0.000004s : 30: predicate.environ_get_eliminate 1.11% : 0.000002s : 20: predicate.environ_get_set_eliminate 1.25% : 0.000003s : 21: predicate.exchange_switch_depend_value 2.02% : 0.000004s : 21: predicate.float_depend_g_call 0.61% : 0.000001s : 10: predicate.float_environ_get_switch 0.83% : 0.000002s : 15: predicate.float_tuple_getitem_switch 0.23% : 0.000000s : 5: predicate.fold_const_symbol 0.70% : 0.000001s : 10: predicate.get_grad_eliminate 0.24% : 0.000001s : 5: predicate.graph_param_transform 0.65% : 0.000001s : 10: predicate.incorporate_call 0.58% : 0.000001s : 10: predicate.incorporate_call_switch 5.87% : 0.000012s : 66: predicate.inline 0.82% : 0.000002s : 10: predicate.inline_without_move 0.34% : 0.000001s : 10: predicate.j_node_and_user_rematch 0.88% : 0.000002s : 10: predicate.less_batch_normalization 1.80% : 0.000004s : 25: predicate.list_to_tuple_eliminator_ 2.37% : 0.000005s : 40: predicate.load_eliminater 1.00% : 0.000002s : 5: predicate.loop_unroll_after_grad 2.21% : 0.000005s : 34: predicate.loop_unroll_before_grad 1.79% : 0.000004s : 25: predicate.make_slice_get_slice_eliminator 0.69% : 0.000001s : 10: predicate.merge_addn 0.60% : 0.000001s : 10: predicate.micro_step_allgather_replace 0.63% : 0.000001s : 10: predicate.mini_step_allgather_replace 0.92% : 0.000002s : 15: predicate.minmaximum_grad 1.11% : 0.000002s : 5: predicate.mutable_eliminate 0.42% : 0.000001s : 5: predicate.opt_reshape 0.40% : 0.000001s : 5: predicate.parallel_virtual_node 1.64% : 0.000003s : 21: predicate.partial_defer_inline 1.29% : 0.000003s : 20: predicate.partial_eliminate 0.85% : 0.000002s : 15: predicate.print_const_string_wrapper 0.61% : 0.000001s : 10: predicate.reduce_all_const_elim 1.22% : 0.000003s : 15: predicate.reduce_eliminate 2.31% : 0.000005s : 40: predicate.redundant_stop_gradient_eliminater 0.65% : 0.000001s : 10: predicate.remove_not_recompute_node 1.23% : 0.000003s : 25: predicate.replace_applicator 0.55% : 0.000001s : 10: predicate.replace_old_param 0.28% : 0.000001s : 5: predicate.reset_defer_inline 1.01% : 0.000002s : 15: predicate.reshape_eliminate 0.64% : 0.000001s : 10: predicate.row_tensor_add_zeros_like 0.40% : 0.000001s : 5: predicate.row_tensor_eliminate 0.75% : 0.000002s : 10: predicate.same_eliminate 0.44% : 0.000001s : 10: predicate.set_cell_output_no_recompute 0.77% : 0.000002s : 10: predicate.shard_identity_eliminate 0.80% : 0.000002s : 10: predicate.special_op_eliminate 0.76% : 0.000002s : 10: predicate.specialize_transform 0.91% : 0.000002s : 10: predicate.split_environ_get_set_with_tuple_value 0.78% : 0.000002s : 10: predicate.stack_unstack_eliminate 0.35% : 0.000001s : 5: predicate.switch_call_monad_eliminater 1.41% : 0.000003s : 21: predicate.switch_defer_inline 2.00% : 0.000004s : 31: predicate.switch_layer_defer_inline 5.41% : 0.000011s : 74: predicate.switch_simplify 0.92% : 0.000002s : 15: predicate.tile_eliminate 1.05% : 0.000002s : 15: predicate.transpose_eliminate 1.66% : 0.000004s : 25: predicate.tuple_list_convert_item_index_to_positive 1.74% : 0.000004s : 25: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 25: predicate.tuple_list_get_item_depend_reorder 3.01% : 0.000006s : 35: predicate.tuple_list_get_item_eliminator 1.57% : 0.000003s : 25: predicate.tuple_list_get_set_item_eliminator 2.29% : 0.000005s : 35: predicate.tuple_list_set_item_eliminator 1.63% : 0.000003s : 25: predicate.tuple_to_list_eliminator_ 2.23% : 0.000005s : 40: predicate.updatestate_pure_node_eliminater 2.98% : 0.000006s : 50: predicate.updatestate_useless_node_eliminater 0.36% : 0.000001s : 5: predicate.value_based_eliminate 0.74% : 0.000002s : 10: predicate.virtual_dataset_eliminate 0.67% : 0.000001s : 10: predicate.virtual_output_eliminate 0.27% : 0.000001s : 5: predicate.virtual_view_grad_eliminate 0.60% : 0.000001s : 5: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.001102 15 59.16% : 0.000652s : 7: func_graph_cloner_run.FuncGraphClonerGraph 40.84% : 0.000450s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.084280 196 0.00% : 0.000004s : 1: ForceFp32Comm 3.93% : 0.003312s : 1: add_attr 3.92% : 0.003303s : 1: add_attr_with_inline 0.00% : 0.000004s : 1: add_comm_op_reuse_tag 0.07% : 0.000058s : 1: add_recomputation 0.00% : 0.000004s : 1: assign_add_opt 0.11% : 0.000091s : 1: auto_monad 0.03% : 0.000024s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.00% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 0.60% : 0.000507s : 1: bootstrap 0.03% : 0.000025s : 1: cconv 0.00% : 0.000004s : 1: comm_op_add_attrs 0.02% : 0.000018s : 1: control_data_broadcast_order 0.01% : 0.000005s : 1: convert_after_rewriter 0.03% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.01% : 0.000009s : 1: environ_conv 0.03% : 0.000026s : 1: event_method 0.01% : 0.000011s : 1: execute 0.01% : 0.000006s : 1: full_micro_interleaved_order_control 0.01% : 0.000005s : 1: get_jit_bprop_graph 0.01% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.00% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.00% : 0.000004s : 1: interleave_parallel_branches 0.00% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000006s : 1: label_fine_grained_interleaved_index 0.01% : 0.000007s : 1: label_micro_interleaved_index 0.54% : 0.000458s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000006s : 1: micro_interleaved_order_control 0.58% : 0.000488s : 1: mutable_eliminate 0.01% : 0.000007s : 1: offloading_packed_experts 0.02% : 0.000016s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000017s : 1: opt.transform.mutable_eliminate 1.49% : 0.001255s : 78: opt.transform.opt_a 0.04% : 0.000035s : 1: opt.transform.opt_after_cconv 0.03% : 0.000029s : 1: opt.transform.opt_after_jit_grad 0.19% : 0.000161s : 28: opt.transform.opt_b 0.07% : 0.000057s : 2: opt.transform.opt_trans_graph 0.05% : 0.000043s : 4: opt.transform.symbol_engine_opt 3.76% : 0.003172s : 1: opt_a 0.14% : 0.000118s : 1: opt_after_cconv 0.58% : 0.000490s : 1: opt_after_jit_grad 0.35% : 0.000299s : 1: opt_b 6.22% : 0.005245s : 1: optimize 0.03% : 0.000022s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.03% : 0.000023s : 1: overlap_grad_flash_sp 0.00% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.01% : 0.000008s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.00% : 0.000004s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000006s : 1: overlap_param_gather 0.00% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.01% : 0.000008s : 1: overlap_recompute_and_grad_model_parallel 0.01% : 0.000006s : 1: overlap_recompute_comm 0.01% : 0.000007s : 1: parallel-infer-symbol 0.00% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000005s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.04% : 0.000037s : 1: pre_auto_parallel 0.01% : 0.000009s : 1: py_interpret_to_execute 0.01% : 0.000008s : 1: py_interpret_to_execute_after_opt_a 0.00% : 0.000004s : 1: remove_cast_before_assign_add 0.03% : 0.000023s : 1: remove_dup_value 0.74% : 0.000626s : 1: renormalize.infer 0.56% : 0.000472s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.01% : 0.000007s : 1: rewriter_after_jit_bprop_graph 0.03% : 0.000024s : 1: rewriter_after_opt_a 0.09% : 0.000074s : 1: rewriter_before_opt_a 0.01% : 0.000005s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000004s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.01% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.10% : 0.000083s : 1: symbol_engine_optimizer 12.97% : 0.010930s : 1: task_emit 0.10% : 0.000086s : 1: tuple_transform 62.01% : 0.052258s : 1: type_inference 0.08% : 0.000069s : 1: validate TotalTime = 0.0787724, [33] [bootstrap]: 0.00027301 [type_inference]: 0.0543661 [event_method]: 0.00034384 [auto_monad]: 0.00013968 [graph_reusing]: 8.37e-06 [pre_auto_parallel]: 3.43999e-06 [py_interpret_to_execute]: 4.897e-05 [rewriter_before_opt_a]: 0.00014872 [expand_dump_flag]: 3.64002e-06 [jit_opt_a]: 0.0156609, [3] [Cycle 1]: 0.00990933, [27] [switch_simplify]: 0.00017475 [loop_unroll]: 6.357e-05 [a_1]: 0.00136707 [with_stream_mark]: 2.126e-05 [recompute_prepare]: 2.212e-05 [updatestate_depend_eliminate]: 8.33001e-06 [updatestate_assign_eliminate]: 7.42002e-06 [updatestate_loads_eliminate]: 6.72002e-06 [parameter_eliminate]: 1.86e-06 [specialize_transform]: 1.733e-05 [updatestate_useless_node_eliminater]: 1.508e-05 [accelerated_algorithm]: 1.525e-05 [meta_shard_fg_expand]: 4.09002e-06 [get_grad_eliminate_]: 1.538e-05 [merge_forward]: 8.29002e-06 [cell_reuse_recompute_pass]: 7.7e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.896e-05 [j_node_and_user_rematch]: 2.604e-05 [meta_fg_expand]: 0.00219656 [replace_old_param]: 8.432e-05 [inline_without_move]: 8.15e-05 [renormalize]: 0.00508837 [add_forward_monad_depend]: 8.40001e-06 [auto_monad_grad]: 5.76998e-06 [auto_monad_eliminator]: 5.272e-05 [cse]: 0.00029951 [replace_applicator]: 7.577e-05 [Cycle 2]: 0.00244785, [27] [switch_simplify]: 4.471e-05 [loop_unroll]: 4.167e-05 [a_1]: 0.00116438 [with_stream_mark]: 1.163e-05 [recompute_prepare]: 8.87e-06 [updatestate_depend_eliminate]: 3.68999e-06 [updatestate_assign_eliminate]: 2.83e-06 [updatestate_loads_eliminate]: 2.55002e-06 [parameter_eliminate]: 1.02998e-06 [specialize_transform]: 7.31001e-06 [updatestate_useless_node_eliminater]: 6.59001e-06 [accelerated_algorithm]: 7.03998e-06 [meta_shard_fg_expand]: 1.72001e-06 [get_grad_eliminate_]: 6.17999e-06 [merge_forward]: 2.86999e-06 [cell_reuse_recompute_pass]: 9.70002e-07 [cell_reuse_handle_not_recompute_node_pass]: 1.212e-05 [j_node_and_user_rematch]: 9.42001e-06 [meta_fg_expand]: 0.00014732 [replace_old_param]: 1.422e-05 [inline_without_move]: 7.03e-06 [renormalize]: 0.00073593 [add_forward_monad_depend]: 4.68001e-06 [auto_monad_grad]: 1.08001e-06 [auto_monad_eliminator]: 1.079e-05 [cse]: 2.13e-05 [replace_applicator]: 1.426e-05 [Cycle 3]: 0.00040263, [27] [switch_simplify]: 7.65e-06 [loop_unroll]: 6.67002e-06 [a_1]: 0.00012586 [with_stream_mark]: 8.87e-06 [recompute_prepare]: 6.94001e-06 [updatestate_depend_eliminate]: 3.18e-06 [updatestate_assign_eliminate]: 2.74001e-06 [updatestate_loads_eliminate]: 2.46998e-06 [parameter_eliminate]: 9.89996e-07 [specialize_transform]: 6.55002e-06 [updatestate_useless_node_eliminater]: 6.61999e-06 [accelerated_algorithm]: 7.08e-06 [meta_shard_fg_expand]: 1.35001e-06 [get_grad_eliminate_]: 6.26998e-06 [merge_forward]: 2.96001e-06 [cell_reuse_recompute_pass]: 1.56998e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.424e-05 [j_node_and_user_rematch]: 9.37001e-06 [meta_fg_expand]: 2.24001e-06 [replace_old_param]: 2.415e-05 [inline_without_move]: 6.63e-06 [renormalize]: 1.00001e-07 [add_forward_monad_depend]: 1.30001e-06 [auto_monad_grad]: 7.79983e-07 [auto_monad_eliminator]: 6.39999e-06 [cse]: 1.664e-05 [replace_applicator]: 6.89001e-06 [py_interpret_to_execute_after_opt_a]: 9.80002e-06 [rewriter_after_opt_a]: 3.078e-05 [convert_after_rewriter]: 6.96001e-06 [order_py_execute_after_rewriter]: 5.49e-06 [mutable_eliminate]: 0.00047932 [jit_opt_b]: 5.731e-05, [1] [Cycle 1]: 5.125e-05, [2] [frontend_op_eliminate]: 2.02e-05 [inline_after_opt_a]: 1.934e-05 [cconv]: 1.643e-05 [loop_unroll]: 0.00043013 [jit_opt_after_cconv]: 0.00015695, [1] [Cycle 1]: 0.00015091, [11] [c_1]: 2.823e-05 [parameter_eliminate]: 2.37999e-06 [updatestate_depend_eliminate]: 5.69999e-06 [updatestate_assign_eliminate]: 2.81999e-06 [updatestate_loads_eliminate]: 2.54001e-06 [cse]: 2.247e-05 [call_graph_tuple_transform]: 2.049e-05 [tuple_list_get_item_eliminator]: 7.11999e-06 [none_parameter_eliminate]: 1.10999e-06 [renormalize]: 4.00003e-07 [switch_simplify]: 6.91001e-06 [remove_dup_value]: 1.436e-05 [partial_unused_args_eliminate]: 1.74e-06 [environ_conv]: 5.00999e-06 [add_recomputation]: 3.271e-05 [cse_after_recomputation]: 2.552e-05, [1] [Cycle 1]: 1.99e-05, [1] [cse]: 1.4e-05 [auto_monad_reorder]: 1.274e-05 [get_jit_bprop_graph]: 1.25001e-06 [rewriter_after_jit_bprop_graph]: 4.15e-06 [opt_after_jit_grad]: 0.00047058 [symbol_engine_optimizer]: 7.851e-05, [1] [Cycle 1]: 7.271e-05, [6] [build]: 2.66e-06 [elim_shapecalc]: 9.14998e-06 [elim_not_effective]: 1.457e-05 [opt_reshape]: 7.06001e-06 [fold_const_symbol]: 9.89001e-06 [renormalize]: 4.2998e-07 [validate]: 2.919e-05 [backend_pass]: 8.2e-07 [task_emit]: 0.00571844 [execute]: 6.07001e-06 Sums bootstrap : 0.000273s : 0.36% type_inference : 0.054366s : 72.41% event_method : 0.000344s : 0.46% auto_monad : 0.000140s : 0.19% graph_reusing : 0.000008s : 0.01% pre_auto_parallel : 0.000003s : 0.00% py_interpret_to_execute : 0.000049s : 0.07% rewriter_before_opt_a : 0.000149s : 0.20% expand_dump_flag : 0.000004s : 0.00% jit_opt_a.switch_simplify : 0.000227s : 0.30% jit_opt_a.loop_unroll : 0.000112s : 0.15% jit_opt_a.a_1 : 0.002657s : 3.54% jit_opt_a.with_stream_mark : 0.000042s : 0.06% jit_opt_a.recompute_prepare : 0.000038s : 0.05% jit_opt_a.updatestate_depend_eliminate : 0.000015s : 0.02% jit_opt_a.updatestate_assign_eliminate : 0.000013s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000012s : 0.02% jit_opt_a.parameter_eliminate : 0.000004s : 0.01% jit_opt_a.specialize_transform : 0.000031s : 0.04% jit_opt_a.updatestate_useless_node_eliminater : 0.000028s : 0.04% jit_opt_a.accelerated_algorithm : 0.000029s : 0.04% jit_opt_a.meta_shard_fg_expand : 0.000007s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000028s : 0.04% jit_opt_a.merge_forward : 0.000014s : 0.02% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.00% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000055s : 0.07% jit_opt_a.j_node_and_user_rematch : 0.000045s : 0.06% jit_opt_a.meta_fg_expand : 0.002346s : 3.12% jit_opt_a.replace_old_param : 0.000123s : 0.16% jit_opt_a.inline_without_move : 0.000095s : 0.13% jit_opt_a.renormalize : 0.005824s : 7.76% jit_opt_a.add_forward_monad_depend : 0.000014s : 0.02% jit_opt_a.auto_monad_grad : 0.000008s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000070s : 0.09% jit_opt_a.cse : 0.000337s : 0.45% jit_opt_a.replace_applicator : 0.000097s : 0.13% py_interpret_to_execute_after_opt_a : 0.000010s : 0.01% rewriter_after_opt_a : 0.000031s : 0.04% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000479s : 0.64% jit_opt_b.frontend_op_eliminate : 0.000020s : 0.03% jit_opt_b.inline_after_opt_a : 0.000019s : 0.03% cconv : 0.000016s : 0.02% loop_unroll : 0.000430s : 0.57% jit_opt_after_cconv.c_1 : 0.000028s : 0.04% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000022s : 0.03% jit_opt_after_cconv.call_graph_tuple_transform : 0.000020s : 0.03% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000014s : 0.02% partial_unused_args_eliminate : 0.000002s : 0.00% environ_conv : 0.000005s : 0.01% add_recomputation : 0.000033s : 0.04% cse_after_recomputation.cse : 0.000014s : 0.02% auto_monad_reorder : 0.000013s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000004s : 0.01% opt_after_jit_grad : 0.000471s : 0.63% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.01% symbol_engine_optimizer.elim_not_effective : 0.000015s : 0.02% symbol_engine_optimizer.opt_reshape : 0.000007s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.01% symbol_engine_optimizer.renormalize : 0.000000s : 0.00% validate : 0.000029s : 0.04% backend_pass : 0.000001s : 0.00% task_emit : 0.005718s : 7.62% execute : 0.000006s : 0.01% Time group info: ------[substitution.] 0.000609 134 0.32% : 0.000002s : 2: substitution.elim_not_effective 0.13% : 0.000001s : 2: substitution.fold_const_symbol 0.75% : 0.000005s : 4: substitution.graph_param_transform 69.99% : 0.000426s : 21: substitution.inline 3.49% : 0.000021s : 3: substitution.inline_without_move 1.30% : 0.000008s : 13: substitution.j_node_and_user_rematch 1.66% : 0.000010s : 7: substitution.minmaximum_grad 1.58% : 0.000010s : 11: substitution.partial_eliminate 1.60% : 0.000010s : 13: substitution.remove_not_recompute_node 3.76% : 0.000023s : 9: substitution.replace_applicator 1.89% : 0.000012s : 17: substitution.replace_old_param 0.41% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.03% : 0.000012s : 5: substitution.switch_simplify 3.28% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 2.28% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.52% : 0.000034s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.054272 2 94.63% : 0.051358s : 1: type_inference.infer 5.37% : 0.002914s : 1: type_inference.specialize ------[replace.] 0.000246 31 58.12% : 0.000143s : 21: replace.inline 21.96% : 0.000054s : 5: replace.switch_simplify 19.92% : 0.000049s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000435 31 95.29% : 0.000415s : 21: match.inline 2.24% : 0.000010s : 5: match.switch_simplify 2.47% : 0.000011s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000482 3351 6.43% : 0.000031s : 57: predicate.accumulaten_eliminater 0.31% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.44% : 0.000007s : 57: predicate.addn_check_dump 1.51% : 0.000007s : 57: predicate.addn_zero_filter 2.07% : 0.000010s : 57: predicate.arithmetic_simplify 1.45% : 0.000007s : 57: predicate.cast_eliminate 0.13% : 0.000001s : 4: predicate.check_bprop_eliminate 1.34% : 0.000006s : 57: predicate.compare_switch_simplify 1.47% : 0.000007s : 57: predicate.depend_value_elim 1.44% : 0.000007s : 57: predicate.dict_get_item_const_eliminator 1.59% : 0.000008s : 57: predicate.dict_get_item_eliminator 1.41% : 0.000007s : 57: predicate.dict_set_item_eliminator 0.18% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.10% : 0.000000s : 4: predicate.elim_not_effective 0.16% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.41% : 0.000007s : 57: predicate.environ_add_const_eliminate 1.41% : 0.000007s : 57: predicate.environ_get_add_eliminate 1.39% : 0.000007s : 57: predicate.environ_get_depend_swap 1.45% : 0.000007s : 57: predicate.environ_get_eliminate 1.42% : 0.000007s : 57: predicate.environ_get_set_eliminate 0.07% : 0.000000s : 4: predicate.fold_const_symbol 0.68% : 0.000003s : 22: predicate.get_grad_eliminate 0.08% : 0.000000s : 4: predicate.graph_param_transform 3.95% : 0.000019s : 91: predicate.inline 2.16% : 0.000010s : 64: predicate.inline_without_move 0.33% : 0.000002s : 22: predicate.j_node_and_user_rematch 0.77% : 0.000004s : 22: predicate.less_batch_normalization 1.70% : 0.000008s : 62: predicate.list_to_tuple_eliminator_ 1.75% : 0.000008s : 66: predicate.load_eliminater 0.36% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.52% : 0.000017s : 121: predicate.loop_unroll_before_grad 1.60% : 0.000008s : 61: predicate.make_slice_get_slice_eliminator 1.41% : 0.000007s : 57: predicate.merge_addn 1.43% : 0.000007s : 57: predicate.minmaximum_grad 0.36% : 0.000002s : 4: predicate.mutable_eliminate 0.15% : 0.000001s : 4: predicate.opt_reshape 2.09% : 0.000010s : 66: predicate.partial_eliminate 1.43% : 0.000007s : 57: predicate.print_const_string_wrapper 1.85% : 0.000009s : 57: predicate.reduce_eliminate 1.65% : 0.000008s : 62: predicate.redundant_stop_gradient_eliminater 0.39% : 0.000002s : 22: predicate.remove_not_recompute_node 2.37% : 0.000011s : 114: predicate.replace_applicator 1.10% : 0.000005s : 64: predicate.replace_old_param 0.11% : 0.000001s : 4: predicate.reset_defer_inline 1.66% : 0.000008s : 57: predicate.reshape_eliminate 1.43% : 0.000007s : 57: predicate.row_tensor_add_zeros_like 0.19% : 0.000001s : 4: predicate.row_tensor_eliminate 1.49% : 0.000007s : 57: predicate.same_eliminate 0.41% : 0.000002s : 22: predicate.set_cell_output_no_recompute 0.28% : 0.000001s : 8: predicate.special_op_eliminate 0.72% : 0.000003s : 22: predicate.specialize_transform 1.69% : 0.000008s : 57: predicate.split_environ_get_set_with_tuple_value 1.43% : 0.000007s : 57: predicate.stack_unstack_eliminate 0.12% : 0.000001s : 4: predicate.switch_call_monad_eliminater 2.90% : 0.000014s : 83: predicate.switch_defer_inline 2.48% : 0.000012s : 83: predicate.switch_layer_defer_inline 6.79% : 0.000033s : 218: predicate.switch_simplify 1.51% : 0.000007s : 57: predicate.tile_eliminate 1.49% : 0.000007s : 57: predicate.transpose_eliminate 1.89% : 0.000009s : 57: predicate.tuple_list_convert_item_index_to_positive 1.69% : 0.000008s : 57: predicate.tuple_list_get_item_depend_reorder 2.65% : 0.000013s : 70: predicate.tuple_list_get_item_eliminator 1.83% : 0.000009s : 57: predicate.tuple_list_set_item_eliminator 1.63% : 0.000008s : 62: predicate.tuple_to_list_eliminator_ 1.65% : 0.000008s : 66: predicate.updatestate_pure_node_eliminater 2.56% : 0.000012s : 88: predicate.updatestate_useless_node_eliminater 1.81% : 0.000009s : 57: predicate.value_based_eliminate 0.11% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.18% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.003271 51 64.85% : 0.002121s : 23: func_graph_cloner_run.FuncGraphClonerGraph 35.15% : 0.001150s : 28: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.088254 91 0.04% : 0.000036s : 1: add_recomputation 0.17% : 0.000146s : 1: auto_monad 0.02% : 0.000015s : 1: auto_monad_reorder 0.00% : 0.000003s : 1: backend_pass 0.32% : 0.000284s : 1: bootstrap 0.02% : 0.000019s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.03% : 0.000028s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: environ_conv 0.40% : 0.000352s : 1: event_method 0.01% : 0.000010s : 1: execute 0.01% : 0.000006s : 1: expand_dump_flag 0.00% : 0.000003s : 1: get_jit_bprop_graph 0.01% : 0.000011s : 1: graph_reusing 17.75% : 0.015663s : 1: jit_opt_a 0.18% : 0.000160s : 1: jit_opt_after_cconv 0.07% : 0.000060s : 1: jit_opt_b 0.50% : 0.000437s : 1: loop_unroll 0.55% : 0.000487s : 1: mutable_eliminate 3.99% : 0.003523s : 39: opt.transform.jit_opt_a 0.07% : 0.000059s : 4: opt.transform.jit_opt_after_cconv 0.04% : 0.000033s : 4: opt.transform.jit_opt_b 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 0.03% : 0.000027s : 1: opt.transform.opt_after_jit_grad 0.04% : 0.000038s : 4: opt.transform.symbol_engine_opt 0.54% : 0.000478s : 1: opt_after_jit_grad 0.01% : 0.000008s : 1: order_py_execute_after_rewriter 0.00% : 0.000004s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pre_auto_parallel 0.06% : 0.000052s : 1: py_interpret_to_execute 0.01% : 0.000012s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000017s : 1: remove_dup_value 4.05% : 0.003573s : 2: renormalize.infer 2.53% : 0.002237s : 2: renormalize.specialize 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.04% : 0.000034s : 1: rewriter_after_opt_a 0.17% : 0.000152s : 1: rewriter_before_opt_a 0.09% : 0.000081s : 1: symbol_engine_optimizer 6.49% : 0.005727s : 1: task_emit 61.61% : 0.054377s : 1: type_inference 0.05% : 0.000045s : 1: validate TotalTime = 0.0309601, [24] [bootstrap]: 0.00045612 [type_inference]: 0.015333 [event_method]: 1.863e-05 [auto_monad]: 8.278e-05 [graph_reusing]: 6.67002e-06 [inline]: 2.14999e-06 [add_attr]: 0.00308396, [1] [add_attr_with_inline]: 0.00307635, [1] [Cycle 1]: 5.547e-05, [2] [tag_attr]: 1.954e-05 [meta_addattr_fg_expand]: 5.72999e-06 [parallel-infer-symbol]: 3.49001e-06 [pre_auto_parallel]: 3.195e-05 [insert-virtual-dataset]: 2.83e-06 [parallel-infer-symbol-second]: 7.49977e-07 [dataset_repeat_opt]: 2.15002e-06 [pipeline_split]: 1.66e-06 [optimize]: 0.00440483, [53] [py_interpret_to_execute]: 4.39002e-06 [rewriter_before_opt_a]: 6.811e-05 [opt_a]: 0.0025031, [2] [Cycle 1]: 0.00187716, [45] [expand_dump_flag]: 3.45e-06 [switch_simplify]: 9.206e-05 [loop_unroll]: 2.782e-05 [a_1]: 0.00050557 [with_stream_mark]: 1.4e-05 [recompute_prepare]: 8.34002e-06 [updatestate_depend_eliminate]: 3.61999e-06 [updatestate_assign_eliminate]: 3.56001e-06 [updatestate_loads_eliminate]: 3.11001e-06 [parameter_eliminate]: 1.90001e-06 [a_2]: 8.289e-05 [accelerated_algorithm]: 7.3e-06 [shard]: 1.76e-06 [meta_shard_fg_expand]: 1.85001e-06 [shard_inline]: 6.29001e-06 [merge_send_recv]: 8.09002e-06 [auto_parallel]: 5.97999e-06 [parallel]: 1.716e-05 [flash_sp]: 7.58001e-06 [merge_comm]: 3.62002e-06 [allreduce_fusion]: 3.45e-06 [matmul_add_comm_reduction]: 9.36e-06 [allreduce_slice_to_reducescatter]: 6.40022e-07 [virtual_shard_identity]: 7.63999e-06 [virtual_dataset]: 6.61e-06 [get_grad_eliminate_]: 6.52001e-06 [virtual_output]: 6.79001e-06 [merge_forward]: 3.81999e-06 [cell_reuse_recompute_pass]: 1.14998e-06 [offload_activation]: 1.007e-05 [cell_reuse_handle_not_recompute_node_pass]: 1.25e-05 [merge_recompute_call_nodes]: 1.60999e-06 [before_grad]: 1.015e-05 [set_forward_comm_id_for_comm_node_pass]: 3.56001e-06 [meta_fg_expand]: 3.21001e-06 [flash_sp_send_recv_attached]: 2.47001e-06 [receive_attached]: 2.39999e-06 [after_resolve]: 1.08e-05 [a_after_grad]: 9.54999e-06 [renormalize]: 0.00062791 [add_forward_monad_depend]: 5.15999e-06 [auto_monad_grad]: 1.84e-06 [auto_monad_eliminator]: 1.504e-05 [cse]: 3.28e-05 [a_3]: 4.68e-05 [Cycle 2]: 0.00061699, [45] [expand_dump_flag]: 9.89996e-07 [switch_simplify]: 8.03999e-06 [loop_unroll]: 6.51999e-06 [a_1]: 0.00012537 [with_stream_mark]: 1.108e-05 [recompute_prepare]: 6.49999e-06 [updatestate_depend_eliminate]: 2.94001e-06 [updatestate_assign_eliminate]: 2.64999e-06 [updatestate_loads_eliminate]: 2.87002e-06 [parameter_eliminate]: 1.00001e-06 [a_2]: 7.459e-05 [accelerated_algorithm]: 6.29999e-06 [shard]: 1.04e-06 [meta_shard_fg_expand]: 1.29998e-06 [shard_inline]: 6.06e-06 [merge_send_recv]: 4.47e-06 [auto_parallel]: 5.10999e-06 [parallel]: 4.2e-06 [flash_sp]: 2.97002e-06 [merge_comm]: 3.06999e-06 [allreduce_fusion]: 2.84999e-06 [matmul_add_comm_reduction]: 5.05999e-06 [allreduce_slice_to_reducescatter]: 3.80009e-07 [virtual_shard_identity]: 7.01001e-06 [virtual_dataset]: 6.10002e-06 [get_grad_eliminate_]: 6.12001e-06 [virtual_output]: 5.82001e-06 [merge_forward]: 2.73e-06 [cell_reuse_recompute_pass]: 1.35999e-06 [offload_activation]: 5.79999e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.296e-05 [merge_recompute_call_nodes]: 7.09988e-07 [before_grad]: 8.70999e-06 [set_forward_comm_id_for_comm_node_pass]: 3.16001e-06 [meta_fg_expand]: 1.99e-06 [flash_sp_send_recv_attached]: 7.7e-07 [receive_attached]: 9.30013e-07 [after_resolve]: 9.87999e-06 [a_after_grad]: 9.26002e-06 [renormalize]: 6.99947e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 9.80013e-07 [auto_monad_eliminator]: 6.12001e-06 [cse]: 1.463e-05 [a_3]: 3.748e-05 [py_interpret_to_execute_after_opt_a]: 3.61001e-06 [slice_cell_reuse_recomputed_activation]: 2.11e-06 [rewriter_after_opt_a]: 1.924e-05 [convert_after_rewriter]: 1.42e-06 [order_py_execute_after_rewriter]: 1.27999e-06 [mutable_eliminate]: 0.0004733 [opt_b]: 0.00020868, [1] [Cycle 1]: 0.00020319, [7] [b_1]: 0.00013203 [b_2]: 7.62998e-06 [updatestate_depend_eliminate]: 5.15999e-06 [updatestate_assign_eliminate]: 2.71999e-06 [updatestate_loads_eliminate]: 2.43e-06 [renormalize]: 4.69998e-07 [cse]: 1.971e-05 [optimize_parallel_all_gather_comm]: 1.548e-05 [overlap_param_gather]: 2.21e-06 [cconv]: 2.165e-05 [loop_unroll]: 0.00042006 [opt_after_cconv]: 0.000103, [1] [Cycle 1]: 9.743e-05, [7] [c_1]: 3.164e-05 [parameter_eliminate]: 2.32001e-06 [updatestate_depend_eliminate]: 5.15001e-06 [updatestate_assign_eliminate]: 2.66e-06 [updatestate_loads_eliminate]: 2.37001e-06 [cse]: 2.03e-05 [renormalize]: 4.30009e-07 [remove_dup_value]: 1.594e-05 [tuple_transform]: 7.305e-05, [1] [Cycle 1]: 6.884e-05, [4] [d_1]: 4.235e-05 [none_parameter_eliminate]: 1.54e-06 [renormalize]: 1.59984e-07 [switch_simplify]: 6.85002e-06 [partial_unused_args_eliminate]: 2.26e-06 [add_recomputation]: 4.592e-05 [cse_after_recomputation]: 2.293e-05, [1] [Cycle 1]: 1.867e-05, [1] [cse]: 1.354e-05 [environ_conv]: 5.44e-06 [swap_dp_allreduce_reducescatter]: 5.91e-06 [bias_add_comm_swap]: 2.88e-06 [label_micro_interleaved_index]: 3.95998e-06 [label_fine_grained_interleaved_index]: 2.56e-06 [merge_cast_opt]: 1.30999e-06 [slice_recompute_activation]: 2.10002e-06 [micro_interleaved_order_control]: 2.73e-06 [assign_add_opt]: 1.19e-06 [ForceFp32Comm]: 9.20001e-07 [remove_cast_before_assign_add]: 1.59e-06 [full_micro_interleaved_order_control]: 2.22999e-06 [reorder_send_recv_between_fp_bp]: 2.74999e-06 [comm_op_add_attrs]: 1.03001e-06 [add_comm_op_reuse_tag]: 9.79984e-07 [interleave_split_concat_branches]: 1.12e-06 [interleave_parallel_branches]: 1.22e-06 [overlap_opt_shard_in_pipeline]: 1.29e-06 [overlap_opt_shard_grad_in_pipeline]: 2.37999e-06 [control_data_broadcast_order]: 1.271e-05 [grouped_pairwise_exchange_alltoall]: 1.52999e-06 [offloading_packed_experts]: 3.33998e-06 [overlap_recompute_and_grad_model_parallel]: 4.42998e-06 [overlap_grad_matmul_and_grad_allreduce]: 1.17999e-06 [overlap_recompute_allgather_and_fa_grad]: 1.34e-06 [overlap_recompute_comm]: 2.75002e-06 [overlap_grad_ring_attention]: 3.97998e-06 [overlap_grad_flash_sp]: 1.705e-05 [begin_end_overlap_inline]: 5.19998e-07 [split_matmul_comm_elemetwise]: 2.11e-06 [split_layernorm_comm]: 1.73002e-06 [handle_group_info]: 9.80013e-07 [symbol_engine_optimizer]: 7.243e-05, [1] [Cycle 1]: 6.847e-05, [6] [build]: 2.38002e-06 [elim_shapecalc]: 9.17999e-06 [elim_not_effective]: 1.225e-05 [opt_reshape]: 6.86001e-06 [fold_const_symbol]: 1.005e-05 [renormalize]: 2.00002e-07 [detach_backward]: 1.78002e-06 [pipeline_parallel_scheduler]: 1.52001e-06 [auto_monad_reorder]: 1.86e-05 [get_jit_bprop_graph]: 1.07e-06 [rewriter_after_jit_bprop_graph]: 3.25998e-06 [opt_after_jit_grad]: 0.00045229 [validate]: 3.468e-05 [backend_pass]: 8.70001e-07 [task_emit]: 0.00681455 [execute]: 6.76999e-06 Sums bootstrap : 0.000456s : 1.69% type_inference : 0.015333s : 56.97% event_method : 0.000019s : 0.07% auto_monad : 0.000083s : 0.31% graph_reusing : 0.000007s : 0.02% inline : 0.000002s : 0.01% add_attr.add_attr_with_inline.tag_attr : 0.000020s : 0.07% add_attr.add_attr_with_inline.meta_addattr_fg_expand : 0.000006s : 0.02% parallel-infer-symbol : 0.000003s : 0.01% pre_auto_parallel : 0.000032s : 0.12% insert-virtual-dataset : 0.000003s : 0.01% parallel-infer-symbol-second : 0.000001s : 0.00% dataset_repeat_opt : 0.000002s : 0.01% pipeline_split : 0.000002s : 0.01% optimize.py_interpret_to_execute : 0.000004s : 0.02% optimize.rewriter_before_opt_a : 0.000068s : 0.25% optimize.opt_a.expand_dump_flag : 0.000004s : 0.02% optimize.opt_a.switch_simplify : 0.000100s : 0.37% optimize.opt_a.loop_unroll : 0.000034s : 0.13% optimize.opt_a.a_1 : 0.000631s : 2.34% optimize.opt_a.with_stream_mark : 0.000025s : 0.09% optimize.opt_a.recompute_prepare : 0.000015s : 0.06% optimize.opt_a.updatestate_depend_eliminate : 0.000007s : 0.02% optimize.opt_a.updatestate_assign_eliminate : 0.000006s : 0.02% optimize.opt_a.updatestate_loads_eliminate : 0.000006s : 0.02% optimize.opt_a.parameter_eliminate : 0.000003s : 0.01% optimize.opt_a.a_2 : 0.000157s : 0.59% optimize.opt_a.accelerated_algorithm : 0.000014s : 0.05% optimize.opt_a.shard : 0.000003s : 0.01% optimize.opt_a.meta_shard_fg_expand : 0.000003s : 0.01% optimize.opt_a.shard_inline : 0.000012s : 0.05% optimize.opt_a.merge_send_recv : 0.000013s : 0.05% optimize.opt_a.auto_parallel : 0.000011s : 0.04% optimize.opt_a.parallel : 0.000021s : 0.08% optimize.opt_a.flash_sp : 0.000011s : 0.04% optimize.opt_a.merge_comm : 0.000007s : 0.02% optimize.opt_a.allreduce_fusion : 0.000006s : 0.02% optimize.opt_a.matmul_add_comm_reduction : 0.000014s : 0.05% optimize.opt_a.allreduce_slice_to_reducescatter : 0.000001s : 0.00% optimize.opt_a.virtual_shard_identity : 0.000015s : 0.05% optimize.opt_a.virtual_dataset : 0.000013s : 0.05% optimize.opt_a.get_grad_eliminate_ : 0.000013s : 0.05% optimize.opt_a.virtual_output : 0.000013s : 0.05% optimize.opt_a.merge_forward : 0.000007s : 0.02% optimize.opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% optimize.opt_a.offload_activation : 0.000016s : 0.06% optimize.opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000025s : 0.09% optimize.opt_a.merge_recompute_call_nodes : 0.000002s : 0.01% optimize.opt_a.before_grad : 0.000019s : 0.07% optimize.opt_a.set_forward_comm_id_for_comm_node_pass : 0.000007s : 0.02% optimize.opt_a.meta_fg_expand : 0.000005s : 0.02% optimize.opt_a.flash_sp_send_recv_attached : 0.000003s : 0.01% optimize.opt_a.receive_attached : 0.000003s : 0.01% optimize.opt_a.after_resolve : 0.000021s : 0.08% optimize.opt_a.a_after_grad : 0.000019s : 0.07% optimize.opt_a.renormalize : 0.000628s : 2.33% optimize.opt_a.add_forward_monad_depend : 0.000006s : 0.02% optimize.opt_a.auto_monad_grad : 0.000003s : 0.01% optimize.opt_a.auto_monad_eliminator : 0.000021s : 0.08% optimize.opt_a.cse : 0.000047s : 0.18% optimize.opt_a.a_3 : 0.000084s : 0.31% optimize.py_interpret_to_execute_after_opt_a : 0.000004s : 0.01% optimize.slice_cell_reuse_recomputed_activation : 0.000002s : 0.01% optimize.rewriter_after_opt_a : 0.000019s : 0.07% optimize.convert_after_rewriter : 0.000001s : 0.01% optimize.order_py_execute_after_rewriter : 0.000001s : 0.00% optimize.mutable_eliminate : 0.000473s : 1.76% optimize.opt_b.b_1 : 0.000132s : 0.49% optimize.opt_b.b_2 : 0.000008s : 0.03% optimize.opt_b.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_b.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_b.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_b.renormalize : 0.000000s : 0.00% optimize.opt_b.cse : 0.000020s : 0.07% optimize.optimize_parallel_all_gather_comm : 0.000015s : 0.06% optimize.overlap_param_gather : 0.000002s : 0.01% optimize.cconv : 0.000022s : 0.08% optimize.loop_unroll : 0.000420s : 1.56% optimize.opt_after_cconv.c_1 : 0.000032s : 0.12% optimize.opt_after_cconv.parameter_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.updatestate_depend_eliminate : 0.000005s : 0.02% optimize.opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% optimize.opt_after_cconv.updatestate_loads_eliminate : 0.000002s : 0.01% optimize.opt_after_cconv.cse : 0.000020s : 0.08% optimize.opt_after_cconv.renormalize : 0.000000s : 0.00% optimize.remove_dup_value : 0.000016s : 0.06% optimize.tuple_transform.d_1 : 0.000042s : 0.16% optimize.tuple_transform.none_parameter_eliminate : 0.000002s : 0.01% optimize.tuple_transform.renormalize : 0.000000s : 0.00% optimize.tuple_transform.switch_simplify : 0.000007s : 0.03% optimize.partial_unused_args_eliminate : 0.000002s : 0.01% optimize.add_recomputation : 0.000046s : 0.17% optimize.cse_after_recomputation.cse : 0.000014s : 0.05% optimize.environ_conv : 0.000005s : 0.02% optimize.swap_dp_allreduce_reducescatter : 0.000006s : 0.02% optimize.bias_add_comm_swap : 0.000003s : 0.01% optimize.label_micro_interleaved_index : 0.000004s : 0.01% optimize.label_fine_grained_interleaved_index : 0.000003s : 0.01% optimize.merge_cast_opt : 0.000001s : 0.00% optimize.slice_recompute_activation : 0.000002s : 0.01% optimize.micro_interleaved_order_control : 0.000003s : 0.01% optimize.assign_add_opt : 0.000001s : 0.00% optimize.ForceFp32Comm : 0.000001s : 0.00% optimize.remove_cast_before_assign_add : 0.000002s : 0.01% optimize.full_micro_interleaved_order_control : 0.000002s : 0.01% optimize.reorder_send_recv_between_fp_bp : 0.000003s : 0.01% optimize.comm_op_add_attrs : 0.000001s : 0.00% optimize.add_comm_op_reuse_tag : 0.000001s : 0.00% optimize.interleave_split_concat_branches : 0.000001s : 0.00% optimize.interleave_parallel_branches : 0.000001s : 0.00% optimize.overlap_opt_shard_in_pipeline : 0.000001s : 0.00% optimize.overlap_opt_shard_grad_in_pipeline : 0.000002s : 0.01% optimize.control_data_broadcast_order : 0.000013s : 0.05% optimize.grouped_pairwise_exchange_alltoall : 0.000002s : 0.01% optimize.offloading_packed_experts : 0.000003s : 0.01% optimize.overlap_recompute_and_grad_model_parallel : 0.000004s : 0.02% optimize.overlap_grad_matmul_and_grad_allreduce : 0.000001s : 0.00% optimize.overlap_recompute_allgather_and_fa_grad : 0.000001s : 0.00% optimize.overlap_recompute_comm : 0.000003s : 0.01% optimize.overlap_grad_ring_attention : 0.000004s : 0.01% optimize.overlap_grad_flash_sp : 0.000017s : 0.06% optimize.begin_end_overlap_inline : 0.000001s : 0.00% optimize.split_matmul_comm_elemetwise : 0.000002s : 0.01% optimize.split_layernorm_comm : 0.000002s : 0.01% optimize.handle_group_info : 0.000001s : 0.00% optimize.symbol_engine_optimizer.build : 0.000002s : 0.01% optimize.symbol_engine_optimizer.elim_shapecalc : 0.000009s : 0.03% optimize.symbol_engine_optimizer.elim_not_effective : 0.000012s : 0.05% optimize.symbol_engine_optimizer.opt_reshape : 0.000007s : 0.03% optimize.symbol_engine_optimizer.fold_const_symbol : 0.000010s : 0.04% optimize.symbol_engine_optimizer.renormalize : 0.000000s : 0.00% detach_backward : 0.000002s : 0.01% pipeline_parallel_scheduler : 0.000002s : 0.01% auto_monad_reorder : 0.000019s : 0.07% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000003s : 0.01% opt_after_jit_grad : 0.000452s : 1.68% validate : 0.000035s : 0.13% backend_pass : 0.000001s : 0.00% task_emit : 0.006815s : 25.32% execute : 0.000007s : 0.03% Time group info: ------[substitution.] 0.000155 28 1.22% : 0.000002s : 2: substitution.elim_not_effective 1.02% : 0.000002s : 2: substitution.fold_const_symbol 3.70% : 0.000006s : 4: substitution.graph_param_transform 80.64% : 0.000125s : 6: substitution.inline 1.93% : 0.000003s : 4: substitution.j_node_and_user_rematch 3.41% : 0.000005s : 4: substitution.remove_not_recompute_node 2.24% : 0.000003s : 4: substitution.replace_old_param 5.83% : 0.000009s : 2: substitution.switch_simplify ------[type_inference.] 0.015283 2 93.54% : 0.014296s : 1: type_inference.infer 6.46% : 0.000987s : 1: type_inference.specialize ------[replace.] 0.000069 8 55.77% : 0.000038s : 6: replace.inline 44.23% : 0.000030s : 2: replace.switch_simplify ------[match.] 0.000129 8 94.07% : 0.000121s : 6: match.inline 5.93% : 0.000008s : 2: match.switch_simplify ------[predicate.] 0.000180 1228 0.92% : 0.000002s : 13: predicate.accumulaten_eliminater 0.90% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 0.55% : 0.000001s : 8: predicate.addn_check_dump 0.91% : 0.000002s : 13: predicate.addn_zero_filter 0.84% : 0.000002s : 13: predicate.adjust_all_reduce_mul_add 2.30% : 0.000004s : 21: predicate.arithmetic_simplify 0.94% : 0.000002s : 13: predicate.cast_eliminate 0.62% : 0.000001s : 8: predicate.check_bprop_eliminate 0.54% : 0.000001s : 8: predicate.compare_switch_simplify 0.21% : 0.000000s : 4: predicate.const_output_eliminate 0.57% : 0.000001s : 8: predicate.depend_value_elim 0.99% : 0.000002s : 13: predicate.dict_get_item_const_eliminator 1.10% : 0.000002s : 13: predicate.dict_get_item_eliminator 0.95% : 0.000002s : 13: predicate.dict_set_item_eliminator 0.98% : 0.000002s : 8: predicate.dumpgradient_eliminate 0.32% : 0.000001s : 4: predicate.elim_not_effective 0.38% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.26% : 0.000002s : 17: predicate.environ_add_const_eliminate 1.13% : 0.000002s : 17: predicate.environ_get_add_eliminate 1.18% : 0.000002s : 17: predicate.environ_get_depend_swap 1.79% : 0.000003s : 25: predicate.environ_get_eliminate 1.21% : 0.000002s : 17: predicate.environ_get_set_eliminate 1.40% : 0.000003s : 19: predicate.exchange_switch_depend_value 2.41% : 0.000004s : 19: predicate.float_depend_g_call 0.55% : 0.000001s : 8: predicate.float_environ_get_switch 0.87% : 0.000002s : 12: predicate.float_tuple_getitem_switch 0.21% : 0.000000s : 4: predicate.fold_const_symbol 0.83% : 0.000002s : 8: predicate.get_grad_eliminate 0.24% : 0.000000s : 4: predicate.graph_param_transform 0.57% : 0.000001s : 8: predicate.incorporate_call 0.49% : 0.000001s : 8: predicate.incorporate_call_switch 5.86% : 0.000011s : 56: predicate.inline 0.77% : 0.000001s : 8: predicate.inline_without_move 0.34% : 0.000001s : 8: predicate.j_node_and_user_rematch 0.78% : 0.000001s : 8: predicate.less_batch_normalization 1.54% : 0.000003s : 21: predicate.list_to_tuple_eliminator_ 2.40% : 0.000004s : 34: predicate.load_eliminater 0.91% : 0.000002s : 4: predicate.loop_unroll_after_grad 2.68% : 0.000005s : 32: predicate.loop_unroll_before_grad 1.70% : 0.000003s : 21: predicate.make_slice_get_slice_eliminator 0.58% : 0.000001s : 8: predicate.merge_addn 0.60% : 0.000001s : 8: predicate.micro_step_allgather_replace 0.61% : 0.000001s : 8: predicate.mini_step_allgather_replace 0.87% : 0.000002s : 13: predicate.minmaximum_grad 1.19% : 0.000002s : 4: predicate.mutable_eliminate 0.42% : 0.000001s : 4: predicate.opt_reshape 0.45% : 0.000001s : 4: predicate.parallel_virtual_node 1.83% : 0.000003s : 19: predicate.partial_defer_inline 1.28% : 0.000002s : 17: predicate.partial_eliminate 0.92% : 0.000002s : 13: predicate.print_const_string_wrapper 0.62% : 0.000001s : 8: predicate.reduce_all_const_elim 1.19% : 0.000002s : 13: predicate.reduce_eliminate 2.25% : 0.000004s : 34: predicate.redundant_stop_gradient_eliminater 0.50% : 0.000001s : 8: predicate.remove_not_recompute_node 1.27% : 0.000002s : 21: predicate.replace_applicator 0.58% : 0.000001s : 8: predicate.replace_old_param 0.23% : 0.000000s : 4: predicate.reset_defer_inline 1.05% : 0.000002s : 13: predicate.reshape_eliminate 0.65% : 0.000001s : 8: predicate.row_tensor_add_zeros_like 0.36% : 0.000001s : 4: predicate.row_tensor_eliminate 0.75% : 0.000001s : 8: predicate.same_eliminate 0.48% : 0.000001s : 8: predicate.set_cell_output_no_recompute 0.76% : 0.000001s : 8: predicate.shard_identity_eliminate 0.90% : 0.000002s : 8: predicate.special_op_eliminate 0.69% : 0.000001s : 8: predicate.specialize_transform 0.83% : 0.000002s : 8: predicate.split_environ_get_set_with_tuple_value 0.76% : 0.000001s : 8: predicate.stack_unstack_eliminate 0.34% : 0.000001s : 4: predicate.switch_call_monad_eliminater 1.52% : 0.000003s : 19: predicate.switch_defer_inline 2.05% : 0.000004s : 27: predicate.switch_layer_defer_inline 5.61% : 0.000010s : 67: predicate.switch_simplify 0.95% : 0.000002s : 13: predicate.tile_eliminate 0.92% : 0.000002s : 13: predicate.transpose_eliminate 1.58% : 0.000003s : 21: predicate.tuple_list_convert_item_index_to_positive 1.62% : 0.000003s : 21: predicate.tuple_list_get_item_const_eliminator 1.48% : 0.000003s : 21: predicate.tuple_list_get_item_depend_reorder 2.92% : 0.000005s : 29: predicate.tuple_list_get_item_eliminator 1.54% : 0.000003s : 21: predicate.tuple_list_get_set_item_eliminator 2.21% : 0.000004s : 29: predicate.tuple_list_set_item_eliminator 1.66% : 0.000003s : 21: predicate.tuple_to_list_eliminator_ 2.25% : 0.000004s : 34: predicate.updatestate_pure_node_eliminater 3.05% : 0.000005s : 42: predicate.updatestate_useless_node_eliminater 0.39% : 0.000001s : 4: predicate.value_based_eliminate 0.72% : 0.000001s : 8: predicate.virtual_dataset_eliminate 0.68% : 0.000001s : 8: predicate.virtual_output_eliminate 0.30% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.44% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.000765 12 59.32% : 0.000454s : 4: func_graph_cloner_run.FuncGraphClonerGraph 40.68% : 0.000311s : 8: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.040372 196 0.01% : 0.000005s : 1: ForceFp32Comm 7.65% : 0.003088s : 1: add_attr 7.63% : 0.003080s : 1: add_attr_with_inline 0.01% : 0.000004s : 1: add_comm_op_reuse_tag 0.12% : 0.000050s : 1: add_recomputation 0.01% : 0.000004s : 1: assign_add_opt 0.22% : 0.000088s : 1: auto_monad 0.06% : 0.000022s : 1: auto_monad_reorder 0.01% : 0.000006s : 1: backend_pass 0.01% : 0.000003s : 1: begin_end_overlap_inline 0.01% : 0.000006s : 1: bias_add_comm_swap 1.21% : 0.000488s : 1: bootstrap 0.06% : 0.000025s : 1: cconv 0.01% : 0.000004s : 1: comm_op_add_attrs 0.04% : 0.000016s : 1: control_data_broadcast_order 0.01% : 0.000004s : 1: convert_after_rewriter 0.06% : 0.000026s : 1: cse_after_recomputation 0.01% : 0.000005s : 1: dataset_repeat_opt 0.01% : 0.000005s : 1: detach_backward 0.02% : 0.000009s : 1: environ_conv 0.06% : 0.000024s : 1: event_method 0.03% : 0.000012s : 1: execute 0.01% : 0.000005s : 1: full_micro_interleaved_order_control 0.01% : 0.000004s : 1: get_jit_bprop_graph 0.02% : 0.000010s : 1: graph_reusing 0.01% : 0.000004s : 1: grouped_pairwise_exchange_alltoall 0.01% : 0.000004s : 1: handle_group_info 0.01% : 0.000005s : 1: inline 0.01% : 0.000006s : 1: insert-virtual-dataset 0.01% : 0.000004s : 1: interleave_parallel_branches 0.01% : 0.000004s : 1: interleave_split_concat_branches 0.01% : 0.000005s : 1: label_fine_grained_interleaved_index 0.02% : 0.000007s : 1: label_micro_interleaved_index 1.06% : 0.000428s : 1: loop_unroll 0.01% : 0.000004s : 1: merge_cast_opt 0.01% : 0.000005s : 1: micro_interleaved_order_control 1.19% : 0.000482s : 1: mutable_eliminate 0.02% : 0.000006s : 1: offloading_packed_experts 0.03% : 0.000013s : 1: opt.transform.loop_unroll_optimizer 0.04% : 0.000015s : 1: opt.transform.mutable_eliminate 2.73% : 0.001103s : 78: opt.transform.opt_a 0.08% : 0.000030s : 1: opt.transform.opt_after_cconv 0.06% : 0.000025s : 1: opt.transform.opt_after_jit_grad 0.27% : 0.000109s : 28: opt.transform.opt_b 0.12% : 0.000047s : 2: opt.transform.opt_trans_graph 0.09% : 0.000035s : 4: opt.transform.symbol_engine_opt 6.21% : 0.002506s : 1: opt_a 0.26% : 0.000106s : 1: opt_after_cconv 1.14% : 0.000461s : 1: opt_after_jit_grad 0.53% : 0.000212s : 1: opt_b 10.92% : 0.004409s : 1: optimize 0.05% : 0.000019s : 1: optimize_parallel_all_gather_comm 0.01% : 0.000004s : 1: order_py_execute_after_rewriter 0.05% : 0.000020s : 1: overlap_grad_flash_sp 0.01% : 0.000004s : 1: overlap_grad_matmul_and_grad_allreduce 0.02% : 0.000007s : 1: overlap_grad_ring_attention 0.01% : 0.000005s : 1: overlap_opt_shard_grad_in_pipeline 0.01% : 0.000005s : 1: overlap_opt_shard_in_pipeline 0.01% : 0.000005s : 1: overlap_param_gather 0.01% : 0.000004s : 1: overlap_recompute_allgather_and_fa_grad 0.02% : 0.000007s : 1: overlap_recompute_and_grad_model_parallel 0.02% : 0.000006s : 1: overlap_recompute_comm 0.02% : 0.000007s : 1: parallel-infer-symbol 0.01% : 0.000004s : 1: parallel-infer-symbol-second 0.01% : 0.000005s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pipeline_parallel_scheduler 0.01% : 0.000005s : 1: pipeline_split 0.09% : 0.000036s : 1: pre_auto_parallel 0.02% : 0.000008s : 1: py_interpret_to_execute 0.02% : 0.000007s : 1: py_interpret_to_execute_after_opt_a 0.01% : 0.000004s : 1: remove_cast_before_assign_add 0.05% : 0.000019s : 1: remove_dup_value 0.77% : 0.000309s : 1: renormalize.infer 0.77% : 0.000312s : 1: renormalize.specialize 0.01% : 0.000005s : 1: reorder_send_recv_between_fp_bp 0.02% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.06% : 0.000023s : 1: rewriter_after_opt_a 0.18% : 0.000073s : 1: rewriter_before_opt_a 0.05% : 0.000019s : 1: slice_cell_reuse_recomputed_activation 0.01% : 0.000005s : 1: slice_recompute_activation 0.01% : 0.000005s : 1: split_layernorm_comm 0.01% : 0.000005s : 1: split_matmul_comm_elemetwise 0.02% : 0.000009s : 1: swap_dp_allreduce_reducescatter 0.19% : 0.000075s : 1: symbol_engine_optimizer 16.91% : 0.006825s : 1: task_emit 0.19% : 0.000076s : 1: tuple_transform 38.01% : 0.015347s : 1: type_inference 0.15% : 0.000062s : 1: validate TotalTime = 0.0580422, [33] [bootstrap]: 0.00024583 [type_inference]: 0.0351718 [event_method]: 0.0002104 [auto_monad]: 0.0001331 [graph_reusing]: 8.74e-06 [pre_auto_parallel]: 3.74002e-06 [py_interpret_to_execute]: 4.193e-05 [rewriter_before_opt_a]: 0.00014342 [expand_dump_flag]: 3.60003e-06 [jit_opt_a]: 0.0125275, [3] [Cycle 1]: 0.0070392, [27] [switch_simplify]: 0.00017269 [loop_unroll]: 6.364e-05 [a_1]: 0.0012948 [with_stream_mark]: 2.027e-05 [recompute_prepare]: 2.055e-05 [updatestate_depend_eliminate]: 7.82e-06 [updatestate_assign_eliminate]: 6.62002e-06 [updatestate_loads_eliminate]: 6.23e-06 [parameter_eliminate]: 1.94999e-06 [specialize_transform]: 1.572e-05 [updatestate_useless_node_eliminater]: 1.484e-05 [accelerated_algorithm]: 1.461e-05 [meta_shard_fg_expand]: 3.57002e-06 [get_grad_eliminate_]: 1.421e-05 [merge_forward]: 7.71001e-06 [cell_reuse_recompute_pass]: 9.20001e-07 [cell_reuse_handle_not_recompute_node_pass]: 2.647e-05 [j_node_and_user_rematch]: 2.43e-05 [meta_fg_expand]: 0.00156442 [replace_old_param]: 6.289e-05 [inline_without_move]: 5.879e-05 [renormalize]: 0.00299432 [add_forward_monad_depend]: 8.82999e-06 [auto_monad_grad]: 5.26998e-06 [auto_monad_eliminator]: 7.224e-05 [cse]: 0.00027483 [replace_applicator]: 7.298e-05 [Cycle 2]: 0.00222478, [27] [switch_simplify]: 4.519e-05 [loop_unroll]: 4.209e-05 [a_1]: 0.00115265 [with_stream_mark]: 1.159e-05 [recompute_prepare]: 8.91002e-06 [updatestate_depend_eliminate]: 3.55003e-06 [updatestate_assign_eliminate]: 2.83998e-06 [updatestate_loads_eliminate]: 2.59001e-06 [parameter_eliminate]: 1.15001e-06 [specialize_transform]: 7.01001e-06 [updatestate_useless_node_eliminater]: 6.94001e-06 [accelerated_algorithm]: 6.81001e-06 [meta_shard_fg_expand]: 1.64e-06 [get_grad_eliminate_]: 6.22001e-06 [merge_forward]: 2.99999e-06 [cell_reuse_recompute_pass]: 1.09e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.206e-05 [j_node_and_user_rematch]: 9.25001e-06 [meta_fg_expand]: 0.00012322 [replace_old_param]: 1.382e-05 [inline_without_move]: 7.29001e-06 [renormalize]: 0.00055298 [add_forward_monad_depend]: 4.25999e-06 [auto_monad_grad]: 1.19e-06 [auto_monad_eliminator]: 1.035e-05 [cse]: 2.135e-05 [replace_applicator]: 1.337e-05 [Cycle 3]: 0.00037793, [27] [switch_simplify]: 7.16001e-06 [loop_unroll]: 6.50997e-06 [a_1]: 0.00012416 [with_stream_mark]: 8.55001e-06 [recompute_prepare]: 6.36e-06 [updatestate_depend_eliminate]: 3.26001e-06 [updatestate_assign_eliminate]: 2.81e-06 [updatestate_loads_eliminate]: 2.49999e-06 [parameter_eliminate]: 1.02e-06 [specialize_transform]: 6.34001e-06 [updatestate_useless_node_eliminater]: 6.33e-06 [accelerated_algorithm]: 6.68e-06 [meta_shard_fg_expand]: 1.49e-06 [get_grad_eliminate_]: 6.12001e-06 [merge_forward]: 3.01001e-06 [cell_reuse_recompute_pass]: 1.35001e-06 [cell_reuse_handle_not_recompute_node_pass]: 1.4e-05 [j_node_and_user_rematch]: 9.20999e-06 [meta_fg_expand]: 2.14e-06 [replace_old_param]: 9.41e-06 [inline_without_move]: 5.92999e-06 [renormalize]: 7.00238e-08 [add_forward_monad_depend]: 1.22e-06 [auto_monad_grad]: 6.79982e-07 [auto_monad_eliminator]: 5.89e-06 [cse]: 1.548e-05 [replace_applicator]: 6.56e-06 [py_interpret_to_execute_after_opt_a]: 8.85001e-06 [rewriter_after_opt_a]: 2.928e-05 [convert_after_rewriter]: 6.64999e-06 [order_py_execute_after_rewriter]: 5.09e-06 [mutable_eliminate]: 0.00046396 [jit_opt_b]: 5.603e-05, [1] [Cycle 1]: 5.019e-05, [2] [frontend_op_eliminate]: 1.95e-05 [inline_after_opt_a]: 1.934e-05 [cconv]: 1.667e-05 [loop_unroll]: 0.00043631 [jit_opt_after_cconv]: 0.00016854, [1] [Cycle 1]: 0.00016233, [11] [c_1]: 2.76e-05 [parameter_eliminate]: 2.38998e-06 [updatestate_depend_eliminate]: 6.06e-06 [updatestate_assign_eliminate]: 2.78e-06 [updatestate_loads_eliminate]: 2.63003e-06 [cse]: 2.433e-05 [call_graph_tuple_transform]: 2.066e-05 [tuple_list_get_item_eliminator]: 6.51e-06 [none_parameter_eliminate]: 1.25001e-06 [renormalize]: 4.40021e-07 [switch_simplify]: 7.26001e-06 [remove_dup_value]: 1.291e-05 [partial_unused_args_eliminate]: 1.45001e-06 [environ_conv]: 5.03002e-06 [add_recomputation]: 3.162e-05 [cse_after_recomputation]: 2.683e-05, [1] [Cycle 1]: 2.167e-05, [1] [cse]: 1.59e-05 [auto_monad_reorder]: 1.154e-05 [get_jit_bprop_graph]: 1.29e-06 [rewriter_after_jit_bprop_graph]: 4.50001e-06 [opt_after_jit_grad]: 0.00046651 [symbol_engine_optimizer]: 8.207e-05, [1] [Cycle 1]: 7.64e-05, [6] [build]: 2.53e-06 [elim_shapecalc]: 9.81e-06 [elim_not_effective]: 1.444e-05 [opt_reshape]: 7.90998e-06 [fold_const_symbol]: 1.093e-05 [renormalize]: 5.59987e-07 [validate]: 2.935e-05 [backend_pass]: 8.89995e-07 [task_emit]: 0.00747251 [execute]: 4.52e-06 Sums bootstrap : 0.000246s : 0.45% type_inference : 0.035172s : 64.72% event_method : 0.000210s : 0.39% auto_monad : 0.000133s : 0.24% graph_reusing : 0.000009s : 0.02% pre_auto_parallel : 0.000004s : 0.01% py_interpret_to_execute : 0.000042s : 0.08% rewriter_before_opt_a : 0.000143s : 0.26% expand_dump_flag : 0.000004s : 0.01% jit_opt_a.switch_simplify : 0.000225s : 0.41% jit_opt_a.loop_unroll : 0.000112s : 0.21% jit_opt_a.a_1 : 0.002572s : 4.73% jit_opt_a.with_stream_mark : 0.000040s : 0.07% jit_opt_a.recompute_prepare : 0.000036s : 0.07% jit_opt_a.updatestate_depend_eliminate : 0.000015s : 0.03% jit_opt_a.updatestate_assign_eliminate : 0.000012s : 0.02% jit_opt_a.updatestate_loads_eliminate : 0.000011s : 0.02% jit_opt_a.parameter_eliminate : 0.000004s : 0.01% jit_opt_a.specialize_transform : 0.000029s : 0.05% jit_opt_a.updatestate_useless_node_eliminater : 0.000028s : 0.05% jit_opt_a.accelerated_algorithm : 0.000028s : 0.05% jit_opt_a.meta_shard_fg_expand : 0.000007s : 0.01% jit_opt_a.get_grad_eliminate_ : 0.000027s : 0.05% jit_opt_a.merge_forward : 0.000014s : 0.03% jit_opt_a.cell_reuse_recompute_pass : 0.000003s : 0.01% jit_opt_a.cell_reuse_handle_not_recompute_node_pass : 0.000053s : 0.10% jit_opt_a.j_node_and_user_rematch : 0.000043s : 0.08% jit_opt_a.meta_fg_expand : 0.001690s : 3.11% jit_opt_a.replace_old_param : 0.000086s : 0.16% jit_opt_a.inline_without_move : 0.000072s : 0.13% jit_opt_a.renormalize : 0.003547s : 6.53% jit_opt_a.add_forward_monad_depend : 0.000014s : 0.03% jit_opt_a.auto_monad_grad : 0.000007s : 0.01% jit_opt_a.auto_monad_eliminator : 0.000088s : 0.16% jit_opt_a.cse : 0.000312s : 0.57% jit_opt_a.replace_applicator : 0.000093s : 0.17% py_interpret_to_execute_after_opt_a : 0.000009s : 0.02% rewriter_after_opt_a : 0.000029s : 0.05% convert_after_rewriter : 0.000007s : 0.01% order_py_execute_after_rewriter : 0.000005s : 0.01% mutable_eliminate : 0.000464s : 0.85% jit_opt_b.frontend_op_eliminate : 0.000020s : 0.04% jit_opt_b.inline_after_opt_a : 0.000019s : 0.04% cconv : 0.000017s : 0.03% loop_unroll : 0.000436s : 0.80% jit_opt_after_cconv.c_1 : 0.000028s : 0.05% jit_opt_after_cconv.parameter_eliminate : 0.000002s : 0.00% jit_opt_after_cconv.updatestate_depend_eliminate : 0.000006s : 0.01% jit_opt_after_cconv.updatestate_assign_eliminate : 0.000003s : 0.01% jit_opt_after_cconv.updatestate_loads_eliminate : 0.000003s : 0.00% jit_opt_after_cconv.cse : 0.000024s : 0.04% jit_opt_after_cconv.call_graph_tuple_transform : 0.000021s : 0.04% jit_opt_after_cconv.tuple_list_get_item_eliminator : 0.000007s : 0.01% jit_opt_after_cconv.none_parameter_eliminate : 0.000001s : 0.00% jit_opt_after_cconv.renormalize : 0.000000s : 0.00% jit_opt_after_cconv.switch_simplify : 0.000007s : 0.01% remove_dup_value : 0.000013s : 0.02% partial_unused_args_eliminate : 0.000001s : 0.00% environ_conv : 0.000005s : 0.01% add_recomputation : 0.000032s : 0.06% cse_after_recomputation.cse : 0.000016s : 0.03% auto_monad_reorder : 0.000012s : 0.02% get_jit_bprop_graph : 0.000001s : 0.00% rewriter_after_jit_bprop_graph : 0.000005s : 0.01% opt_after_jit_grad : 0.000467s : 0.86% symbol_engine_optimizer.build : 0.000003s : 0.00% symbol_engine_optimizer.elim_shapecalc : 0.000010s : 0.02% symbol_engine_optimizer.elim_not_effective : 0.000014s : 0.03% symbol_engine_optimizer.opt_reshape : 0.000008s : 0.01% symbol_engine_optimizer.fold_const_symbol : 0.000011s : 0.02% symbol_engine_optimizer.renormalize : 0.000001s : 0.00% validate : 0.000029s : 0.05% backend_pass : 0.000001s : 0.00% task_emit : 0.007473s : 13.75% execute : 0.000005s : 0.01% Time group info: ------[substitution.] 0.000588 128 0.27% : 0.000002s : 2: substitution.elim_not_effective 0.20% : 0.000001s : 2: substitution.fold_const_symbol 0.71% : 0.000004s : 4: substitution.graph_param_transform 70.62% : 0.000415s : 21: substitution.inline 2.75% : 0.000016s : 2: substitution.inline_without_move 1.25% : 0.000007s : 12: substitution.j_node_and_user_rematch 1.67% : 0.000010s : 7: substitution.minmaximum_grad 1.61% : 0.000010s : 11: substitution.partial_eliminate 1.61% : 0.000009s : 12: substitution.remove_not_recompute_node 3.88% : 0.000023s : 9: substitution.replace_applicator 1.69% : 0.000010s : 14: substitution.replace_old_param 0.47% : 0.000003s : 1: substitution.set_cell_output_no_recompute 2.04% : 0.000012s : 5: substitution.switch_simplify 3.35% : 0.000020s : 7: substitution.tuple_list_convert_item_index_to_positive 2.39% : 0.000014s : 7: substitution.tuple_list_get_item_depend_reorder 5.48% : 0.000032s : 12: substitution.tuple_list_get_item_eliminator ------[type_inference.] 0.035082 2 93.16% : 0.032681s : 1: type_inference.infer 6.84% : 0.002401s : 1: type_inference.specialize ------[replace.] 0.000244 31 58.28% : 0.000142s : 21: replace.inline 21.76% : 0.000053s : 5: replace.switch_simplify 19.97% : 0.000049s : 5: replace.tuple_list_get_item_eliminator ------[match.] 0.000423 31 95.41% : 0.000404s : 21: match.inline 2.17% : 0.000009s : 5: match.switch_simplify 2.42% : 0.000010s : 5: match.tuple_list_get_item_eliminator ------[predicate.] 0.000446 3262 1.53% : 0.000007s : 56: predicate.accumulaten_eliminater 0.36% : 0.000002s : 4: predicate.ad_related_special_op_eliminate 1.43% : 0.000006s : 56: predicate.addn_check_dump 1.60% : 0.000007s : 56: predicate.addn_zero_filter 2.12% : 0.000009s : 56: predicate.arithmetic_simplify 1.63% : 0.000007s : 56: predicate.cast_eliminate 0.14% : 0.000001s : 4: predicate.check_bprop_eliminate 1.53% : 0.000007s : 56: predicate.compare_switch_simplify 1.49% : 0.000007s : 56: predicate.depend_value_elim 1.49% : 0.000007s : 56: predicate.dict_get_item_const_eliminator 1.58% : 0.000007s : 56: predicate.dict_get_item_eliminator 1.49% : 0.000007s : 56: predicate.dict_set_item_eliminator 0.22% : 0.000001s : 4: predicate.dumpgradient_eliminate 0.14% : 0.000001s : 4: predicate.elim_not_effective 0.15% : 0.000001s : 4: predicate.elim_shapecalc_of_broadcastargs 1.44% : 0.000006s : 56: predicate.environ_add_const_eliminate 1.42% : 0.000006s : 56: predicate.environ_get_add_eliminate 1.49% : 0.000007s : 56: predicate.environ_get_depend_swap 1.55% : 0.000007s : 56: predicate.environ_get_eliminate 1.48% : 0.000007s : 56: predicate.environ_get_set_eliminate 0.08% : 0.000000s : 4: predicate.fold_const_symbol 0.73% : 0.000003s : 21: predicate.get_grad_eliminate 0.10% : 0.000000s : 4: predicate.graph_param_transform 4.30% : 0.000019s : 90: predicate.inline 1.75% : 0.000008s : 46: predicate.inline_without_move 0.35% : 0.000002s : 21: predicate.j_node_and_user_rematch 0.82% : 0.000004s : 21: predicate.less_batch_normalization 1.82% : 0.000008s : 61: predicate.list_to_tuple_eliminator_ 1.85% : 0.000008s : 65: predicate.load_eliminater 0.35% : 0.000002s : 4: predicate.loop_unroll_after_grad 3.85% : 0.000017s : 120: predicate.loop_unroll_before_grad 1.74% : 0.000008s : 60: predicate.make_slice_get_slice_eliminator 1.43% : 0.000006s : 56: predicate.merge_addn 1.47% : 0.000007s : 56: predicate.minmaximum_grad 0.41% : 0.000002s : 4: predicate.mutable_eliminate 0.17% : 0.000001s : 4: predicate.opt_reshape 2.27% : 0.000010s : 65: predicate.partial_eliminate 1.53% : 0.000007s : 56: predicate.print_const_string_wrapper 1.93% : 0.000009s : 56: predicate.reduce_eliminate 1.74% : 0.000008s : 61: predicate.redundant_stop_gradient_eliminater 0.40% : 0.000002s : 21: predicate.remove_not_recompute_node 2.47% : 0.000011s : 113: predicate.replace_applicator 0.90% : 0.000004s : 46: predicate.replace_old_param 0.12% : 0.000001s : 4: predicate.reset_defer_inline 1.59% : 0.000007s : 56: predicate.reshape_eliminate 1.58% : 0.000007s : 56: predicate.row_tensor_add_zeros_like 0.21% : 0.000001s : 4: predicate.row_tensor_eliminate 1.62% : 0.000007s : 56: predicate.same_eliminate 0.43% : 0.000002s : 21: predicate.set_cell_output_no_recompute 0.30% : 0.000001s : 8: predicate.special_op_eliminate 0.78% : 0.000003s : 21: predicate.specialize_transform 1.78% : 0.000008s : 56: predicate.split_environ_get_set_with_tuple_value 1.60% : 0.000007s : 56: predicate.stack_unstack_eliminate 0.13% : 0.000001s : 4: predicate.switch_call_monad_eliminater 3.07% : 0.000014s : 82: predicate.switch_defer_inline 2.68% : 0.000012s : 82: predicate.switch_layer_defer_inline 7.18% : 0.000032s : 216: predicate.switch_simplify 1.63% : 0.000007s : 56: predicate.tile_eliminate 1.48% : 0.000007s : 56: predicate.transpose_eliminate 1.96% : 0.000009s : 56: predicate.tuple_list_convert_item_index_to_positive 1.76% : 0.000008s : 56: predicate.tuple_list_get_item_depend_reorder 2.88% : 0.000013s : 69: predicate.tuple_list_get_item_eliminator 1.91% : 0.000009s : 56: predicate.tuple_list_set_item_eliminator 1.82% : 0.000008s : 61: predicate.tuple_to_list_eliminator_ 1.79% : 0.000008s : 65: predicate.updatestate_pure_node_eliminater 2.79% : 0.000012s : 86: predicate.updatestate_useless_node_eliminater 1.87% : 0.000008s : 56: predicate.value_based_eliminate 0.12% : 0.000001s : 4: predicate.virtual_view_grad_eliminate 0.20% : 0.000001s : 4: predicate.zero_like_fill_zero ------[func_graph_cloner_run.] 0.002473 41 64.38% : 0.001592s : 16: func_graph_cloner_run.FuncGraphClonerGraph 35.62% : 0.000881s : 25: func_graph_cloner_run.FuncGraphSpecializer ------[meta_graph.] 0.000000 0 ------[manager.] 0.000000 0 ------[pynative] 0.000000 0 ------[others.] 0.065057 91 0.05% : 0.000035s : 1: add_recomputation 0.21% : 0.000139s : 1: auto_monad 0.02% : 0.000014s : 1: auto_monad_reorder 0.01% : 0.000004s : 1: backend_pass 0.40% : 0.000257s : 1: bootstrap 0.03% : 0.000019s : 1: cconv 0.01% : 0.000009s : 1: convert_after_rewriter 0.04% : 0.000029s : 1: cse_after_recomputation 0.01% : 0.000007s : 1: environ_conv 0.33% : 0.000217s : 1: event_method 0.01% : 0.000008s : 1: execute 0.01% : 0.000006s : 1: expand_dump_flag 0.01% : 0.000003s : 1: get_jit_bprop_graph 0.02% : 0.000012s : 1: graph_reusing 19.26% : 0.012530s : 1: jit_opt_a 0.26% : 0.000172s : 1: jit_opt_after_cconv 0.09% : 0.000059s : 1: jit_opt_b 0.68% : 0.000444s : 1: loop_unroll 0.72% : 0.000471s : 1: mutable_eliminate 5.17% : 0.003360s : 39: opt.transform.jit_opt_a 0.09% : 0.000059s : 4: opt.transform.jit_opt_after_cconv 0.05% : 0.000032s : 4: opt.transform.jit_opt_b 0.02% : 0.000014s : 1: opt.transform.loop_unroll_optimizer 0.02% : 0.000015s : 1: opt.transform.mutable_eliminate 0.04% : 0.000026s : 1: opt.transform.opt_after_jit_grad 0.06% : 0.000040s : 4: opt.transform.symbol_engine_opt 0.73% : 0.000474s : 1: opt_after_jit_grad 0.01% : 0.000007s : 1: order_py_execute_after_rewriter 0.01% : 0.000003s : 1: partial_unused_args_eliminate 0.01% : 0.000006s : 1: pre_auto_parallel 0.07% : 0.000046s : 1: py_interpret_to_execute 0.02% : 0.000011s : 1: py_interpret_to_execute_after_opt_a 0.02% : 0.000015s : 1: remove_dup_value 2.88% : 0.001873s : 2: renormalize.infer 2.55% : 0.001661s : 2: renormalize.specialize 0.01% : 0.000006s : 1: rewriter_after_jit_bprop_graph 0.05% : 0.000032s : 1: rewriter_after_opt_a 0.23% : 0.000147s : 1: rewriter_before_opt_a 0.13% : 0.000085s : 1: symbol_engine_optimizer 11.50% : 0.007481s : 1: task_emit 54.08% : 0.035182s : 1: type_inference 0.07% : 0.000046s : 1: validate group_cases_24 have all been run, results of sub cases are below: case: ('KBK',) {} pass. . [hook] pytest_runtest_teardown:test_ops_group_case_ascend910b_level0 tests/st/ops/allcases_onecard/test_ops_group_cases.py::test_ops_group_case_ascend910b_level0,max_mem:2.0M =============================== warnings summary =============================== ../../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/classifier/transdata/transdata_classifier.py:222 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/classifier/transdata/transdata_classifier.py:222: DeprecationWarning: invalid escape sequence \B """ ../../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:143 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:143: DeprecationWarning: invalid escape sequence \c """ ../../../../../../../../../usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:170 /usr/local/Ascend/cann-8.5.0/python/site-packages/tbe/dsl/unify_schedule/vector/transdata/common/graph/transdata_graph_info.py:170: DeprecationWarning: invalid escape sequence \c """ ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:549: UserWarning: The value of the smallest subnormal for type is zero. setattr(self, word, getattr(machar, word).flat[0]) ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for type is zero. return self._float_to_str(self.smallest_subnormal) ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2.py:57 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2.py:57: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad.py:56 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad.py:56: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2_grad") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py:48 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/batchnorm_fold2_grad_reduce.py:48: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("batchnorm_fold2_grad_reduce") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul.py:51 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul.py:51: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:51 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:51: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul_grad") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:143 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/correction_mul_grad.py:143: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("correction_mul_grad_reduce") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py:92 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad.py:92: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py:49 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perlayer_grad_reduce.py:49: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perlayer_grad_d_reduce") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py:91 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad.py:91: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py:48 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_learned_scale_quant_perchannel_grad_reduce.py:48: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_learned_scale_quant_perchannel_grad_d_reduce") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py:52 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel.py:52: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_perchannel") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py:81 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perchannel_grad.py:81: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_perchannel_grad") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py:54 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer.py:54: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_per_layer") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py:81 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/fake_quant_perlayer_grad.py:81: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("fake_quant_per_layer_grad") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perchannel.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("minmax_update_perchannel") ../../../../../../../anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py:50 /home/jenkins/anaconda3/envs/ci39/lib/python3.9/site-packages/mindspore/ops/_op_impl/_custom_op/minmax_update_perlayer.py:50: DeprecationWarning: te_fusion.fusion_manager.fusion_manager.register is deprecated,please replace it with tbe.common.register.register_op_compute @fusion_manager.register("minmax_update_perlayer") -- Docs: https://docs.pytest.org/en/stable/warnings.html ================= 1 passed, 25 warnings in 1114.91s (0:18:34) ==================