#!/usr/bin/env python # coding=utf-8 """ Function: Collection class. This file mainly involves the collect function. Copyright Information: Huawei Technologies Co., Ltd. All Rights Reserved © 2020 """ import os from ms_interface import utils from ms_interface.constant import Constant class Collection: def __init__(self: any, report_path: str, output_path: str) -> None: self.report_path = os.path.realpath(report_path) self.output_path = os.path.realpath(output_path) self.collect_level = 0 def check_argument_valid(self: any) -> None: utils.check_path_valid(self.report_path, isdir=True) utils.check_path_valid(self.output_path, isdir=True, output=True) def get_node_and_kernel_name_l1(self: any) -> list: plog_dir = os.path.join(self.output_path, 'collection', 'plog') # 获取kernel_name kernel_name_cmd = ['grep', '\[AIC_INFO\] dev_func:', '-inrE', plog_dir] kernel_name_regexp = r"dev_func:([a-zA-Z0-9_]{0,})$" kernel_name_ret = utils.get_inquire_result(kernel_name_cmd, kernel_name_regexp) if not kernel_name_ret: utils.print_error_log(f"Failed to get \"[AIC_INFO] dev_func:\" in plog. Cannot run L1 test.") return None if "__" in kernel_name_ret[0]: kernel_name_list = kernel_name_ret[0].split('__') kernel_name = kernel_name_list[0] else: kernel_name = kernel_name_ret[0] # 获取node_name、stream_id、task_id node_name_cmd = ['grep', '\[AIC_INFO\] node_name:', '-inrE', plog_dir] regexp = r".+?node_name:(.*?)," result = utils.get_inquire_result(node_name_cmd, regexp) if not result: utils.print_error_log(f"Failed to get node name in plog. Cannot run L1 test.") raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) node_name = result[0] node_name = node_name.replace('/', '_').replace('.', '_') return kernel_name, node_name def get_kernel_name_l0(self: any) -> list: # 获取kernel_name plog_dir = os.path.join(self.output_path, 'collection', 'plog') kernel_name_cmd = ['grep', 'Aicore kernel execute failed', '-inrE', plog_dir] kernel_name_regexp = r" stream_id=(\d+),.*?task_id=(\d+),.*?fault kernel_name=(.*?),.*?" \ r"fault kernel info ext=(.*?)," kernel_name_ret = utils.get_inquire_result(kernel_name_cmd, kernel_name_regexp) if kernel_name_ret and kernel_name_ret[0][3] != "none": stream_id = kernel_name_ret[0][0] task_id = kernel_name_ret[0][1] kernel_name = kernel_name_ret[0][3] node_name = f"exception_info.{stream_id}.{task_id}" return kernel_name, node_name kernel_name_regexp = r" stream_id=(\d+),.*?task_id=(\d+),.*?fault kernel_name=(.*?)," kernel_name_ret = utils.get_inquire_result(kernel_name_cmd, kernel_name_regexp) if not kernel_name_ret: utils.print_error_log(f"Failed to get \"Aicore kernel execute failed\" in plog.") raise utils.AicErrException(Constant.MS_AICERR_INVALID_SLOG_DATA_ERROR) stream_id = kernel_name_ret[0][0] task_id = kernel_name_ret[0][1] kernel_name = kernel_name_ret[0][2] utils.print_info_log(f"AicoreError Found, Stream id: {stream_id}, task_id: {task_id}," f"kernel_name {kernel_name}") node_name = f"exception_info.{stream_id}.{task_id}" return kernel_name, node_name def _get_node_and_kernel_name(self: any) -> list: if self.collect_level == 1: kernel_name, node_name = self.get_node_and_kernel_name_l1() else: kernel_name, node_name = self.get_kernel_name_l0() return kernel_name, node_name def collect_plog_file(self): find_path_cmd = ['grep', 'there is an .*aicore.* error|there is an .*aivec.* error', '-inrE', self.report_path] find_path_regexp = r"(/[_\-/0-9a-zA-Z.]{1,}.[log|txt]):" plog_path_ret = utils.get_inquire_result(find_path_cmd, find_path_regexp) if plog_path_ret: original_files = plog_path_ret else: utils.print_error_log(f"Aicore error log 'there is an' cannot be found in {self.report_path}.") raise utils.AicErrException(Constant.MS_AICERR_INVALID_PATH_ERROR) dest_path = os.path.join(self.output_path, 'collection', 'plog') utils.check_path_valid(dest_path, isdir=True, output=True) utils.copy_src_to_dest(original_files, os.path.join(dest_path, "aicore_error")) find_path_cmd = ['grep', "\[AIC_INFO\] dev_func:", '-inrE', self.report_path] find_path_regexp = r"(/[_\-/0-9a-zA-Z.]{1,}.[log|txt]):" plog_path_ret_1 = utils.get_inquire_result(find_path_cmd, find_path_regexp) if plog_path_ret_1: self.collect_level = 1 original_file = sorted(plog_path_ret_1)[0] if original_file not in plog_path_ret: utils.copy_src_to_dest([original_file, ], os.path.join(dest_path, "exception_dump")) else: utils.print_info_log(f"'[AIC_INFO] dev_func:' cannot be found in {self.report_path}." "Only run L0 parse") utils.print_info_log(f"Debug Level is {self.collect_level}") find_path_cmd = ['grep', "exception info dump args data", '-inrE', self.report_path] find_path_regexp = r"(/[_\-/0-9a-zA-Z.]{1,}.[log|txt]):" plog_path_ret_2 = utils.get_inquire_result(find_path_cmd, find_path_regexp) if plog_path_ret_2: original_file = sorted(plog_path_ret_2)[0] if original_file not in plog_path_ret: utils.copy_src_to_dest([original_file, ], os.path.join(dest_path, "exception_dump")) return dest_path def collect_kernel_file(self, kernel_name): original_files = [] kernel_name = kernel_name.replace("_mix_aic", "").replace("_mix_aiv", "") utils.print_info_log(f"kernel_name is {kernel_name}") find_path_cmd = ['grep', kernel_name, '-inrE', self.report_path] regexp = r"([_\-/0-9a-zA-Z.]{1,}\.json|[_\-/0-9a-zA-Z.]{1,}\.o|[_\-/0-9a-zA-Z.]{1,}\.cce)" kernel_file_list = utils.get_inquire_result(find_path_cmd, regexp) if not kernel_file_list: utils.print_error_log(f"The {kernel_name} file path cannot be found in {self.report_path}.") return None for kernel_file in kernel_file_list: if os.path.exists(kernel_file): original_files.append(kernel_file) if not original_files: utils.print_error_log( f"Kernel file cannot be collected, the kernel file cannot be found in {self.report_path}.") dest_path = os.path.join(self.output_path, "collection", "compile") utils.check_path_valid(dest_path, isdir=True, output=True) utils.copy_src_to_dest(original_files, dest_path) return dest_path def collect_ge_graph(self): find_path_cmd = ['find', self.report_path, '-name', "ge_proto_*_Build.txt"] regexp = r"([_\-/0-9a-zA-Z.]{1,}_Build.txt)" graph_file_list = utils.get_inquire_result(find_path_cmd, regexp) if not graph_file_list: utils.print_warn_log( f"Graph file cannot be collected, the graph file cannot be found in {self.report_path}.") original_files = graph_file_list dest_path = os.path.join(self.output_path, "collection", "graph") utils.check_path_valid(dest_path, isdir=True, output=True) utils.copy_src_to_dest(original_files, dest_path) return dest_path def collect_data_dump(self, node_name): dest_path = os.path.join(self.output_path, "collection", "dump") find_path_cmd = ['find', self.report_path, '-name', f"*{node_name}.*"] regexp = r"[_\.\-/0-9a-zA-Z.]{1,}" original_files = utils.get_inquire_result(find_path_cmd, regexp) if not original_files: utils.print_error_log( f"Dump file cannot be collected, the dump file cannot be found in {self.report_path}.") raise utils.AicErrException(Constant.MS_AICERR_INVALID_PATH_ERROR) # 如果找到大于1个data, 则匹配日志中的data_dump if len(original_files) > 1: plog_dir = os.path.join(self.output_path, 'collection', 'plog') for file in original_files: data_dump_cmd = ['grep', os.path.basename(file), '-nr', plog_dir] data_dump_ret, _ = utils.execute_command(data_dump_cmd) if data_dump_ret != 0: continue utils.print_info_log(f"Find dump file {os.path.basename(file)}.") original_files = [file] # 如果日志中未找到,或者找到多个, 则默认使用第一个 if len(original_files) > 1: original_files = original_files[0:1] utils.check_path_valid(dest_path, isdir=True, output=True) utils.copy_src_to_dest(original_files, dest_path) return dest_path def collect(self: any): """ collect info """ self.check_argument_valid() collect_path = os.path.join(self.output_path, 'collection') utils.check_path_valid(collect_path, isdir=True, output=True) utils.print_info_log('******************Collection******************') # collect plog utils.print_info_log(f'Start to collect {Constant.DIR_PLOG} file.') plog_dest_path = self.collect_plog_file() utils.print_info_log(f'The {Constant.DIR_PLOG} file is saved in {plog_dest_path}.') # get kernel_name utils.print_info_log('Start to parse ai core error by plog file.') kernel_name, node_name = self._get_node_and_kernel_name() utils.print_info_log(f'The ai core error occurs in kernel: {kernel_name}, node_name: {node_name}.') # collect compile utils.print_info_log('Start to collect compile file.') kernel_dest_path = self.collect_kernel_file(kernel_name) utils.print_info_log(f"The ops file is saved in {kernel_dest_path}.") # collect_ge_proto_graph proto_dest_path = self.collect_ge_graph() utils.print_info_log(f"The graph file is saved in {proto_dest_path}.") # collect dump utils.print_info_log('Start to collect dump file.') dump_dest_path = self.collect_data_dump(node_name) utils.print_info_log(f'The dump file is saved in {dump_dest_path}.')