#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (c) Huawei Technologies Co., Ltd. 2022. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
ascendc_npuchk_report.py
"""
import os
import sys
import glob
import subprocess


def get_error_type(info_input):
    err_start = info_input.find('[Error')
    if err_start < 0:
        return None
    err_info_str = info_input[err_start + 1:]
    err_stop = err_info_str.find(']')
    if err_stop < 0:
        return None
    return err_info_str[:err_stop]


def parse_log(file, stack):
    cce_intri = ''
    bs_start = False
    err_info = []
    key = ''
    with open(file, 'r') as fd:
        lines = fd.readlines()
        for line in lines:
            err_type_line = get_error_type(line)
            if err_type_line is not None:
                err_info.append(line.strip())
                err_info.append(cce_intri)
                key += err_type_line
                continue
            if line.startswith('### '):
                cce_intri = line.strip()
                continue
            if not bs_start and line.find('# BackTrace #') > 0:
                bs_start = True
                continue
            if bs_start and not line.startswith('  '):
                bs_start = False
                if stack.get(key) is None:
                    stack[key] = err_info
                err_info = []
                key = ''
                continue
            if bs_start:
                if line.find('.so') > 0:
                    continue
                line = line.strip()
                binfile = line.split('(')[0]
                if line.find('+') < 0:
                    continue
                addr = line.split('+')[1].split(')')[0]
                info_tmp = binfile + ':' + addr
                err_info.append(info_tmp)
                key += addr


def execute_cmd(cmds):
    proc = subprocess.Popen(cmds, stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE, encoding='utf-8')
    try:
        outs, errs = proc.communicate(timeout=10)
        if len(errs) > 0:
            print(errs)
        return outs.strip()
    except TimeoutExpired:
        proc.kill()
        outs, errs = proc.communicate()
        print("Error:\n", errs)
    return ''


def addr_to_line(bin_file, addr):
    res = execute_cmd(['addr2line', '-f', '-e', bin_file, addr])
    fun_line = res.split('\n')
    fun = ''
    line = ''
    if len(fun_line) > 0:
        fun = fun_line[0]
        fun = execute_cmd(['c++filt', fun])
    if len(fun_line) > 1:
        line = fun_line[1]
    return '{} at {}'.format(fun, line)


if __name__ == "__main__":
    """
    使用场景1: 单log文件解析
              python3 ascendc_npuchk_report.py xxxx/xxx_npuchk.log
    使用场景2: 无log文件输入, 脚本自动在当前路径下找xxx_npuchk.log, 并解析
              python3 ascendc_npuchk_report.py
    使用场景3: 指定路径和cpu bin路径, 在该路径下找log文件, 并解析
              python3 ascendc_npuchk_report.py log_path bin_path
    """
    err_details = {
        'ErrorRead1': '非法内存读取数据: 整段内存未经过AscendC框架的alloc_buf申请或者已free',
        'ErrorRead2': '[可疑问题]读取无效数据：读取的内存部分/全部从未被写过，读取的数据可能是无效数据',
        'ErrorRead3': '读取越界, 长度超出经AscendC框架的alloc_buf申请实际有效的数据(开始/结尾)',
        'ErrorRead4': '读取地址非32字节对齐',
        'ErrorWrite1': '非法内存写入数据: 未经过AscendC框架的alloc_buf申请过或者已经free了',
        'ErrorWrite2': '写入越界, 长度超出经AscendC框架的alloc_buf申请实际有效的数据(开始/结尾)',
        'ErrorWrite3': '[可疑问题]重复写入，前一次写入的内存没有被读取走，重复写入',
        'ErrorWrite4': '写入地址非32字节对齐',
        'ErrorSync1': '写入存在同步问题, pipe内缺少pipe barrier/pipe间缺少set/wait',
        'ErrorSync2': '读取存在同步问题, pipe内缺少pipe barrier/pipe间缺少set/wait',
        'ErrorSync3': 'set/wait使用不配对, 缺少set或者wait',
        'ErrorSync4': '出现set/wait的eventID重复, 比如mte2: set0/set0, vector: wait0/wait0',
        'ErrorLeak': '内存泄露，存在申请内存未释放问题，详细见*_npuchk.log日志分析',
        'ErrorFree': '内存重复释放，调用free_buf释放过，再次调用free_buf',
        'ErrorBuffer1': 'tensor的que类型与初始化时不一致',
        'ErrorBuffer2': 'VECIN/VECOUT/VECCALC的操作不合规',
        'ErrorBuffer0': 'tensor内存未使用Ascendc框架的bufInit',
        'ErrorBuffer3': 'tensor的操作内存不合法, 可能原因: 内存未alloc/内存越界',
        'ErrorManager': '操作未初始化的内存'
    }
    stats = {}
    cpu_bin_path = None
    if len(sys.argv) == 2:
        all_npuchk_files = [sys.argv[1]]
    elif len(sys.argv) > 2:
        all_npuchk_files = glob.glob(sys.argv[1] + '/*_npuchk.log', recursive=True)
        cpu_bin_path = os.path.realpath(sys.argv[2])
    else:
        all_npuchk_files = glob.glob('**/*_npuchk.log', recursive=True)
    err_stack = {}
    for file_var in all_npuchk_files:
        parse_log(file_var, err_stack)
    for err_key in err_stack.keys():
        stack_info = []
        cur_err_info = err_stack.get(err_key)
        if len(cur_err_info) <= 0:
            continue
        err_type = get_error_type(cur_err_info[0])
        stack_info.append(cur_err_info[0])
        stack_info.append('Rule: ' + str(err_details.get(err_type)))
        if len(cur_err_info) <= 1:
            continue
        stack_info.append(cur_err_info[1])
        if len(cur_err_info) <= 2:
            continue
        for frame in cur_err_info[2:]:
            info = frame.split(':')
            if len(info) < 2:
                stack_info.append('  ' + info[0])
                continue
            if "flash_attention_score_cpu" in info[0] and cpu_bin_path:
                info[0] = os.path.join(cpu_bin_path, info[0])
            stack_info.append('  ' + addr_to_line(info[0], info[1]))
        stack_info.append('')
        LOG = '\n'.join(stack_info)
        if LOG.find('PostMessage') > 0:
            continue
        if stats.get(err_type) is None:
            stats[err_type] = 1
        else:
            stats[err_type] += 1
        print(LOG)
    print('---------------------- ERROR STATISTICS ----------------------')
    for err_key in stats.keys():
        print('{}, {}, {}'.format(stats[err_key], err_key, err_details.get(err_key)))