#include #include #include #include #include #include #include "replay_def.h" #include "code_gen.h" #include "replay_fun.h" #include "register/op_check.h" #define __ASCENDC_REPLAY_CODE__ using namespace std; using namespace optiling; using namespace AscendCReplay; extern "C" void __KERNEL_FUN__ (__ARGS_DEF__, const char *); extern "C" int elf_append(char *elf, uint32_t elfSize, char *jit, int kernum, int blknum[], char *atext[], int alen[], int atlen, const char* kernelname[]); #define KERNEL_N 1 #define ARG_N (__ARG_NUM__) #define MAX_L (1024 * 1024 * 100) #define MAX_E (1024 * 1024) int __KERNEL_FUN___replay___OPS_PRODUCT__(ReplayFuncParam& param, const int core_type) { // gen type 1 : direct call codes 0: load .o file if (param.gentype < 0 || param.gentype > 1) { printf("Error: call replay gen type is %d, should only be 1 or 0\n", param.gentype); return 0; } else if (param.gentype == 1 && param.objptr == nullptr) { printf("Error: call replay with direct call mode, but code obj addr is null\n"); return 0; } else if (param.gentype == 0 && param.output_kernel_file == nullptr) { printf("Error: call replay with object file mode, but object file path is null\n"); return 0; } // core_type 0:MIX 1:CUBE 2:VEC if (core_type < 0 || core_type > 2) { printf("Error: call replay core type is %d !\n", core_type); return 0; } g_coreType = __CORE_TYPE__; g_taskRation = param.task_ration; g_tilingKey = param.tiling_key; unsigned char *buf, *jit; char *kernel[KERNEL_N * 32]; int len[KERNEL_N * 32]; int blknum[KERNEL_N]; int max; block_num = param.block_dim; g_ubBase = block_num; uint8_t *code = (uint8_t *)malloc(MAX_L); uint8_t *pos = code; struct timespec tp1, tp2; clock_gettime(CLOCK_MONOTONIC, &tp1); if (block_num > 32) { printf("Error: block_num > 32\n"); return 0; } //__OP_FOPEN__ for (int i = 0; i < KERNEL_N; i++) { for (int j = 0; j < ARG_N; j++) AddArg(j, ARG_STEP * (j + 1)); for (block_idx = 0; block_idx < block_num; block_idx++) { //__OP_SET_KERNEL__ int code_idx = i * block_num + block_idx; #ifdef FP_CEILING SetCtrlFloatEnable(); #else SetCtrlFloatDisable(); #endif CodeInit(pos, false); __KERNEL_FUN__(__KERNEL_ARGS__, param.tiling_data); CodeEnd(); kernel[code_idx] = (char *)pos; len[code_idx] = CodeLen(); pos += len[code_idx]; printf("kernel %d core %ld code generated len %d\n", i, block_idx, len[code_idx]); } blknum[i] = block_num; } //__OP_FCLOSE__ clock_gettime(CLOCK_MONOTONIC, &tp2); buf = (unsigned char *)malloc(MAX_E); int fd = open(param.entry_file, O_RDONLY); if (fd < 0) { printf("[error]: cannot find entry.o : %s\n", param.entry_file); return 0; } uint32_t bufSize = read(fd, buf, MAX_E); if (bufSize <= 0) { printf("[error]: entry.o : %s is too small ! \n", param.entry_file); } close(fd); jit = (unsigned char *)malloc(MAX_L); printf("total code generated %ld\n", pos - code); int sz = elf_append((char *)buf, bufSize, (char *)jit, KERNEL_N, blknum, kernel, len, pos - code, ¶m.kernel_name); if (tp1.tv_sec != tp2.tv_sec) { printf("%ld NS\n", tp2.tv_nsec + 1000000000 - tp1.tv_nsec); } else { printf("%ld NS\n", tp2.tv_nsec - tp1.tv_nsec); } printf("new elf size %d\n", sz); if (param.gentype == 0) { fd = open(param.output_kernel_file, O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); (void)write(fd, jit, sz); close(fd); free(jit); } else if (param.gentype == 1) { *param.objptr = (char*)jit; } free(buf); free(code); return sz; } REG_REPLAY_FUNC(__OPTYPE__, __OPS_PRODUCT__, __KERNEL_FUN___replay___OPS_PRODUCT__);