title = "PEM Config" [Project] name = "1951lite" version_name = 0 # 0: mini(1951 lite) | 1: lite(nashville) | 2: tiny | 3: cloud(1982) [LOG] disable_list = [] enable_list = ["instr_log", "instr_popped_log", "icache_log", "dcache_log", "ccu_log"] #trace = 0, debug = 1, info = 2, warn = 3, error = 4, critical = 5, off = 6 file_print_level = 2 screen_print_level = 3 flush_level = 2 rotating_file_size = 134217728 # 0x8000000 # ~130MB rotating_file_number = 2 separate_vf_log = false perf_event_log = false [STAT] path = "./" window_size = 10 stride_size = 1 summary_log_enable = 1 aicore_trace_enable = 1 ld_est_enable = 0 ld_est_param =[0.6, 0.2, 0.0, 0.1, 0.2, 0.3, 0.1, 0.3, 0.4, 0.2] #[w_cube, w_vec0, w_vec1, f1, f2, f3, f4, f5, th_up, th_low] [ARCH] cube_core_num = 1 vec_core_num = 1 core_ostd_num = 1 #2 early end 1 normal mode vector_core_mode = 1 #0 1981 1 1911 inorder_acc = 0 wait_flag_dev_en = 0 func_switch = 1 max_sim_time = 30000000 ub_overlap_chk = 1 separate_arch = 0 simt_flag_en = 0 sub_core = ["mixcore0"] coupling_core = [] [DDR] addr_begin = 16777216 min_read_latency = 521 #average latency read_latency_diver = 180 min_write_latency = 449 write_latency_diver = 180 min_dbid_latency = 20 dbid_latency_diver = 20 bandwidth_limit = 20 #20-26 max_credit_num = 4096 [L2] addr_begin = 0 # 0x0 size = 16777216 # 16M bandwidth_limit = 198 #198-148 read_bandwidth_limit = 155 #155-148 write_bandwidth_limit = 125 #125-101 max_credit_num = 4096 min_read_latency = 216 read_latency_diver = 4 min_write_latency = 238 write_latency_diver = 4 min_dbid_latency = 20 dbid_latency_diver = 20 [BIU] port_num = 2 icache_port_num = 5 dcache_port_num = 5 simt_dcache_port_num = 5 read_queue_size = 184 write_queue_size = 90 #74 total_queue_size = 274 #194 high_prior_read_port = 0 high_prior_write_port = 1 icache_port_width = 128 mte_data_size = 512 mte_port_width = 128 mte_read_port_num = 2 mte_write_port_num = 1 bus_port_width = 128 bus_read_port_num = 2 bus_write_port_num = 1 bus_chi_enable = 0 write_data_split_en = 0 # if port_width smaller than bandwidth, it should be 1, otherwise it should be 0 write_buf_depth = 4 read_buf_depth = 4 ifu_read_queue_size = 4 enable_memmap = 1 mem_entry_num = 64 mem_entry_size = 131072 # 0x20000 phy_mem_entry_num = 256 atomic_switch = 1 aw_latency = 4 su_read_latency = 1 [PARAM_BUFFER] num_slots = 32 num_sreg_per_slot = 64 [RVEC] pv_mode = "exec_instr" enable_peekvf_prefetch = false lsu_dst_cflt_en = false csw_en = 0 vc_vcb_mode = 1 #0 v300 1 v310 vag_ad_en = 1 #0 1911l 1 1982 area_reduce = false [IFU] ibuf_size = 32 fetch_size = 16 dispatch_size = 2 # icache icache_prefetch_uint = 2048 #2K ic_addr_width = 48 #[47:0] aic_asso_num = 4 #2---->4 aiv_asso_num = 2 aic_ic_size = 32768 # 0x8000 #32k aiv_ic_size = 16384 # 0x4000 #16k ic_line_size = 128 #1024/8 aic_ic_entry_num = 64 #(ic_size/ic_line_size)/ic_asso_num aic_ic_line_num = 256 #(ic_size/ic_line_size) aiv_ic_entry_num = 64 #(ic_size/ic_line_size)/ic_asso_num aiv_ic_line_num = 256 #(ic_size/ic_line_size) ic_prefetch_en = 1 ic_prefetch_num = 3 ic_merge_en = 0 ic_max_otsd_num = 16 #16--->8 ic_max_preload_num = 15 aic_ic_idx_addr_lsb = 7 #VA[12:7]for tag idx aic_ic_idx_addr_mask = 63 # 0x7f-->0x3f aic_ic_tag_addr_lsb = 13 # VA[47:13]for va tag aic_ic_tag_addr_mask = 34359738367 # 0x1ffffffff----->0x7ffffffff # 64 - (cacheline size + set(asso) num size) aiv_ic_idx_addr_lsb = 7 #VA[12:7]for tag idx aiv_ic_idx_addr_mask = 63 # 0x7f-->0x3f aiv_ic_tag_addr_lsb = 13 # VA[47:13]for va tag aiv_ic_tag_addr_mask = 34359738367 # 0x1ffffffff----->0x7ffffffff # 64 - (cacheline size + set(asso) num size) #vec ifu ifu_new_arch = false #false: 1951 lite/nsv true:v310 1982 vf_queue_depth = 32 peek_queue_depth = 10 instr_buffer_size = 32 prefetch_trigger_times = 64 peekvf_prefetch_trigger_times = 64 num_sreg_to_copy = 32 para_write_size = 32 icache_fetch_size = 8 max_instr_to_idu = 6 icache_line_size = 128 vf_early_fetch = true #vec icache vec_ic_size = 8192 vec_ic_biu_bw = 128 #1911 128 / Cpro 32 vec_ic_line_size = 128 vec_ic_way_num = 2 vec_ic_entry_num = 32 # same with set_num : line_num / way_num vec_ic_ecc = 0 vec_ic_data_ram_width = 32 vec_ic_offset_addr_mask = 127 # 0x7f aligned addr with cacheline vec_ic_otsd_num = 8 # misq size vec_ic_pipe_length = 1 hit_info_buffer_size =1 #calculate vec_ic_line_num = 64 # size / (line_size) vec_ic_data_ram_num = 4 # line_size / data_ram_width #2 way vec_ic_idx_addr_lsb = 7 # [6:0]cacheline addr for tag ram vec_ic_idx_addr_mask = 31 # [11:7] set_index vec_ic_tag_addr_lsb = 12 # [47:12] Tag info stored in ram vec_ic_tag_addr_mask = 68719476735 # 0xFFFFFFFFF #4 way #vec_ic_idx_addr_lsb = 6 # [5:0]cacheline addr for tag ram #vec_ic_idx_addr_mask = 63 # [12:6] set_index #vec_ic_tag_addr_lsb = 13 # [47:13] Tag info stored in ram #vec_ic_tag_addr_mask = 34359738367 # 0x7FFFFFFFF vec_ic_per_way_data_ram_bank_num = 2 vec_ic_data_ram_bank_line_size = 8 [CCU] aic_issque_size = 16 aic_issque_ostd_num = 15 #32--->15 aiv_issque_size = 32 #32--->64 aiv_issque_otsd_num = 2 #32--->31 mte1_issque_size = 32 mte1_issque_otsd_num = 31 #32--->31 fixp_issque_size = 32 fixp_issque_otsd_num = 31 #32--->31 aic_mte23_issque_size = 32 aic_mte23_issque_otsd_num = 31 aiv_mte23_issque_size = 32 aiv_mte23_issque_otsd_num = 31 event_queue_size = 32 I1_slot_num = 2 # ar_option = 0 # 0 unable 1:RR 2:WRR cube_weight = 2 mte1_weight = 3 fixp_weight = 1 mte2_weight = 3 vec_weight = 2 mte3_weight = 1 # mte_rs = [ 0, 1, 1, 0 ] vec_rs = 1 cube_rs = 1 [SCALAR_Buf] base_address = 262144 # 0x40000 # 256K total_size = 16384 # 0x4000 wrap_en = 0 sys_va_base_config = 1 # 0: config by model spr for isg 1: config by spec sys_va_base = 0 # sys va base address stack_phy_base_config = 1 # 0: config by model spr for isg 1: config by spec stack_phy_base = 34603008 # (stack_va_base_addr[48:25] != sys_va_addr[48:25]) == > stack in ddr 0x2100000 mem_init_befor_start_en = 0 stack_buffer_mode = 1 memory_map_option = 0 # 0:1911L/1951L = 15M 1:nsv = 1M #asu idu_to_asu_ostd = 1 # a.k.a. issue queue depth [DCACHE] aic_dc_set_size = 128 #128-->64 aic_dc_line_num = 512 #512-->256 aiv_dc_set_size = 128 aiv_dc_line_num = 512 #dc_line_num = dc_set_size * dc_way_size aic_dc_size = 32 #dc_size = dc_line_size * dc_way_size * dc_set_size; aiv_dc_size = 32 #dc_size = dc_line_size * dc_way_size * dc_set_size; aic_dc_line_size = 64 aiv_dc_line_size = 64 aic_dc_way_size = 4 aiv_dc_way_size = 4 dc_max_read_otsd_num = 8 dc_max_write_otsd_num = 4 dc_mshr_main_entry_num = 8 dc_mshr_sub_entry_num = 8 dc_stb_main_entry_num = 8 dc_stb_sub_entry_num = 8 dc_stb_timeout_cycles = 32 dc_dstb_buf_size = 8 dc_dstb_entry_ary_size = 1 dc_req_que_size = 4 dc_ub_write_allocate = 0 #0: write without-allocate 1: write with-allocate ### UB: fixed write-through dc_ddr_write_allocate = 1 #0: write without-allocate 1: write with-allocate ### DDR: fixed write-back dc_ub_cacheable = 0 # NOTE: cacheable=0 => without-allocate dc_lock_cacheline = 1 dc_lock_cacheline_num = 2 aic_dc_idx_addr_lsb = 6 # [5:0] cacheline addr for tag ram aiv_dc_idx_addr_lsb = 6 aic_dc_idx_addr_mask = 127 # set(asso) num size 0x7f aiv_dc_idx_addr_mask = 127 # set(asso) num size 0x7f aic_dc_tag_addr_lsb = 13 # [12:6] cacheline addr for va tag aiv_dc_tag_addr_lsb = 13 # [12:6] cacheline addr for va tag dc_tag_addr_mask = 34359738367 # [47:13] Tag info stored in ram dc_set_flag_en = 1 [MTE] # cube and vec cubecore_ue_mte1 = [] # not used cubecore_ue_mte2 = [] # not used cubecore_ue_mte3 = [] # not used cubecore_ue_fixp = [] # not used veccore_ue_mte1 = [] # not used veccore_ue_mte2 = [] # not used veccore_ue_mte3 = [] # not used veccore_ue_fixp = [] # not used cubecore_intf = [] # not used veccore_intf = [] # not used # mixcore mixcore_ue_mte1 = ["UE3DV2", "UE2D", "UESET", "UEDMA", "UEWINOA", "UEWINOB"] mixcore_ue_mte2 = ["UE2D", "UESET", "UEDMA", "ND2NZ", "AIPP", "UNZIP", "MVF"] mixcore_ue_mte3 = ["UEDMA", "L1OUT"] mixcore_ue_fixp = ["NZ2ND"] mixcore_intf = ["L1RIF", "L0CRIF", "L1WIF", "L0AWIF", "L0BWIF", "L1WR_FIXP", "UBRIF", "UBWIF"] # store_buf_port = ["L10", "L0C0"] store_buf_size = 64 # same with write_otsd brif_wrr_weight = [2, 1, 1, 1, 1] bwif_wrr_weight = [2, 1, 1, 1, 1] read_otsd = 128 #64 write_otsd = 64 #48 biu_burst = [512, 256, 128] scramble_granularity = 0 # 0:disable 1:128B 2:256B 3:512B 4:1024B intlv_granularity = 0 # 0:disable 1:128B 2:256B 3:512B 4:1024B write_data_buffer_depth = 0 write_data_buffer_latency = 5 per_channel_en = 1 wr_otsd_waterline = 0 # write outstanding waterline, 0 means no waterline # mte1 l1_to_l0a_bandwidth = 512 l1_to_l0b_bandwidth = 256 l1_to_l0c_bandwidth = 64 l1_to_smask_bandwidth = 32 l1_to_fb_post_bandwidth= 128 l1_to_fb_pre_bandwidth = 32 l1_to_bt_bandwidth = 64 l1_to_pt_bandwidth = 128 l1_to_sp_bandwidth = 128 l1_to_ub_bandwidth = 128 set2d_to_l0a_bandwidth = 512 set2d_to_l0b_bandwidth = 256 l0c_read_bandwidth = 512 l1_to_biu_bandwidth = 128 l1_to_l0a_wino_bandwidth = 256 l1_to_l0b_wino_bandwidth = 256 winoa_fetch_num = 8 # mte2 biu_to_l1_bandwidth = 256 biu_to_l0a_bandwidth = 256 biu_to_l0b_bandwidth = 256 biu_to_ub_bandwidth = 128 set2d_to_l1_bandwidth = 256 # mte3 ub_to_biu_bandwidth = 128 ub_to_l1_bandwidth = 128 # move align move_align_v2 = 0 # move bt move_bt_fb_v2 = 1 # nd2nz nd2nz_buf_depth = 256 nd2nz_buf_num = 8 nd2nz_to_l1_bandwdith = 256 #fixp nz2nd_trans_buf_size_ = 16 nz2nd_trans_buf_tmp_size_ = 8 nz2nd_trans_buf_depth_ = 8 fixp_pre_out_bandwidth = 256 fixp_wr_l1_bandwidth = 256 fixp_wr_ub_bandwidth = 128 fixp_wr_out_bandwidth = 128 fixp_rd_l0c_bandwidth = 512 compute_resource_parsim = 128 fixp_instr_fifo_depth = 4 bwif_acc_fixp_latency = 5 n_direction_pad = 0 fixp_l0c_req_ost_ = 256 fixp_status_report_en = 0 fixp_bitmask_en = 0 fixp_read_col_num = 16 fixp_ndummy_en = 0 fixp_nz2nd_row_merge_opt = false nz2nd_row_merge_n_num = 32 # aipp min_h_res = 8 max_h_res = 4096 byte_per_pixel_in_l1 = 32 y_dat_buf_size = 64 uv_dat_buf_size = 64 rgb_dat_buf_size = 128 uv_upsample_buf_size = 4096 sync_buf_size = 96 csc_buf_size = 24 dtc_buf_size = 48 cpadding_buf_size = 256 pixels_per_trans = 8 img_dat_channels = 3 aipp_dat_buf_bubble = 3 aipp_max_dtc_lat = 5 aipp_dtc_u8_fp16_lat = 5 aipp_dtc_u8_s8_lat = 1 aipp_dtc_chl_offset = 0 chickenBit_en = 0 dma_buffer_size_dc = 8192 dma_buffer_size_uc = 16384 dma_y_ping_buf_addr_uc = 0 # 0x0 dma_y_pong_buf_addr_uc = 16384 # 0x4000 dma_uv_ping_buf_addr_dc = 0 # 0x0 dma_uv_pong_buf_addr_dc = 8192 # 0x2000 dma_y_ping_buf_addr_dc = 16384 # 0x4000 dma_y_pong_buf_addr_dc = 24576 # 0x6000 aipp_print_img_en = 1 #waipp dtc_normalized = 1 rd_proc_bytes = 32 waipp_print_img_en = 1 # unzip unzip_fm_size = 512 fm_size = 512 unzip_pkt_size = 8 unzip_head_size = 8 unzip_dict_size = 34 unzip_low_sparse_dict_size = 36 unzip_str_dict_size = 1 unzip_out_len = 64 unzip_seg_size = 64 unzip_buffer_size = 32768 unzip_entry_size = 2 max_fetch_idx_num = 32 max_uzp_uop_crdt = 1 unzip_delay_time = 4 unzip_bypass_delay_time = 2 unzip_buffer_depth = 1 unzip_write_band_width = 256 unzip_engine_num = 4 uzp_to_l1_bus_width = 256 uzp_to_l0b_bus_width = 128 #nd dma nddma_toml_en = 0 shared_one_nacache = 3 nddma_aixs_num = [64,128,2] nddma_dst_stride = [256, 2, 1] nddma_src_stride = [2, 750, 1] nddma_add_stride = [] src_addr = 67108864 dst_addr = 0 element_size = 4 # ndcache ndcache_size = 32 split_axis_ndcache_size = 32768 # for split axis algorithem ndcache_line_size = 128 ndcache_fifo_depth = 8192 cache_line_hash_idx = 1 cache_bank_hash_en = 1 biu_rd_que_depth = 32 tag_ram_bank_num = 4 ndcache_rd_ostd = 64 ndcache_tag_ram_clr = 0 src_addr_cls_num_per_cycle = 1 ndcache_req_buf_depth = 8 dcache_dual_port = 0 miss_fifo_buf = 5 wr_cache_backpressure = 1 [VEC] su_vec_depth = 4 aiv_stat_en_list = ["pem.veccore0.vec"] [CUBE] cube_dummy_cycle_number = 8 cube_spec_npe = 256 FP_partial_columns = 4 # only 4(full), 2, 1 columns is supported m_frac_size = 16 n_frac_size = 16 global_sync_pulse_phase_type = 1 vdrop_tick = 48 fsm_ver = 1 # ver0: v100~v200; ver1: v210 and later projects mmad_fsm_n2_mode = 0 fsm_m_pri = 1 cube_stage_num = 22 hset_l0c_check_stage = 14 hset_l0ab_check_stage = 3 hset_bt_check_stage = 3 wino_en = 1 depthwise_en = 0 group_conv_en = 0 s8s4_en = 0 u8s8_en = 0 u8_en = 0 cube_ctrl_base_config = 0 # 0: config by model spr; 1: config by spec toml CUBE_CTRL_0 = 1743011328 # defult: 1743011328 CUBE_CTRL_1 = 11162892 # defult: 11162892 aic_stat_en_list = ["pem.cubecore0.cube"] bt_fp16_compactly = 0 [L0A] total_size = 65536 #64 KB wrap_en = 0 layoutzN_en = 1 [L0B] total_size = 65536 #64 KB wrap_en = 0 [L0C] total_size = 131072 #128KB wrap_en = 0 bank_count = 32 cube_unit_flag_rd_lat = 12 # cube wr req sent to rd unit flag set latency. l0c_vec_read_dat_latency = 7 # arbitration to data back latency. [SMASK] total_size = 256 wrap_en = 0 [L1] total_size = 1048576 # 0x100000 wrap_en = 0 buffer_bg_offset = 19 buffer_bg_num = 2 buffer_line_size = 16 buffer_bank_count = 16 buffer_bank_width = 32 core_access_width = 512 dmac_access_width = 512 read_port_num = 1 write_port_num = 2 [UB] total_size = 262144 #256KB wrap_en = 0 buffer_line_size = 32 subbank_line_size = 32 buffer_bank_count = 64 bank_id_offset = 16 # bank ID bit offset in address bank_group_number = 16 bank_num_in_group = 4 # port: LSU_R, LSU_W, GSU, VEC_SU_R, VEC_SU_W, AIC_SU_R, AIC_SU_W, AIV_SU_R, AIV_SU_W, MTE_R, MTE_W, EXT_R, EXT_W port_num = [16, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4] # buffer_depth: VEC_SU_R, VEC_SU_W, AIC_SU_R, AIC_SU_W, AIV_SU_R, AIV_SU_W, MTE_R, MTE_W buffer_depth = [2, 7, 2, 7, 2, 7, 2, 2] # UB read access latency (not include UB_ARB): LSU_R, GSU_R, VEC_SU_R, AIC_SU_R, AIV_SU_R, MTE_R, SFU_R, SIMT_R read_latency = [4, 6, 7, 9, 9, 7, 6, 5] # UB full write access latency (not include UB_ARB): LSU_W, GSU_W, VEC_SU_W, AIC_SU_W, AIV_SU_W, MTE_W, SFU_W, SIMT_W write_latency = [2, 2, 2, 4, 4, 2, 2, 4] ecc = 1 ecc_byte = 32 bank_conflict_chk = 1 [FB] total_size = 6144 #6 K wrap_en = 0 [BT] total_size = 1024 #1 K wrap_en = 0 [PT] total_size = 131072 #128K wrap_en = 0 [SPIDX] total_size = 16384 #16 K wrap_en = 0 [REG] vreg_length = 256 phy_vreg_num = 52 phy_preg_num = 16 vir_vreg_num = 0 # for ooo vrat size 0:no pre-mapping !0:just num pre-mapping vir_preg_num = 0 # for ooo prat size 0:no pre-mapping !0:just num pre-mapping is_reset_en = 1 # 0:reserved state 1:reset state [IDU] # dispatch number settings max_total_disp_num = 6 # max of instr can be dispatched in a tick (IDU FIFO depth = this * 2) max_asu_disp_num = 1 max_vag_disp_num = 1 max_ld_disp_num = 2 max_st_disp_num = 1 max_ex_disp_num = 3 delay_isu2idu = 1 delay_ooo2idu = 1 check_sreg_hazard_en = 1 [ISU] exq_depth = 26 ldq_depth = 26 stq_depth = 18 ex_fifo_depth = 0 ld_fifo_depth = 0 st_fifo_depth = 0 exq_iss_width = 2 ldq_ld_iss_width = 2 ldq_ga_iss_width = 1 stq_iss_width = 1 sqzn_fifo_depth = 8 # actual usable fifo depth is 8-1=7 exq_output_pipe = 0 exq_decode_and_congruence = 2 exq_early_free_reg_enable = 1 # early-free-register switch exq_early_free_reg_time = 4 # tick to free reg before WB, =4 exq_early_free_valu_thres = 3 # valu latency threshold, =3 ld1_qsize = 4 ld2_qsize = 4 st_qsize = 4 gsu_intf_depth = 1 ex_qsize = 4 ooo_ldq_inf_size = 2 ooo_stq_inf_size = 1 ooo_exq_inf_size = 3 ex_iss_num = 2 ex_wreg_rd_pt_num = 1 ex_vreg_rd_pt_num = 4 ex_preg_rd_pt_num = 4 ex_wreg_wr_pt_num = 1 ex_vreg_wr_pt_num = 4 ex_preg_wr_pt_num = 2 iss_alloc_inv_en = 1 # issue alloc_vld=0 (not mapped) instr max_stalled_tick = 100 # max stalled/stuck tick, for debugging ldq_max_inorder_iss_num = 2 # max inorder instr can be issued in one tick stq_max_inorder_iss_num = 1 exq_max_inorder_iss_num = 1 b2b_fwd_min_intvl = 1 # back-to-back forwarding min interval b2b_fwd_delta_time = 2 wb2byps_fwd_min_intvl = 2 # wb-to-bypass forwarding min interval wb2byps_fwd_delta_time = 1 regrwbpys_fwd_min_intvl = 3 # register-rw-bypass min interval regrwbyps_fwd_delta_time = 0 [RAT] vrat_rport_num = 15 vrat_wport_num = 5 prat_rport_num = 15 prat_wport_num = 5 [OOO] idu_to_ooo_ostd = 5 ooo_to_ldq_ostd = 2 ooo_to_stq_ostd = 1 ooo_to_exq_ostd = 3 #vf_bc mode is_vf_bc_en = 0 #is vf_bc mode need remap rat pre_vrat_num = 16 #vf_bc mode rat remap num pre_prat_num = 8 #vf_bc mode rat remap num [GSU] instr_buf_depth = [2, 1, 1] #[GATHER, SCATTER, TSU] timing_buffer_depth = 4 scatter_timing_buffer_depth = 2 pipe_stage_latency = [1, 4, 1, 0, 1, 2, 5, 2, 1] #[INTF, VX, I0, I1, I2RD, I2WR, MX, RDB, WB] ub_access_para = [0, 4, 5, 7, 8, 17] #[bank_addr_lsb_, bank_addr_msb_, bank_id_lsb_, bank_id_msb_, align_addr_lsb_, align_addr_msb_] input_parallelism = 128 paralleism = 64 bank_num = 8 coaleas_en = 0 [SFU] nchw_pipe_latency = [ 2, 13, 2, 4, 2,] ub_access_para = [ 16, 17, 5, 8,] nchw_max_rd_blk = 16 nchw_max_wr_blk = 8 tran_max_rd_blk = 16 tran_max_wr_blk = 8 dma_max_rd_blk = 8 dma_max_wr_blk = 8 vms4_max_rd_rp_num = 16 vms4_max_wr_rp_num = 8 vms4_ibuf_crdt = 15 vms4_ibuf_latency = 8 max_rd_ub_crdt = 4 vms4_retire_latency = 10 no_access_latency = 6 vbs32_latency = 18 vbs32_thrput = 16 [TSU] dir0_b8_chn_num = 16 dir0_b16_chn_num = 16 dir0_b32_chn_num = 8 dir1_b8_chn_num = 8 dir1_b16_chn_num = 8 dir1_b32_chn_num = 8 dir0_b8_hw_num = 4 dir0_b16_hw_num = 4 dir0_b32_hw_num = 8 dir1_b8_hw_num = 16 dir1_b16_hw_num = 16 dir1_b32_hw_num = 8 v4dtrans_uop_intlv_num = 16 v4dtrans_engine_latcy = 2 v4dtrans_retire_latcy = 8 [LSU] ldu_rd_reg_latency = 1 stu_rd_reg_latency = 2 ldu_time_buf_depth = 5 stu_time_buf_depth = 6 [MVF] mvf_dcache_en = 2 dc_cache_size = 192 #192KB dc_cacheline_size = 128 dc_bank_num = 4 #32*4=128 dc_cacheline_num = 1536 #1536=192*1024/128 dc_tag_bank_size = 4 dc_way_size = 4 dc_set_size = 128 #96 = 192*1024/128/4/4 or 128 = 256*1024/128/4/4 dc_tag_bank_lsb = 7 #[8:7] dc_tag_bank_mask = 3 #0x3 dc_addr_coalescer_lab = 7 #[MAB:7] dc_addr_coalescer_mask = 67108863 #0x3ffffff dc_tag_index_lsb = 9 #[15:9] dc_tag_index_mask = 127 #0x7f dc_tag_addr_lsb = 16 #[31:16] dc_tag_addr_mask = 65535 #0xffff dc_max_miss_fifo_size = 200 #tmp dc_max_rd_biu_ostd = 32 #tmp dc_free_cacheline_reclaim_num = 32 #tmp dc_max_process_ostd = 32 #tmp ub_index_window = 256 [SIMT] [SIMT.SYS] num_cores = 2 [SIMT.ARCH] total_reg_num = 65536 total_phy_p_reg = 32 # we have 16. But each element is 256 bit, / (32threads x 4issue) = 2 entries total_phy_v_reg = 26 # we have 52, but this time it's the reverse. Each registers are 256B, while there are 32thread x 4B x 4 issue = 512B is required per entry total_sb = 32 max_warp_num = 64 thread_num = 32 # per warp issue_num = 4 stack_base = 327680 warp_stack_size = 4096 debug = false area_reduce = false valu_option = 0 # 0:single_valu, 1:double_valu, 2:lnexp_mul [SIMT.SCH] l1_sch_slot_num = 4 l2_sch_slot_num = 12 l2_copy_scb_latency = 6 gto_arb_switch = 1 l2_with_ibuf = false [SIMT.REG] max_r_reg_num = 127 # to reserve the RZ encoding max_p_reg_num = 7 # to reserve the PT encoding max_s_reg_num = 32 # SREG can only be used as the source register within a vector function sram_bank_num = 2 # TBC rreg_rd_latency = [1,1,1,] #[LSU,EXU,DVG] preg_rd_latency = [1,1,1,] #[LSU,EXU,DVG] sreg_rd_latency = [1,1,1,] #[LSU,EXU,DVG] rreg_wr_latency = [1,1,1,] #[LSU,EXU,DVG] preg_wr_latency = [1,1,1,] #[LSU,EXU,DVG] sreg_wr_latency = [1,1,1,] #[LSU,EXU,DVG] [SIMT.IFU] task_start_latency = 10 task_gen_latency = 3 scb_cnt_bitwidth = 6 #[0-63] ibuf_fifo_depth = 8 ibuf_read_bandwidth = 64 ibuf_fifo_width = 16 ibuf_fifo_write_enable = 4 lsu_req_pre_buf_size = 32 agu_arbiter_mode = 0 ib_dynamic_alloc = 0 pcg_out_latency = 1 dcache_arb_mode = 1 # 0:rr 1:greedy->sp [SIMT.DVG] stack_entry_num = 16 # 16 for option1, option2 will write overflow stack to ddr, model will modify this option later dvg_update_latency = 1 stack_base = 655360 warp_stack_size = 4096 [SIMT.LSU] agu_que_depth = 3 wpayload_buf_depth = 64 wpayload_size_per_entry = 128 exclusive_switch = 1 # 0: atomic_load, 1: LDEX/STEX atom_unit_fifo = 64 [SIMT.EXU] rd_reg_latency = 10 # TBC wr_reg_latency = 10 # TBC mufu_option = 1 [SIMT.DCACHE] line_size = 128 #bytes sector_size = 128 way_num = 4 bank_num = 4 set_num = 128 # per bank coalesce_depth = 32 cache_req_buf_depth = 16 bpsq_depth = 16 miss_fifo_len = 256 miss_handle_otsd = 256 biu_rd_ostd = 64 biu_wr_ostd = 64 reclaim_thrd = 40 reclaim_option = 1 # 0(random) , 1(LRU) freelist_option = 1 # 0(fifo) , 1(bitmap) random_min = 0 random_max = 512 max_ref_cnt = 31 max_fref_cnt = 7 ub_sz = 262144 max_shmem_sz = 65536 stk_wrmode = 0 # 0(writeback) , 1(writethrough) stg_wrmode = 1 # 0(writeback) , 1(writethrough) area_cut_mode = 0 # for scramble, 0(normal), 1(half area), 2(quarter area) ld_st_stall_en = 0 # enable LD and ST switch stall cycle partial_st_stall_en = 0 # enable extra partial write stall cycle missfifo_num = 4 rwdb_entry_size = 128 bpsq_rwdb_ent_num = 14 mrob_rwdb_ent_num = 14