/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (2c97a12e9f49260c2ce1b87c12cf7e05adcaf403)
;
.include "igemm_fwd_gtcx35_nhwc_bf16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me
; tensor_layout              : 'nhwc'
; gemm_m_per_block           : 128
; gemm_n_per_block           : 128
; gemm_k_per_block           : 16
; wave_tile_m                : 32
; wave_step_m                : 1
; wave_repeat_m              : 2
; wave_tile_n                : 32
; wave_step_n                : 1
; wave_repeat_n              : 2
; wave_tile_k                : 4
; tensor_a_thread_lengths    : [1, 1, 8, 1]
; tensor_a_cluster_lengths   : [1, 16, 1, 16]
; tensor_b_thread_lengths    : [1, 1, 8, 1]
; tensor_b_cluster_lengths   : [1, 16, 1, 16]
; direction                  : 'fwd'
; precision                  : 'bf16'
; nxb                        : 0
; nxe                        : 1
; merge_e                    : 1
; vector_c                   : 1
; 
; block_size                 : 256
; lds_total                  : 8192
; lds_buffer_num             : 1
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_hi, 24
.set k_wi, 28
.set k_n, 32
.set k_k, 36
.set k_c, 40
.set k_ho, 44
.set k_wo, 48
.set k_stride_h, 52
.set k_stride_w, 56
.set k_dilation_h, 60
.set k_dilation_w, 64
.set k_pad_h, 68
.set k_pad_w, 72
.set k_y, 76
.set k_x, 80
.set k_group, 84
.set k_magic_0, 88
.set k_magic_1, 92
.set k_magic_2, 96
.set k_magic_3, 100
.set k_magic_4, 104
.set k_magic_5, 108
.set k_shift_pack_0, 112
.set k_shift_pack_1, 116
.set k_gemm_k_global_split, 120
.set k__pack_0, 124
.set k_end, 128
.set k_gload_in_c_stride, 2

.set s_ka, 0
.set s_bx, 2
.set s_by, 3
.set s_p_in, 4
.set s_p_wei, 8
.set s_p_out, 12
.set s_hi, 16
.set s_wi, 17
.set s_n, 18
.set s_k, 19
.set s_c, 20
.set s_ho, 21
.set s_wo, 22
.set s_stride_h, 23
.set s_stride_w, 24
.set s_dilation_h, 25
.set s_dilation_w, 26
.set s_pad_h, 27
.set s_pad_w, 28
.set s_y, 29
.set s_x, 30
.set s_group, 31
.set s_in_stride_wi, 32
.set s_in_stride_n, 33
.set s_wei_stride_k0, 34
.set s_wei_stride_k, 35
.set s_out_stride_wo, 36
.set s_out_stride_n, 37
.set s_block_gtc_ig, 38
.set s_block_gtc_ik, 39
.set s_block_gtc_inb, 40
.set s_move_slice_k_stride_gemm_k, 41
.set s_knum, 3
.set s_dim_br, 42
.set s_dim_mp, 43
.set s_dim_mr, 44
.set s_dim_np, 45
.set s_gemm_k_diff_c, 31
.set s_move_slice_k_y, 46
.set s_move_slice_k_x, 47
.set s_move_slice_k_c, 48
.set s_diff_in_os_acc_y_x_c, 38
.set s_diff_in_os_ovf_c_acc_x, 29
.set s_diff_in_os_ovf_x_acc_y, 42
.set s_diff_in_iwi_acc_x, 43
.set s_diff_in_iwi_ovf_x, 45
.set s_diff_in_ihi_acc_y, 28
.set s_y_x_c, 49
.set s_kitr, 1
.set s_in_offset, 50
.set s_wei_offset, 51
.set s_magic_0, 6
.set s_magic_1, 7
.set s_magic_2, 14
.set s_magic_3, 15
.set s_magic_4, 10
.set s_magic_5, 11
.set s_shift_pack_0, 57
.set s_shift_pack_1, 58
.set s_tmp, 60
.set s_end, 66

.set v_c, 0  ; coalescing:16, needed:0, resuable:62
.set v_a, 0
.set v_b, 8
.set v_gld_a, 16
.set v_gld_b, 24
.set v_sst_a_os, 32
.set v_sld_a_os, 33
.set v_sst_b_os, 34
.set v_sld_b_os, 35
.set v_in_os, 36
.set v_in_ihi_list, 44
.set v_in_iwi_list, 52
.set v_in_flag, 60
.set v_in_flag_n, 68
.set v_wei_os, 69
.set v_out_os, 70
.set v_gtc_ic, 71
.set v_gtc_iec, 72
.set v_gtc_iy, 73
.set v_gtc_ix, 74
.set v_in_inb, 75
.set v_in_in, 76
.set v_wei_ik, 77
.set v_co_sst, 76
.set v_co_sld, 78
.set v_out_flag, 77
.set v_out_inb, 75
.set v_gemm_in, 79
.set v_gemm_im, 80
.set v_co_sub_m_index, 80
.set v_co_sub_n_index, 79
.set v_tmp, 82
.set v_wei_tmp_pack, 88
.set v_wei_flag, 89
.set v_end, 164

.set a_c, 100
.set a_end, 164

.text
.globl igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me
.p2align 8
.type igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me,@function
igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me:
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],    s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],   s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],   s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx8 s[s_hi+0:s_hi+7],    s[s_ka+0:s_ka+1],    0+k_hi
    s_load_dwordx8 s[s_stride_w+0:s_stride_w+7],    s[s_ka+0:s_ka+1],    0+k_stride_w
    s_load_dwordx2 s[s_magic_0+0:s_magic_0+1],  s[s_ka+0:s_ka+1],  0+k_magic_0
    s_load_dwordx2 s[s_magic_2+0:s_magic_2+1],  s[s_ka+0:s_ka+1],  0+k_magic_2
    s_load_dwordx2 s[s_magic_4+0:s_magic_4+1], s[s_ka+0:s_ka+1],  0+k_magic_4
    s_load_dword s[s_shift_pack_0], s[s_ka+0:s_ka+1],  0+k_shift_pack_0
    s_load_dword s[s_shift_pack_1], s[s_ka+0:s_ka+1],  0+k_shift_pack_1
    ; in(e, c, nb0, nb1) thread_lengths: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_iec], 15, v[v_tmp]
    v_lshrrev_b32 v[v_tmp], 4, v[v_tmp]
    v_and_b32 v[v_in_inb], 15, v[v_tmp]
    ; wei(e, c, k0, k1) thread_length: 1x1x8x1, cluster_length: 1x16x1x16, k_pack:1
    v_lshrrev_b32 v[v_tmp], 4, v0
    v_and_b32 v[v_wei_ik], 15, v[v_tmp]

    s_mov_b32 s[s_tmp], 16777215
    s_waitcnt lgkmcnt(0)

    ; calculate index
    s_lshr_b32 s[s_move_slice_k_y], s[s_y], 24
    s_lshr_b32 s[s_move_slice_k_x], s[s_x], 24
    s_lshr_b32 s[s_move_slice_k_c], s[s_c], 24
    s_and_b32 s[s_y], s[s_tmp], s[s_y]
    s_and_b32 s[s_x], s[s_tmp], s[s_x]
    s_and_b32 s[s_c], s[s_tmp], s[s_c]
    s_mul_i32 s[s_tmp], s[s_c], s[s_x]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_gtc_iy,v_gtc_iec,s_magic_4,s_tmp+3,s_tmp,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_1], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_gtc_ic,v_gtc_ix,v_tmp+4,s_magic_5,s_tmp+3,s_c,v_tmp
    s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group]
    s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi]
    s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2]
    s_mul_i32 s[s_tmp], s[s_x], s[s_c]
    s_mul_i32 s[s_wei_stride_k], s[s_tmp], s[s_y]
    s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k], 4
    s_mov_b32 s[s_y_x_c], s[s_wei_stride_k]
    s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group]
    s_mul_i32 s[s_tmp+1], s[s_wo], s[s_out_stride_wo]
    s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+1]
    s_mul_i32  s[s_tmp], s[s_n], s[s_in_stride_n]
    s_mul_i32  s[s_tmp+1], s[s_n], s[s_out_stride_n]
    s_lshl_b32 s[s_tmp+4], s[s_tmp], 1
    s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]
    s_add_u32 s[s_tmp], 15, s[s_wei_stride_k]
    s_lshr_b32 s[s_tmp], s[s_tmp], 4
    s_lshl_b32 s[s_knum], s[s_tmp], 4
    s_mul_i32 s[s_dim_br], s[s_ho], s[s_wo]
    s_mul_i32 s[s_dim_mr], s[s_n], s[s_dim_br]
    v_mul_u32_u24 v[v_sst_a_os], s[s_dilation_h], v[v_gtc_iy]
    s_add_u32 s[s_tmp], 127, s[s_dim_mr]
    v_subrev_u32 v[v_sst_a_os], s[s_pad_h], v[v_sst_a_os]
    s_lshr_b32 s[s_tmp+1], s[s_tmp], 7
    v_mul_u32_u24 v[v_sld_a_os], s[s_dilation_w], v[v_gtc_ix]
    s_lshl_b32 s[s_dim_mp], s[s_tmp+1], 7
    v_subrev_u32 v[v_sld_a_os], s[s_pad_w], v[v_sld_a_os]
    s_add_u32 s[s_tmp], 127, s[s_k]
    s_lshr_b32 s[s_tmp+1], s[s_tmp], 7
    s_lshl_b32 s[s_dim_np], s[s_tmp+1], 7

    ; gemm_m_per_block:128, gemm_n_per_block:128, source_access_order:0
    s_lshr_b32 s[s_tmp], s[s_dim_mp], 7
    s_lshr_b32 s[s_tmp+1], s[s_dim_np], 7
    s_mul_i32 s[0], s[s_tmp+1], s[s_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080018 ; offset:24, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_block_gtc_ig,s_bx,s_magic_3,s_tmp+3,0,s_tmp
    s_mov_b32 s[s_bx], s[s_tmp+4]
    s_lshr_b32 s[0], s[s_dim_np], 7
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080000 ; offset:0, width:8
    .mdiv_u32_rem_ss s_tmp+4,s_tmp+5,s_bx,s_magic_0,s_tmp+3,0,s_tmp
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+4], 7
    s_lshl_b32 s[s_block_gtc_inb], s[s_tmp+5], 7
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_in_inb]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list,v_in_ihi_list,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list], s[s_stride_h], v[v_in_ihi_list]
    v_add_u32 v[v_in_ihi_list], v[v_in_ihi_list], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list], s[s_stride_w], v[v_in_iwi_list]
    v_add_u32 v[v_in_iwi_list], v[v_in_iwi_list], v[v_sld_a_os]

    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshlrev_b32 v[v_in_flag_n], 0, v[v_tmp]
    s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1
    ; calculate wei offset
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_ik], v[v_wei_ik]
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_tmp+5]
    v_add_lshl_u32 v[v_wei_os], v[v_tmp], v[v_gtc_iec], 1
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag], 0, v[v_wei_flag], vcc
    v_mov_b32 v[v_wei_tmp_pack], v[v_wei_flag]
    s_mov_b32 s[s_tmp], 16
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+1], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+1], 0, v[v_wei_flag+1], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+1], 1, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+2], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+2], 0, v[v_wei_flag+2], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+2], 2, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+3], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+3], 0, v[v_wei_flag+3], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+3], 3, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+4], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+4], 0, v[v_wei_flag+4], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+4], 4, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+5], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+5], 0, v[v_wei_flag+5], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+5], 5, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+6], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+6], 0, v[v_wei_flag+6], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+6], 6, v[v_wei_tmp_pack]
    v_add_u32 v[v_tmp+5], s[s_tmp], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp+5]
    v_cndmask_b32 v[v_wei_flag+7], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_wei_flag+7], 0, v[v_wei_flag+7], vcc
    v_lshl_or_b32 v[v_wei_tmp_pack], v[v_wei_flag+7], 7, v[v_wei_tmp_pack]

    s_lshl_b32 s[s_wei_stride_k0], s[s_wei_stride_k0], 1

    s_mul_i32 s[s_wei_offset+0], 2, s[s_wei_stride_k0]
    s_mul_i32 s[s_wei_offset+1], 3, s[s_wei_stride_k0]
    s_mul_i32 s[s_wei_offset+2], 4, s[s_wei_stride_k0]
    s_mul_i32 s[s_wei_offset+3], 5, s[s_wei_stride_k0]
    s_mul_i32 s[s_wei_offset+4], 6, s[s_wei_stride_k0]
    s_mul_i32 s[s_wei_offset+5], 7, s[s_wei_stride_k0]
    .v_clear_nc v_gld_b, 8
    s_mov_b32 s[s_p_wei+2], 0xffffffff
    s_mov_b32 s[s_p_wei+3], 0x27000
    ; load weight
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1]
    buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2]
    buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3]
    buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4]
    buffer_load_short_d16 v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5]
    buffer_load_short_d16 v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6]
    buffer_load_short_d16 v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7]
    buffer_load_short_d16 v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    s_mov_b64 exec, -1

    ; calculate in offset
    s_mov_b32 s[s_in_offset], 0
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_c]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    s_lshl_b32 s[s_in_stride_wi], s[s_in_stride_wi], 1
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list]
    v_add_u32 v[v_tmp], v[v_in_iwi_list], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os], v[v_tmp+4], v[v_tmp]
    v_bfe_u32 v[v_tmp+1], v[v_in_flag_n],  0, 1
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+1], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc

    s_mov_b32 s1, 16
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+1,v_in_ihi_list+1,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+1], s[s_stride_h], v[v_in_ihi_list+1]
    v_add_u32 v[v_in_ihi_list+1], v[v_in_ihi_list+1], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+1], s[s_stride_w], v[v_in_iwi_list+1]
    v_add_u32 v[v_in_iwi_list+1], v[v_in_iwi_list+1], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+1]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+1], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 1, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc
    s_mov_b32 s1, 32
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+2,v_in_ihi_list+2,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+2], s[s_stride_h], v[v_in_ihi_list+2]
    v_add_u32 v[v_in_ihi_list+2], v[v_in_ihi_list+2], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+2], s[s_stride_w], v[v_in_iwi_list+2]
    v_add_u32 v[v_in_iwi_list+2], v[v_in_iwi_list+2], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+2]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+2], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 2, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc
    s_mov_b32 s1, 48
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+3,v_in_ihi_list+3,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+3], s[s_stride_h], v[v_in_ihi_list+3]
    v_add_u32 v[v_in_ihi_list+3], v[v_in_ihi_list+3], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+3], s[s_stride_w], v[v_in_iwi_list+3]
    v_add_u32 v[v_in_iwi_list+3], v[v_in_iwi_list+3], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+3]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+3], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 3, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc
    s_mov_b32 s1, 64
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+4,v_in_ihi_list+4,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+4], s[s_stride_h], v[v_in_ihi_list+4]
    v_add_u32 v[v_in_ihi_list+4], v[v_in_ihi_list+4], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+4], s[s_stride_w], v[v_in_iwi_list+4]
    v_add_u32 v[v_in_iwi_list+4], v[v_in_iwi_list+4], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+4]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+4], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 4, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4]
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4]
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc
    s_mov_b32 s1, 80
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+5,v_in_ihi_list+5,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+5], s[s_stride_h], v[v_in_ihi_list+5]
    v_add_u32 v[v_in_ihi_list+5], v[v_in_ihi_list+5], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+5], s[s_stride_w], v[v_in_iwi_list+5]
    v_add_u32 v[v_in_iwi_list+5], v[v_in_iwi_list+5], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+5]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+5], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 5, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc
    s_mov_b32 s1, 96
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+6,v_in_ihi_list+6,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+6], s[s_stride_h], v[v_in_ihi_list+6]
    v_add_u32 v[v_in_ihi_list+6], v[v_in_ihi_list+6], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+6], s[s_stride_w], v[v_in_iwi_list+6]
    v_add_u32 v[v_in_iwi_list+6], v[v_in_iwi_list+6], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+6]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+6], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 6, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc
    s_mov_b32 s1, 112
    v_add_u32 v[v_tmp], s1, v[v_in_inb]
    v_add_u32 v[v_tmp+5], s[s_block_gtc_inb], v[v_tmp]
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080008 ; offset:8, width:8
    .mdiv_u32_rem_vs v_tmp+4,v_in_in,v_tmp+5,s_magic_1,s_tmp+3,s_dim_br,v_tmp
    s_bfe_u32 s[s_tmp+3], s[s_shift_pack_0], 0x00080010 ; offset:16, width:8
    .mdiv_u32_rem_vs v_in_iwi_list+7,v_in_ihi_list+7,v_tmp+4,s_magic_2,s_tmp+3,s_wo,v_tmp
    v_mul_lo_u32 v[v_in_ihi_list+7], s[s_stride_h], v[v_in_ihi_list+7]
    v_add_u32 v[v_in_ihi_list+7], v[v_in_ihi_list+7], v[v_sst_a_os]
    v_mul_lo_u32 v[v_in_iwi_list+7], s[s_stride_w], v[v_in_iwi_list+7]
    v_add_u32 v[v_in_iwi_list+7], v[v_in_iwi_list+7], v[v_sld_a_os]

    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n], v[v_in_in]
    v_add_lshl_u32 v[v_tmp+4], v[v_gtc_ic], v[v_tmp+1], 1
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi_list+7]
    v_add_u32 v[v_tmp], v[v_in_iwi_list+7], v[v_tmp]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_wi], v[v_tmp]
    v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_tmp]
    v_cmp_gt_u32 vcc, s[s_n], v[v_in_in]
    v_cndmask_b32 v[v_tmp], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_tmp+1], 0, 1, vcc
    v_and_b32 v[v_tmp], v[v_tmp+1], v[v_tmp]
    v_lshl_or_b32 v[v_in_flag_n], v[v_tmp], 7, v[v_in_flag_n]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc
    s_mov_b32 s[s_p_in+2], 0xffffffff
    s_mov_b32 s[s_p_in+3], 0x27000
    ; load input, nxe:1
    .v_clear_nc v_gld_a, 8
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+1]
    buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+2]
    buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+3]
    buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+4]
    buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+5]
    buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+6]
    buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+7]
    buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1
    v_and_b32 v[v_gemm_in], 15, v[v_tmp+5]           ; block_n index 
    v_and_b32 v[v_gemm_im], 15, v[v_tmp+5]           ; block_m index 
    v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in]   ; shift left k_pack:4
    v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im]   ; shift left k_pack:4
    v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5]          ; block_n_per_wave index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 6, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 1], 1, v[v_tmp+5]          ; block_m_per_wave index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 1], 6, v[v_gemm_im]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5]  ; waves_per_n index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 7, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 7, v[v_gemm_im]

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 15, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 4, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 3, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 2, v[v_tmp+5]
    v_mov_b32 v[v_co_sst], v[v_tmp+0]
    v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1]
    v_and_b32 v[v_tmp+0], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 5, v[v_co_sst]
    v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 5, v[v_co_sld]

    ; LDS store, in: e,c,nb0,nb1: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_a:1, bf16
    v_lshlrev_b32 v[v_tmp+2], 2,  v[v_in_inb]
    v_lshrrev_b32 v[v_tmp+1], 2,  v[v_gtc_iec]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2]
    v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec]
    v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im] ; LDS load in
    ; LDS store, wei: e,c,k: 1x1x8x1, 1x16x1x16, k_pack:1, k_pack_gld_b:1, bf16
    v_lshlrev_b32 v[v_tmp+2], 2,  v[v_wei_ik]
    v_lshrrev_b32 v[v_tmp+1], 2,  v[v_gtc_iec]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+1], 9, v[v_tmp+2]
    v_and_b32 v[v_tmp+2], 3, v[v_gtc_iec]
    v_or_b32 v[v_tmp], v[v_tmp], v[v_tmp+2]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_add_u32 v[v_sst_b_os], 4096, v[v_sst_b_os]

    v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in] ; LDS load wei
    v_add_u32 v[v_sld_b_os], 4096, v[v_sld_b_os]
    v_mov_b32 v[v_gemm_in], v[v_co_sst]
    v_mov_b32 v[v_gemm_im], v[v_co_sld]
    ; init_co_lds_offset for xdlops
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp],  3 v[v_tmp]   ; thread id of lanegroup_m_per_cluster
    v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp]
    v_lshrrev_b32 v[v_tmp+2], 5, v[v_gemm_im]  ; thread id of waves_per_m
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst]
    v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in]   ; implicit transpose with m granularity:4 while store
    v_lshl_or_b32 v[v_co_sst], v[v_tmp], 9, v[v_tmp+1]
    v_lshlrev_b32 v[v_co_sst], 1, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 3, v[0]
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4]
    ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2
    ; nd_stride:[4, 4, 1, 1, 2, 1, 2, 1]
    v_lshrrev_b32 v[v_co_sub_m_index], 7, v[0]   ; get tid along m
    v_and_b32 v[v_tmp+0], 3, v[v_co_sub_m_index]                   ; => x_mc
    v_lshlrev_b32 v[v_co_sub_m_index], 2, v[v_tmp+0]      ; => accumulate x_mc
    ; init_co_sub_n_index xdlops
    v_and_b32 v[v_co_sub_n_index], 127, v[0]

    v_add_u32 v[v_tmp], s[s_block_gtc_ik], v[v_co_sub_n_index]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    v_cndmask_b32 v[v_out_flag], 0, 1, vcc
    ; output offset
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_k]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]

    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 1
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp+3]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], 0

    s_lshl_b32 s[s_out_stride_wo], s[s_out_stride_wo], 1
    v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index]   ; total n*ho*wo
    v_mul_lo_u32 v[v_out_os], s[s_out_stride_wo], v[v_out_inb]
    v_lshlrev_b32 v[v_tmp], 1, v[v_co_sub_n_index]
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp]
    ; move slice stride
    v_bfe_u32 v[v_wei_flag], v[v_wei_tmp_pack], 0, 1
    s_mov_b32 s[s_move_slice_k_stride_gemm_k], 32
    v_bfe_u32 v[v_wei_flag+1], v[v_wei_tmp_pack], 1, 1
    
    s_mul_i32 s[s_tmp+5], s[s_wi], s[s_in_stride_wi]
    s_mul_i32 s[s_tmp], s[s_dilation_w], s[s_in_stride_wi]
    s_lshl_b32 s[s_tmp+1], s[s_c], 1
    s_sub_i32 s[s_diff_in_os_ovf_c_acc_x], s[s_tmp], s[s_tmp+1]
    s_mul_i32 s[s_diff_in_iwi_acc_x], s[s_move_slice_k_x], s[s_dilation_w]
    s_mul_i32 s[s_diff_in_iwi_ovf_x], s[s_x], s[s_dilation_w]
    s_mul_i32 s[s_diff_in_ihi_acc_y], s[s_move_slice_k_y], s[s_dilation_h]
    s_mul_i32 s[s_tmp+5], s[s_tmp+5], s[s_dilation_h]
    s_mul_i32 s[s_tmp+2], s[s_tmp], s[s_move_slice_k_x]
    s_lshl_b32 s[s_tmp+1], s[s_move_slice_k_c], 1
    s_mul_i32 s[s_tmp], s[s_diff_in_ihi_acc_y], s[s_tmp+5]
    s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_tmp], s[s_tmp+1]
    s_add_u32 s[s_diff_in_os_acc_y_x_c], s[s_diff_in_os_acc_y_x_c], s[s_tmp+2]
    s_mul_i32 s[s_tmp], s[s_diff_in_iwi_ovf_x], s[s_in_stride_wi]
    s_sub_i32 s[s_diff_in_os_ovf_x_acc_y], s[s_tmp+5], s[s_tmp]

    s_mov_b32 s[s_p_out+2], 0xffffffff
    v_bfe_u32 v[v_wei_flag+2], v[v_wei_tmp_pack], 2, 1
    s_mov_b32 s[s_p_out+3], 0x27000
    v_bfe_u32 v[v_wei_flag+3], v[v_wei_tmp_pack], 3, 1
    v_bfe_u32 v[v_wei_flag+4], v[v_wei_tmp_pack], 4, 1
    v_bfe_u32 v[v_wei_flag+5], v[v_wei_tmp_pack], 5, 1
    v_bfe_u32 v[v_wei_flag+6], v[v_wei_tmp_pack], 6, 1
    v_bfe_u32 v[v_wei_flag+7], v[v_wei_tmp_pack], 7, 1
    ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 1x1 step, k_pack:4
    s_waitcnt vmcnt(8)
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+0] 
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896

    s_waitcnt vmcnt(0)
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+0] 
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896

    .v_clear_nc a_c, 64
    ; make sure acc WAR harzard, at least 1 nop for src_c
    s_sub_i32 s[s_kitr], s[s_knum], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_end

    v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x]
    v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y]
    v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c]
    v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec]
    v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic]
    v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic]
    v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic]
    v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix]
    v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy]
    v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4]
    s_mov_b64 exec, -1
    v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix]
    v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix]
    v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix]
    v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy]
    v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5]
    v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4]
    s_mov_b64 exec, -1
    v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list]
    v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1]
    v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2]
    v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3]
    v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4]
    v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5]
    v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6]
    v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7]
    v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list]
    v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1]
    v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2]
    v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3]
    v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4]
    v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5]
    v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6]
    v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7]
    v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os]
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc
    v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag]
    v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1]
    v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2]
    v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3]
    v_and_b32 v[v_wei_flag+4], v[v_gtc_iy], v[v_wei_flag+4]
    v_and_b32 v[v_wei_flag+5], v[v_gtc_iy], v[v_wei_flag+5]
    v_and_b32 v[v_wei_flag+6], v[v_gtc_iy], v[v_wei_flag+6]
    v_and_b32 v[v_wei_flag+7], v[v_gtc_iy], v[v_wei_flag+7]
    v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc
    v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc
    v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc
    v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc
    v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4]
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4]
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc
    v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc
    v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc
    v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc
    
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512
L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_body:
    ; do fma accumulate with unroll 16
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag]
    buffer_load_short_d16 v[v_gld_b], v[v_wei_os], s[s_p_wei:s_p_wei+3], 0 offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+1]
    buffer_load_short_d16 v[v_gld_b+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_stride_k0] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+2]
    buffer_load_short_d16 v[v_gld_b+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+3]
    buffer_load_short_d16 v[v_gld_b+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+1] offen offset:0
    s_mov_b64 exec, -1
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(3)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+4]
    buffer_load_short_d16 v[v_gld_b+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+2] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+5]
    buffer_load_short_d16 v[v_gld_b+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+3] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+6]
    buffer_load_short_d16 v[v_gld_b+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+4] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_wei_flag+7]
    buffer_load_short_d16 v[v_gld_b+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_wei_offset+5] offen offset:0
    s_mov_b64 exec, -1
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    .v_clear_nc v_gld_a, 8
    v_cmpx_le_u32 vcc, 1, v[v_in_flag]
    buffer_load_short_d16 v[v_gld_a], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+1]
    buffer_load_short_d16 v[v_gld_a+1], v[v_in_os+1], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+2]
    buffer_load_short_d16 v[v_gld_a+2], v[v_in_os+2], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0
    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+3]
    buffer_load_short_d16 v[v_gld_a+3], v[v_in_os+3], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+4]
    buffer_load_short_d16 v[v_gld_a+4], v[v_in_os+4], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+5]
    buffer_load_short_d16 v[v_gld_a+5], v[v_in_os+5], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+6]
    buffer_load_short_d16 v[v_gld_a+6], v[v_in_os+6], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_cmpx_le_u32 vcc, 1, v[v_in_flag+7]
    buffer_load_short_d16 v[v_gld_a+7], v[v_in_os+7], s[s_p_in:s_p_in+3], s[s_in_offset] offen offset:0
    s_mov_b64 exec, -1
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mov_b32 v[v_gtc_iy], s[s_diff_in_iwi_acc_x]
    v_mov_b32 v[v_tmp+5], s[s_diff_in_ihi_acc_y]
    v_mov_b32 v[v_tmp+4], s[s_diff_in_os_acc_y_x_c]
    v_add_u32 v[v_gtc_iec], 16, v[v_gtc_iec]
    v_add_u32 v[v_gtc_ic], s[s_move_slice_k_c], v[v_gtc_ic]
    v_cmpx_le_u32 vcc, s[s_c], v[v_gtc_ic]
    v_subrev_u32 v[v_gtc_ic], s[s_c], v[v_gtc_ic]
    v_add_u32 v[v_gtc_ix], 1, v[v_gtc_ix]
    v_add_u32 v[v_gtc_iy], s[s_dilation_w], v[v_gtc_iy]
    v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_c_acc_x], v[v_tmp+4]
    s_mov_b64 exec, -1
    v_add_u32 v[v_gtc_ix], s[s_move_slice_k_x], v[v_gtc_ix]
    v_cmpx_le_u32 vcc, s[s_x], v[v_gtc_ix]
    v_subrev_u32 v[v_gtc_ix], s[s_x], v[v_gtc_ix]
    v_subrev_u32 v[v_gtc_iy], s[s_diff_in_iwi_ovf_x], v[v_gtc_iy]
    v_add_u32 v[v_tmp+5], s[s_dilation_h], v[v_tmp+5]
    v_add_u32 v[v_tmp+4], s[s_diff_in_os_ovf_x_acc_y], v[v_tmp+4]
    s_mov_b64 exec, -1
    v_add_u32 v[v_in_iwi_list], v[v_gtc_iy], v[v_in_iwi_list]
    v_add_u32 v[v_in_iwi_list+1], v[v_gtc_iy], v[v_in_iwi_list+1]
    v_add_u32 v[v_in_iwi_list+2], v[v_gtc_iy], v[v_in_iwi_list+2]
    v_add_u32 v[v_in_iwi_list+3], v[v_gtc_iy], v[v_in_iwi_list+3]
    v_add_u32 v[v_in_iwi_list+4], v[v_gtc_iy], v[v_in_iwi_list+4]
    v_add_u32 v[v_in_iwi_list+5], v[v_gtc_iy], v[v_in_iwi_list+5]
    v_add_u32 v[v_in_iwi_list+6], v[v_gtc_iy], v[v_in_iwi_list+6]
    v_add_u32 v[v_in_iwi_list+7], v[v_gtc_iy], v[v_in_iwi_list+7]
    v_add_u32 v[v_in_ihi_list], v[v_tmp+5], v[v_in_ihi_list]
    v_add_u32 v[v_in_ihi_list+1], v[v_tmp+5], v[v_in_ihi_list+1]
    v_add_u32 v[v_in_ihi_list+2], v[v_tmp+5], v[v_in_ihi_list+2]
    v_add_u32 v[v_in_ihi_list+3], v[v_tmp+5], v[v_in_ihi_list+3]
    v_add_u32 v[v_in_ihi_list+4], v[v_tmp+5], v[v_in_ihi_list+4]
    v_add_u32 v[v_in_ihi_list+5], v[v_tmp+5], v[v_in_ihi_list+5]
    v_add_u32 v[v_in_ihi_list+6], v[v_tmp+5], v[v_in_ihi_list+6]
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    v_add_u32 v[v_in_ihi_list+7], v[v_tmp+5], v[v_in_ihi_list+7]
    v_add_u32 v[v_wei_os], s[s_move_slice_k_stride_gemm_k], v[v_wei_os]
    v_cmp_gt_u32 vcc, s[s_y_x_c], v[v_gtc_iec]
    v_cndmask_b32 v[v_gtc_iy], 0, 1, vcc
    v_and_b32 v[v_wei_flag], v[v_gtc_iy], v[v_wei_flag]
    v_and_b32 v[v_wei_flag+1], v[v_gtc_iy], v[v_wei_flag+1]
    v_and_b32 v[v_wei_flag+2], v[v_gtc_iy], v[v_wei_flag+2]
    v_and_b32 v[v_wei_flag+3], v[v_gtc_iy], v[v_wei_flag+3]
    v_and_b32 v[v_wei_flag+4], v[v_gtc_iy], v[v_wei_flag+4]
    v_and_b32 v[v_wei_flag+5], v[v_gtc_iy], v[v_wei_flag+5]
    v_and_b32 v[v_wei_flag+6], v[v_gtc_iy], v[v_wei_flag+6]
    v_and_b32 v[v_wei_flag+7], v[v_gtc_iy], v[v_wei_flag+7]
    v_add_u32 v[v_in_os], v[v_tmp+4], v[v_in_os]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 0, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc
    v_add_u32 v[v_in_os+1], v[v_tmp+4], v[v_in_os+1]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 1, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+1]
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0
    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+1]
    v_cndmask_b32 v[v_in_flag+1], 0, v[v_in_flag+1], vcc
    v_add_u32 v[v_in_os+2], v[v_tmp+4], v[v_in_os+2]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 2, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+2]
    v_cndmask_b32 v[v_in_flag+2], 0, v[v_in_flag+2], vcc
    v_add_u32 v[v_in_os+3], v[v_tmp+4], v[v_in_os+3]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 3, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+3]
    v_cndmask_b32 v[v_in_flag+3], 0, v[v_in_flag+3], vcc
    v_add_u32 v[v_in_os+4], v[v_tmp+4], v[v_in_os+4]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 4, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+4]
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+4]
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1
    v_cndmask_b32 v[v_in_flag+4], 0, v[v_in_flag+4], vcc
    v_add_u32 v[v_in_os+5], v[v_tmp+4], v[v_in_os+5]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 5, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+5]
    v_cndmask_b32 v[v_in_flag+5], 0, v[v_in_flag+5], vcc
    v_add_u32 v[v_in_os+6], v[v_tmp+4], v[v_in_os+6]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 6, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+6]
    v_cndmask_b32 v[v_in_flag+6], 0, v[v_in_flag+6], vcc
    v_add_u32 v[v_in_os+7], v[v_tmp+4], v[v_in_os+7]
    v_bfe_u32 v[v_tmp+5], v[v_in_flag_n], 7, 1   ; extract flag_n
    v_and_b32 v[v_tmp+5], v[v_gtc_iy], v[v_tmp+5]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_tmp+5], vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi_list+7]
    v_cndmask_b32 v[v_in_flag+7], 0, v[v_in_flag+7], vcc
    s_waitcnt lgkmcnt(0)
    s_barrier
    s_waitcnt vmcnt(8)
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+0]
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+1] offset:128
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+2] offset:256
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+3] offset:384
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+4] offset:512
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+5] offset:640
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+6] offset:768
    ds_write_b16 v[v_sst_b_os], v[v_gld_b+7] offset:896
    s_waitcnt vmcnt(0)
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+0]
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+1] offset:128
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+2] offset:256
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+3] offset:384
    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+4] offset:512
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+5] offset:640
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+6] offset:768
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_write_b16 v[v_sst_a_os], v[v_gld_a+7] offset:896
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    s_sub_i32 s[s_kitr], s[s_kitr], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_finishing
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512
    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    s_branch L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_body
L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_finishing:
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16

    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16

L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_mfma_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:512
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:512
    ; k iteration : 0
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:1024 ; load i_k:1 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(3)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:1536 ; load i_k:1 into local buffer 1, repeat 1

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:2048 ; load i_k:2 into local buffer 0, repeat 0

    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1

    ; k iteration : 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:2560 ; load i_k:2 into local buffer 0, repeat 1

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_a+4:v_a+4+1], v[v_sld_a_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+4:v_b+4+1], v[v_sld_b_os] offset:3072 ; load i_k:3 into local buffer 1, repeat 0

    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ds_read_b64 v[v_b+6:v_b+6+1], v[v_sld_b_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1
    ds_read_b64 v[v_a+6:v_a+6+1], v[v_sld_a_os] offset:3584 ; load i_k:3 into local buffer 1, repeat 1

    ; k iteration : 14
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16

    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16
    ; k iteration : 15
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_16x16x4bf16_1k v[a_c+0:a_c+15], v[v_a+4:v_a+5], v[v_b+4:v_b+5], v[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(1)
    v_mfma_f32_16x16x4bf16_1k v[a_c+16:a_c+31], v[v_a+4:v_a+5], v[v_b+6:v_b+7], v[a_c+16:a_c+31]     ; repeat:0x1, step:0x0, num_a_c:16

    s_waitcnt lgkmcnt(0)
    v_mfma_f32_16x16x4bf16_1k v[a_c+32:a_c+47], v[v_a+6:v_a+7], v[v_b+4:v_b+5], v[a_c+32:a_c+47]     ; repeat:1x0, step:0x0, num_a_c:16

    v_mfma_f32_16x16x4bf16_1k v[a_c+48:a_c+63], v[v_a+6:v_a+7], v[v_b+6:v_b+7], v[a_c+48:a_c+63]     ; repeat:1x1, step:0x0, num_a_c:16

    s_nop 9
    ; coalescing store, mapping:mt_m:128, mt_n:128, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:1, s_n:1 | 16x16x4, lanegroup_m_tcbw:4x4x1x2, lanegroup_n_tcbw:1x16x1x2
    ; coalescing_groups:4, num_dword_per_group:16
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:128x128 sub_m_index:[0, 4]
    ; g_mr:2, g_ms:1, g_mw:2, g_mb:1, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:1, l_mt:4 | n_mc:4, n_ml:1, n_mv:2
    ; nd_stride:[4, 1, 1, 2, 1, 2, 1]
    ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0
    s_barrier
    v_pack_b32_f16 v[v_c], v[a_c], v[a_c+1] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+1], v[a_c+2], v[a_c+3] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+4], v[a_c+4], v[a_c+5] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+5], v[a_c+6], v[a_c+7] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_pack_b32_f16 v[v_c+8], v[a_c+16], v[a_c+17] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+9], v[a_c+18], v[a_c+19] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512   ; idword:64(0,64),  0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+12], v[a_c+20], v[a_c+21] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+13], v[a_c+22], v[a_c+23] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640   ; idword:80(0,80),  0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    s_mov_b32 s[s_tmp], 0   ; i_m:0(i_m0:0,i_m1:0)
    v_add_u32 v[v_out_inb], s[s_block_gtc_inb], v[v_co_sub_m_index]
    v_mov_b32 v[v_tmp], v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    v_cmpx_eq_u32 vcc, 1, v[v_out_flag]
    ;   store to global, m index start from 0, m0:0, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b32 s[s_tmp], s[s_out_stride_wo]   ; i_m:1(i_m0:0,i_m1:1)
    v_add_u32 v[v_tmp], 1, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 2, s[s_out_stride_wo]   ; i_m:2(i_m0:0,i_m1:2)
    v_add_u32 v[v_tmp], 2, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 3, s[s_out_stride_wo]   ; i_m:3(i_m0:0,i_m1:3)
    v_add_u32 v[v_tmp], 3, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 8, s[s_out_stride_wo]   ; i_m:8(i_m0:0,i_m1:8)
    v_add_u32 v[v_tmp], 8, v[v_out_inb]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 9, s[s_out_stride_wo]   ; i_m:9(i_m0:0,i_m1:9)
    v_add_u32 v[v_tmp], 9, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 10, s[s_out_stride_wo]   ; i_m:10(i_m0:0,i_m1:10)
    v_add_u32 v[v_tmp], 10, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 11, s[s_out_stride_wo]   ; i_m:11(i_m0:0,i_m1:11)
    v_add_u32 v[v_tmp], 11, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 32, s[s_out_stride_wo]   ; i_m:32(i_m0:2,i_m1:0)
    v_add_u32 v[v_tmp], 32, v[v_out_inb]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 33, s[s_out_stride_wo]   ; i_m:33(i_m0:2,i_m1:1)
    v_add_u32 v[v_tmp], 33, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 34, s[s_out_stride_wo]   ; i_m:34(i_m0:2,i_m1:2)
    v_add_u32 v[v_tmp], 34, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 35, s[s_out_stride_wo]   ; i_m:35(i_m0:2,i_m1:3)
    v_add_u32 v[v_tmp], 35, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 40, s[s_out_stride_wo]   ; i_m:40(i_m0:2,i_m1:8)
    v_add_u32 v[v_tmp], 40, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 41, s[s_out_stride_wo]   ; i_m:41(i_m0:2,i_m1:9)
    v_add_u32 v[v_tmp], 41, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 42, s[s_out_stride_wo]   ; i_m:42(i_m0:2,i_m1:10)
    v_add_u32 v[v_tmp], 42, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 43, s[s_out_stride_wo]   ; i_m:43(i_m0:2,i_m1:11)
    v_add_u32 v[v_tmp], 43, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 16
    s_barrier
    v_pack_b32_f16 v[v_c], v[a_c+8], v[a_c+9] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+1], v[a_c+10], v[a_c+11] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+4], v[a_c+12], v[a_c+13] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+5], v[a_c+14], v[a_c+15] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_pack_b32_f16 v[v_c+8], v[a_c+24], v[a_c+25] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+9], v[a_c+26], v[a_c+27] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512   ; idword:64(0,64),  0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+12], v[a_c+28], v[a_c+29] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+13], v[a_c+30], v[a_c+31] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640   ; idword:80(0,80),  0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    s_mul_i32 s[s_tmp], 16, s[s_out_stride_wo]   ; i_m:16(i_m0:1,i_m1:0)
    v_add_u32 v[v_tmp], 16, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    v_cmpx_eq_u32 vcc, 1, v[v_out_flag]
    ;   store to global, m index start from 16, m0:1, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 17, s[s_out_stride_wo]   ; i_m:17(i_m0:1,i_m1:1)
    v_add_u32 v[v_tmp], 17, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 18, s[s_out_stride_wo]   ; i_m:18(i_m0:1,i_m1:2)
    v_add_u32 v[v_tmp], 18, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 19, s[s_out_stride_wo]   ; i_m:19(i_m0:1,i_m1:3)
    v_add_u32 v[v_tmp], 19, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 24, s[s_out_stride_wo]   ; i_m:24(i_m0:1,i_m1:8)
    v_add_u32 v[v_tmp], 24, v[v_out_inb]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 25, s[s_out_stride_wo]   ; i_m:25(i_m0:1,i_m1:9)
    v_add_u32 v[v_tmp], 25, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 26, s[s_out_stride_wo]   ; i_m:26(i_m0:1,i_m1:10)
    v_add_u32 v[v_tmp], 26, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 27, s[s_out_stride_wo]   ; i_m:27(i_m0:1,i_m1:11)
    v_add_u32 v[v_tmp], 27, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 48, s[s_out_stride_wo]   ; i_m:48(i_m0:3,i_m1:0)
    v_add_u32 v[v_tmp], 48, v[v_out_inb]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 49, s[s_out_stride_wo]   ; i_m:49(i_m0:3,i_m1:1)
    v_add_u32 v[v_tmp], 49, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 50, s[s_out_stride_wo]   ; i_m:50(i_m0:3,i_m1:2)
    v_add_u32 v[v_tmp], 50, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 51, s[s_out_stride_wo]   ; i_m:51(i_m0:3,i_m1:3)
    v_add_u32 v[v_tmp], 51, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 56, s[s_out_stride_wo]   ; i_m:56(i_m0:3,i_m1:8)
    v_add_u32 v[v_tmp], 56, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 57, s[s_out_stride_wo]   ; i_m:57(i_m0:3,i_m1:9)
    v_add_u32 v[v_tmp], 57, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 58, s[s_out_stride_wo]   ; i_m:58(i_m0:3,i_m1:10)
    v_add_u32 v[v_tmp], 58, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 59, s[s_out_stride_wo]   ; i_m:59(i_m0:3,i_m1:11)
    v_add_u32 v[v_tmp], 59, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 2, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 64
    s_barrier
    v_pack_b32_f16 v[v_c], v[a_c+32], v[a_c+33] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+1], v[a_c+34], v[a_c+35] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+4], v[a_c+36], v[a_c+37] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+5], v[a_c+38], v[a_c+39] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_pack_b32_f16 v[v_c+8], v[a_c+48], v[a_c+49] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+9], v[a_c+50], v[a_c+51] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512   ; idword:64(0,64),  0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+12], v[a_c+52], v[a_c+53] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+13], v[a_c+54], v[a_c+55] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640   ; idword:80(0,80),  0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    s_mul_i32 s[s_tmp], 64, s[s_out_stride_wo]   ; i_m:64(i_m0:4,i_m1:0)
    v_add_u32 v[v_tmp], 64, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    v_cmpx_eq_u32 vcc, 1, v[v_out_flag]
    ;   store to global, m index start from 64, m0:4, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 65, s[s_out_stride_wo]   ; i_m:65(i_m0:4,i_m1:1)
    v_add_u32 v[v_tmp], 65, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 66, s[s_out_stride_wo]   ; i_m:66(i_m0:4,i_m1:2)
    v_add_u32 v[v_tmp], 66, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 67, s[s_out_stride_wo]   ; i_m:67(i_m0:4,i_m1:3)
    v_add_u32 v[v_tmp], 67, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 72, s[s_out_stride_wo]   ; i_m:72(i_m0:4,i_m1:8)
    v_add_u32 v[v_tmp], 72, v[v_out_inb]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 73, s[s_out_stride_wo]   ; i_m:73(i_m0:4,i_m1:9)
    v_add_u32 v[v_tmp], 73, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 74, s[s_out_stride_wo]   ; i_m:74(i_m0:4,i_m1:10)
    v_add_u32 v[v_tmp], 74, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 75, s[s_out_stride_wo]   ; i_m:75(i_m0:4,i_m1:11)
    v_add_u32 v[v_tmp], 75, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 96, s[s_out_stride_wo]   ; i_m:96(i_m0:6,i_m1:0)
    v_add_u32 v[v_tmp], 96, v[v_out_inb]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 97, s[s_out_stride_wo]   ; i_m:97(i_m0:6,i_m1:1)
    v_add_u32 v[v_tmp], 97, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 98, s[s_out_stride_wo]   ; i_m:98(i_m0:6,i_m1:2)
    v_add_u32 v[v_tmp], 98, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 99, s[s_out_stride_wo]   ; i_m:99(i_m0:6,i_m1:3)
    v_add_u32 v[v_tmp], 99, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 104, s[s_out_stride_wo]   ; i_m:104(i_m0:6,i_m1:8)
    v_add_u32 v[v_tmp], 104, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 105, s[s_out_stride_wo]   ; i_m:105(i_m0:6,i_m1:9)
    v_add_u32 v[v_tmp], 105, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 106, s[s_out_stride_wo]   ; i_m:106(i_m0:6,i_m1:10)
    v_add_u32 v[v_tmp], 106, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 107, s[s_out_stride_wo]   ; i_m:107(i_m0:6,i_m1:11)
    v_add_u32 v[v_tmp], 107, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 3, i_g_mr:1, i_g_ms:0, i_g_mw:1, i_g_mb:0, i_g_mt:0, m index start from 80
    s_barrier
    v_pack_b32_f16 v[v_c], v[a_c+40], v[a_c+41] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+1], v[a_c+42], v[a_c+43] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c:v_c+1]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+4], v[a_c+44], v[a_c+45] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+5], v[a_c+46], v[a_c+47] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+4:v_c+4+1] offset:128   ; idword:16(0,16),  0x16 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:1
    v_pack_b32_f16 v[v_c+8], v[a_c+56], v[a_c+57] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+9], v[a_c+58], v[a_c+59] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+8:v_c+8+1] offset:512   ; idword:64(0,64),  0x64 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_pack_b32_f16 v[v_c+12], v[a_c+60], v[a_c+61] op_sel:[1,1]
    v_pack_b32_f16 v[v_c+13], v[a_c+62], v[a_c+63] op_sel:[1,1]
    ds_write_b64 v[v_co_sst], v[v_c+12:v_c+12+1] offset:640   ; idword:80(0,80),  0x80 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:1
    s_mul_i32 s[s_tmp], 80, s[s_out_stride_wo]   ; i_m:80(i_m0:5,i_m1:0)
    v_add_u32 v[v_tmp], 80, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b64 v[v_c:v_c+1], v[v_co_sld] offset:0
    ds_read_b64 v[v_c+2:v_c+2+1], v[v_co_sld] offset:2048
    ds_read_b64 v[v_c+4:v_c+4+1], v[v_co_sld] offset:4096
    ds_read_b64 v[v_c+6:v_c+6+1], v[v_co_sld] offset:6144
    v_cmpx_eq_u32 vcc, 1, v[v_out_flag]
    ;   store to global, m index start from 80, m0:5, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 81, s[s_out_stride_wo]   ; i_m:81(i_m0:5,i_m1:1)
    v_add_u32 v[v_tmp], 81, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 82, s[s_out_stride_wo]   ; i_m:82(i_m0:5,i_m1:2)
    v_add_u32 v[v_tmp], 82, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 83, s[s_out_stride_wo]   ; i_m:83(i_m0:5,i_m1:3)
    v_add_u32 v[v_tmp], 83, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+1], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 88, s[s_out_stride_wo]   ; i_m:88(i_m0:5,i_m1:8)
    v_add_u32 v[v_tmp], 88, v[v_out_inb]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 89, s[s_out_stride_wo]   ; i_m:89(i_m0:5,i_m1:9)
    v_add_u32 v[v_tmp], 89, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+2], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 90, s[s_out_stride_wo]   ; i_m:90(i_m0:5,i_m1:10)
    v_add_u32 v[v_tmp], 90, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 91, s[s_out_stride_wo]   ; i_m:91(i_m0:5,i_m1:11)
    v_add_u32 v[v_tmp], 91, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 112, s[s_out_stride_wo]   ; i_m:112(i_m0:7,i_m1:0)
    v_add_u32 v[v_tmp], 112, v[v_out_inb]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 113, s[s_out_stride_wo]   ; i_m:113(i_m0:7,i_m1:1)
    v_add_u32 v[v_tmp], 113, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+4], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 114, s[s_out_stride_wo]   ; i_m:114(i_m0:7,i_m1:2)
    v_add_u32 v[v_tmp], 114, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 115, s[s_out_stride_wo]   ; i_m:115(i_m0:7,i_m1:3)
    v_add_u32 v[v_tmp], 115, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+5], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 120, s[s_out_stride_wo]   ; i_m:120(i_m0:7,i_m1:8)
    v_add_u32 v[v_tmp], 120, v[v_out_inb]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 121, s[s_out_stride_wo]   ; i_m:121(i_m0:7,i_m1:9)
    v_add_u32 v[v_tmp], 121, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+6], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 122, s[s_out_stride_wo]   ; i_m:122(i_m0:7,i_m1:10)
    v_add_u32 v[v_tmp], 122, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 123, s[s_out_stride_wo]   ; i_m:123(i_m0:7,i_m1:11)
    v_add_u32 v[v_tmp], 123, v[v_out_inb]
    v_cmp_gt_u32 vcc, s[s_dim_mr], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_store_short_d16_hi_m v[v_c+7], v[v_out_os], s[s_p_out:s_p_out+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
L_igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me
    .amdhsa_group_segment_fixed_size 8192
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_sgpr_workgroup_id_y 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 164
    .amdhsa_next_free_sgpr 66
    .amdhsa_ieee_mode 1
    .amdhsa_dx10_clamp 1
    .amdhsa_float_round_mode_32 3
    .amdhsa_float_round_mode_16_64 3
    .amdhsa_tg_split 0
    .amdhsa_accum_offset 100
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me
    .symbol: igemm_fwd_gtcx35_nhwc_bf16_bx0_ex1_bt128x128x16_wt32x32x4_ws1x1_wr2x2_ta1x1x8x1_1x16x1x16_tb1x1x8x1_1x16x1x16_me.kd
    .sgpr_count: 72
    .vgpr_count: 164
    .kernarg_segment_align: 8
    .kernarg_segment_size: 128
    .group_segment_fixed_size: 8192
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in_     , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_wei_    , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out_    , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: hi_       , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi_       , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n_        , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k_        , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c_        , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho_       , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo_       , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h_ , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w_ , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h_, .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w_, .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h_    , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w_    , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y_        , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x_        , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: group_    , .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: magic_0_  , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: magic_1_  , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
    - { .name: magic_2_  , .size: 4, .offset:  96, .value_kind: by_value, .value_type: i32}
    - { .name: magic_3_  , .size: 4, .offset: 100, .value_kind: by_value, .value_type: i32}
    - { .name: magic_4_  , .size: 4, .offset: 104, .value_kind: by_value, .value_type: i32}
    - { .name: magic_5_  , .size: 4, .offset: 108, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_0_, .size: 4, .offset: 112, .value_kind: by_value, .value_type: i32}
    - { .name: shift_pack_1_, .size: 4, .offset: 116, .value_kind: by_value, .value_type: i32}
    - { .name: gemm_k_split_, .size: 4, .offset: 120, .value_kind: by_value, .value_type: i32}
    - { .name: __pack_0_ , .size: 4, .offset: 124, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
