1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 5; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 8 9define amdgpu_kernel void @zero_init_kernel() { 10; GFX9-LABEL: zero_init_kernel: 11; GFX9: ; %bb.0: 12; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 13; GFX9-NEXT: s_mov_b32 s0, 0 14; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 15; GFX9-NEXT: s_mov_b32 s1, s0 16; GFX9-NEXT: s_mov_b32 s2, s0 17; GFX9-NEXT: s_mov_b32 s3, s0 18; GFX9-NEXT: v_mov_b32_e32 v0, s0 19; GFX9-NEXT: v_mov_b32_e32 v1, s1 20; GFX9-NEXT: v_mov_b32_e32 v2, s2 21; GFX9-NEXT: v_mov_b32_e32 v3, s3 22; GFX9-NEXT: s_mov_b32 vcc_hi, 0 23; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 28; GFX9-NEXT: s_mov_b32 vcc_hi, 0 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 30; GFX9-NEXT: s_endpgm 31; 32; GFX10-LABEL: zero_init_kernel: 33; GFX10: ; %bb.0: 34; GFX10-NEXT: s_add_u32 s0, s0, s3 35; GFX10-NEXT: s_addc_u32 s1, s1, 0 36; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 37; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 38; GFX10-NEXT: s_mov_b32 s0, 0 39; GFX10-NEXT: s_mov_b32 s1, s0 40; GFX10-NEXT: s_mov_b32 s2, s0 41; GFX10-NEXT: s_mov_b32 s3, s0 42; GFX10-NEXT: v_mov_b32_e32 v0, s0 43; GFX10-NEXT: v_mov_b32_e32 v1, s1 44; GFX10-NEXT: v_mov_b32_e32 v2, s2 45; GFX10-NEXT: v_mov_b32_e32 v3, s3 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 50; GFX10-NEXT: s_endpgm 51; 52; GFX9-PAL-LABEL: zero_init_kernel: 53; GFX9-PAL: ; %bb.0: 54; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 55; GFX9-PAL-NEXT: s_mov_b32 s2, s0 56; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 57; GFX9-PAL-NEXT: s_mov_b32 s0, 0 58; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 59; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 60; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 61; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 62; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 63; GFX9-PAL-NEXT: s_mov_b32 s1, s0 64; GFX9-PAL-NEXT: s_mov_b32 s2, s0 65; GFX9-PAL-NEXT: s_mov_b32 s3, s0 66; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 67; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 68; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 69; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 70; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 71; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 72; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 73; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 74; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 75; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 76; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 77; GFX9-PAL-NEXT: s_endpgm 78; 79; GFX940-LABEL: zero_init_kernel: 80; GFX940: ; %bb.0: 81; GFX940-NEXT: s_mov_b32 s0, 0 82; GFX940-NEXT: s_mov_b32 s1, s0 83; GFX940-NEXT: s_mov_b32 s2, s0 84; GFX940-NEXT: s_mov_b32 s3, s0 85; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 86; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 87; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 88; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 89; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 90; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 91; GFX940-NEXT: s_endpgm 92; 93; GFX1010-PAL-LABEL: zero_init_kernel: 94; GFX1010-PAL: ; %bb.0: 95; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 96; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 97; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 98; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 99; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 100; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 101; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 102; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 103; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 104; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 105; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 106; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 107; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 108; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 109; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 110; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 111; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 112; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 113; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 114; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 115; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 116; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 117; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 118; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 119; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 120; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 121; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 122; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 123; GFX1010-PAL-NEXT: s_endpgm 124; 125; GFX1030-PAL-LABEL: zero_init_kernel: 126; GFX1030-PAL: ; %bb.0: 127; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 128; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 129; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 130; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 132; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 133; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 134; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 135; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 136; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 137; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 138; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 139; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 140; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 141; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 142; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 143; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 144; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 145; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 146; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 147; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 148; GFX1030-PAL-NEXT: s_endpgm 149 %alloca = alloca [32 x i16], align 2, addrspace(5) 150 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 151 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 152 ret void 153} 154 155define void @zero_init_foo() { 156; GFX9-LABEL: zero_init_foo: 157; GFX9: ; %bb.0: 158; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 159; GFX9-NEXT: s_mov_b32 s0, 0 160; GFX9-NEXT: s_mov_b32 s1, s0 161; GFX9-NEXT: s_mov_b32 s2, s0 162; GFX9-NEXT: s_mov_b32 s3, s0 163; GFX9-NEXT: v_mov_b32_e32 v0, s0 164; GFX9-NEXT: v_mov_b32_e32 v1, s1 165; GFX9-NEXT: v_mov_b32_e32 v2, s2 166; GFX9-NEXT: v_mov_b32_e32 v3, s3 167; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 168; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 169; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 170; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 171; GFX9-NEXT: s_waitcnt vmcnt(0) 172; GFX9-NEXT: s_setpc_b64 s[30:31] 173; 174; GFX10-LABEL: zero_init_foo: 175; GFX10: ; %bb.0: 176; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 178; GFX10-NEXT: s_mov_b32 s0, 0 179; GFX10-NEXT: s_mov_b32 s1, s0 180; GFX10-NEXT: s_mov_b32 s2, s0 181; GFX10-NEXT: s_mov_b32 s3, s0 182; GFX10-NEXT: v_mov_b32_e32 v0, s0 183; GFX10-NEXT: v_mov_b32_e32 v1, s1 184; GFX10-NEXT: v_mov_b32_e32 v2, s2 185; GFX10-NEXT: v_mov_b32_e32 v3, s3 186; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 187; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 188; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 189; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 190; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 191; GFX10-NEXT: s_setpc_b64 s[30:31] 192; 193; GFX9-PAL-LABEL: zero_init_foo: 194; GFX9-PAL: ; %bb.0: 195; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX9-PAL-NEXT: s_mov_b32 s0, 0 197; GFX9-PAL-NEXT: s_mov_b32 s1, s0 198; GFX9-PAL-NEXT: s_mov_b32 s2, s0 199; GFX9-PAL-NEXT: s_mov_b32 s3, s0 200; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 201; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 202; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 203; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 204; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 205; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 206; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 207; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 208; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 209; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 210; 211; GFX940-LABEL: zero_init_foo: 212; GFX940: ; %bb.0: 213; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX940-NEXT: s_mov_b32 s0, 0 215; GFX940-NEXT: s_mov_b32 s1, s0 216; GFX940-NEXT: s_mov_b32 s2, s0 217; GFX940-NEXT: s_mov_b32 s3, s0 218; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 219; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 220; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 221; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 222; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 223; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 224; GFX940-NEXT: s_waitcnt vmcnt(0) 225; GFX940-NEXT: s_setpc_b64 s[30:31] 226; 227; GFX10-PAL-LABEL: zero_init_foo: 228; GFX10-PAL: ; %bb.0: 229; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 231; GFX10-PAL-NEXT: s_mov_b32 s0, 0 232; GFX10-PAL-NEXT: s_mov_b32 s1, s0 233; GFX10-PAL-NEXT: s_mov_b32 s2, s0 234; GFX10-PAL-NEXT: s_mov_b32 s3, s0 235; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 236; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 237; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 238; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 239; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 240; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 241; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 242; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 243; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 244; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 245; GCN-LABEL: zero_init_foo: 246; GCN: ; %bb.0: 247; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GCN-NEXT: s_mov_b32 s0, 0 249; GCN-NEXT: s_mov_b32 s1, s0 250; GCN-NEXT: s_mov_b32 s2, s0 251; GCN-NEXT: s_mov_b32 s3, s0 252; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 253; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 254; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 255; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 256; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 257; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 258; GCN-NEXT: s_waitcnt vmcnt(0) 259; GCN-NEXT: s_setpc_b64 s[30:31] 260 %alloca = alloca [32 x i16], align 2, addrspace(5) 261 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 262 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 263 ret void 264} 265 266define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 267; GFX9-LABEL: store_load_sindex_kernel: 268; GFX9: ; %bb.0: ; %bb 269; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 270; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 271; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 272; GFX9-NEXT: v_mov_b32_e32 v0, 15 273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 274; GFX9-NEXT: s_lshl_b32 s1, s0, 2 275; GFX9-NEXT: s_and_b32 s0, s0, 15 276; GFX9-NEXT: s_add_i32 s1, s1, 4 277; GFX9-NEXT: s_lshl_b32 s0, s0, 2 278; GFX9-NEXT: scratch_store_dword off, v0, s1 279; GFX9-NEXT: s_waitcnt vmcnt(0) 280; GFX9-NEXT: s_add_i32 s0, s0, 4 281; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 282; GFX9-NEXT: s_waitcnt vmcnt(0) 283; GFX9-NEXT: s_endpgm 284; 285; GFX10-LABEL: store_load_sindex_kernel: 286; GFX10: ; %bb.0: ; %bb 287; GFX10-NEXT: s_add_u32 s2, s2, s5 288; GFX10-NEXT: s_addc_u32 s3, s3, 0 289; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 290; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 291; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 292; GFX10-NEXT: v_mov_b32_e32 v0, 15 293; GFX10-NEXT: s_waitcnt lgkmcnt(0) 294; GFX10-NEXT: s_and_b32 s1, s0, 15 295; GFX10-NEXT: s_lshl_b32 s0, s0, 2 296; GFX10-NEXT: s_lshl_b32 s1, s1, 2 297; GFX10-NEXT: s_add_i32 s0, s0, 4 298; GFX10-NEXT: s_add_i32 s1, s1, 4 299; GFX10-NEXT: scratch_store_dword off, v0, s0 300; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 301; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 302; GFX10-NEXT: s_waitcnt vmcnt(0) 303; GFX10-NEXT: s_endpgm 304; 305; GFX9-PAL-LABEL: store_load_sindex_kernel: 306; GFX9-PAL: ; %bb.0: ; %bb 307; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 308; GFX9-PAL-NEXT: s_mov_b32 s4, s0 309; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 310; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 311; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 312; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 313; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 314; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 315; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 316; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 317; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 318; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 319; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 320; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 321; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 322; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 323; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 324; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 325; GFX9-PAL-NEXT: s_endpgm 326; 327; GFX940-LABEL: store_load_sindex_kernel: 328; GFX940: ; %bb.0: ; %bb 329; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 330; GFX940-NEXT: v_mov_b32_e32 v0, 15 331; GFX940-NEXT: s_waitcnt lgkmcnt(0) 332; GFX940-NEXT: s_lshl_b32 s1, s0, 2 333; GFX940-NEXT: s_and_b32 s0, s0, 15 334; GFX940-NEXT: s_add_i32 s1, s1, 4 335; GFX940-NEXT: s_lshl_b32 s0, s0, 2 336; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 337; GFX940-NEXT: s_waitcnt vmcnt(0) 338; GFX940-NEXT: s_add_i32 s0, s0, 4 339; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 340; GFX940-NEXT: s_waitcnt vmcnt(0) 341; GFX940-NEXT: s_endpgm 342; 343; GFX10-PAL-LABEL: store_load_sindex_kernel: 344; GFX10-PAL: ; %bb.0: ; %bb 345; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 346; GFX10-PAL-NEXT: s_mov_b32 s4, s0 347; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 348; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 349; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 350; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 351; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 352; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 353; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 354; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 355; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 356; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 357; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 358; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 359; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 360; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 361; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 362; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 363; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 364; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 365; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 366; GFX10-PAL-NEXT: s_endpgm 367; GCN-LABEL: store_load_sindex_kernel: 368; GCN: ; %bb.0: ; %bb 369; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 370; GCN-NEXT: v_mov_b32_e32 v0, 15 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: s_lshl_b32 s1, s0, 2 373; GCN-NEXT: s_and_b32 s0, s0, 15 374; GCN-NEXT: s_lshl_b32 s0, s0, 2 375; GCN-NEXT: s_add_u32 s1, 4, s1 376; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 377; GCN-NEXT: s_waitcnt vmcnt(0) 378; GCN-NEXT: s_add_u32 s0, 4, s0 379; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 380; GCN-NEXT: s_waitcnt vmcnt(0) 381; GCN-NEXT: s_endpgm 382bb: 383 %i = alloca [32 x float], align 4, addrspace(5) 384 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 385 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 386 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 387 store volatile i32 15, i32 addrspace(5)* %i8, align 4 388 %i9 = and i32 %idx, 15 389 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 390 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 391 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 392 ret void 393} 394 395define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 396; GFX9-LABEL: store_load_sindex_foo: 397; GFX9: ; %bb.0: ; %bb 398; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 399; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 400; GFX9-NEXT: s_lshl_b32 s0, s2, 2 401; GFX9-NEXT: s_add_i32 s0, s0, 4 402; GFX9-NEXT: v_mov_b32_e32 v0, 15 403; GFX9-NEXT: scratch_store_dword off, v0, s0 404; GFX9-NEXT: s_waitcnt vmcnt(0) 405; GFX9-NEXT: s_and_b32 s0, s2, 15 406; GFX9-NEXT: s_lshl_b32 s0, s0, 2 407; GFX9-NEXT: s_add_i32 s0, s0, 4 408; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 409; GFX9-NEXT: s_waitcnt vmcnt(0) 410; GFX9-NEXT: s_endpgm 411; 412; GFX10-LABEL: store_load_sindex_foo: 413; GFX10: ; %bb.0: ; %bb 414; GFX10-NEXT: s_add_u32 s0, s0, s3 415; GFX10-NEXT: s_addc_u32 s1, s1, 0 416; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 417; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 418; GFX10-NEXT: v_mov_b32_e32 v0, 15 419; GFX10-NEXT: s_and_b32 s0, s2, 15 420; GFX10-NEXT: s_lshl_b32 s1, s2, 2 421; GFX10-NEXT: s_lshl_b32 s0, s0, 2 422; GFX10-NEXT: s_add_i32 s1, s1, 4 423; GFX10-NEXT: s_add_i32 s0, s0, 4 424; GFX10-NEXT: scratch_store_dword off, v0, s1 425; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 426; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 427; GFX10-NEXT: s_waitcnt vmcnt(0) 428; GFX10-NEXT: s_endpgm 429; 430; GFX9-PAL-LABEL: store_load_sindex_foo: 431; GFX9-PAL: ; %bb.0: ; %bb 432; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 433; GFX9-PAL-NEXT: s_mov_b32 s2, s0 434; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 435; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 436; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 437; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 438; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 439; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 440; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 441; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 442; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 443; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 444; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 445; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 446; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 447; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 448; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 449; GFX9-PAL-NEXT: s_endpgm 450; 451; GFX940-LABEL: store_load_sindex_foo: 452; GFX940: ; %bb.0: ; %bb 453; GFX940-NEXT: s_lshl_b32 s1, s0, 2 454; GFX940-NEXT: s_and_b32 s0, s0, 15 455; GFX940-NEXT: s_add_i32 s1, s1, 4 456; GFX940-NEXT: v_mov_b32_e32 v0, 15 457; GFX940-NEXT: s_lshl_b32 s0, s0, 2 458; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 459; GFX940-NEXT: s_waitcnt vmcnt(0) 460; GFX940-NEXT: s_add_i32 s0, s0, 4 461; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 462; GFX940-NEXT: s_waitcnt vmcnt(0) 463; GFX940-NEXT: s_endpgm 464; 465; GFX10-PAL-LABEL: store_load_sindex_foo: 466; GFX10-PAL: ; %bb.0: ; %bb 467; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 468; GFX10-PAL-NEXT: s_mov_b32 s2, s0 469; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 470; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 471; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 472; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 473; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 474; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 475; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 476; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 477; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 478; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 479; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 480; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 481; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 482; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 483; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 484; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 485; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 486; GFX10-PAL-NEXT: s_endpgm 487; GCN-LABEL: store_load_sindex_foo: 488; GCN: ; %bb.0: ; %bb 489; GCN-NEXT: s_lshl_b32 s1, s0, 2 490; GCN-NEXT: s_and_b32 s0, s0, 15 491; GCN-NEXT: s_lshl_b32 s0, s0, 2 492; GCN-NEXT: s_add_u32 s1, 4, s1 493; GCN-NEXT: v_mov_b32_e32 v0, 15 494; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 495; GCN-NEXT: s_waitcnt vmcnt(0) 496; GCN-NEXT: s_add_u32 s0, 4, s0 497; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 498; GCN-NEXT: s_waitcnt vmcnt(0) 499; GCN-NEXT: s_endpgm 500bb: 501 %i = alloca [32 x float], align 4, addrspace(5) 502 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 503 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 504 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 505 store volatile i32 15, i32 addrspace(5)* %i8, align 4 506 %i9 = and i32 %idx, 15 507 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 508 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 509 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 510 ret void 511} 512 513define amdgpu_kernel void @store_load_vindex_kernel() { 514; GFX9-LABEL: store_load_vindex_kernel: 515; GFX9: ; %bb.0: ; %bb 516; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 517; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 518; GFX9-NEXT: v_mov_b32_e32 v1, 4 519; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 520; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 521; GFX9-NEXT: v_mov_b32_e32 v3, 15 522; GFX9-NEXT: scratch_store_dword v2, v3, off 523; GFX9-NEXT: s_waitcnt vmcnt(0) 524; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 525; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 526; GFX9-NEXT: s_waitcnt vmcnt(0) 527; GFX9-NEXT: s_endpgm 528; 529; GFX10-LABEL: store_load_vindex_kernel: 530; GFX10: ; %bb.0: ; %bb 531; GFX10-NEXT: s_add_u32 s0, s0, s3 532; GFX10-NEXT: s_addc_u32 s1, s1, 0 533; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 534; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 535; GFX10-NEXT: v_mov_b32_e32 v1, 4 536; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 537; GFX10-NEXT: v_mov_b32_e32 v3, 15 538; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 539; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 540; GFX10-NEXT: scratch_store_dword v2, v3, off 541; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 542; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 543; GFX10-NEXT: s_waitcnt vmcnt(0) 544; GFX10-NEXT: s_endpgm 545; 546; GFX9-PAL-LABEL: store_load_vindex_kernel: 547; GFX9-PAL: ; %bb.0: ; %bb 548; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 549; GFX9-PAL-NEXT: s_mov_b32 s2, s0 550; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 551; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 552; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 553; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 554; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 555; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 556; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 557; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 558; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 559; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 560; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 561; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 562; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 563; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 564; GFX9-PAL-NEXT: s_endpgm 565; 566; GFX940-LABEL: store_load_vindex_kernel: 567; GFX940: ; %bb.0: ; %bb 568; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 569; GFX940-NEXT: v_mov_b32_e32 v1, 15 570; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 571; GFX940-NEXT: s_waitcnt vmcnt(0) 572; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 573; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 574; GFX940-NEXT: s_waitcnt vmcnt(0) 575; GFX940-NEXT: s_endpgm 576; 577; GFX10-PAL-LABEL: store_load_vindex_kernel: 578; GFX10-PAL: ; %bb.0: ; %bb 579; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 580; GFX10-PAL-NEXT: s_mov_b32 s2, s0 581; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 582; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 583; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 584; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 585; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 586; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 587; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 588; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4 589; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 590; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 591; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 592; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 593; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off 594; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 595; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 596; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 597; GFX10-PAL-NEXT: s_endpgm 598; GCN-LABEL: store_load_vindex_kernel: 599; GCN: ; %bb.0: ; %bb 600; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 601; GCN-NEXT: v_mov_b32_e32 v1, 15 602; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 603; GCN-NEXT: s_waitcnt vmcnt(0) 604; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 605; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 606; GCN-NEXT: s_waitcnt vmcnt(0) 607; GCN-NEXT: s_endpgm 608bb: 609 %i = alloca [32 x float], align 4, addrspace(5) 610 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 611 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 612 %i3 = zext i32 %i2 to i64 613 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 614 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 615 store volatile i32 15, i32 addrspace(5)* %i8, align 4 616 %i9 = sub nsw i32 31, %i2 617 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 618 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 619 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 620 ret void 621} 622 623define void @store_load_vindex_foo(i32 %idx) { 624; GFX9-LABEL: store_load_vindex_foo: 625; GFX9: ; %bb.0: ; %bb 626; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 627; GFX9-NEXT: v_mov_b32_e32 v1, s32 628; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 629; GFX9-NEXT: v_mov_b32_e32 v3, 15 630; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 631; GFX9-NEXT: scratch_store_dword v2, v3, off 632; GFX9-NEXT: s_waitcnt vmcnt(0) 633; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 634; GFX9-NEXT: scratch_load_dword v0, v0, off glc 635; GFX9-NEXT: s_waitcnt vmcnt(0) 636; GFX9-NEXT: s_setpc_b64 s[30:31] 637; 638; GFX10-LABEL: store_load_vindex_foo: 639; GFX10: ; %bb.0: ; %bb 640; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 641; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 642; GFX10-NEXT: v_mov_b32_e32 v1, s32 643; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 644; GFX10-NEXT: v_mov_b32_e32 v3, 15 645; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 646; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 647; GFX10-NEXT: scratch_store_dword v0, v3, off 648; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 649; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 650; GFX10-NEXT: s_waitcnt vmcnt(0) 651; GFX10-NEXT: s_setpc_b64 s[30:31] 652; 653; GFX9-PAL-LABEL: store_load_vindex_foo: 654; GFX9-PAL: ; %bb.0: ; %bb 655; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 656; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 657; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 658; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 659; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 660; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 661; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 662; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 663; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 664; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 665; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 666; 667; GFX940-LABEL: store_load_vindex_foo: 668; GFX940: ; %bb.0: ; %bb 669; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 670; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 671; GFX940-NEXT: v_mov_b32_e32 v2, 15 672; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 673; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 674; GFX940-NEXT: s_waitcnt vmcnt(0) 675; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 676; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 677; GFX940-NEXT: s_waitcnt vmcnt(0) 678; GFX940-NEXT: s_setpc_b64 s[30:31] 679; 680; GFX10-PAL-LABEL: store_load_vindex_foo: 681; GFX10-PAL: ; %bb.0: ; %bb 682; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 683; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 684; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s32 685; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 686; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 687; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 688; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 689; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off 690; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 691; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 692; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 693; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 694; GCN-LABEL: store_load_vindex_foo: 695; GCN: ; %bb.0: ; %bb 696; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 697; GCN-NEXT: v_mov_b32_e32 v2, 15 698; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 699; GCN-NEXT: v_and_b32_e32 v0, v0, v2 700; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 701; GCN-NEXT: s_waitcnt vmcnt(0) 702; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 703; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 704; GCN-NEXT: s_waitcnt vmcnt(0) 705; GCN-NEXT: s_setpc_b64 s[30:31] 706bb: 707 %i = alloca [32 x float], align 4, addrspace(5) 708 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 709 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 710 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 711 store volatile i32 15, i32 addrspace(5)* %i8, align 4 712 %i9 = and i32 %idx, 15 713 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 714 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 715 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 716 ret void 717} 718 719define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 720; GFX9-LABEL: private_ptr_foo: 721; GFX9: ; %bb.0: 722; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 723; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 724; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 725; GFX9-NEXT: s_waitcnt vmcnt(0) 726; GFX9-NEXT: s_setpc_b64 s[30:31] 727; 728; GFX10-LABEL: private_ptr_foo: 729; GFX10: ; %bb.0: 730; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 731; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 732; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 733; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 734; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 735; GFX10-NEXT: s_setpc_b64 s[30:31] 736; 737; GFX9-PAL-LABEL: private_ptr_foo: 738; GFX9-PAL: ; %bb.0: 739; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 740; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 741; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 742; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 743; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 744; 745; GFX940-LABEL: private_ptr_foo: 746; GFX940: ; %bb.0: 747; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 748; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 749; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 750; GFX940-NEXT: s_waitcnt vmcnt(0) 751; GFX940-NEXT: s_setpc_b64 s[30:31] 752; 753; GFX10-PAL-LABEL: private_ptr_foo: 754; GFX10-PAL: ; %bb.0: 755; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 756; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 757; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 758; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 759; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 760; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 761; GCN-LABEL: private_ptr_foo: 762; GCN: ; %bb.0: 763; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 764; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 765; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 766; GCN-NEXT: s_waitcnt vmcnt(0) 767; GCN-NEXT: s_setpc_b64 s[30:31] 768 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 769 store float 1.000000e+01, float addrspace(5)* %gep, align 4 770 ret void 771} 772 773define amdgpu_kernel void @zero_init_small_offset_kernel() { 774; GFX9-LABEL: zero_init_small_offset_kernel: 775; GFX9: ; %bb.0: 776; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 777; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 778; GFX9-NEXT: s_mov_b32 vcc_hi, 0 779; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 780; GFX9-NEXT: s_waitcnt vmcnt(0) 781; GFX9-NEXT: s_mov_b32 s0, 0 782; GFX9-NEXT: s_mov_b32 s1, s0 783; GFX9-NEXT: s_mov_b32 s2, s0 784; GFX9-NEXT: s_mov_b32 s3, s0 785; GFX9-NEXT: v_mov_b32_e32 v0, s0 786; GFX9-NEXT: v_mov_b32_e32 v1, s1 787; GFX9-NEXT: v_mov_b32_e32 v2, s2 788; GFX9-NEXT: v_mov_b32_e32 v3, s3 789; GFX9-NEXT: s_mov_b32 vcc_hi, 0 790; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 791; GFX9-NEXT: s_mov_b32 vcc_hi, 0 792; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 793; GFX9-NEXT: s_mov_b32 vcc_hi, 0 794; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 795; GFX9-NEXT: s_mov_b32 vcc_hi, 0 796; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 797; GFX9-NEXT: s_endpgm 798; 799; GFX10-LABEL: zero_init_small_offset_kernel: 800; GFX10: ; %bb.0: 801; GFX10-NEXT: s_add_u32 s0, s0, s3 802; GFX10-NEXT: s_addc_u32 s1, s1, 0 803; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 804; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 805; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 806; GFX10-NEXT: s_waitcnt vmcnt(0) 807; GFX10-NEXT: s_mov_b32 s0, 0 808; GFX10-NEXT: s_mov_b32 s1, s0 809; GFX10-NEXT: s_mov_b32 s2, s0 810; GFX10-NEXT: s_mov_b32 s3, s0 811; GFX10-NEXT: v_mov_b32_e32 v0, s0 812; GFX10-NEXT: v_mov_b32_e32 v1, s1 813; GFX10-NEXT: v_mov_b32_e32 v2, s2 814; GFX10-NEXT: v_mov_b32_e32 v3, s3 815; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 816; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 817; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 818; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 819; GFX10-NEXT: s_endpgm 820; 821; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 822; GFX9-PAL: ; %bb.0: 823; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 824; GFX9-PAL-NEXT: s_mov_b32 s2, s0 825; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 826; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 827; GFX9-PAL-NEXT: s_mov_b32 s0, 0 828; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 829; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 830; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 831; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 832; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 833; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 834; GFX9-PAL-NEXT: s_mov_b32 s1, s0 835; GFX9-PAL-NEXT: s_mov_b32 s2, s0 836; GFX9-PAL-NEXT: s_mov_b32 s3, s0 837; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 838; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 839; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 840; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 841; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 842; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 843; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 844; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 845; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 846; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 847; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 848; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 849; GFX9-PAL-NEXT: s_endpgm 850; 851; GFX940-LABEL: zero_init_small_offset_kernel: 852; GFX940: ; %bb.0: 853; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 854; GFX940-NEXT: s_waitcnt vmcnt(0) 855; GFX940-NEXT: s_mov_b32 s0, 0 856; GFX940-NEXT: s_mov_b32 s1, s0 857; GFX940-NEXT: s_mov_b32 s2, s0 858; GFX940-NEXT: s_mov_b32 s3, s0 859; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 860; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 861; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 862; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 863; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 864; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 865; GFX940-NEXT: s_endpgm 866; 867; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 868; GFX1010-PAL: ; %bb.0: 869; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 870; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 871; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 872; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 873; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 874; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 875; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 876; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 877; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 878; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 879; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 880; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 881; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 882; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 883; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 884; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 885; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 886; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 887; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 888; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 889; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 890; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 891; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 892; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 893; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 894; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 895; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 896; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 897; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 898; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 899; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 900; GFX1010-PAL-NEXT: s_endpgm 901; 902; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 903; GFX1030-PAL: ; %bb.0: 904; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 905; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 906; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 907; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 908; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 909; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 910; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 911; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 912; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 913; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 914; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 915; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 916; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 917; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 918; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 919; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 920; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 921; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 922; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 923; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 924; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 925; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 926; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 927; GFX1030-PAL-NEXT: s_endpgm 928 %padding = alloca [64 x i32], align 4, addrspace(5) 929 %alloca = alloca [32 x i16], align 2, addrspace(5) 930 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 931 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 932 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 933 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 934 ret void 935} 936 937define void @zero_init_small_offset_foo() { 938; GFX9-LABEL: zero_init_small_offset_foo: 939; GFX9: ; %bb.0: 940; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 941; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 942; GFX9-NEXT: s_waitcnt vmcnt(0) 943; GFX9-NEXT: s_mov_b32 s0, 0 944; GFX9-NEXT: s_mov_b32 s1, s0 945; GFX9-NEXT: s_mov_b32 s2, s0 946; GFX9-NEXT: s_mov_b32 s3, s0 947; GFX9-NEXT: v_mov_b32_e32 v0, s0 948; GFX9-NEXT: v_mov_b32_e32 v1, s1 949; GFX9-NEXT: v_mov_b32_e32 v2, s2 950; GFX9-NEXT: v_mov_b32_e32 v3, s3 951; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 952; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 953; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 954; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 955; GFX9-NEXT: s_waitcnt vmcnt(0) 956; GFX9-NEXT: s_setpc_b64 s[30:31] 957; 958; GFX10-LABEL: zero_init_small_offset_foo: 959; GFX10: ; %bb.0: 960; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 961; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 962; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 963; GFX10-NEXT: s_waitcnt vmcnt(0) 964; GFX10-NEXT: s_mov_b32 s0, 0 965; GFX10-NEXT: s_mov_b32 s1, s0 966; GFX10-NEXT: s_mov_b32 s2, s0 967; GFX10-NEXT: s_mov_b32 s3, s0 968; GFX10-NEXT: v_mov_b32_e32 v0, s0 969; GFX10-NEXT: v_mov_b32_e32 v1, s1 970; GFX10-NEXT: v_mov_b32_e32 v2, s2 971; GFX10-NEXT: v_mov_b32_e32 v3, s3 972; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 973; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 974; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 975; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 976; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 977; GFX10-NEXT: s_setpc_b64 s[30:31] 978; 979; GFX9-PAL-LABEL: zero_init_small_offset_foo: 980; GFX9-PAL: ; %bb.0: 981; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 982; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 983; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 984; GFX9-PAL-NEXT: s_mov_b32 s0, 0 985; GFX9-PAL-NEXT: s_mov_b32 s1, s0 986; GFX9-PAL-NEXT: s_mov_b32 s2, s0 987; GFX9-PAL-NEXT: s_mov_b32 s3, s0 988; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 989; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 990; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 991; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 992; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 993; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 994; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 995; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 996; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 997; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 998; 999; GFX940-LABEL: zero_init_small_offset_foo: 1000; GFX940: ; %bb.0: 1001; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1002; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1003; GFX940-NEXT: s_waitcnt vmcnt(0) 1004; GFX940-NEXT: s_mov_b32 s0, 0 1005; GFX940-NEXT: s_mov_b32 s1, s0 1006; GFX940-NEXT: s_mov_b32 s2, s0 1007; GFX940-NEXT: s_mov_b32 s3, s0 1008; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1009; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1010; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1011; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1012; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1013; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1014; GFX940-NEXT: s_waitcnt vmcnt(0) 1015; GFX940-NEXT: s_setpc_b64 s[30:31] 1016; 1017; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1018; GFX10-PAL: ; %bb.0: 1019; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1020; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1021; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1022; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1023; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1024; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1025; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1026; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1027; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1028; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1029; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1030; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1031; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1032; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1033; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1034; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1035; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1036; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1037; GCN-LABEL: zero_init_small_offset_foo: 1038; GCN: ; %bb.0: 1039; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1040; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1041; GCN-NEXT: s_waitcnt vmcnt(0) 1042; GCN-NEXT: s_mov_b32 s0, 0 1043; GCN-NEXT: s_mov_b32 s1, s0 1044; GCN-NEXT: s_mov_b32 s2, s0 1045; GCN-NEXT: s_mov_b32 s3, s0 1046; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1047; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1048; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1049; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1050; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1051; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1052; GCN-NEXT: s_waitcnt vmcnt(0) 1053; GCN-NEXT: s_setpc_b64 s[30:31] 1054 %padding = alloca [64 x i32], align 4, addrspace(5) 1055 %alloca = alloca [32 x i16], align 2, addrspace(5) 1056 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1057 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1058 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1059 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1060 ret void 1061} 1062 1063define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1064; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1065; GFX9: ; %bb.0: ; %bb 1066; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1067; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1068; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1069; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1070; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1071; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1072; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1073; GFX9-NEXT: s_and_b32 s0, s0, 15 1074; GFX9-NEXT: v_mov_b32_e32 v0, 15 1075; GFX9-NEXT: s_addk_i32 s1, 0x104 1076; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1077; GFX9-NEXT: scratch_store_dword off, v0, s1 1078; GFX9-NEXT: s_waitcnt vmcnt(0) 1079; GFX9-NEXT: s_addk_i32 s0, 0x104 1080; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1081; GFX9-NEXT: s_waitcnt vmcnt(0) 1082; GFX9-NEXT: s_endpgm 1083; 1084; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1085; GFX10: ; %bb.0: ; %bb 1086; GFX10-NEXT: s_add_u32 s2, s2, s5 1087; GFX10-NEXT: s_addc_u32 s3, s3, 0 1088; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1089; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1090; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1091; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1092; GFX10-NEXT: s_waitcnt vmcnt(0) 1093; GFX10-NEXT: v_mov_b32_e32 v0, 15 1094; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1095; GFX10-NEXT: s_and_b32 s1, s0, 15 1096; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1097; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1098; GFX10-NEXT: s_addk_i32 s0, 0x104 1099; GFX10-NEXT: s_addk_i32 s1, 0x104 1100; GFX10-NEXT: scratch_store_dword off, v0, s0 1101; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1102; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1103; GFX10-NEXT: s_waitcnt vmcnt(0) 1104; GFX10-NEXT: s_endpgm 1105; 1106; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1107; GFX9-PAL: ; %bb.0: ; %bb 1108; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1109; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1110; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1111; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1112; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1113; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1115; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1116; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1117; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1118; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1119; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1120; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1121; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1122; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1123; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1124; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1125; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1126; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1127; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1128; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1129; GFX9-PAL-NEXT: s_endpgm 1130; 1131; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1132; GFX940: ; %bb.0: ; %bb 1133; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 1134; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1135; GFX940-NEXT: s_waitcnt vmcnt(0) 1136; GFX940-NEXT: v_mov_b32_e32 v0, 15 1137; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1138; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1139; GFX940-NEXT: s_and_b32 s0, s0, 15 1140; GFX940-NEXT: s_addk_i32 s1, 0x104 1141; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1142; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1143; GFX940-NEXT: s_waitcnt vmcnt(0) 1144; GFX940-NEXT: s_addk_i32 s0, 0x104 1145; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1146; GFX940-NEXT: s_waitcnt vmcnt(0) 1147; GFX940-NEXT: s_endpgm 1148; 1149; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1150; GFX1010-PAL: ; %bb.0: ; %bb 1151; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1152; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1153; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1154; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1155; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1156; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1157; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1158; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1159; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1160; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1161; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1162; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1163; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1164; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1165; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1167; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1168; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1169; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1170; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1171; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1172; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1173; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1174; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1175; GFX1010-PAL-NEXT: s_endpgm 1176; 1177; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1178; GFX1030-PAL: ; %bb.0: ; %bb 1179; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1180; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1181; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1182; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1183; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1184; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1185; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1186; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1187; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1188; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1189; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1190; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1191; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1192; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1193; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1194; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1195; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1196; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1197; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1198; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1199; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1200; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1201; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1202; GFX1030-PAL-NEXT: s_endpgm 1203bb: 1204 %padding = alloca [64 x i32], align 4, addrspace(5) 1205 %i = alloca [32 x float], align 4, addrspace(5) 1206 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1207 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1208 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1209 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1210 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1211 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1212 %i9 = and i32 %idx, 15 1213 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1214 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1215 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1216 ret void 1217} 1218 1219define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1220; GFX9-LABEL: store_load_sindex_small_offset_foo: 1221; GFX9: ; %bb.0: ; %bb 1222; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1223; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1224; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1225; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1226; GFX9-NEXT: s_waitcnt vmcnt(0) 1227; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1228; GFX9-NEXT: s_addk_i32 s0, 0x104 1229; GFX9-NEXT: v_mov_b32_e32 v0, 15 1230; GFX9-NEXT: scratch_store_dword off, v0, s0 1231; GFX9-NEXT: s_waitcnt vmcnt(0) 1232; GFX9-NEXT: s_and_b32 s0, s2, 15 1233; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1234; GFX9-NEXT: s_addk_i32 s0, 0x104 1235; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1236; GFX9-NEXT: s_waitcnt vmcnt(0) 1237; GFX9-NEXT: s_endpgm 1238; 1239; GFX10-LABEL: store_load_sindex_small_offset_foo: 1240; GFX10: ; %bb.0: ; %bb 1241; GFX10-NEXT: s_add_u32 s0, s0, s3 1242; GFX10-NEXT: s_addc_u32 s1, s1, 0 1243; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1244; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1245; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1246; GFX10-NEXT: s_waitcnt vmcnt(0) 1247; GFX10-NEXT: v_mov_b32_e32 v0, 15 1248; GFX10-NEXT: s_and_b32 s0, s2, 15 1249; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1250; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1251; GFX10-NEXT: s_addk_i32 s1, 0x104 1252; GFX10-NEXT: s_addk_i32 s0, 0x104 1253; GFX10-NEXT: scratch_store_dword off, v0, s1 1254; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1255; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1256; GFX10-NEXT: s_waitcnt vmcnt(0) 1257; GFX10-NEXT: s_endpgm 1258; 1259; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1260; GFX9-PAL: ; %bb.0: ; %bb 1261; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1262; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1263; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1264; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1265; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1266; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1267; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1268; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1269; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1270; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1271; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1272; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1273; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1274; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1275; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1276; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1277; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1278; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1279; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1280; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1281; GFX9-PAL-NEXT: s_endpgm 1282; 1283; GFX940-LABEL: store_load_sindex_small_offset_foo: 1284; GFX940: ; %bb.0: ; %bb 1285; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1286; GFX940-NEXT: s_waitcnt vmcnt(0) 1287; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1288; GFX940-NEXT: s_and_b32 s0, s0, 15 1289; GFX940-NEXT: s_addk_i32 s1, 0x104 1290; GFX940-NEXT: v_mov_b32_e32 v0, 15 1291; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1292; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1293; GFX940-NEXT: s_waitcnt vmcnt(0) 1294; GFX940-NEXT: s_addk_i32 s0, 0x104 1295; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1296; GFX940-NEXT: s_waitcnt vmcnt(0) 1297; GFX940-NEXT: s_endpgm 1298; 1299; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1300; GFX1010-PAL: ; %bb.0: ; %bb 1301; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1302; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1303; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1304; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1305; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1306; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1307; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1308; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1309; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1310; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1311; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1312; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1313; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1314; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1315; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1316; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1317; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1318; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1319; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1320; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1321; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1322; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1323; GFX1010-PAL-NEXT: s_endpgm 1324; 1325; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1326; GFX1030-PAL: ; %bb.0: ; %bb 1327; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1328; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1329; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1330; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1331; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1332; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1333; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1334; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1335; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1336; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1337; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1338; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1339; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1340; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1341; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1342; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1343; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1344; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1345; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1346; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1347; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1348; GFX1030-PAL-NEXT: s_endpgm 1349bb: 1350 %padding = alloca [64 x i32], align 4, addrspace(5) 1351 %i = alloca [32 x float], align 4, addrspace(5) 1352 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1353 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1354 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1355 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1356 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1357 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1358 %i9 = and i32 %idx, 15 1359 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1360 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1361 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1362 ret void 1363} 1364 1365define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1366; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1367; GFX9: ; %bb.0: ; %bb 1368; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1369; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1370; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1371; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1372; GFX9-NEXT: s_waitcnt vmcnt(0) 1373; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1374; GFX9-NEXT: v_mov_b32_e32 v1, 0x104 1375; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 1376; GFX9-NEXT: v_mov_b32_e32 v3, 15 1377; GFX9-NEXT: scratch_store_dword v2, v3, off 1378; GFX9-NEXT: s_waitcnt vmcnt(0) 1379; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 1380; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1381; GFX9-NEXT: s_waitcnt vmcnt(0) 1382; GFX9-NEXT: s_endpgm 1383; 1384; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1385; GFX10: ; %bb.0: ; %bb 1386; GFX10-NEXT: s_add_u32 s0, s0, s3 1387; GFX10-NEXT: s_addc_u32 s1, s1, 0 1388; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1389; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1390; GFX10-NEXT: v_mov_b32_e32 v1, 0x104 1391; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1392; GFX10-NEXT: v_mov_b32_e32 v3, 15 1393; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 1394; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1395; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1396; GFX10-NEXT: s_waitcnt vmcnt(0) 1397; GFX10-NEXT: scratch_store_dword v2, v3, off 1398; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1399; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1400; GFX10-NEXT: s_waitcnt vmcnt(0) 1401; GFX10-NEXT: s_endpgm 1402; 1403; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1404; GFX9-PAL: ; %bb.0: ; %bb 1405; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1406; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1407; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1408; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1409; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1410; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1411; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1412; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1413; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1414; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1415; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1416; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1417; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1418; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 1419; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1420; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1421; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 1422; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1423; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1424; GFX9-PAL-NEXT: s_endpgm 1425; 1426; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1427; GFX940: ; %bb.0: ; %bb 1428; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 1429; GFX940-NEXT: s_waitcnt vmcnt(0) 1430; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1431; GFX940-NEXT: v_mov_b32_e32 v1, 15 1432; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 1433; GFX940-NEXT: s_waitcnt vmcnt(0) 1434; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 1435; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1436; GFX940-NEXT: s_waitcnt vmcnt(0) 1437; GFX940-NEXT: s_endpgm 1438; 1439; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1440; GFX1010-PAL: ; %bb.0: ; %bb 1441; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1442; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1443; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1444; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1445; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1446; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1447; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1448; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1449; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1450; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1451; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1452; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 1453; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1454; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1455; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1456; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc 1457; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1458; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off 1459; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1460; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1461; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1462; GFX1010-PAL-NEXT: s_endpgm 1463; 1464; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1465; GFX1030-PAL: ; %bb.0: ; %bb 1466; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1467; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1468; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1469; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1470; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1471; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1472; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1473; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1474; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1475; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x104 1476; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1477; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 1478; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 1479; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 1480; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 1481; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1482; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off 1483; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1484; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1485; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1486; GFX1030-PAL-NEXT: s_endpgm 1487bb: 1488 %padding = alloca [64 x i32], align 4, addrspace(5) 1489 %i = alloca [32 x float], align 4, addrspace(5) 1490 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1491 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1492 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1493 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1494 %i3 = zext i32 %i2 to i64 1495 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1496 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1497 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1498 %i9 = sub nsw i32 31, %i2 1499 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1500 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1501 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1502 ret void 1503} 1504 1505define void @store_load_vindex_small_offset_foo(i32 %idx) { 1506; GFX9-LABEL: store_load_vindex_small_offset_foo: 1507; GFX9: ; %bb.0: ; %bb 1508; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1509; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1510; GFX9-NEXT: s_waitcnt vmcnt(0) 1511; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1512; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1513; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1514; GFX9-NEXT: v_mov_b32_e32 v3, 15 1515; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 1516; GFX9-NEXT: scratch_store_dword v2, v3, off 1517; GFX9-NEXT: s_waitcnt vmcnt(0) 1518; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1519; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1520; GFX9-NEXT: s_waitcnt vmcnt(0) 1521; GFX9-NEXT: s_setpc_b64 s[30:31] 1522; 1523; GFX10-LABEL: store_load_vindex_small_offset_foo: 1524; GFX10: ; %bb.0: ; %bb 1525; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1526; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1527; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1528; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 1529; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo 1530; GFX10-NEXT: v_mov_b32_e32 v3, 15 1531; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1532; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 1533; GFX10-NEXT: scratch_load_dword v2, off, s32 glc dlc 1534; GFX10-NEXT: s_waitcnt vmcnt(0) 1535; GFX10-NEXT: scratch_store_dword v0, v3, off 1536; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1537; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 1538; GFX10-NEXT: s_waitcnt vmcnt(0) 1539; GFX10-NEXT: s_setpc_b64 s[30:31] 1540; 1541; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1542; GFX9-PAL: ; %bb.0: ; %bb 1543; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1544; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1545; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1546; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1547; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1548; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1549; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1550; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 1551; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1552; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1553; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1554; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1555; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1556; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1557; 1558; GFX940-LABEL: store_load_vindex_small_offset_foo: 1559; GFX940: ; %bb.0: ; %bb 1560; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1561; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1562; GFX940-NEXT: s_waitcnt vmcnt(0) 1563; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1564; GFX940-NEXT: v_mov_b32_e32 v2, 15 1565; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 1566; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1567; GFX940-NEXT: s_waitcnt vmcnt(0) 1568; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1569; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1570; GFX940-NEXT: s_waitcnt vmcnt(0) 1571; GFX940-NEXT: s_setpc_b64 s[30:31] 1572; 1573; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1574; GFX10-PAL: ; %bb.0: ; %bb 1575; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1576; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1577; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1578; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 1579; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo 1580; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 1581; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1582; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 1583; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 glc dlc 1584; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1585; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off 1586; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1587; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 1588; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1589; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1590; GCN-LABEL: store_load_vindex_small_offset_foo: 1591; GCN: ; %bb.0: ; %bb 1592; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1593; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1594; GCN-NEXT: s_waitcnt vmcnt(0) 1595; GCN-NEXT: v_mov_b32_e32 v2, 15 1596; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1597; GCN-NEXT: v_and_b32_e32 v0, v0, v2 1598; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1599; GCN-NEXT: s_waitcnt vmcnt(0) 1600; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1601; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1602; GCN-NEXT: s_waitcnt vmcnt(0) 1603; GCN-NEXT: s_setpc_b64 s[30:31] 1604bb: 1605 %padding = alloca [64 x i32], align 4, addrspace(5) 1606 %i = alloca [32 x float], align 4, addrspace(5) 1607 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1608 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1609 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1610 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1611 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1612 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1613 %i9 = and i32 %idx, 15 1614 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1615 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1616 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1617 ret void 1618} 1619 1620define amdgpu_kernel void @zero_init_large_offset_kernel() { 1621; GFX9-LABEL: zero_init_large_offset_kernel: 1622; GFX9: ; %bb.0: 1623; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1624; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1625; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1626; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1627; GFX9-NEXT: s_waitcnt vmcnt(0) 1628; GFX9-NEXT: s_mov_b32 s0, 0 1629; GFX9-NEXT: s_mov_b32 s1, s0 1630; GFX9-NEXT: s_mov_b32 s2, s0 1631; GFX9-NEXT: s_mov_b32 s3, s0 1632; GFX9-NEXT: v_mov_b32_e32 v0, s0 1633; GFX9-NEXT: v_mov_b32_e32 v1, s1 1634; GFX9-NEXT: v_mov_b32_e32 v2, s2 1635; GFX9-NEXT: v_mov_b32_e32 v3, s3 1636; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1637; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1638; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1639; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1640; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1641; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1642; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1643; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1644; GFX9-NEXT: s_endpgm 1645; 1646; GFX10-LABEL: zero_init_large_offset_kernel: 1647; GFX10: ; %bb.0: 1648; GFX10-NEXT: s_add_u32 s0, s0, s3 1649; GFX10-NEXT: s_addc_u32 s1, s1, 0 1650; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1651; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1652; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1653; GFX10-NEXT: s_waitcnt vmcnt(0) 1654; GFX10-NEXT: s_mov_b32 s0, 0 1655; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1656; GFX10-NEXT: s_mov_b32 s1, s0 1657; GFX10-NEXT: s_mov_b32 s2, s0 1658; GFX10-NEXT: s_mov_b32 s3, s0 1659; GFX10-NEXT: v_mov_b32_e32 v0, s0 1660; GFX10-NEXT: v_mov_b32_e32 v1, s1 1661; GFX10-NEXT: v_mov_b32_e32 v2, s2 1662; GFX10-NEXT: v_mov_b32_e32 v3, s3 1663; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1664; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1665; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1666; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1667; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1668; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1669; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1670; GFX10-NEXT: s_endpgm 1671; 1672; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1673; GFX9-PAL: ; %bb.0: 1674; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1675; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1676; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1677; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1678; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1679; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1680; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1681; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1682; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1683; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1684; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1685; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1686; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1687; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1688; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1689; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1690; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1691; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1692; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1693; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1694; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1695; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1696; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1697; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1698; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1699; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1700; GFX9-PAL-NEXT: s_endpgm 1701; 1702; GFX940-LABEL: zero_init_large_offset_kernel: 1703; GFX940: ; %bb.0: 1704; GFX940-NEXT: scratch_load_dword v0, off, off offset:16 sc0 sc1 1705; GFX940-NEXT: s_waitcnt vmcnt(0) 1706; GFX940-NEXT: s_mov_b32 s0, 0 1707; GFX940-NEXT: s_mov_b32 s1, s0 1708; GFX940-NEXT: s_mov_b32 s2, s0 1709; GFX940-NEXT: s_mov_b32 s3, s0 1710; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1711; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1712; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1713; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1714; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1715; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1716; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1717; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1718; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1719; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1720; GFX940-NEXT: s_endpgm 1721; 1722; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 1723; GFX1010-PAL: ; %bb.0: 1724; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1725; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1726; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1727; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1728; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1729; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1730; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1731; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1732; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1733; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1734; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1735; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 1736; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1737; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1738; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1739; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1740; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1741; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1742; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1743; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1744; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1745; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1746; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1747; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1748; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1749; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1750; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1751; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1752; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1753; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1754; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1755; GFX1010-PAL-NEXT: s_endpgm 1756; 1757; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 1758; GFX1030-PAL: ; %bb.0: 1759; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1760; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1761; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1762; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1763; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1764; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1765; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1766; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1767; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1768; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1769; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1770; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1771; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1772; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1773; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1774; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1775; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1776; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1777; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1778; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1779; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1780; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1781; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1782; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1783; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1784; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1785; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1786; GFX1030-PAL-NEXT: s_endpgm 1787 %padding = alloca [4096 x i32], align 4, addrspace(5) 1788 %alloca = alloca [32 x i16], align 2, addrspace(5) 1789 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1790 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1791 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1792 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1793 ret void 1794} 1795 1796define void @zero_init_large_offset_foo() { 1797; GFX9-LABEL: zero_init_large_offset_foo: 1798; GFX9: ; %bb.0: 1799; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1800; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 1801; GFX9-NEXT: s_waitcnt vmcnt(0) 1802; GFX9-NEXT: s_mov_b32 s0, 0 1803; GFX9-NEXT: s_mov_b32 s1, s0 1804; GFX9-NEXT: s_mov_b32 s2, s0 1805; GFX9-NEXT: s_mov_b32 s3, s0 1806; GFX9-NEXT: v_mov_b32_e32 v0, s0 1807; GFX9-NEXT: v_mov_b32_e32 v1, s1 1808; GFX9-NEXT: v_mov_b32_e32 v2, s2 1809; GFX9-NEXT: v_mov_b32_e32 v3, s3 1810; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1811; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1812; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1813; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1814; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1815; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1816; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1817; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1818; GFX9-NEXT: s_waitcnt vmcnt(0) 1819; GFX9-NEXT: s_setpc_b64 s[30:31] 1820; 1821; GFX10-LABEL: zero_init_large_offset_foo: 1822; GFX10: ; %bb.0: 1823; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1824; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1825; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1826; GFX10-NEXT: s_waitcnt vmcnt(0) 1827; GFX10-NEXT: s_mov_b32 s0, 0 1828; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1829; GFX10-NEXT: s_mov_b32 s1, s0 1830; GFX10-NEXT: s_mov_b32 s2, s0 1831; GFX10-NEXT: s_mov_b32 s3, s0 1832; GFX10-NEXT: v_mov_b32_e32 v0, s0 1833; GFX10-NEXT: v_mov_b32_e32 v1, s1 1834; GFX10-NEXT: v_mov_b32_e32 v2, s2 1835; GFX10-NEXT: v_mov_b32_e32 v3, s3 1836; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1837; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1838; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1839; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1840; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1841; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1842; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1843; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1844; GFX10-NEXT: s_setpc_b64 s[30:31] 1845; 1846; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1847; GFX9-PAL: ; %bb.0: 1848; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1849; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 1850; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1851; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1852; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1853; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1854; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1855; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1856; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1857; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1858; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1859; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1860; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1861; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1862; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1863; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1864; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1865; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1866; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1867; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1868; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1869; 1870; GFX940-LABEL: zero_init_large_offset_foo: 1871; GFX940: ; %bb.0: 1872; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1873; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:16 sc0 sc1 1874; GFX940-NEXT: s_waitcnt vmcnt(0) 1875; GFX940-NEXT: s_mov_b32 s0, 0 1876; GFX940-NEXT: s_mov_b32 s1, s0 1877; GFX940-NEXT: s_mov_b32 s2, s0 1878; GFX940-NEXT: s_mov_b32 s3, s0 1879; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1880; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1881; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1882; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1883; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1884; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1885; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1886; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1887; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1888; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1889; GFX940-NEXT: s_waitcnt vmcnt(0) 1890; GFX940-NEXT: s_setpc_b64 s[30:31] 1891; 1892; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 1893; GFX1010-PAL: ; %bb.0: 1894; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1895; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1896; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1897; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1898; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1899; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1900; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1901; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1902; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1903; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1904; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1905; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1906; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1907; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1908; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1909; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1910; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1911; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1912; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1913; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1914; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1915; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1916; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1917; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1918; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 1919; 1920; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 1921; GFX1030-PAL: ; %bb.0: 1922; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1923; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1924; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1925; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1926; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1927; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1928; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1929; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1930; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1931; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1932; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1933; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1934; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1935; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1936; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1937; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1938; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1939; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1940; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1941; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1942; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1943; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 1944 %padding = alloca [4096 x i32], align 4, addrspace(5) 1945 %alloca = alloca [32 x i16], align 2, addrspace(5) 1946 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1947 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1948 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1949 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1950 ret void 1951} 1952 1953define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1954; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1955; GFX9: ; %bb.0: ; %bb 1956; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1957; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1958; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1959; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1960; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1961; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1962; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1963; GFX9-NEXT: s_and_b32 s0, s0, 15 1964; GFX9-NEXT: v_mov_b32_e32 v0, 15 1965; GFX9-NEXT: s_addk_i32 s1, 0x4004 1966; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1967; GFX9-NEXT: scratch_store_dword off, v0, s1 1968; GFX9-NEXT: s_waitcnt vmcnt(0) 1969; GFX9-NEXT: s_addk_i32 s0, 0x4004 1970; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1971; GFX9-NEXT: s_waitcnt vmcnt(0) 1972; GFX9-NEXT: s_endpgm 1973; 1974; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1975; GFX10: ; %bb.0: ; %bb 1976; GFX10-NEXT: s_add_u32 s2, s2, s5 1977; GFX10-NEXT: s_addc_u32 s3, s3, 0 1978; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1979; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1980; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1981; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1982; GFX10-NEXT: s_waitcnt vmcnt(0) 1983; GFX10-NEXT: v_mov_b32_e32 v0, 15 1984; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1985; GFX10-NEXT: s_and_b32 s1, s0, 15 1986; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1987; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1988; GFX10-NEXT: s_addk_i32 s0, 0x4004 1989; GFX10-NEXT: s_addk_i32 s1, 0x4004 1990; GFX10-NEXT: scratch_store_dword off, v0, s0 1991; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1992; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1993; GFX10-NEXT: s_waitcnt vmcnt(0) 1994; GFX10-NEXT: s_endpgm 1995; 1996; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1997; GFX9-PAL: ; %bb.0: ; %bb 1998; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1999; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2000; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2001; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2002; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2003; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2004; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2005; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2006; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2007; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2008; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2009; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2010; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2011; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2012; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2013; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2014; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2015; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2016; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2017; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2018; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2019; GFX9-PAL-NEXT: s_endpgm 2020; 2021; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2022; GFX940: ; %bb.0: ; %bb 2023; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2024; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2025; GFX940-NEXT: s_waitcnt vmcnt(0) 2026; GFX940-NEXT: v_mov_b32_e32 v0, 15 2027; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2028; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2029; GFX940-NEXT: s_and_b32 s0, s0, 15 2030; GFX940-NEXT: s_addk_i32 s1, 0x4004 2031; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2032; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2033; GFX940-NEXT: s_waitcnt vmcnt(0) 2034; GFX940-NEXT: s_addk_i32 s0, 0x4004 2035; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2036; GFX940-NEXT: s_waitcnt vmcnt(0) 2037; GFX940-NEXT: s_endpgm 2038; 2039; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2040; GFX1010-PAL: ; %bb.0: ; %bb 2041; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 2042; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 2043; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2044; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2045; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2046; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 2047; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 2048; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2049; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2050; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2051; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2052; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2053; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2054; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2055; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2056; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2057; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2058; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2059; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2060; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2061; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2062; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2063; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2064; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2065; GFX1010-PAL-NEXT: s_endpgm 2066; 2067; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2068; GFX1030-PAL: ; %bb.0: ; %bb 2069; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 2070; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 2071; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2072; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2073; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2074; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 2075; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 2076; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2077; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2078; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2079; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2080; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2081; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2082; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2083; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2084; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2085; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2086; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2087; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2088; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2089; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2090; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2091; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2092; GFX1030-PAL-NEXT: s_endpgm 2093bb: 2094 %padding = alloca [4096 x i32], align 4, addrspace(5) 2095 %i = alloca [32 x float], align 4, addrspace(5) 2096 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2097 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2098 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2099 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2100 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2101 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2102 %i9 = and i32 %idx, 15 2103 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2104 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2105 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2106 ret void 2107} 2108 2109define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2110; GFX9-LABEL: store_load_sindex_large_offset_foo: 2111; GFX9: ; %bb.0: ; %bb 2112; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2113; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2114; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2115; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2116; GFX9-NEXT: s_waitcnt vmcnt(0) 2117; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2118; GFX9-NEXT: s_addk_i32 s0, 0x4004 2119; GFX9-NEXT: v_mov_b32_e32 v0, 15 2120; GFX9-NEXT: scratch_store_dword off, v0, s0 2121; GFX9-NEXT: s_waitcnt vmcnt(0) 2122; GFX9-NEXT: s_and_b32 s0, s2, 15 2123; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2124; GFX9-NEXT: s_addk_i32 s0, 0x4004 2125; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2126; GFX9-NEXT: s_waitcnt vmcnt(0) 2127; GFX9-NEXT: s_endpgm 2128; 2129; GFX10-LABEL: store_load_sindex_large_offset_foo: 2130; GFX10: ; %bb.0: ; %bb 2131; GFX10-NEXT: s_add_u32 s0, s0, s3 2132; GFX10-NEXT: s_addc_u32 s1, s1, 0 2133; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2134; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2135; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2136; GFX10-NEXT: s_waitcnt vmcnt(0) 2137; GFX10-NEXT: v_mov_b32_e32 v0, 15 2138; GFX10-NEXT: s_and_b32 s0, s2, 15 2139; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2140; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2141; GFX10-NEXT: s_addk_i32 s1, 0x4004 2142; GFX10-NEXT: s_addk_i32 s0, 0x4004 2143; GFX10-NEXT: scratch_store_dword off, v0, s1 2144; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2145; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2146; GFX10-NEXT: s_waitcnt vmcnt(0) 2147; GFX10-NEXT: s_endpgm 2148; 2149; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2150; GFX9-PAL: ; %bb.0: ; %bb 2151; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2152; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2153; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2154; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2155; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2156; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2157; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2158; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2159; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2160; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2161; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2162; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2163; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2164; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2165; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2166; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2167; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2168; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2169; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2170; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2171; GFX9-PAL-NEXT: s_endpgm 2172; 2173; GFX940-LABEL: store_load_sindex_large_offset_foo: 2174; GFX940: ; %bb.0: ; %bb 2175; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2176; GFX940-NEXT: s_waitcnt vmcnt(0) 2177; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2178; GFX940-NEXT: s_and_b32 s0, s0, 15 2179; GFX940-NEXT: s_addk_i32 s1, 0x4004 2180; GFX940-NEXT: v_mov_b32_e32 v0, 15 2181; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2182; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2183; GFX940-NEXT: s_waitcnt vmcnt(0) 2184; GFX940-NEXT: s_addk_i32 s0, 0x4004 2185; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2186; GFX940-NEXT: s_waitcnt vmcnt(0) 2187; GFX940-NEXT: s_endpgm 2188; 2189; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 2190; GFX1010-PAL: ; %bb.0: ; %bb 2191; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2192; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2193; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2194; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2195; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2196; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2197; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2198; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2199; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2200; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2201; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2202; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2203; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2204; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2205; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2206; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2207; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2208; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2209; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2210; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2211; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2212; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2213; GFX1010-PAL-NEXT: s_endpgm 2214; 2215; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 2216; GFX1030-PAL: ; %bb.0: ; %bb 2217; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2218; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2219; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2220; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2221; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2222; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2223; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2224; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2225; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2226; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2227; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2228; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2229; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2230; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2231; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2232; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2233; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2234; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2235; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2236; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2237; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2238; GFX1030-PAL-NEXT: s_endpgm 2239bb: 2240 %padding = alloca [4096 x i32], align 4, addrspace(5) 2241 %i = alloca [32 x float], align 4, addrspace(5) 2242 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2243 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2244 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2245 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2246 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2247 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2248 %i9 = and i32 %idx, 15 2249 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2250 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2251 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2252 ret void 2253} 2254 2255define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 2256; GFX9-LABEL: store_load_vindex_large_offset_kernel: 2257; GFX9: ; %bb.0: ; %bb 2258; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2259; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2260; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2261; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2262; GFX9-NEXT: s_waitcnt vmcnt(0) 2263; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2264; GFX9-NEXT: v_mov_b32_e32 v1, 0x4004 2265; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 2266; GFX9-NEXT: v_mov_b32_e32 v3, 15 2267; GFX9-NEXT: scratch_store_dword v2, v3, off 2268; GFX9-NEXT: s_waitcnt vmcnt(0) 2269; GFX9-NEXT: v_sub_u32_e32 v0, v1, v0 2270; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2271; GFX9-NEXT: s_waitcnt vmcnt(0) 2272; GFX9-NEXT: s_endpgm 2273; 2274; GFX10-LABEL: store_load_vindex_large_offset_kernel: 2275; GFX10: ; %bb.0: ; %bb 2276; GFX10-NEXT: s_add_u32 s0, s0, s3 2277; GFX10-NEXT: s_addc_u32 s1, s1, 0 2278; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2279; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2280; GFX10-NEXT: v_mov_b32_e32 v1, 0x4004 2281; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2282; GFX10-NEXT: v_mov_b32_e32 v3, 15 2283; GFX10-NEXT: v_add_nc_u32_e32 v2, v1, v0 2284; GFX10-NEXT: v_sub_nc_u32_e32 v0, v1, v0 2285; GFX10-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 2286; GFX10-NEXT: s_waitcnt vmcnt(0) 2287; GFX10-NEXT: scratch_store_dword v2, v3, off 2288; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2289; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2290; GFX10-NEXT: s_waitcnt vmcnt(0) 2291; GFX10-NEXT: s_endpgm 2292; 2293; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 2294; GFX9-PAL: ; %bb.0: ; %bb 2295; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2296; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2297; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2298; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2299; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2300; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 2301; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2302; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2303; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2304; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2305; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2306; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2307; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 2308; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0 2309; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 2310; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2311; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0 2312; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2313; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2314; GFX9-PAL-NEXT: s_endpgm 2315; 2316; GFX940-LABEL: store_load_vindex_large_offset_kernel: 2317; GFX940: ; %bb.0: ; %bb 2318; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 2319; GFX940-NEXT: s_waitcnt vmcnt(0) 2320; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2321; GFX940-NEXT: v_mov_b32_e32 v1, 15 2322; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 2323; GFX940-NEXT: scratch_store_dword v0, v1, vcc_hi sc0 sc1 2324; GFX940-NEXT: s_waitcnt vmcnt(0) 2325; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2326; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 2327; GFX940-NEXT: s_waitcnt vmcnt(0) 2328; GFX940-NEXT: s_endpgm 2329; 2330; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 2331; GFX1010-PAL: ; %bb.0: ; %bb 2332; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2333; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2334; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2335; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2336; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2337; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2338; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2339; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2340; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2341; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 2342; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2343; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, 15 2344; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2345; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 2346; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 2347; GFX1010-PAL-NEXT: scratch_load_dword v1, off, vcc_lo offset:4 glc dlc 2348; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2349; GFX1010-PAL-NEXT: scratch_store_dword v2, v3, off 2350; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2351; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2352; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2353; GFX1010-PAL-NEXT: s_endpgm 2354; 2355; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 2356; GFX1030-PAL: ; %bb.0: ; %bb 2357; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2358; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2359; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2360; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2361; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2362; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2363; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2364; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2365; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2366; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 0x4004 2367; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2368; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, 15 2369; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0 2370; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 2371; GFX1030-PAL-NEXT: scratch_load_dword v1, off, off offset:4 glc dlc 2372; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2373; GFX1030-PAL-NEXT: scratch_store_dword v2, v3, off 2374; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2375; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2376; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2377; GFX1030-PAL-NEXT: s_endpgm 2378bb: 2379 %padding = alloca [4096 x i32], align 4, addrspace(5) 2380 %i = alloca [32 x float], align 4, addrspace(5) 2381 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2382 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2383 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2384 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2385 %i3 = zext i32 %i2 to i64 2386 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2387 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2388 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2389 %i9 = sub nsw i32 31, %i2 2390 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2391 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2392 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2393 ret void 2394} 2395 2396define void @store_load_vindex_large_offset_foo(i32 %idx) { 2397; GFX9-LABEL: store_load_vindex_large_offset_foo: 2398; GFX9: ; %bb.0: ; %bb 2399; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2400; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2401; GFX9-NEXT: s_waitcnt vmcnt(0) 2402; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2403; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 2404; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2405; GFX9-NEXT: v_mov_b32_e32 v3, 15 2406; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2407; GFX9-NEXT: scratch_store_dword v2, v3, off 2408; GFX9-NEXT: s_waitcnt vmcnt(0) 2409; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2410; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2411; GFX9-NEXT: s_waitcnt vmcnt(0) 2412; GFX9-NEXT: s_setpc_b64 s[30:31] 2413; 2414; GFX10-LABEL: store_load_vindex_large_offset_foo: 2415; GFX10: ; %bb.0: ; %bb 2416; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2417; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2418; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2419; GFX10-NEXT: v_and_b32_e32 v2, 15, v0 2420; GFX10-NEXT: v_mov_b32_e32 v1, vcc_lo 2421; GFX10-NEXT: v_mov_b32_e32 v3, 15 2422; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2423; GFX10-NEXT: v_lshl_add_u32 v1, v2, 2, v1 2424; GFX10-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc 2425; GFX10-NEXT: s_waitcnt vmcnt(0) 2426; GFX10-NEXT: scratch_store_dword v0, v3, off 2427; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2428; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 2429; GFX10-NEXT: s_waitcnt vmcnt(0) 2430; GFX10-NEXT: s_setpc_b64 s[30:31] 2431; 2432; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 2433; GFX9-PAL: ; %bb.0: ; %bb 2434; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2435; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2436; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2437; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2438; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 2439; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2440; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 2441; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 2442; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 2443; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2444; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2445; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 2446; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2447; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2448; 2449; GFX940-LABEL: store_load_vindex_large_offset_foo: 2450; GFX940: ; %bb.0: ; %bb 2451; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2452; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 2453; GFX940-NEXT: s_waitcnt vmcnt(0) 2454; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 2455; GFX940-NEXT: v_mov_b32_e32 v2, 15 2456; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2457; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 2458; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 2459; GFX940-NEXT: s_waitcnt vmcnt(0) 2460; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2461; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2462; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 2463; GFX940-NEXT: s_waitcnt vmcnt(0) 2464; GFX940-NEXT: s_setpc_b64 s[30:31] 2465; 2466; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 2467; GFX10-PAL: ; %bb.0: ; %bb 2468; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2469; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2470; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2471; GFX10-PAL-NEXT: v_and_b32_e32 v2, 15, v0 2472; GFX10-PAL-NEXT: v_mov_b32_e32 v1, vcc_lo 2473; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15 2474; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2475; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v2, 2, v1 2476; GFX10-PAL-NEXT: scratch_load_dword v2, off, s32 offset:4 glc dlc 2477; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2478; GFX10-PAL-NEXT: scratch_store_dword v0, v3, off 2479; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2480; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 2481; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2482; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2483; GCN-LABEL: store_load_vindex_large_offset_foo: 2484; GCN: ; %bb.0: ; %bb 2485; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2486; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 2487; GCN-NEXT: s_waitcnt vmcnt(0) 2488; GCN-NEXT: v_mov_b32_e32 v2, 15 2489; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 2490; GCN-NEXT: v_and_b32_e32 v0, v0, v2 2491; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 2492; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 2493; GCN-NEXT: s_waitcnt vmcnt(0) 2494; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2495; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 2496; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 2497; GCN-NEXT: s_waitcnt vmcnt(0) 2498; GCN-NEXT: s_setpc_b64 s[30:31] 2499bb: 2500 %padding = alloca [4096 x i32], align 4, addrspace(5) 2501 %i = alloca [32 x float], align 4, addrspace(5) 2502 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2503 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2504 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2505 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2506 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2507 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2508 %i9 = and i32 %idx, 15 2509 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2510 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2511 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2512 ret void 2513} 2514 2515define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 2516; GFX9-LABEL: store_load_large_imm_offset_kernel: 2517; GFX9: ; %bb.0: ; %bb 2518; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2519; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2520; GFX9-NEXT: v_mov_b32_e32 v0, 13 2521; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2522; GFX9-NEXT: s_movk_i32 s0, 0x3000 2523; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2524; GFX9-NEXT: s_waitcnt vmcnt(0) 2525; GFX9-NEXT: s_add_i32 s0, s0, 4 2526; GFX9-NEXT: v_mov_b32_e32 v0, 15 2527; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2528; GFX9-NEXT: s_waitcnt vmcnt(0) 2529; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2530; GFX9-NEXT: s_waitcnt vmcnt(0) 2531; GFX9-NEXT: s_endpgm 2532; 2533; GFX10-LABEL: store_load_large_imm_offset_kernel: 2534; GFX10: ; %bb.0: ; %bb 2535; GFX10-NEXT: s_add_u32 s0, s0, s3 2536; GFX10-NEXT: s_addc_u32 s1, s1, 0 2537; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2538; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2539; GFX10-NEXT: v_mov_b32_e32 v0, 13 2540; GFX10-NEXT: v_mov_b32_e32 v1, 15 2541; GFX10-NEXT: s_movk_i32 s0, 0x3800 2542; GFX10-NEXT: s_add_i32 s0, s0, 4 2543; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 2544; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2545; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2546; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2547; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2548; GFX10-NEXT: s_waitcnt vmcnt(0) 2549; GFX10-NEXT: s_endpgm 2550; 2551; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 2552; GFX9-PAL: ; %bb.0: ; %bb 2553; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2554; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2555; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2556; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2557; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2558; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2559; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2560; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2561; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2562; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2563; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2564; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2565; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 2566; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2567; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2568; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2569; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2570; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2571; GFX9-PAL-NEXT: s_endpgm 2572; 2573; GFX940-LABEL: store_load_large_imm_offset_kernel: 2574; GFX940: ; %bb.0: ; %bb 2575; GFX940-NEXT: v_mov_b32_e32 v0, 13 2576; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 2577; GFX940-NEXT: s_waitcnt vmcnt(0) 2578; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 2579; GFX940-NEXT: v_mov_b32_e32 v1, 15 2580; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 2581; GFX940-NEXT: s_waitcnt vmcnt(0) 2582; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 2583; GFX940-NEXT: s_waitcnt vmcnt(0) 2584; GFX940-NEXT: s_endpgm 2585; 2586; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 2587; GFX1010-PAL: ; %bb.0: ; %bb 2588; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2589; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2590; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2591; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2592; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2593; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2594; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2595; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2596; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2597; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 2598; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 2599; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 2600; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2601; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 2602; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 2603; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2604; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2605; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2606; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2607; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2608; GFX1010-PAL-NEXT: s_endpgm 2609; 2610; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 2611; GFX1030-PAL: ; %bb.0: ; %bb 2612; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2613; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2614; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2615; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2616; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2617; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2618; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2619; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2620; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2621; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 2622; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 2623; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 2624; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 2625; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 2626; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2627; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2628; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2629; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2630; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2631; GFX1030-PAL-NEXT: s_endpgm 2632bb: 2633 %i = alloca [4096 x i32], align 4, addrspace(5) 2634 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2635 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2636 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2637 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2638 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2639 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2640 ret void 2641} 2642 2643define void @store_load_large_imm_offset_foo() { 2644; GFX9-LABEL: store_load_large_imm_offset_foo: 2645; GFX9: ; %bb.0: ; %bb 2646; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2647; GFX9-NEXT: v_mov_b32_e32 v0, 13 2648; GFX9-NEXT: s_movk_i32 s0, 0x3000 2649; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 2650; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 2651; GFX9-NEXT: s_waitcnt vmcnt(0) 2652; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 2653; GFX9-NEXT: v_mov_b32_e32 v0, 15 2654; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2655; GFX9-NEXT: s_waitcnt vmcnt(0) 2656; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2657; GFX9-NEXT: s_waitcnt vmcnt(0) 2658; GFX9-NEXT: s_setpc_b64 s[30:31] 2659; 2660; GFX10-LABEL: store_load_large_imm_offset_foo: 2661; GFX10: ; %bb.0: ; %bb 2662; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2663; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2664; GFX10-NEXT: v_mov_b32_e32 v0, 13 2665; GFX10-NEXT: v_mov_b32_e32 v1, 15 2666; GFX10-NEXT: s_movk_i32 s0, 0x3800 2667; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 2668; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 2669; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 2670; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2671; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2672; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2673; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2674; GFX10-NEXT: s_waitcnt vmcnt(0) 2675; GFX10-NEXT: s_setpc_b64 s[30:31] 2676; 2677; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 2678; GFX9-PAL: ; %bb.0: ; %bb 2679; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2680; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2681; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2682; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 2683; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 2684; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2685; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi 2686; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2687; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2688; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2689; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2690; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2691; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2692; 2693; GFX940-LABEL: store_load_large_imm_offset_foo: 2694; GFX940: ; %bb.0: ; %bb 2695; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2696; GFX940-NEXT: v_mov_b32_e32 v0, 13 2697; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 2698; GFX940-NEXT: s_waitcnt vmcnt(0) 2699; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 2700; GFX940-NEXT: v_mov_b32_e32 v1, 15 2701; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 2702; GFX940-NEXT: s_waitcnt vmcnt(0) 2703; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 2704; GFX940-NEXT: s_waitcnt vmcnt(0) 2705; GFX940-NEXT: s_setpc_b64 s[30:31] 2706; 2707; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 2708; GFX10-PAL: ; %bb.0: ; %bb 2709; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2710; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2711; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 2712; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2713; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 2714; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 2715; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo 2716; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 2717; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2718; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2719; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2720; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2721; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2722; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2723; GCN-LABEL: store_load_large_imm_offset_foo: 2724; GCN: ; %bb.0: ; %bb 2725; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2726; GCN-NEXT: v_mov_b32_e32 v0, 13 2727; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 2728; GCN-NEXT: s_waitcnt vmcnt(0) 2729; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 2730; GCN-NEXT: v_mov_b32_e32 v1, 15 2731; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 2732; GCN-NEXT: s_waitcnt vmcnt(0) 2733; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 2734; GCN-NEXT: s_waitcnt vmcnt(0) 2735; GCN-NEXT: s_setpc_b64 s[30:31] 2736bb: 2737 %i = alloca [4096 x i32], align 4, addrspace(5) 2738 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2739 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2740 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2741 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2742 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2743 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2744 ret void 2745} 2746 2747define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 2748; GFX9-LABEL: store_load_vidx_sidx_offset: 2749; GFX9: ; %bb.0: ; %bb 2750; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2751; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2752; GFX9-NEXT: v_mov_b32_e32 v1, 4 2753; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2755; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 2756; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2757; GFX9-NEXT: v_mov_b32_e32 v1, 15 2758; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 2759; GFX9-NEXT: s_waitcnt vmcnt(0) 2760; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2761; GFX9-NEXT: s_waitcnt vmcnt(0) 2762; GFX9-NEXT: s_endpgm 2763; 2764; GFX10-LABEL: store_load_vidx_sidx_offset: 2765; GFX10: ; %bb.0: ; %bb 2766; GFX10-NEXT: s_add_u32 s2, s2, s5 2767; GFX10-NEXT: s_addc_u32 s3, s3, 0 2768; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2769; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2770; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2771; GFX10-NEXT: v_mov_b32_e32 v1, 15 2772; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2773; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 2774; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2775; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 2776; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2777; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2778; GFX10-NEXT: s_waitcnt vmcnt(0) 2779; GFX10-NEXT: s_endpgm 2780; 2781; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 2782; GFX9-PAL: ; %bb.0: ; %bb 2783; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2784; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2785; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2786; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 2787; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2788; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2789; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2790; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2791; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 2792; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2793; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2794; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2795; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2796; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2797; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2798; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2799; GFX9-PAL-NEXT: s_endpgm 2800; 2801; GFX940-LABEL: store_load_vidx_sidx_offset: 2802; GFX940: ; %bb.0: ; %bb 2803; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2804; GFX940-NEXT: v_mov_b32_e32 v1, 15 2805; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2806; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 2807; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 2808; GFX940-NEXT: s_waitcnt vmcnt(0) 2809; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 2810; GFX940-NEXT: s_waitcnt vmcnt(0) 2811; GFX940-NEXT: s_endpgm 2812; 2813; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 2814; GFX10-PAL: ; %bb.0: ; %bb 2815; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 2816; GFX10-PAL-NEXT: s_mov_b32 s4, s0 2817; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2818; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2819; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2820; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 2821; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 2822; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2823; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2824; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2825; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2826; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 2828; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2829; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2830; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2831; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2832; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2833; GFX10-PAL-NEXT: s_endpgm 2834; GCN-LABEL: store_load_vidx_sidx_offset: 2835; GCN: ; %bb.0: ; %bb 2836; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 2837; GCN-NEXT: v_mov_b32_e32 v1, 15 2838; GCN-NEXT: s_waitcnt lgkmcnt(0) 2839; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 2840; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 2841; GCN-NEXT: s_waitcnt vmcnt(0) 2842; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 2843; GCN-NEXT: s_waitcnt vmcnt(0) 2844; GCN-NEXT: s_endpgm 2845bb: 2846 %alloca = alloca [32 x i32], align 4, addrspace(5) 2847 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 2848 %add1 = add nsw i32 %sidx, %vidx 2849 %add2 = add nsw i32 %add1, 256 2850 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 2851 store volatile i32 15, i32 addrspace(5)* %gep, align 4 2852 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 2853 ret void 2854} 2855 2856define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 2857; GFX9-LABEL: store_load_i64_aligned: 2858; GFX9: ; %bb.0: ; %bb 2859; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2860; GFX9-NEXT: v_mov_b32_e32 v1, 15 2861; GFX9-NEXT: v_mov_b32_e32 v2, 0 2862; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2863; GFX9-NEXT: s_waitcnt vmcnt(0) 2864; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2865; GFX9-NEXT: s_waitcnt vmcnt(0) 2866; GFX9-NEXT: s_setpc_b64 s[30:31] 2867; 2868; GFX10-LABEL: store_load_i64_aligned: 2869; GFX10: ; %bb.0: ; %bb 2870; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2871; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2872; GFX10-NEXT: v_mov_b32_e32 v1, 15 2873; GFX10-NEXT: v_mov_b32_e32 v2, 0 2874; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2875; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2876; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2877; GFX10-NEXT: s_waitcnt vmcnt(0) 2878; GFX10-NEXT: s_setpc_b64 s[30:31] 2879; 2880; GFX9-PAL-LABEL: store_load_i64_aligned: 2881; GFX9-PAL: ; %bb.0: ; %bb 2882; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2883; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2884; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2885; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2886; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2887; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2888; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2889; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2890; 2891; GFX940-LABEL: store_load_i64_aligned: 2892; GFX940: ; %bb.0: ; %bb 2893; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2894; GFX940-NEXT: v_mov_b32_e32 v2, 15 2895; GFX940-NEXT: v_mov_b32_e32 v3, 0 2896; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2897; GFX940-NEXT: s_waitcnt vmcnt(0) 2898; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2899; GFX940-NEXT: s_waitcnt vmcnt(0) 2900; GFX940-NEXT: s_setpc_b64 s[30:31] 2901; 2902; GFX10-PAL-LABEL: store_load_i64_aligned: 2903; GFX10-PAL: ; %bb.0: ; %bb 2904; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2905; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2906; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2907; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2908; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2909; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2910; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2911; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2912; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2913; GCN-LABEL: store_load_i64_aligned: 2914; GCN: ; %bb.0: ; %bb 2915; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2916; GCN-NEXT: v_mov_b32_e32 v2, 15 2917; GCN-NEXT: v_mov_b32_e32 v3, 0 2918; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2919; GCN-NEXT: s_waitcnt vmcnt(0) 2920; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2921; GCN-NEXT: s_waitcnt vmcnt(0) 2922; GCN-NEXT: s_setpc_b64 s[30:31] 2923bb: 2924 store volatile i64 15, i64 addrspace(5)* %arg, align 8 2925 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 2926 ret void 2927} 2928 2929define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2930; GFX9-LABEL: store_load_i64_unaligned: 2931; GFX9: ; %bb.0: ; %bb 2932; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2933; GFX9-NEXT: v_mov_b32_e32 v1, 15 2934; GFX9-NEXT: v_mov_b32_e32 v2, 0 2935; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2936; GFX9-NEXT: s_waitcnt vmcnt(0) 2937; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2938; GFX9-NEXT: s_waitcnt vmcnt(0) 2939; GFX9-NEXT: s_setpc_b64 s[30:31] 2940; 2941; GFX10-LABEL: store_load_i64_unaligned: 2942; GFX10: ; %bb.0: ; %bb 2943; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2944; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2945; GFX10-NEXT: v_mov_b32_e32 v1, 15 2946; GFX10-NEXT: v_mov_b32_e32 v2, 0 2947; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2948; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2949; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2950; GFX10-NEXT: s_waitcnt vmcnt(0) 2951; GFX10-NEXT: s_setpc_b64 s[30:31] 2952; 2953; GFX9-PAL-LABEL: store_load_i64_unaligned: 2954; GFX9-PAL: ; %bb.0: ; %bb 2955; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2956; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2957; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2958; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2959; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2960; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2961; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2962; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2963; 2964; GFX940-LABEL: store_load_i64_unaligned: 2965; GFX940: ; %bb.0: ; %bb 2966; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2967; GFX940-NEXT: v_mov_b32_e32 v2, 15 2968; GFX940-NEXT: v_mov_b32_e32 v3, 0 2969; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2970; GFX940-NEXT: s_waitcnt vmcnt(0) 2971; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2972; GFX940-NEXT: s_waitcnt vmcnt(0) 2973; GFX940-NEXT: s_setpc_b64 s[30:31] 2974; 2975; GFX10-PAL-LABEL: store_load_i64_unaligned: 2976; GFX10-PAL: ; %bb.0: ; %bb 2977; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2978; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2979; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2980; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2981; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2982; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2983; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2984; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2985; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2986; GCN-LABEL: store_load_i64_unaligned: 2987; GCN: ; %bb.0: ; %bb 2988; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2989; GCN-NEXT: v_mov_b32_e32 v2, 15 2990; GCN-NEXT: v_mov_b32_e32 v3, 0 2991; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2992; GCN-NEXT: s_waitcnt vmcnt(0) 2993; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2994; GCN-NEXT: s_waitcnt vmcnt(0) 2995; GCN-NEXT: s_setpc_b64 s[30:31] 2996bb: 2997 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2998 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2999 ret void 3000} 3001 3002define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 3003; GFX9-LABEL: store_load_v3i32_unaligned: 3004; GFX9: ; %bb.0: ; %bb 3005; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3006; GFX9-NEXT: v_mov_b32_e32 v1, 1 3007; GFX9-NEXT: v_mov_b32_e32 v2, 2 3008; GFX9-NEXT: v_mov_b32_e32 v3, 3 3009; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3010; GFX9-NEXT: s_waitcnt vmcnt(0) 3011; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3012; GFX9-NEXT: s_waitcnt vmcnt(0) 3013; GFX9-NEXT: s_setpc_b64 s[30:31] 3014; 3015; GFX10-LABEL: store_load_v3i32_unaligned: 3016; GFX10: ; %bb.0: ; %bb 3017; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3018; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3019; GFX10-NEXT: v_mov_b32_e32 v1, 1 3020; GFX10-NEXT: v_mov_b32_e32 v2, 2 3021; GFX10-NEXT: v_mov_b32_e32 v3, 3 3022; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3023; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3024; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3025; GFX10-NEXT: s_waitcnt vmcnt(0) 3026; GFX10-NEXT: s_setpc_b64 s[30:31] 3027; 3028; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 3029; GFX9-PAL: ; %bb.0: ; %bb 3030; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3031; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3032; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3033; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3034; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3035; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3036; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3037; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3038; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3039; 3040; GFX940-LABEL: store_load_v3i32_unaligned: 3041; GFX940: ; %bb.0: ; %bb 3042; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3043; GFX940-NEXT: v_mov_b32_e32 v2, 1 3044; GFX940-NEXT: v_mov_b32_e32 v3, 2 3045; GFX940-NEXT: v_mov_b32_e32 v4, 3 3046; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3047; GFX940-NEXT: s_waitcnt vmcnt(0) 3048; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3049; GFX940-NEXT: s_waitcnt vmcnt(0) 3050; GFX940-NEXT: s_setpc_b64 s[30:31] 3051; 3052; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 3053; GFX10-PAL: ; %bb.0: ; %bb 3054; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3055; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3056; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3057; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3058; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3059; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3060; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3061; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3062; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3063; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3064; GCN-LABEL: store_load_v3i32_unaligned: 3065; GCN: ; %bb.0: ; %bb 3066; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3067; GCN-NEXT: v_mov_b32_e32 v2, 1 3068; GCN-NEXT: v_mov_b32_e32 v3, 2 3069; GCN-NEXT: v_mov_b32_e32 v4, 3 3070; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3071; GCN-NEXT: s_waitcnt vmcnt(0) 3072; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3073; GCN-NEXT: s_waitcnt vmcnt(0) 3074; GCN-NEXT: s_setpc_b64 s[30:31] 3075bb: 3076 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 3077 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 3078 ret void 3079} 3080 3081define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 3082; GFX9-LABEL: store_load_v4i32_unaligned: 3083; GFX9: ; %bb.0: ; %bb 3084; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3085; GFX9-NEXT: v_mov_b32_e32 v1, 1 3086; GFX9-NEXT: v_mov_b32_e32 v2, 2 3087; GFX9-NEXT: v_mov_b32_e32 v3, 3 3088; GFX9-NEXT: v_mov_b32_e32 v4, 4 3089; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3090; GFX9-NEXT: s_waitcnt vmcnt(0) 3091; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3092; GFX9-NEXT: s_waitcnt vmcnt(0) 3093; GFX9-NEXT: s_setpc_b64 s[30:31] 3094; 3095; GFX10-LABEL: store_load_v4i32_unaligned: 3096; GFX10: ; %bb.0: ; %bb 3097; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3098; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3099; GFX10-NEXT: v_mov_b32_e32 v1, 1 3100; GFX10-NEXT: v_mov_b32_e32 v2, 2 3101; GFX10-NEXT: v_mov_b32_e32 v3, 3 3102; GFX10-NEXT: v_mov_b32_e32 v4, 4 3103; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3104; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3105; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3106; GFX10-NEXT: s_waitcnt vmcnt(0) 3107; GFX10-NEXT: s_setpc_b64 s[30:31] 3108; 3109; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 3110; GFX9-PAL: ; %bb.0: ; %bb 3111; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3112; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3113; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3114; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3115; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 3116; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3117; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3118; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3119; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3120; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3121; 3122; GFX940-LABEL: store_load_v4i32_unaligned: 3123; GFX940: ; %bb.0: ; %bb 3124; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3125; GFX940-NEXT: v_mov_b32_e32 v2, 1 3126; GFX940-NEXT: v_mov_b32_e32 v3, 2 3127; GFX940-NEXT: v_mov_b32_e32 v4, 3 3128; GFX940-NEXT: v_mov_b32_e32 v5, 4 3129; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3130; GFX940-NEXT: s_waitcnt vmcnt(0) 3131; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3132; GFX940-NEXT: s_waitcnt vmcnt(0) 3133; GFX940-NEXT: s_setpc_b64 s[30:31] 3134; 3135; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 3136; GFX10-PAL: ; %bb.0: ; %bb 3137; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3138; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3139; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3140; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3141; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3142; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 3143; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3144; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3145; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3146; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3147; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3148; GCN-LABEL: store_load_v4i32_unaligned: 3149; GCN: ; %bb.0: ; %bb 3150; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3151; GCN-NEXT: v_mov_b32_e32 v2, 1 3152; GCN-NEXT: v_mov_b32_e32 v3, 2 3153; GCN-NEXT: v_mov_b32_e32 v4, 3 3154; GCN-NEXT: v_mov_b32_e32 v5, 4 3155; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3156; GCN-NEXT: s_waitcnt vmcnt(0) 3157; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3158; GCN-NEXT: s_waitcnt vmcnt(0) 3159; GCN-NEXT: s_setpc_b64 s[30:31] 3160bb: 3161 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 3162 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 3163 ret void 3164} 3165 3166define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3167; GFX9-LABEL: store_load_i32_negative_unaligned: 3168; GFX9: ; %bb.0: ; %bb 3169; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3170; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 3171; GFX9-NEXT: v_mov_b32_e32 v1, 1 3172; GFX9-NEXT: scratch_store_byte v0, v1, off 3173; GFX9-NEXT: s_waitcnt vmcnt(0) 3174; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3175; GFX9-NEXT: s_waitcnt vmcnt(0) 3176; GFX9-NEXT: s_setpc_b64 s[30:31] 3177; 3178; GFX10-LABEL: store_load_i32_negative_unaligned: 3179; GFX10: ; %bb.0: ; %bb 3180; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3181; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3182; GFX10-NEXT: v_mov_b32_e32 v1, 1 3183; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 3184; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3185; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3186; GFX10-NEXT: s_waitcnt vmcnt(0) 3187; GFX10-NEXT: s_setpc_b64 s[30:31] 3188; 3189; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 3190; GFX9-PAL: ; %bb.0: ; %bb 3191; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3192; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 3193; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3194; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 3195; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3196; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 3197; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3198; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3199; 3200; GFX940-LABEL: store_load_i32_negative_unaligned: 3201; GFX940: ; %bb.0: ; %bb 3202; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3203; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 3204; GFX940-NEXT: v_mov_b32_e32 v1, 1 3205; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 3206; GFX940-NEXT: s_waitcnt vmcnt(0) 3207; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 3208; GFX940-NEXT: s_waitcnt vmcnt(0) 3209; GFX940-NEXT: s_setpc_b64 s[30:31] 3210; 3211; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 3212; GFX1010-PAL: ; %bb.0: ; %bb 3213; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3214; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3215; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 3216; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 3217; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 3218; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3219; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 3220; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3221; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 3222; 3223; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 3224; GFX1030-PAL: ; %bb.0: ; %bb 3225; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3226; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3227; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 3228; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 3229; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3230; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3231; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3232; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 3233bb: 3234 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 3235 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 3236 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 3237 ret void 3238} 3239 3240define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3241; GFX9-LABEL: store_load_i32_large_negative_unaligned: 3242; GFX9: ; %bb.0: ; %bb 3243; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3244; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 3245; GFX9-NEXT: v_mov_b32_e32 v1, 1 3246; GFX9-NEXT: scratch_store_byte v0, v1, off 3247; GFX9-NEXT: s_waitcnt vmcnt(0) 3248; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3249; GFX9-NEXT: s_waitcnt vmcnt(0) 3250; GFX9-NEXT: s_setpc_b64 s[30:31] 3251; 3252; GFX10-LABEL: store_load_i32_large_negative_unaligned: 3253; GFX10: ; %bb.0: ; %bb 3254; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3255; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3256; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 3257; GFX10-NEXT: v_mov_b32_e32 v1, 1 3258; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 3259; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3260; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 3261; GFX10-NEXT: s_waitcnt vmcnt(0) 3262; GFX10-NEXT: s_setpc_b64 s[30:31] 3263; 3264; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 3265; GFX9-PAL: ; %bb.0: ; %bb 3266; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3267; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 3268; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3269; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 3270; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3271; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 3272; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3273; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3274; 3275; GFX940-LABEL: store_load_i32_large_negative_unaligned: 3276; GFX940: ; %bb.0: ; %bb 3277; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3278; GFX940-NEXT: s_movk_i32 s0, 0xef7f 3279; GFX940-NEXT: v_mov_b32_e32 v1, 1 3280; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 3281; GFX940-NEXT: s_waitcnt vmcnt(0) 3282; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 3283; GFX940-NEXT: s_waitcnt vmcnt(0) 3284; GFX940-NEXT: s_setpc_b64 s[30:31] 3285; 3286; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 3287; GFX1010-PAL: ; %bb.0: ; %bb 3288; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3289; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3290; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 3291; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 3292; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 3293; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3294; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 3295; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3296; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 3297; 3298; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 3299; GFX1030-PAL: ; %bb.0: ; %bb 3300; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3301; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3302; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 3303; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 3304; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 3305; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3306; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 3307; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3308; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 3309bb: 3310 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 3311 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 3312 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 3313 ret void 3314} 3315 3316define amdgpu_ps void @large_offset() { 3317; GFX9-LABEL: large_offset: 3318; GFX9: ; %bb.0: ; %bb 3319; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 3320; GFX9-NEXT: v_mov_b32_e32 v0, 0 3321; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 3322; GFX9-NEXT: v_mov_b32_e32 v1, v0 3323; GFX9-NEXT: v_mov_b32_e32 v2, v0 3324; GFX9-NEXT: v_mov_b32_e32 v3, v0 3325; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3326; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 3327; GFX9-NEXT: s_waitcnt vmcnt(0) 3328; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3329; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 3330; GFX9-NEXT: s_waitcnt vmcnt(0) 3331; GFX9-NEXT: v_mov_b32_e32 v0, 16 3332; GFX9-NEXT: ;;#ASMSTART 3333; GFX9-NEXT: ; use v0 3334; GFX9-NEXT: ;;#ASMEND 3335; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 3336; GFX9-NEXT: ;;#ASMSTART 3337; GFX9-NEXT: ; use v0 3338; GFX9-NEXT: ;;#ASMEND 3339; GFX9-NEXT: s_endpgm 3340; 3341; GFX10-LABEL: large_offset: 3342; GFX10: ; %bb.0: ; %bb 3343; GFX10-NEXT: s_add_u32 s0, s0, s2 3344; GFX10-NEXT: s_addc_u32 s1, s1, 0 3345; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 3346; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 3347; GFX10-NEXT: v_mov_b32_e32 v0, 0 3348; GFX10-NEXT: s_movk_i32 s0, 0x810 3349; GFX10-NEXT: s_addk_i32 s0, 0x3c0 3350; GFX10-NEXT: v_mov_b32_e32 v1, v0 3351; GFX10-NEXT: v_mov_b32_e32 v2, v0 3352; GFX10-NEXT: v_mov_b32_e32 v3, v0 3353; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 3354; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3355; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 3356; GFX10-NEXT: s_waitcnt vmcnt(0) 3357; GFX10-NEXT: v_mov_b32_e32 v0, 16 3358; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 3359; GFX10-NEXT: ;;#ASMSTART 3360; GFX10-NEXT: ; use v0 3361; GFX10-NEXT: ;;#ASMEND 3362; GFX10-NEXT: ;;#ASMSTART 3363; GFX10-NEXT: ; use v1 3364; GFX10-NEXT: ;;#ASMEND 3365; GFX10-NEXT: s_endpgm 3366; 3367; GFX9-PAL-LABEL: large_offset: 3368; GFX9-PAL: ; %bb.0: ; %bb 3369; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 3370; GFX9-PAL-NEXT: s_mov_b32 s2, s0 3371; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3372; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 3373; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 3374; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 3375; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 3376; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3377; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3378; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 3379; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3380; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3381; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 3382; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3383; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3384; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 3385; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3386; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 3387; GFX9-PAL-NEXT: ;;#ASMSTART 3388; GFX9-PAL-NEXT: ; use v0 3389; GFX9-PAL-NEXT: ;;#ASMEND 3390; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 3391; GFX9-PAL-NEXT: ;;#ASMSTART 3392; GFX9-PAL-NEXT: ; use v0 3393; GFX9-PAL-NEXT: ;;#ASMEND 3394; GFX9-PAL-NEXT: s_endpgm 3395; 3396; GFX940-LABEL: large_offset: 3397; GFX940: ; %bb.0: ; %bb 3398; GFX940-NEXT: v_mov_b32_e32 v0, 0 3399; GFX940-NEXT: v_mov_b32_e32 v1, v0 3400; GFX940-NEXT: v_mov_b32_e32 v2, v0 3401; GFX940-NEXT: v_mov_b32_e32 v3, v0 3402; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 3403; GFX940-NEXT: s_waitcnt vmcnt(0) 3404; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 3405; GFX940-NEXT: s_waitcnt vmcnt(0) 3406; GFX940-NEXT: v_mov_b32_e32 v0, 16 3407; GFX940-NEXT: ;;#ASMSTART 3408; GFX940-NEXT: ; use v0 3409; GFX940-NEXT: ;;#ASMEND 3410; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 3411; GFX940-NEXT: ;;#ASMSTART 3412; GFX940-NEXT: ; use v0 3413; GFX940-NEXT: ;;#ASMEND 3414; GFX940-NEXT: s_endpgm 3415; 3416; GFX10-PAL-LABEL: large_offset: 3417; GFX10-PAL: ; %bb.0: ; %bb 3418; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 3419; GFX10-PAL-NEXT: s_mov_b32 s2, s0 3420; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3421; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3422; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3423; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 3424; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 3425; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3426; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3427; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 3428; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 3429; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 3430; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 3431; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 3432; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 3433; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 3434; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3435; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 3436; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3437; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 3438; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 3439; GFX10-PAL-NEXT: ;;#ASMSTART 3440; GFX10-PAL-NEXT: ; use v0 3441; GFX10-PAL-NEXT: ;;#ASMEND 3442; GFX10-PAL-NEXT: ;;#ASMSTART 3443; GFX10-PAL-NEXT: ; use v1 3444; GFX10-PAL-NEXT: ;;#ASMEND 3445; GFX10-PAL-NEXT: s_endpgm 3446bb: 3447 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 3448 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 3449 %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60 3450 store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16 3451 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16 3452 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0 3453 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0 3454 ret void 3455} 3456 3457declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 3458declare i32 @llvm.amdgcn.workitem.id.x() 3459