1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s 5; RUN: llc -march=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s 6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s 7; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s 8 9define amdgpu_kernel void @zero_init_kernel() { 10; GFX9-LABEL: zero_init_kernel: 11; GFX9: ; %bb.0: 12; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 13; GFX9-NEXT: s_mov_b32 s0, 0 14; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 15; GFX9-NEXT: s_mov_b32 s1, s0 16; GFX9-NEXT: s_mov_b32 s2, s0 17; GFX9-NEXT: s_mov_b32 s3, s0 18; GFX9-NEXT: v_mov_b32_e32 v0, s0 19; GFX9-NEXT: v_mov_b32_e32 v1, s1 20; GFX9-NEXT: v_mov_b32_e32 v2, s2 21; GFX9-NEXT: v_mov_b32_e32 v3, s3 22; GFX9-NEXT: s_mov_b32 vcc_hi, 0 23; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 24; GFX9-NEXT: s_mov_b32 vcc_hi, 0 25; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 26; GFX9-NEXT: s_mov_b32 vcc_hi, 0 27; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 28; GFX9-NEXT: s_mov_b32 vcc_hi, 0 29; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 30; GFX9-NEXT: s_endpgm 31; 32; GFX10-LABEL: zero_init_kernel: 33; GFX10: ; %bb.0: 34; GFX10-NEXT: s_add_u32 s0, s0, s3 35; GFX10-NEXT: s_addc_u32 s1, s1, 0 36; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 37; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 38; GFX10-NEXT: s_mov_b32 s0, 0 39; GFX10-NEXT: s_mov_b32 s1, s0 40; GFX10-NEXT: s_mov_b32 s2, s0 41; GFX10-NEXT: s_mov_b32 s3, s0 42; GFX10-NEXT: v_mov_b32_e32 v0, s0 43; GFX10-NEXT: v_mov_b32_e32 v1, s1 44; GFX10-NEXT: v_mov_b32_e32 v2, s2 45; GFX10-NEXT: v_mov_b32_e32 v3, s3 46; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 47; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 48; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 49; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 50; GFX10-NEXT: s_endpgm 51; 52; GFX9-PAL-LABEL: zero_init_kernel: 53; GFX9-PAL: ; %bb.0: 54; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 55; GFX9-PAL-NEXT: s_mov_b32 s2, s0 56; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 57; GFX9-PAL-NEXT: s_mov_b32 s0, 0 58; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 59; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 60; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 61; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 62; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 63; GFX9-PAL-NEXT: s_mov_b32 s1, s0 64; GFX9-PAL-NEXT: s_mov_b32 s2, s0 65; GFX9-PAL-NEXT: s_mov_b32 s3, s0 66; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 67; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 68; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 69; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 70; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64 71; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 72; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 73; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 74; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 75; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 76; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 77; GFX9-PAL-NEXT: s_endpgm 78; 79; GFX940-LABEL: zero_init_kernel: 80; GFX940: ; %bb.0: 81; GFX940-NEXT: s_mov_b32 s0, 0 82; GFX940-NEXT: s_mov_b32 s1, s0 83; GFX940-NEXT: s_mov_b32 s2, s0 84; GFX940-NEXT: s_mov_b32 s3, s0 85; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 86; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 87; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 88; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 89; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 90; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 91; GFX940-NEXT: s_endpgm 92; 93; GFX1010-PAL-LABEL: zero_init_kernel: 94; GFX1010-PAL: ; %bb.0: 95; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 96; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 97; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 98; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 99; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 100; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 101; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 102; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 103; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 104; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 105; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 106; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 107; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 108; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 109; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 110; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 111; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 112; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 113; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64 114; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 115; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 116; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 117; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 118; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 119; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 120; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 121; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 122; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 123; GFX1010-PAL-NEXT: s_endpgm 124; 125; GFX1030-PAL-LABEL: zero_init_kernel: 126; GFX1030-PAL: ; %bb.0: 127; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 128; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 129; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 130; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 131; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 132; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 133; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 134; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 135; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 136; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 137; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 138; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 139; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 140; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 141; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 142; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 143; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 144; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64 145; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48 146; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32 147; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16 148; GFX1030-PAL-NEXT: s_endpgm 149 %alloca = alloca [32 x i16], align 2, addrspace(5) 150 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 151 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 152 ret void 153} 154 155define void @zero_init_foo() { 156; GFX9-LABEL: zero_init_foo: 157; GFX9: ; %bb.0: 158; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 159; GFX9-NEXT: s_mov_b32 s0, 0 160; GFX9-NEXT: s_mov_b32 s1, s0 161; GFX9-NEXT: s_mov_b32 s2, s0 162; GFX9-NEXT: s_mov_b32 s3, s0 163; GFX9-NEXT: v_mov_b32_e32 v0, s0 164; GFX9-NEXT: v_mov_b32_e32 v1, s1 165; GFX9-NEXT: v_mov_b32_e32 v2, s2 166; GFX9-NEXT: v_mov_b32_e32 v3, s3 167; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 168; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 169; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 170; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 171; GFX9-NEXT: s_waitcnt vmcnt(0) 172; GFX9-NEXT: s_setpc_b64 s[30:31] 173; 174; GFX10-LABEL: zero_init_foo: 175; GFX10: ; %bb.0: 176; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 178; GFX10-NEXT: s_mov_b32 s0, 0 179; GFX10-NEXT: s_mov_b32 s1, s0 180; GFX10-NEXT: s_mov_b32 s2, s0 181; GFX10-NEXT: s_mov_b32 s3, s0 182; GFX10-NEXT: v_mov_b32_e32 v0, s0 183; GFX10-NEXT: v_mov_b32_e32 v1, s1 184; GFX10-NEXT: v_mov_b32_e32 v2, s2 185; GFX10-NEXT: v_mov_b32_e32 v3, s3 186; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 187; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 188; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 189; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 190; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 191; GFX10-NEXT: s_setpc_b64 s[30:31] 192; 193; GFX9-PAL-LABEL: zero_init_foo: 194; GFX9-PAL: ; %bb.0: 195; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 196; GFX9-PAL-NEXT: s_mov_b32 s0, 0 197; GFX9-PAL-NEXT: s_mov_b32 s1, s0 198; GFX9-PAL-NEXT: s_mov_b32 s2, s0 199; GFX9-PAL-NEXT: s_mov_b32 s3, s0 200; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 201; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 202; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 203; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 204; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 205; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 206; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 207; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 208; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 209; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 210; 211; GFX940-LABEL: zero_init_foo: 212; GFX940: ; %bb.0: 213; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 214; GFX940-NEXT: s_mov_b32 s0, 0 215; GFX940-NEXT: s_mov_b32 s1, s0 216; GFX940-NEXT: s_mov_b32 s2, s0 217; GFX940-NEXT: s_mov_b32 s3, s0 218; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 219; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 220; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 221; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 222; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 223; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 224; GFX940-NEXT: s_waitcnt vmcnt(0) 225; GFX940-NEXT: s_setpc_b64 s[30:31] 226; 227; GFX10-PAL-LABEL: zero_init_foo: 228; GFX10-PAL: ; %bb.0: 229; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 231; GFX10-PAL-NEXT: s_mov_b32 s0, 0 232; GFX10-PAL-NEXT: s_mov_b32 s1, s0 233; GFX10-PAL-NEXT: s_mov_b32 s2, s0 234; GFX10-PAL-NEXT: s_mov_b32 s3, s0 235; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 236; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 237; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 238; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 239; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 240; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 241; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 242; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 243; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 244; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 245; GCN-LABEL: zero_init_foo: 246; GCN: ; %bb.0: 247; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GCN-NEXT: s_mov_b32 s0, 0 249; GCN-NEXT: s_mov_b32 s1, s0 250; GCN-NEXT: s_mov_b32 s2, s0 251; GCN-NEXT: s_mov_b32 s3, s0 252; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 253; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 254; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 255; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 256; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 257; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 258; GCN-NEXT: s_waitcnt vmcnt(0) 259; GCN-NEXT: s_setpc_b64 s[30:31] 260 %alloca = alloca [32 x i16], align 2, addrspace(5) 261 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 262 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 263 ret void 264} 265 266define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 267; GFX9-LABEL: store_load_sindex_kernel: 268; GFX9: ; %bb.0: ; %bb 269; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 270; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 271; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 272; GFX9-NEXT: v_mov_b32_e32 v0, 15 273; GFX9-NEXT: s_waitcnt lgkmcnt(0) 274; GFX9-NEXT: s_lshl_b32 s1, s0, 2 275; GFX9-NEXT: s_and_b32 s0, s0, 15 276; GFX9-NEXT: s_add_i32 s1, s1, 4 277; GFX9-NEXT: s_lshl_b32 s0, s0, 2 278; GFX9-NEXT: scratch_store_dword off, v0, s1 279; GFX9-NEXT: s_waitcnt vmcnt(0) 280; GFX9-NEXT: s_add_i32 s0, s0, 4 281; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 282; GFX9-NEXT: s_waitcnt vmcnt(0) 283; GFX9-NEXT: s_endpgm 284; 285; GFX10-LABEL: store_load_sindex_kernel: 286; GFX10: ; %bb.0: ; %bb 287; GFX10-NEXT: s_add_u32 s2, s2, s5 288; GFX10-NEXT: s_addc_u32 s3, s3, 0 289; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 290; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 291; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 292; GFX10-NEXT: v_mov_b32_e32 v0, 15 293; GFX10-NEXT: s_waitcnt lgkmcnt(0) 294; GFX10-NEXT: s_and_b32 s1, s0, 15 295; GFX10-NEXT: s_lshl_b32 s0, s0, 2 296; GFX10-NEXT: s_lshl_b32 s1, s1, 2 297; GFX10-NEXT: s_add_i32 s0, s0, 4 298; GFX10-NEXT: s_add_i32 s1, s1, 4 299; GFX10-NEXT: scratch_store_dword off, v0, s0 300; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 301; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 302; GFX10-NEXT: s_waitcnt vmcnt(0) 303; GFX10-NEXT: s_endpgm 304; 305; GFX9-PAL-LABEL: store_load_sindex_kernel: 306; GFX9-PAL: ; %bb.0: ; %bb 307; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 308; GFX9-PAL-NEXT: s_mov_b32 s4, s0 309; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 310; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 311; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 312; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 313; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 314; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 315; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 316; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 317; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 318; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 319; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 320; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 321; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 322; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 323; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 324; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 325; GFX9-PAL-NEXT: s_endpgm 326; 327; GFX940-LABEL: store_load_sindex_kernel: 328; GFX940: ; %bb.0: ; %bb 329; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 330; GFX940-NEXT: v_mov_b32_e32 v0, 15 331; GFX940-NEXT: s_waitcnt lgkmcnt(0) 332; GFX940-NEXT: s_lshl_b32 s1, s0, 2 333; GFX940-NEXT: s_and_b32 s0, s0, 15 334; GFX940-NEXT: s_add_i32 s1, s1, 4 335; GFX940-NEXT: s_lshl_b32 s0, s0, 2 336; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 337; GFX940-NEXT: s_waitcnt vmcnt(0) 338; GFX940-NEXT: s_add_i32 s0, s0, 4 339; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 340; GFX940-NEXT: s_waitcnt vmcnt(0) 341; GFX940-NEXT: s_endpgm 342; 343; GFX10-PAL-LABEL: store_load_sindex_kernel: 344; GFX10-PAL: ; %bb.0: ; %bb 345; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 346; GFX10-PAL-NEXT: s_mov_b32 s4, s0 347; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 348; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 349; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 350; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 351; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 352; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 353; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 354; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 355; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 356; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 357; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 358; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 359; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 360; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 361; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 362; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 363; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 364; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 365; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 366; GFX10-PAL-NEXT: s_endpgm 367; GCN-LABEL: store_load_sindex_kernel: 368; GCN: ; %bb.0: ; %bb 369; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 370; GCN-NEXT: v_mov_b32_e32 v0, 15 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: s_lshl_b32 s1, s0, 2 373; GCN-NEXT: s_and_b32 s0, s0, 15 374; GCN-NEXT: s_lshl_b32 s0, s0, 2 375; GCN-NEXT: s_add_u32 s1, 4, s1 376; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 377; GCN-NEXT: s_waitcnt vmcnt(0) 378; GCN-NEXT: s_add_u32 s0, 4, s0 379; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 380; GCN-NEXT: s_waitcnt vmcnt(0) 381; GCN-NEXT: s_endpgm 382bb: 383 %i = alloca [32 x float], align 4, addrspace(5) 384 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 385 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 386 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 387 store volatile i32 15, i32 addrspace(5)* %i8, align 4 388 %i9 = and i32 %idx, 15 389 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 390 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 391 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 392 ret void 393} 394 395define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) { 396; GFX9-LABEL: store_load_sindex_foo: 397; GFX9: ; %bb.0: ; %bb 398; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 399; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 400; GFX9-NEXT: s_lshl_b32 s0, s2, 2 401; GFX9-NEXT: s_add_i32 s0, s0, 4 402; GFX9-NEXT: v_mov_b32_e32 v0, 15 403; GFX9-NEXT: scratch_store_dword off, v0, s0 404; GFX9-NEXT: s_waitcnt vmcnt(0) 405; GFX9-NEXT: s_and_b32 s0, s2, 15 406; GFX9-NEXT: s_lshl_b32 s0, s0, 2 407; GFX9-NEXT: s_add_i32 s0, s0, 4 408; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 409; GFX9-NEXT: s_waitcnt vmcnt(0) 410; GFX9-NEXT: s_endpgm 411; 412; GFX10-LABEL: store_load_sindex_foo: 413; GFX10: ; %bb.0: ; %bb 414; GFX10-NEXT: s_add_u32 s0, s0, s3 415; GFX10-NEXT: s_addc_u32 s1, s1, 0 416; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 417; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 418; GFX10-NEXT: v_mov_b32_e32 v0, 15 419; GFX10-NEXT: s_and_b32 s0, s2, 15 420; GFX10-NEXT: s_lshl_b32 s1, s2, 2 421; GFX10-NEXT: s_lshl_b32 s0, s0, 2 422; GFX10-NEXT: s_add_i32 s1, s1, 4 423; GFX10-NEXT: s_add_i32 s0, s0, 4 424; GFX10-NEXT: scratch_store_dword off, v0, s1 425; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 426; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 427; GFX10-NEXT: s_waitcnt vmcnt(0) 428; GFX10-NEXT: s_endpgm 429; 430; GFX9-PAL-LABEL: store_load_sindex_foo: 431; GFX9-PAL: ; %bb.0: ; %bb 432; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 433; GFX9-PAL-NEXT: s_mov_b32 s2, s0 434; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 435; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 436; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 437; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 438; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 439; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 440; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 441; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 442; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4 443; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 444; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 445; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 446; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 447; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 448; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 449; GFX9-PAL-NEXT: s_endpgm 450; 451; GFX940-LABEL: store_load_sindex_foo: 452; GFX940: ; %bb.0: ; %bb 453; GFX940-NEXT: s_lshl_b32 s1, s0, 2 454; GFX940-NEXT: s_and_b32 s0, s0, 15 455; GFX940-NEXT: s_add_i32 s1, s1, 4 456; GFX940-NEXT: v_mov_b32_e32 v0, 15 457; GFX940-NEXT: s_lshl_b32 s0, s0, 2 458; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 459; GFX940-NEXT: s_waitcnt vmcnt(0) 460; GFX940-NEXT: s_add_i32 s0, s0, 4 461; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 462; GFX940-NEXT: s_waitcnt vmcnt(0) 463; GFX940-NEXT: s_endpgm 464; 465; GFX10-PAL-LABEL: store_load_sindex_foo: 466; GFX10-PAL: ; %bb.0: ; %bb 467; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 468; GFX10-PAL-NEXT: s_mov_b32 s2, s0 469; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 470; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 471; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 472; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 473; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 474; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 475; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 476; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15 477; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15 478; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2 479; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2 480; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4 481; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4 482; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0 483; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 484; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 485; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 486; GFX10-PAL-NEXT: s_endpgm 487; GCN-LABEL: store_load_sindex_foo: 488; GCN: ; %bb.0: ; %bb 489; GCN-NEXT: s_lshl_b32 s1, s0, 2 490; GCN-NEXT: s_and_b32 s0, s0, 15 491; GCN-NEXT: s_lshl_b32 s0, s0, 2 492; GCN-NEXT: s_add_u32 s1, 4, s1 493; GCN-NEXT: v_mov_b32_e32 v0, 15 494; GCN-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 495; GCN-NEXT: s_waitcnt vmcnt(0) 496; GCN-NEXT: s_add_u32 s0, 4, s0 497; GCN-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 498; GCN-NEXT: s_waitcnt vmcnt(0) 499; GCN-NEXT: s_endpgm 500bb: 501 %i = alloca [32 x float], align 4, addrspace(5) 502 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 503 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 504 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 505 store volatile i32 15, i32 addrspace(5)* %i8, align 4 506 %i9 = and i32 %idx, 15 507 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 508 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 509 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 510 ret void 511} 512 513define amdgpu_kernel void @store_load_vindex_kernel() { 514; GFX9-LABEL: store_load_vindex_kernel: 515; GFX9: ; %bb.0: ; %bb 516; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 517; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 518; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 519; GFX9-NEXT: v_add_u32_e32 v1, 4, v0 520; GFX9-NEXT: v_mov_b32_e32 v2, 15 521; GFX9-NEXT: scratch_store_dword v1, v2, off 522; GFX9-NEXT: s_waitcnt vmcnt(0) 523; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0 524; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 525; GFX9-NEXT: s_waitcnt vmcnt(0) 526; GFX9-NEXT: s_endpgm 527; 528; GFX10-LABEL: store_load_vindex_kernel: 529; GFX10: ; %bb.0: ; %bb 530; GFX10-NEXT: s_add_u32 s0, s0, s3 531; GFX10-NEXT: s_addc_u32 s1, s1, 0 532; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 533; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 534; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 535; GFX10-NEXT: v_mov_b32_e32 v2, 15 536; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0 537; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0 538; GFX10-NEXT: scratch_store_dword v1, v2, off 539; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 540; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 541; GFX10-NEXT: s_waitcnt vmcnt(0) 542; GFX10-NEXT: s_endpgm 543; 544; GFX9-PAL-LABEL: store_load_vindex_kernel: 545; GFX9-PAL: ; %bb.0: ; %bb 546; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 547; GFX9-PAL-NEXT: s_mov_b32 s2, s0 548; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 549; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 550; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0 551; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 552; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0 553; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 554; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 555; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 556; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 557; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 558; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 559; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 560; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 561; GFX9-PAL-NEXT: s_endpgm 562; 563; GFX940-LABEL: store_load_vindex_kernel: 564; GFX940: ; %bb.0: ; %bb 565; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 566; GFX940-NEXT: v_mov_b32_e32 v1, 15 567; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 568; GFX940-NEXT: s_waitcnt vmcnt(0) 569; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0 570; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 571; GFX940-NEXT: s_waitcnt vmcnt(0) 572; GFX940-NEXT: s_endpgm 573; 574; GFX10-PAL-LABEL: store_load_vindex_kernel: 575; GFX10-PAL: ; %bb.0: ; %bb 576; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 577; GFX10-PAL-NEXT: s_mov_b32 s2, s0 578; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 579; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 580; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 581; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1 582; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 583; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 584; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 585; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 586; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 587; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 4, v0 588; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 4, v0 589; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off 590; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 591; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 592; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 593; GFX10-PAL-NEXT: s_endpgm 594; GCN-LABEL: store_load_vindex_kernel: 595; GCN: ; %bb.0: ; %bb 596; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 597; GCN-NEXT: v_mov_b32_e32 v1, 15 598; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 599; GCN-NEXT: s_waitcnt vmcnt(0) 600; GCN-NEXT: v_sub_u32_e32 v0, 4, v0 601; GCN-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 602; GCN-NEXT: s_waitcnt vmcnt(0) 603; GCN-NEXT: s_endpgm 604bb: 605 %i = alloca [32 x float], align 4, addrspace(5) 606 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 607 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 608 %i3 = zext i32 %i2 to i64 609 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 610 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 611 store volatile i32 15, i32 addrspace(5)* %i8, align 4 612 %i9 = sub nsw i32 31, %i2 613 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 614 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 615 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 616 ret void 617} 618 619define void @store_load_vindex_foo(i32 %idx) { 620; GFX9-LABEL: store_load_vindex_foo: 621; GFX9: ; %bb.0: ; %bb 622; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 623; GFX9-NEXT: v_mov_b32_e32 v1, s32 624; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 625; GFX9-NEXT: v_mov_b32_e32 v3, 15 626; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 627; GFX9-NEXT: scratch_store_dword v2, v3, off 628; GFX9-NEXT: s_waitcnt vmcnt(0) 629; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 630; GFX9-NEXT: scratch_load_dword v0, v0, off glc 631; GFX9-NEXT: s_waitcnt vmcnt(0) 632; GFX9-NEXT: s_setpc_b64 s[30:31] 633; 634; GFX10-LABEL: store_load_vindex_foo: 635; GFX10: ; %bb.0: ; %bb 636; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 637; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 638; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 639; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32 640; GFX10-NEXT: v_mov_b32_e32 v2, 15 641; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32 642; GFX10-NEXT: scratch_store_dword v0, v2, off 643; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 644; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 645; GFX10-NEXT: s_waitcnt vmcnt(0) 646; GFX10-NEXT: s_setpc_b64 s[30:31] 647; 648; GFX9-PAL-LABEL: store_load_vindex_foo: 649; GFX9-PAL: ; %bb.0: ; %bb 650; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 651; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32 652; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 653; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 654; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 655; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 656; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 657; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 658; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 659; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 660; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 661; 662; GFX940-LABEL: store_load_vindex_foo: 663; GFX940: ; %bb.0: ; %bb 664; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 665; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 666; GFX940-NEXT: v_mov_b32_e32 v2, 15 667; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 668; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 669; GFX940-NEXT: s_waitcnt vmcnt(0) 670; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 671; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 672; GFX940-NEXT: s_waitcnt vmcnt(0) 673; GFX940-NEXT: s_setpc_b64 s[30:31] 674; 675; GFX10-PAL-LABEL: store_load_vindex_foo: 676; GFX10-PAL: ; %bb.0: ; %bb 677; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 678; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 679; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 680; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32 681; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 682; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32 683; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 684; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 685; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 686; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 687; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 688; GCN-LABEL: store_load_vindex_foo: 689; GCN: ; %bb.0: ; %bb 690; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 691; GCN-NEXT: v_mov_b32_e32 v2, 15 692; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 693; GCN-NEXT: v_and_b32_e32 v0, v0, v2 694; GCN-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 695; GCN-NEXT: s_waitcnt vmcnt(0) 696; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 697; GCN-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 698; GCN-NEXT: s_waitcnt vmcnt(0) 699; GCN-NEXT: s_setpc_b64 s[30:31] 700bb: 701 %i = alloca [32 x float], align 4, addrspace(5) 702 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 703 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 704 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 705 store volatile i32 15, i32 addrspace(5)* %i8, align 4 706 %i9 = and i32 %idx, 15 707 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 708 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 709 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 710 ret void 711} 712 713define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 714; GFX9-LABEL: private_ptr_foo: 715; GFX9: ; %bb.0: 716; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 717; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 718; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 719; GFX9-NEXT: s_waitcnt vmcnt(0) 720; GFX9-NEXT: s_setpc_b64 s[30:31] 721; 722; GFX10-LABEL: private_ptr_foo: 723; GFX10: ; %bb.0: 724; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 725; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 726; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 727; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 728; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 729; GFX10-NEXT: s_setpc_b64 s[30:31] 730; 731; GFX9-PAL-LABEL: private_ptr_foo: 732; GFX9-PAL: ; %bb.0: 733; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 734; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 735; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 736; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 737; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 738; 739; GFX940-LABEL: private_ptr_foo: 740; GFX940: ; %bb.0: 741; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 742; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 743; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 744; GFX940-NEXT: s_waitcnt vmcnt(0) 745; GFX940-NEXT: s_setpc_b64 s[30:31] 746; 747; GFX10-PAL-LABEL: private_ptr_foo: 748; GFX10-PAL: ; %bb.0: 749; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 750; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 751; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000 752; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4 753; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 754; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 755; GCN-LABEL: private_ptr_foo: 756; GCN: ; %bb.0: 757; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 758; GCN-NEXT: v_mov_b32_e32 v1, 0x41200000 759; GCN-NEXT: scratch_store_dword v0, v1, off offset:4 760; GCN-NEXT: s_waitcnt vmcnt(0) 761; GCN-NEXT: s_setpc_b64 s[30:31] 762 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 763 store float 1.000000e+01, float addrspace(5)* %gep, align 4 764 ret void 765} 766 767define amdgpu_kernel void @zero_init_small_offset_kernel() { 768; GFX9-LABEL: zero_init_small_offset_kernel: 769; GFX9: ; %bb.0: 770; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 771; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 772; GFX9-NEXT: s_mov_b32 vcc_hi, 0 773; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 774; GFX9-NEXT: s_waitcnt vmcnt(0) 775; GFX9-NEXT: s_mov_b32 s0, 0 776; GFX9-NEXT: s_mov_b32 s1, s0 777; GFX9-NEXT: s_mov_b32 s2, s0 778; GFX9-NEXT: s_mov_b32 s3, s0 779; GFX9-NEXT: v_mov_b32_e32 v0, s0 780; GFX9-NEXT: v_mov_b32_e32 v1, s1 781; GFX9-NEXT: v_mov_b32_e32 v2, s2 782; GFX9-NEXT: v_mov_b32_e32 v3, s3 783; GFX9-NEXT: s_mov_b32 vcc_hi, 0 784; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 785; GFX9-NEXT: s_mov_b32 vcc_hi, 0 786; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 787; GFX9-NEXT: s_mov_b32 vcc_hi, 0 788; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 789; GFX9-NEXT: s_mov_b32 vcc_hi, 0 790; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 791; GFX9-NEXT: s_endpgm 792; 793; GFX10-LABEL: zero_init_small_offset_kernel: 794; GFX10: ; %bb.0: 795; GFX10-NEXT: s_add_u32 s0, s0, s3 796; GFX10-NEXT: s_addc_u32 s1, s1, 0 797; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 798; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 799; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 800; GFX10-NEXT: s_waitcnt vmcnt(0) 801; GFX10-NEXT: s_mov_b32 s0, 0 802; GFX10-NEXT: s_mov_b32 s1, s0 803; GFX10-NEXT: s_mov_b32 s2, s0 804; GFX10-NEXT: s_mov_b32 s3, s0 805; GFX10-NEXT: v_mov_b32_e32 v0, s0 806; GFX10-NEXT: v_mov_b32_e32 v1, s1 807; GFX10-NEXT: v_mov_b32_e32 v2, s2 808; GFX10-NEXT: v_mov_b32_e32 v3, s3 809; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 810; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 811; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 812; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 813; GFX10-NEXT: s_endpgm 814; 815; GFX9-PAL-LABEL: zero_init_small_offset_kernel: 816; GFX9-PAL: ; %bb.0: 817; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 818; GFX9-PAL-NEXT: s_mov_b32 s2, s0 819; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 820; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 821; GFX9-PAL-NEXT: s_mov_b32 s0, 0 822; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 823; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 824; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 825; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 826; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 827; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 828; GFX9-PAL-NEXT: s_mov_b32 s1, s0 829; GFX9-PAL-NEXT: s_mov_b32 s2, s0 830; GFX9-PAL-NEXT: s_mov_b32 s3, s0 831; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 832; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 833; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 834; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 835; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 836; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272 837; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 838; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288 839; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 840; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304 841; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 842; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320 843; GFX9-PAL-NEXT: s_endpgm 844; 845; GFX940-LABEL: zero_init_small_offset_kernel: 846; GFX940: ; %bb.0: 847; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 848; GFX940-NEXT: s_waitcnt vmcnt(0) 849; GFX940-NEXT: s_mov_b32 s0, 0 850; GFX940-NEXT: s_mov_b32 s1, s0 851; GFX940-NEXT: s_mov_b32 s2, s0 852; GFX940-NEXT: s_mov_b32 s3, s0 853; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 854; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 855; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 856; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 857; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 858; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 859; GFX940-NEXT: s_endpgm 860; 861; GFX1010-PAL-LABEL: zero_init_small_offset_kernel: 862; GFX1010-PAL: ; %bb.0: 863; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 864; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 865; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 866; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 867; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 868; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 869; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 870; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 871; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 872; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 873; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 874; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 875; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 876; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 877; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 878; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 879; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 880; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 881; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 882; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 883; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 884; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272 885; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 886; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 887; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288 888; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 889; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 890; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304 891; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 892; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 893; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320 894; GFX1010-PAL-NEXT: s_endpgm 895; 896; GFX1030-PAL-LABEL: zero_init_small_offset_kernel: 897; GFX1030-PAL: ; %bb.0: 898; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 899; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 900; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 901; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 902; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 903; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 904; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 905; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 906; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 907; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 908; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 909; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 910; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 911; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 912; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 913; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 914; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 915; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 916; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 917; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272 918; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288 919; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304 920; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320 921; GFX1030-PAL-NEXT: s_endpgm 922 %padding = alloca [64 x i32], align 4, addrspace(5) 923 %alloca = alloca [32 x i16], align 2, addrspace(5) 924 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 925 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 926 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 927 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 928 ret void 929} 930 931define void @zero_init_small_offset_foo() { 932; GFX9-LABEL: zero_init_small_offset_foo: 933; GFX9: ; %bb.0: 934; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 935; GFX9-NEXT: scratch_load_dword v0, off, s32 glc 936; GFX9-NEXT: s_waitcnt vmcnt(0) 937; GFX9-NEXT: s_mov_b32 s0, 0 938; GFX9-NEXT: s_mov_b32 s1, s0 939; GFX9-NEXT: s_mov_b32 s2, s0 940; GFX9-NEXT: s_mov_b32 s3, s0 941; GFX9-NEXT: v_mov_b32_e32 v0, s0 942; GFX9-NEXT: v_mov_b32_e32 v1, s1 943; GFX9-NEXT: v_mov_b32_e32 v2, s2 944; GFX9-NEXT: v_mov_b32_e32 v3, s3 945; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 946; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 947; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 948; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 949; GFX9-NEXT: s_waitcnt vmcnt(0) 950; GFX9-NEXT: s_setpc_b64 s[30:31] 951; 952; GFX10-LABEL: zero_init_small_offset_foo: 953; GFX10: ; %bb.0: 954; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 955; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 956; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc 957; GFX10-NEXT: s_waitcnt vmcnt(0) 958; GFX10-NEXT: s_mov_b32 s0, 0 959; GFX10-NEXT: s_mov_b32 s1, s0 960; GFX10-NEXT: s_mov_b32 s2, s0 961; GFX10-NEXT: s_mov_b32 s3, s0 962; GFX10-NEXT: v_mov_b32_e32 v0, s0 963; GFX10-NEXT: v_mov_b32_e32 v1, s1 964; GFX10-NEXT: v_mov_b32_e32 v2, s2 965; GFX10-NEXT: v_mov_b32_e32 v3, s3 966; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 967; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 968; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 969; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 970; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 971; GFX10-NEXT: s_setpc_b64 s[30:31] 972; 973; GFX9-PAL-LABEL: zero_init_small_offset_foo: 974; GFX9-PAL: ; %bb.0: 975; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 976; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc 977; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 978; GFX9-PAL-NEXT: s_mov_b32 s0, 0 979; GFX9-PAL-NEXT: s_mov_b32 s1, s0 980; GFX9-PAL-NEXT: s_mov_b32 s2, s0 981; GFX9-PAL-NEXT: s_mov_b32 s3, s0 982; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 983; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 984; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 985; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 986; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 987; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 988; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 989; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 990; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 991; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 992; 993; GFX940-LABEL: zero_init_small_offset_foo: 994; GFX940: ; %bb.0: 995; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 996; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 997; GFX940-NEXT: s_waitcnt vmcnt(0) 998; GFX940-NEXT: s_mov_b32 s0, 0 999; GFX940-NEXT: s_mov_b32 s1, s0 1000; GFX940-NEXT: s_mov_b32 s2, s0 1001; GFX940-NEXT: s_mov_b32 s3, s0 1002; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1003; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1004; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1005; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1006; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1007; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1008; GFX940-NEXT: s_waitcnt vmcnt(0) 1009; GFX940-NEXT: s_setpc_b64 s[30:31] 1010; 1011; GFX10-PAL-LABEL: zero_init_small_offset_foo: 1012; GFX10-PAL: ; %bb.0: 1013; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1014; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1015; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc 1016; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1017; GFX10-PAL-NEXT: s_mov_b32 s0, 0 1018; GFX10-PAL-NEXT: s_mov_b32 s1, s0 1019; GFX10-PAL-NEXT: s_mov_b32 s2, s0 1020; GFX10-PAL-NEXT: s_mov_b32 s3, s0 1021; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0 1022; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1 1023; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2 1024; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3 1025; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1026; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1027; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1028; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1029; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1030; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1031; GCN-LABEL: zero_init_small_offset_foo: 1032; GCN: ; %bb.0: 1033; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1034; GCN-NEXT: scratch_load_dword v0, off, s32 sc0 sc1 1035; GCN-NEXT: s_waitcnt vmcnt(0) 1036; GCN-NEXT: s_mov_b32 s0, 0 1037; GCN-NEXT: s_mov_b32 s1, s0 1038; GCN-NEXT: s_mov_b32 s2, s0 1039; GCN-NEXT: s_mov_b32 s3, s0 1040; GCN-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1041; GCN-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1042; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 1043; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 1044; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 1045; GCN-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 1046; GCN-NEXT: s_waitcnt vmcnt(0) 1047; GCN-NEXT: s_setpc_b64 s[30:31] 1048 %padding = alloca [64 x i32], align 4, addrspace(5) 1049 %alloca = alloca [32 x i16], align 2, addrspace(5) 1050 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1051 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1052 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1053 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1054 ret void 1055} 1056 1057define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 1058; GFX9-LABEL: store_load_sindex_small_offset_kernel: 1059; GFX9: ; %bb.0: ; %bb 1060; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1061; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1062; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1063; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1064; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1065; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1066; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1067; GFX9-NEXT: s_and_b32 s0, s0, 15 1068; GFX9-NEXT: v_mov_b32_e32 v0, 15 1069; GFX9-NEXT: s_addk_i32 s1, 0x104 1070; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1071; GFX9-NEXT: scratch_store_dword off, v0, s1 1072; GFX9-NEXT: s_waitcnt vmcnt(0) 1073; GFX9-NEXT: s_addk_i32 s0, 0x104 1074; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1075; GFX9-NEXT: s_waitcnt vmcnt(0) 1076; GFX9-NEXT: s_endpgm 1077; 1078; GFX10-LABEL: store_load_sindex_small_offset_kernel: 1079; GFX10: ; %bb.0: ; %bb 1080; GFX10-NEXT: s_add_u32 s2, s2, s5 1081; GFX10-NEXT: s_addc_u32 s3, s3, 0 1082; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1083; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1084; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1085; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1086; GFX10-NEXT: s_waitcnt vmcnt(0) 1087; GFX10-NEXT: v_mov_b32_e32 v0, 15 1088; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1089; GFX10-NEXT: s_and_b32 s1, s0, 15 1090; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1091; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1092; GFX10-NEXT: s_addk_i32 s0, 0x104 1093; GFX10-NEXT: s_addk_i32 s1, 0x104 1094; GFX10-NEXT: scratch_store_dword off, v0, s0 1095; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1096; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1097; GFX10-NEXT: s_waitcnt vmcnt(0) 1098; GFX10-NEXT: s_endpgm 1099; 1100; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel: 1101; GFX9-PAL: ; %bb.0: ; %bb 1102; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1103; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1104; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1105; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1106; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1107; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1108; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1109; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1110; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1111; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1112; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1113; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1114; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1115; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1116; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1117; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1118; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1119; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1120; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1121; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1122; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1123; GFX9-PAL-NEXT: s_endpgm 1124; 1125; GFX940-LABEL: store_load_sindex_small_offset_kernel: 1126; GFX940: ; %bb.0: ; %bb 1127; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 1128; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1129; GFX940-NEXT: s_waitcnt vmcnt(0) 1130; GFX940-NEXT: v_mov_b32_e32 v0, 15 1131; GFX940-NEXT: s_waitcnt lgkmcnt(0) 1132; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1133; GFX940-NEXT: s_and_b32 s0, s0, 15 1134; GFX940-NEXT: s_addk_i32 s1, 0x104 1135; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1136; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1137; GFX940-NEXT: s_waitcnt vmcnt(0) 1138; GFX940-NEXT: s_addk_i32 s0, 0x104 1139; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1140; GFX940-NEXT: s_waitcnt vmcnt(0) 1141; GFX940-NEXT: s_endpgm 1142; 1143; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel: 1144; GFX1010-PAL: ; %bb.0: ; %bb 1145; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 1146; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 1147; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1148; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1149; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1150; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 1151; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 1152; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1153; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1154; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1155; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1156; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1157; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1158; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1159; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1160; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1161; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1162; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1163; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1164; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1165; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1166; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1167; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1168; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1169; GFX1010-PAL-NEXT: s_endpgm 1170; 1171; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel: 1172; GFX1030-PAL: ; %bb.0: ; %bb 1173; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 1174; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 1175; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1176; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1177; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1178; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 1179; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 1180; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 1181; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 1182; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1183; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1184; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1185; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1186; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1188; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1189; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1190; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1191; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1192; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1193; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1194; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1195; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1196; GFX1030-PAL-NEXT: s_endpgm 1197bb: 1198 %padding = alloca [64 x i32], align 4, addrspace(5) 1199 %i = alloca [32 x float], align 4, addrspace(5) 1200 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1201 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1202 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1203 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1204 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1205 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1206 %i9 = and i32 %idx, 15 1207 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1208 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1209 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1210 ret void 1211} 1212 1213define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) { 1214; GFX9-LABEL: store_load_sindex_small_offset_foo: 1215; GFX9: ; %bb.0: ; %bb 1216; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1217; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1218; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1219; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1220; GFX9-NEXT: s_waitcnt vmcnt(0) 1221; GFX9-NEXT: s_lshl_b32 s0, s2, 2 1222; GFX9-NEXT: s_addk_i32 s0, 0x104 1223; GFX9-NEXT: v_mov_b32_e32 v0, 15 1224; GFX9-NEXT: scratch_store_dword off, v0, s0 1225; GFX9-NEXT: s_waitcnt vmcnt(0) 1226; GFX9-NEXT: s_and_b32 s0, s2, 15 1227; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1228; GFX9-NEXT: s_addk_i32 s0, 0x104 1229; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1230; GFX9-NEXT: s_waitcnt vmcnt(0) 1231; GFX9-NEXT: s_endpgm 1232; 1233; GFX10-LABEL: store_load_sindex_small_offset_foo: 1234; GFX10: ; %bb.0: ; %bb 1235; GFX10-NEXT: s_add_u32 s0, s0, s3 1236; GFX10-NEXT: s_addc_u32 s1, s1, 0 1237; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1238; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1239; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1240; GFX10-NEXT: s_waitcnt vmcnt(0) 1241; GFX10-NEXT: v_mov_b32_e32 v0, 15 1242; GFX10-NEXT: s_and_b32 s0, s2, 15 1243; GFX10-NEXT: s_lshl_b32 s1, s2, 2 1244; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1245; GFX10-NEXT: s_addk_i32 s1, 0x104 1246; GFX10-NEXT: s_addk_i32 s0, 0x104 1247; GFX10-NEXT: scratch_store_dword off, v0, s1 1248; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1249; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 1250; GFX10-NEXT: s_waitcnt vmcnt(0) 1251; GFX10-NEXT: s_endpgm 1252; 1253; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo: 1254; GFX9-PAL: ; %bb.0: ; %bb 1255; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1256; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1257; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1258; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1259; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1260; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1261; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1262; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1263; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1264; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1265; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1266; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 1267; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104 1268; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 1269; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 1270; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 1271; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1272; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104 1273; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 1274; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1275; GFX9-PAL-NEXT: s_endpgm 1276; 1277; GFX940-LABEL: store_load_sindex_small_offset_foo: 1278; GFX940: ; %bb.0: ; %bb 1279; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 1280; GFX940-NEXT: s_waitcnt vmcnt(0) 1281; GFX940-NEXT: s_lshl_b32 s1, s0, 2 1282; GFX940-NEXT: s_and_b32 s0, s0, 15 1283; GFX940-NEXT: s_addk_i32 s1, 0x104 1284; GFX940-NEXT: v_mov_b32_e32 v0, 15 1285; GFX940-NEXT: s_lshl_b32 s0, s0, 2 1286; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 1287; GFX940-NEXT: s_waitcnt vmcnt(0) 1288; GFX940-NEXT: s_addk_i32 s0, 0x104 1289; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 1290; GFX940-NEXT: s_waitcnt vmcnt(0) 1291; GFX940-NEXT: s_endpgm 1292; 1293; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo: 1294; GFX1010-PAL: ; %bb.0: ; %bb 1295; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1296; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1297; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1298; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1299; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1300; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1301; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1302; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1303; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1304; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1305; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 1306; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 1307; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1308; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 1309; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 1310; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 1311; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104 1312; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104 1313; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 1314; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1315; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1316; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1317; GFX1010-PAL-NEXT: s_endpgm 1318; 1319; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo: 1320; GFX1030-PAL: ; %bb.0: ; %bb 1321; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1322; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1323; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1324; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1325; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1326; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1327; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1328; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1329; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1330; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1331; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1332; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 1333; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 1334; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 1335; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 1336; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104 1337; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104 1338; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 1339; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1340; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 1341; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1342; GFX1030-PAL-NEXT: s_endpgm 1343bb: 1344 %padding = alloca [64 x i32], align 4, addrspace(5) 1345 %i = alloca [32 x float], align 4, addrspace(5) 1346 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1347 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1348 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1349 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1350 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1351 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1352 %i9 = and i32 %idx, 15 1353 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1354 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1355 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1356 ret void 1357} 1358 1359define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 1360; GFX9-LABEL: store_load_vindex_small_offset_kernel: 1361; GFX9: ; %bb.0: ; %bb 1362; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1363; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1364; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1365; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1366; GFX9-NEXT: s_waitcnt vmcnt(0) 1367; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1368; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0 1369; GFX9-NEXT: v_mov_b32_e32 v2, 15 1370; GFX9-NEXT: scratch_store_dword v1, v2, off 1371; GFX9-NEXT: s_waitcnt vmcnt(0) 1372; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0 1373; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1374; GFX9-NEXT: s_waitcnt vmcnt(0) 1375; GFX9-NEXT: s_endpgm 1376; 1377; GFX10-LABEL: store_load_vindex_small_offset_kernel: 1378; GFX10: ; %bb.0: ; %bb 1379; GFX10-NEXT: s_add_u32 s0, s0, s3 1380; GFX10-NEXT: s_addc_u32 s1, s1, 0 1381; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1382; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1383; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1384; GFX10-NEXT: v_mov_b32_e32 v2, 15 1385; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1386; GFX10-NEXT: s_waitcnt vmcnt(0) 1387; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1388; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1389; GFX10-NEXT: scratch_store_dword v1, v2, off 1390; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1391; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1392; GFX10-NEXT: s_waitcnt vmcnt(0) 1393; GFX10-NEXT: s_endpgm 1394; 1395; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel: 1396; GFX9-PAL: ; %bb.0: ; %bb 1397; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1398; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1399; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1400; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1401; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1402; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 1403; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1404; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1405; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1406; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1407; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 1408; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1409; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0 1410; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 1411; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1412; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0 1413; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 1414; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1415; GFX9-PAL-NEXT: s_endpgm 1416; 1417; GFX940-LABEL: store_load_vindex_small_offset_kernel: 1418; GFX940: ; %bb.0: ; %bb 1419; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 1420; GFX940-NEXT: s_waitcnt vmcnt(0) 1421; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1422; GFX940-NEXT: v_mov_b32_e32 v1, 15 1423; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1 1424; GFX940-NEXT: s_waitcnt vmcnt(0) 1425; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0 1426; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 1427; GFX940-NEXT: s_waitcnt vmcnt(0) 1428; GFX940-NEXT: s_endpgm 1429; 1430; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel: 1431; GFX1010-PAL: ; %bb.0: ; %bb 1432; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1433; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1434; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1435; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1436; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1437; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1438; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1439; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1440; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1441; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1442; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 1443; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1444; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 1445; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1446; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1447; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1448; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 1449; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1450; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1451; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1452; GFX1010-PAL-NEXT: s_endpgm 1453; 1454; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel: 1455; GFX1030-PAL: ; %bb.0: ; %bb 1456; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1457; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1458; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1459; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1460; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1461; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1462; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1463; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1464; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1465; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1466; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 1467; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 1468; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1469; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x104, v0 1470; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x104, v0 1471; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 1472; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1473; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 1474; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1475; GFX1030-PAL-NEXT: s_endpgm 1476bb: 1477 %padding = alloca [64 x i32], align 4, addrspace(5) 1478 %i = alloca [32 x float], align 4, addrspace(5) 1479 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1480 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1481 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1482 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 1483 %i3 = zext i32 %i2 to i64 1484 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 1485 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1486 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1487 %i9 = sub nsw i32 31, %i2 1488 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1489 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1490 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1491 ret void 1492} 1493 1494define void @store_load_vindex_small_offset_foo(i32 %idx) { 1495; GFX9-LABEL: store_load_vindex_small_offset_foo: 1496; GFX9: ; %bb.0: ; %bb 1497; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1498; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 1499; GFX9-NEXT: s_waitcnt vmcnt(0) 1500; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 1501; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 1502; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1503; GFX9-NEXT: v_mov_b32_e32 v3, 15 1504; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 1505; GFX9-NEXT: scratch_store_dword v2, v3, off 1506; GFX9-NEXT: s_waitcnt vmcnt(0) 1507; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1508; GFX9-NEXT: scratch_load_dword v0, v0, off glc 1509; GFX9-NEXT: s_waitcnt vmcnt(0) 1510; GFX9-NEXT: s_setpc_b64 s[30:31] 1511; 1512; GFX10-LABEL: store_load_vindex_small_offset_foo: 1513; GFX10: ; %bb.0: ; %bb 1514; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1515; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1516; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 1517; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1518; GFX10-NEXT: v_mov_b32_e32 v2, 15 1519; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1520; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 1521; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 1522; GFX10-NEXT: s_waitcnt vmcnt(0) 1523; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1524; GFX10-NEXT: scratch_store_dword v0, v2, off 1525; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1526; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 1527; GFX10-NEXT: s_waitcnt vmcnt(0) 1528; GFX10-NEXT: s_setpc_b64 s[30:31] 1529; 1530; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo: 1531; GFX9-PAL: ; %bb.0: ; %bb 1532; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1533; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc 1534; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1535; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x100 1536; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 1537; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 1538; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 1539; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 1540; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 1541; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1542; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 1543; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 1544; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1545; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1546; 1547; GFX940-LABEL: store_load_vindex_small_offset_foo: 1548; GFX940: ; %bb.0: ; %bb 1549; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1550; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1551; GFX940-NEXT: s_waitcnt vmcnt(0) 1552; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1553; GFX940-NEXT: v_mov_b32_e32 v2, 15 1554; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 1555; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1556; GFX940-NEXT: s_waitcnt vmcnt(0) 1557; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1558; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1559; GFX940-NEXT: s_waitcnt vmcnt(0) 1560; GFX940-NEXT: s_setpc_b64 s[30:31] 1561; 1562; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo: 1563; GFX10-PAL: ; %bb.0: ; %bb 1564; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1565; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1566; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 1567; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1568; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 1569; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 1570; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x100 1571; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc 1572; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1573; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 1574; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 1575; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1576; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 1577; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 1578; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 1579; GCN-LABEL: store_load_vindex_small_offset_foo: 1580; GCN: ; %bb.0: ; %bb 1581; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1582; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 1583; GCN-NEXT: s_waitcnt vmcnt(0) 1584; GCN-NEXT: v_mov_b32_e32 v2, 15 1585; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 1586; GCN-NEXT: v_and_b32_e32 v0, v0, v2 1587; GCN-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 1588; GCN-NEXT: s_waitcnt vmcnt(0) 1589; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 1590; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 1591; GCN-NEXT: s_waitcnt vmcnt(0) 1592; GCN-NEXT: s_setpc_b64 s[30:31] 1593bb: 1594 %padding = alloca [64 x i32], align 4, addrspace(5) 1595 %i = alloca [32 x float], align 4, addrspace(5) 1596 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 1597 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1598 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 1599 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 1600 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 1601 store volatile i32 15, i32 addrspace(5)* %i8, align 4 1602 %i9 = and i32 %idx, 15 1603 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 1604 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 1605 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 1606 ret void 1607} 1608 1609define amdgpu_kernel void @zero_init_large_offset_kernel() { 1610; GFX9-LABEL: zero_init_large_offset_kernel: 1611; GFX9: ; %bb.0: 1612; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 1613; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 1614; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1615; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1616; GFX9-NEXT: s_waitcnt vmcnt(0) 1617; GFX9-NEXT: s_mov_b32 s0, 0 1618; GFX9-NEXT: s_mov_b32 s1, s0 1619; GFX9-NEXT: s_mov_b32 s2, s0 1620; GFX9-NEXT: s_mov_b32 s3, s0 1621; GFX9-NEXT: v_mov_b32_e32 v0, s0 1622; GFX9-NEXT: v_mov_b32_e32 v1, s1 1623; GFX9-NEXT: v_mov_b32_e32 v2, s2 1624; GFX9-NEXT: v_mov_b32_e32 v3, s3 1625; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1626; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1627; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1628; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1629; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1630; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1631; GFX9-NEXT: s_movk_i32 vcc_hi, 0x4010 1632; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1633; GFX9-NEXT: s_endpgm 1634; 1635; GFX10-LABEL: zero_init_large_offset_kernel: 1636; GFX10: ; %bb.0: 1637; GFX10-NEXT: s_add_u32 s0, s0, s3 1638; GFX10-NEXT: s_addc_u32 s1, s1, 0 1639; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 1640; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 1641; GFX10-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1642; GFX10-NEXT: s_waitcnt vmcnt(0) 1643; GFX10-NEXT: s_mov_b32 s0, 0 1644; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1645; GFX10-NEXT: s_mov_b32 s1, s0 1646; GFX10-NEXT: s_mov_b32 s2, s0 1647; GFX10-NEXT: s_mov_b32 s3, s0 1648; GFX10-NEXT: v_mov_b32_e32 v0, s0 1649; GFX10-NEXT: v_mov_b32_e32 v1, s1 1650; GFX10-NEXT: v_mov_b32_e32 v2, s2 1651; GFX10-NEXT: v_mov_b32_e32 v3, s3 1652; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1653; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1654; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1655; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1656; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1657; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010 1658; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1659; GFX10-NEXT: s_endpgm 1660; 1661; GFX9-PAL-LABEL: zero_init_large_offset_kernel: 1662; GFX9-PAL: ; %bb.0: 1663; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 1664; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1665; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1666; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1667; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1668; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1670; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 1671; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1672; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:16 glc 1673; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1674; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1675; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1676; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1677; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1678; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1679; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1680; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1681; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1682; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1683; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1684; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1685; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1686; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1687; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010 1688; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1689; GFX9-PAL-NEXT: s_endpgm 1690; 1691; GFX940-LABEL: zero_init_large_offset_kernel: 1692; GFX940: ; %bb.0: 1693; GFX940-NEXT: scratch_load_dword v0, off, off offset:16 sc0 sc1 1694; GFX940-NEXT: s_waitcnt vmcnt(0) 1695; GFX940-NEXT: s_mov_b32 s0, 0 1696; GFX940-NEXT: s_mov_b32 s1, s0 1697; GFX940-NEXT: s_mov_b32 s2, s0 1698; GFX940-NEXT: s_mov_b32 s3, s0 1699; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1700; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1701; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1702; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1703; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1704; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1705; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1706; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1707; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4010 1708; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1709; GFX940-NEXT: s_endpgm 1710; 1711; GFX1010-PAL-LABEL: zero_init_large_offset_kernel: 1712; GFX1010-PAL: ; %bb.0: 1713; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 1714; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1715; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1716; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 1717; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1718; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 1719; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 1720; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1721; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1722; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 1723; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1724; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:16 glc dlc 1725; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1726; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1727; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1728; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1729; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1730; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1731; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1732; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1733; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1734; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1735; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1736; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1737; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1738; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1739; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1740; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1741; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1742; GFX1010-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1743; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1744; GFX1010-PAL-NEXT: s_endpgm 1745; 1746; GFX1030-PAL-LABEL: zero_init_large_offset_kernel: 1747; GFX1030-PAL: ; %bb.0: 1748; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 1749; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1750; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 1751; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 1752; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 1753; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 1754; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 1755; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1756; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1757; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:16 glc dlc 1758; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1759; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1760; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1761; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1762; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1763; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1764; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1765; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1766; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1767; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1768; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1769; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1770; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1771; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1772; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1773; GFX1030-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 1774; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1775; GFX1030-PAL-NEXT: s_endpgm 1776 %padding = alloca [4096 x i32], align 4, addrspace(5) 1777 %alloca = alloca [32 x i16], align 2, addrspace(5) 1778 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1779 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1780 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1781 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1782 ret void 1783} 1784 1785define void @zero_init_large_offset_foo() { 1786; GFX9-LABEL: zero_init_large_offset_foo: 1787; GFX9: ; %bb.0: 1788; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1789; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 1790; GFX9-NEXT: s_waitcnt vmcnt(0) 1791; GFX9-NEXT: s_mov_b32 s0, 0 1792; GFX9-NEXT: s_mov_b32 s1, s0 1793; GFX9-NEXT: s_mov_b32 s2, s0 1794; GFX9-NEXT: s_mov_b32 s3, s0 1795; GFX9-NEXT: v_mov_b32_e32 v0, s0 1796; GFX9-NEXT: v_mov_b32_e32 v1, s1 1797; GFX9-NEXT: v_mov_b32_e32 v2, s2 1798; GFX9-NEXT: v_mov_b32_e32 v3, s3 1799; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1800; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1801; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1802; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1803; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1804; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1805; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1806; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1807; GFX9-NEXT: s_waitcnt vmcnt(0) 1808; GFX9-NEXT: s_setpc_b64 s[30:31] 1809; 1810; GFX10-LABEL: zero_init_large_offset_foo: 1811; GFX10: ; %bb.0: 1812; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1813; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1814; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1815; GFX10-NEXT: s_waitcnt vmcnt(0) 1816; GFX10-NEXT: s_mov_b32 s0, 0 1817; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1818; GFX10-NEXT: s_mov_b32 s1, s0 1819; GFX10-NEXT: s_mov_b32 s2, s0 1820; GFX10-NEXT: s_mov_b32 s3, s0 1821; GFX10-NEXT: v_mov_b32_e32 v0, s0 1822; GFX10-NEXT: v_mov_b32_e32 v1, s1 1823; GFX10-NEXT: v_mov_b32_e32 v2, s2 1824; GFX10-NEXT: v_mov_b32_e32 v3, s3 1825; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1826; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1827; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1828; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1829; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1830; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1831; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1832; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1833; GFX10-NEXT: s_setpc_b64 s[30:31] 1834; 1835; GFX9-PAL-LABEL: zero_init_large_offset_foo: 1836; GFX9-PAL: ; %bb.0: 1837; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1838; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc 1839; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1840; GFX9-PAL-NEXT: s_mov_b32 s0, 0 1841; GFX9-PAL-NEXT: s_mov_b32 s1, s0 1842; GFX9-PAL-NEXT: s_mov_b32 s2, s0 1843; GFX9-PAL-NEXT: s_mov_b32 s3, s0 1844; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 1845; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 1846; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 1847; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 1848; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1849; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1850; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1851; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1852; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1853; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1854; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1855; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1856; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1857; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 1858; 1859; GFX940-LABEL: zero_init_large_offset_foo: 1860; GFX940: ; %bb.0: 1861; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1862; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:16 sc0 sc1 1863; GFX940-NEXT: s_waitcnt vmcnt(0) 1864; GFX940-NEXT: s_mov_b32 s0, 0 1865; GFX940-NEXT: s_mov_b32 s1, s0 1866; GFX940-NEXT: s_mov_b32 s2, s0 1867; GFX940-NEXT: s_mov_b32 s3, s0 1868; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] 1869; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3] 1870; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1871; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi 1872; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1873; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16 1874; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1875; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32 1876; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4010 1877; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48 1878; GFX940-NEXT: s_waitcnt vmcnt(0) 1879; GFX940-NEXT: s_setpc_b64 s[30:31] 1880; 1881; GFX1010-PAL-LABEL: zero_init_large_offset_foo: 1882; GFX1010-PAL: ; %bb.0: 1883; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1884; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1885; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1886; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 1887; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 1888; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1889; GFX1010-PAL-NEXT: s_mov_b32 s1, s0 1890; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 1891; GFX1010-PAL-NEXT: s_mov_b32 s3, s0 1892; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0 1893; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1 1894; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2 1895; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3 1896; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1897; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1898; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1899; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1900; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1901; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1902; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1903; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 1904; GFX1010-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1905; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1906; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1907; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 1908; 1909; GFX1030-PAL-LABEL: zero_init_large_offset_foo: 1910; GFX1030-PAL: ; %bb.0: 1911; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1912; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1913; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:16 glc dlc 1914; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 1915; GFX1030-PAL-NEXT: s_mov_b32 s0, 0 1916; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1917; GFX1030-PAL-NEXT: s_mov_b32 s1, s0 1918; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 1919; GFX1030-PAL-NEXT: s_mov_b32 s3, s0 1920; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0 1921; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1 1922; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2 1923; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3 1924; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo 1925; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1926; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16 1927; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1928; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32 1929; GFX1030-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 1930; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48 1931; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 1932; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 1933 %padding = alloca [4096 x i32], align 4, addrspace(5) 1934 %alloca = alloca [32 x i16], align 2, addrspace(5) 1935 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 1936 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 1937 %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)* 1938 call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false) 1939 ret void 1940} 1941 1942define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 1943; GFX9-LABEL: store_load_sindex_large_offset_kernel: 1944; GFX9: ; %bb.0: ; %bb 1945; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 1946; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 1947; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 1948; GFX9-NEXT: s_mov_b32 vcc_hi, 0 1949; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1950; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 1951; GFX9-NEXT: s_lshl_b32 s1, s0, 2 1952; GFX9-NEXT: s_and_b32 s0, s0, 15 1953; GFX9-NEXT: v_mov_b32_e32 v0, 15 1954; GFX9-NEXT: s_addk_i32 s1, 0x4004 1955; GFX9-NEXT: s_lshl_b32 s0, s0, 2 1956; GFX9-NEXT: scratch_store_dword off, v0, s1 1957; GFX9-NEXT: s_waitcnt vmcnt(0) 1958; GFX9-NEXT: s_addk_i32 s0, 0x4004 1959; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 1960; GFX9-NEXT: s_waitcnt vmcnt(0) 1961; GFX9-NEXT: s_endpgm 1962; 1963; GFX10-LABEL: store_load_sindex_large_offset_kernel: 1964; GFX10: ; %bb.0: ; %bb 1965; GFX10-NEXT: s_add_u32 s2, s2, s5 1966; GFX10-NEXT: s_addc_u32 s3, s3, 0 1967; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 1968; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 1969; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 1970; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 1971; GFX10-NEXT: s_waitcnt vmcnt(0) 1972; GFX10-NEXT: v_mov_b32_e32 v0, 15 1973; GFX10-NEXT: s_waitcnt lgkmcnt(0) 1974; GFX10-NEXT: s_and_b32 s1, s0, 15 1975; GFX10-NEXT: s_lshl_b32 s0, s0, 2 1976; GFX10-NEXT: s_lshl_b32 s1, s1, 2 1977; GFX10-NEXT: s_addk_i32 s0, 0x4004 1978; GFX10-NEXT: s_addk_i32 s1, 0x4004 1979; GFX10-NEXT: scratch_store_dword off, v0, s0 1980; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1981; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 1982; GFX10-NEXT: s_waitcnt vmcnt(0) 1983; GFX10-NEXT: s_endpgm 1984; 1985; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel: 1986; GFX9-PAL: ; %bb.0: ; %bb 1987; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 1988; GFX9-PAL-NEXT: s_mov_b32 s4, s0 1989; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1990; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 1991; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 1992; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 1993; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 1994; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 1995; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 1996; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 1997; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 1998; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 1999; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2000; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2001; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2002; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2003; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2004; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2005; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2006; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2007; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2008; GFX9-PAL-NEXT: s_endpgm 2009; 2010; GFX940-LABEL: store_load_sindex_large_offset_kernel: 2011; GFX940: ; %bb.0: ; %bb 2012; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2013; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2014; GFX940-NEXT: s_waitcnt vmcnt(0) 2015; GFX940-NEXT: v_mov_b32_e32 v0, 15 2016; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2017; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2018; GFX940-NEXT: s_and_b32 s0, s0, 15 2019; GFX940-NEXT: s_addk_i32 s1, 0x4004 2020; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2021; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2022; GFX940-NEXT: s_waitcnt vmcnt(0) 2023; GFX940-NEXT: s_addk_i32 s0, 0x4004 2024; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2025; GFX940-NEXT: s_waitcnt vmcnt(0) 2026; GFX940-NEXT: s_endpgm 2027; 2028; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel: 2029; GFX1010-PAL: ; %bb.0: ; %bb 2030; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5] 2031; GFX1010-PAL-NEXT: s_mov_b32 s4, s0 2032; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2033; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2034; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2035; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3 2036; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0 2037; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2038; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2039; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2040; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2041; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2042; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2043; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2044; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2045; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2046; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2047; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2048; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2049; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2050; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2051; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2052; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2053; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2054; GFX1010-PAL-NEXT: s_endpgm 2055; 2056; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel: 2057; GFX1030-PAL: ; %bb.0: ; %bb 2058; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5] 2059; GFX1030-PAL-NEXT: s_mov_b32 s4, s0 2060; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2061; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2062; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2063; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3 2064; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0 2065; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2066; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2067; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2068; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2069; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2070; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2071; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2072; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2073; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2074; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2075; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2076; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2077; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2078; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2079; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2080; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2081; GFX1030-PAL-NEXT: s_endpgm 2082bb: 2083 %padding = alloca [4096 x i32], align 4, addrspace(5) 2084 %i = alloca [32 x float], align 4, addrspace(5) 2085 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2086 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2087 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2088 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2089 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2090 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2091 %i9 = and i32 %idx, 15 2092 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2093 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2094 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2095 ret void 2096} 2097 2098define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) { 2099; GFX9-LABEL: store_load_sindex_large_offset_foo: 2100; GFX9: ; %bb.0: ; %bb 2101; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2102; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2103; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2104; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2105; GFX9-NEXT: s_waitcnt vmcnt(0) 2106; GFX9-NEXT: s_lshl_b32 s0, s2, 2 2107; GFX9-NEXT: s_addk_i32 s0, 0x4004 2108; GFX9-NEXT: v_mov_b32_e32 v0, 15 2109; GFX9-NEXT: scratch_store_dword off, v0, s0 2110; GFX9-NEXT: s_waitcnt vmcnt(0) 2111; GFX9-NEXT: s_and_b32 s0, s2, 15 2112; GFX9-NEXT: s_lshl_b32 s0, s0, 2 2113; GFX9-NEXT: s_addk_i32 s0, 0x4004 2114; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 2115; GFX9-NEXT: s_waitcnt vmcnt(0) 2116; GFX9-NEXT: s_endpgm 2117; 2118; GFX10-LABEL: store_load_sindex_large_offset_foo: 2119; GFX10: ; %bb.0: ; %bb 2120; GFX10-NEXT: s_add_u32 s0, s0, s3 2121; GFX10-NEXT: s_addc_u32 s1, s1, 0 2122; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2123; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2124; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2125; GFX10-NEXT: s_waitcnt vmcnt(0) 2126; GFX10-NEXT: v_mov_b32_e32 v0, 15 2127; GFX10-NEXT: s_and_b32 s0, s2, 15 2128; GFX10-NEXT: s_lshl_b32 s1, s2, 2 2129; GFX10-NEXT: s_lshl_b32 s0, s0, 2 2130; GFX10-NEXT: s_addk_i32 s1, 0x4004 2131; GFX10-NEXT: s_addk_i32 s0, 0x4004 2132; GFX10-NEXT: scratch_store_dword off, v0, s1 2133; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2134; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 2135; GFX10-NEXT: s_waitcnt vmcnt(0) 2136; GFX10-NEXT: s_endpgm 2137; 2138; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo: 2139; GFX9-PAL: ; %bb.0: ; %bb 2140; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2141; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2142; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2143; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2144; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2145; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2146; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2147; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2148; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 2149; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2150; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2 2151; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15 2152; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004 2153; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2154; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2 2155; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1 2156; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2157; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 2158; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc 2159; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2160; GFX9-PAL-NEXT: s_endpgm 2161; 2162; GFX940-LABEL: store_load_sindex_large_offset_foo: 2163; GFX940: ; %bb.0: ; %bb 2164; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 2165; GFX940-NEXT: s_waitcnt vmcnt(0) 2166; GFX940-NEXT: s_lshl_b32 s1, s0, 2 2167; GFX940-NEXT: s_and_b32 s0, s0, 15 2168; GFX940-NEXT: s_addk_i32 s1, 0x4004 2169; GFX940-NEXT: v_mov_b32_e32 v0, 15 2170; GFX940-NEXT: s_lshl_b32 s0, s0, 2 2171; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1 2172; GFX940-NEXT: s_waitcnt vmcnt(0) 2173; GFX940-NEXT: s_addk_i32 s0, 0x4004 2174; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1 2175; GFX940-NEXT: s_waitcnt vmcnt(0) 2176; GFX940-NEXT: s_endpgm 2177; 2178; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo: 2179; GFX1010-PAL: ; %bb.0: ; %bb 2180; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2181; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2182; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2183; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2184; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2185; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2186; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2187; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2188; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2189; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2190; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15 2191; GFX1010-PAL-NEXT: scratch_load_dword v0, off, vcc_lo offset:4 glc dlc 2192; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2193; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15 2194; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2 2195; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2 2196; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 2197; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004 2198; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0 2199; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2200; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2201; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2202; GFX1010-PAL-NEXT: s_endpgm 2203; 2204; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo: 2205; GFX1030-PAL: ; %bb.0: ; %bb 2206; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2207; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2208; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2209; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2210; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2211; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2212; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2213; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2214; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2215; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 2216; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2217; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15 2218; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15 2219; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2 2220; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2 2221; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 2222; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004 2223; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0 2224; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2225; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc 2226; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2227; GFX1030-PAL-NEXT: s_endpgm 2228bb: 2229 %padding = alloca [4096 x i32], align 4, addrspace(5) 2230 %i = alloca [32 x float], align 4, addrspace(5) 2231 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2232 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2233 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2234 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2235 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2236 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2237 %i9 = and i32 %idx, 15 2238 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2239 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2240 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2241 ret void 2242} 2243 2244define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 2245; GFX9-LABEL: store_load_vindex_large_offset_kernel: 2246; GFX9: ; %bb.0: ; %bb 2247; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2248; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2249; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2250; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2251; GFX9-NEXT: s_waitcnt vmcnt(0) 2252; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2253; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 2254; GFX9-NEXT: v_mov_b32_e32 v2, 15 2255; GFX9-NEXT: scratch_store_dword v1, v2, off 2256; GFX9-NEXT: s_waitcnt vmcnt(0) 2257; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2258; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2259; GFX9-NEXT: s_waitcnt vmcnt(0) 2260; GFX9-NEXT: s_endpgm 2261; 2262; GFX10-LABEL: store_load_vindex_large_offset_kernel: 2263; GFX10: ; %bb.0: ; %bb 2264; GFX10-NEXT: s_add_u32 s0, s0, s3 2265; GFX10-NEXT: s_addc_u32 s1, s1, 0 2266; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2267; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2268; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2269; GFX10-NEXT: v_mov_b32_e32 v2, 15 2270; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2271; GFX10-NEXT: s_waitcnt vmcnt(0) 2272; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2273; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2274; GFX10-NEXT: scratch_store_dword v1, v2, off 2275; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2276; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2277; GFX10-NEXT: s_waitcnt vmcnt(0) 2278; GFX10-NEXT: s_endpgm 2279; 2280; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel: 2281; GFX9-PAL: ; %bb.0: ; %bb 2282; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2283; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2284; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2285; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2286; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2287; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 2288; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2289; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2290; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2291; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2292; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 2293; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2294; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 2295; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off 2296; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2297; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2298; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc 2299; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2300; GFX9-PAL-NEXT: s_endpgm 2301; 2302; GFX940-LABEL: store_load_vindex_large_offset_kernel: 2303; GFX940: ; %bb.0: ; %bb 2304; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 2305; GFX940-NEXT: s_waitcnt vmcnt(0) 2306; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2307; GFX940-NEXT: v_mov_b32_e32 v1, 15 2308; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 2309; GFX940-NEXT: scratch_store_dword v0, v1, vcc_hi sc0 sc1 2310; GFX940-NEXT: s_waitcnt vmcnt(0) 2311; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0 2312; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 2313; GFX940-NEXT: s_waitcnt vmcnt(0) 2314; GFX940-NEXT: s_endpgm 2315; 2316; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel: 2317; GFX1010-PAL: ; %bb.0: ; %bb 2318; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2319; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2320; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2321; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2322; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2323; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2324; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2325; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2326; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2327; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2328; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 2329; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2330; GFX1010-PAL-NEXT: scratch_load_dword v3, off, vcc_lo offset:4 glc dlc 2331; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2332; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2333; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2334; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off 2335; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2336; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2337; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2338; GFX1010-PAL-NEXT: s_endpgm 2339; 2340; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel: 2341; GFX1030-PAL: ; %bb.0: ; %bb 2342; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2343; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2344; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2345; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2346; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2347; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2348; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2349; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2350; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2351; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2352; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 2353; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 2354; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2355; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 2356; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 2357; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off 2358; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2359; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc 2360; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2361; GFX1030-PAL-NEXT: s_endpgm 2362bb: 2363 %padding = alloca [4096 x i32], align 4, addrspace(5) 2364 %i = alloca [32 x float], align 4, addrspace(5) 2365 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2366 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2367 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2368 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 2369 %i3 = zext i32 %i2 to i64 2370 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 2371 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2372 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2373 %i9 = sub nsw i32 31, %i2 2374 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2375 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2376 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2377 ret void 2378} 2379 2380define void @store_load_vindex_large_offset_foo(i32 %idx) { 2381; GFX9-LABEL: store_load_vindex_large_offset_foo: 2382; GFX9: ; %bb.0: ; %bb 2383; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2384; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2385; GFX9-NEXT: s_waitcnt vmcnt(0) 2386; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2387; GFX9-NEXT: v_mov_b32_e32 v1, vcc_hi 2388; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2389; GFX9-NEXT: v_mov_b32_e32 v3, 15 2390; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 2391; GFX9-NEXT: scratch_store_dword v2, v3, off 2392; GFX9-NEXT: s_waitcnt vmcnt(0) 2393; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2394; GFX9-NEXT: scratch_load_dword v0, v0, off glc 2395; GFX9-NEXT: s_waitcnt vmcnt(0) 2396; GFX9-NEXT: s_setpc_b64 s[30:31] 2397; 2398; GFX10-LABEL: store_load_vindex_large_offset_foo: 2399; GFX10: ; %bb.0: ; %bb 2400; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2401; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2402; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 2403; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2404; GFX10-NEXT: v_mov_b32_e32 v2, 15 2405; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 2406; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2407; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 2408; GFX10-NEXT: s_waitcnt vmcnt(0) 2409; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 2410; GFX10-NEXT: scratch_store_dword v0, v2, off 2411; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2412; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 2413; GFX10-NEXT: s_waitcnt vmcnt(0) 2414; GFX10-NEXT: s_setpc_b64 s[30:31] 2415; 2416; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo: 2417; GFX9-PAL: ; %bb.0: ; %bb 2418; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2419; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 2420; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2421; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2422; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi 2423; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1 2424; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15 2425; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0 2426; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off 2427; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2428; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2429; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc 2430; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2431; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2432; 2433; GFX940-LABEL: store_load_vindex_large_offset_foo: 2434; GFX940: ; %bb.0: ; %bb 2435; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2436; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 2437; GFX940-NEXT: s_waitcnt vmcnt(0) 2438; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 2439; GFX940-NEXT: v_mov_b32_e32 v2, 15 2440; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2441; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 2442; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 2443; GFX940-NEXT: s_waitcnt vmcnt(0) 2444; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2445; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 2446; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 2447; GFX940-NEXT: s_waitcnt vmcnt(0) 2448; GFX940-NEXT: s_setpc_b64 s[30:31] 2449; 2450; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo: 2451; GFX10-PAL: ; %bb.0: ; %bb 2452; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2453; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2454; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0 2455; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2456; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 2457; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, vcc_lo 2458; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 2459; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 2460; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2461; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, vcc_lo 2462; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off 2463; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2464; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc 2465; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2466; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2467; GCN-LABEL: store_load_vindex_large_offset_foo: 2468; GCN: ; %bb.0: ; %bb 2469; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2470; GCN-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 2471; GCN-NEXT: s_waitcnt vmcnt(0) 2472; GCN-NEXT: v_mov_b32_e32 v2, 15 2473; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 2474; GCN-NEXT: v_and_b32_e32 v0, v0, v2 2475; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 2476; GCN-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 2477; GCN-NEXT: s_waitcnt vmcnt(0) 2478; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 2479; GCN-NEXT: s_add_u32 vcc_hi, s32, 0x4000 2480; GCN-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 2481; GCN-NEXT: s_waitcnt vmcnt(0) 2482; GCN-NEXT: s_setpc_b64 s[30:31] 2483bb: 2484 %padding = alloca [4096 x i32], align 4, addrspace(5) 2485 %i = alloca [32 x float], align 4, addrspace(5) 2486 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 2487 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 2488 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 2489 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 2490 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 2491 store volatile i32 15, i32 addrspace(5)* %i8, align 4 2492 %i9 = and i32 %idx, 15 2493 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 2494 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 2495 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 2496 ret void 2497} 2498 2499define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 2500; GFX9-LABEL: store_load_large_imm_offset_kernel: 2501; GFX9: ; %bb.0: ; %bb 2502; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 2503; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 2504; GFX9-NEXT: v_mov_b32_e32 v0, 13 2505; GFX9-NEXT: s_mov_b32 vcc_hi, 0 2506; GFX9-NEXT: s_movk_i32 s0, 0x3000 2507; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2508; GFX9-NEXT: s_waitcnt vmcnt(0) 2509; GFX9-NEXT: s_add_i32 s0, s0, 4 2510; GFX9-NEXT: v_mov_b32_e32 v0, 15 2511; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2512; GFX9-NEXT: s_waitcnt vmcnt(0) 2513; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2514; GFX9-NEXT: s_waitcnt vmcnt(0) 2515; GFX9-NEXT: s_endpgm 2516; 2517; GFX10-LABEL: store_load_large_imm_offset_kernel: 2518; GFX10: ; %bb.0: ; %bb 2519; GFX10-NEXT: s_add_u32 s0, s0, s3 2520; GFX10-NEXT: s_addc_u32 s1, s1, 0 2521; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 2522; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 2523; GFX10-NEXT: v_mov_b32_e32 v0, 13 2524; GFX10-NEXT: v_mov_b32_e32 v1, 15 2525; GFX10-NEXT: s_movk_i32 s0, 0x3800 2526; GFX10-NEXT: s_add_i32 s0, s0, 4 2527; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 2528; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2529; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2530; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2531; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2532; GFX10-NEXT: s_waitcnt vmcnt(0) 2533; GFX10-NEXT: s_endpgm 2534; 2535; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel: 2536; GFX9-PAL: ; %bb.0: ; %bb 2537; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 2538; GFX9-PAL-NEXT: s_mov_b32 s2, s0 2539; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2540; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2541; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 2542; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2543; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2544; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2545; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1 2546; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2547; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 2548; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2549; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4 2550; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2551; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2552; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2553; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2554; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2555; GFX9-PAL-NEXT: s_endpgm 2556; 2557; GFX940-LABEL: store_load_large_imm_offset_kernel: 2558; GFX940: ; %bb.0: ; %bb 2559; GFX940-NEXT: v_mov_b32_e32 v0, 13 2560; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 2561; GFX940-NEXT: s_waitcnt vmcnt(0) 2562; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 2563; GFX940-NEXT: v_mov_b32_e32 v1, 15 2564; GFX940-NEXT: scratch_store_dword v0, v1, off offset:3716 sc0 sc1 2565; GFX940-NEXT: s_waitcnt vmcnt(0) 2566; GFX940-NEXT: scratch_load_dword v0, v0, off offset:3716 sc0 sc1 2567; GFX940-NEXT: s_waitcnt vmcnt(0) 2568; GFX940-NEXT: s_endpgm 2569; 2570; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel: 2571; GFX1010-PAL: ; %bb.0: ; %bb 2572; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] 2573; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 2574; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2575; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) 2576; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2577; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1 2578; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 2579; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2580; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2581; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13 2582; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15 2583; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800 2584; GFX1010-PAL-NEXT: s_mov_b32 vcc_lo, 0 2585; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4 2586; GFX1010-PAL-NEXT: scratch_store_dword off, v0, vcc_lo offset:4 2587; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2588; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2589; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2590; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2591; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 2592; GFX1010-PAL-NEXT: s_endpgm 2593; 2594; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel: 2595; GFX1030-PAL: ; %bb.0: ; %bb 2596; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] 2597; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 2598; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 2599; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) 2600; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff 2601; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1 2602; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 2603; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2604; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2605; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13 2606; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15 2607; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800 2608; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4 2609; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4 2610; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2611; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2612; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2613; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2614; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 2615; GFX1030-PAL-NEXT: s_endpgm 2616bb: 2617 %i = alloca [4096 x i32], align 4, addrspace(5) 2618 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2619 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2620 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2621 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2622 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2623 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2624 ret void 2625} 2626 2627define void @store_load_large_imm_offset_foo() { 2628; GFX9-LABEL: store_load_large_imm_offset_foo: 2629; GFX9: ; %bb.0: ; %bb 2630; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2631; GFX9-NEXT: v_mov_b32_e32 v0, 13 2632; GFX9-NEXT: s_movk_i32 s0, 0x3000 2633; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 2634; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 2635; GFX9-NEXT: s_waitcnt vmcnt(0) 2636; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 2637; GFX9-NEXT: v_mov_b32_e32 v0, 15 2638; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712 2639; GFX9-NEXT: s_waitcnt vmcnt(0) 2640; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2641; GFX9-NEXT: s_waitcnt vmcnt(0) 2642; GFX9-NEXT: s_setpc_b64 s[30:31] 2643; 2644; GFX10-LABEL: store_load_large_imm_offset_foo: 2645; GFX10: ; %bb.0: ; %bb 2646; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2647; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2648; GFX10-NEXT: v_mov_b32_e32 v0, 13 2649; GFX10-NEXT: v_mov_b32_e32 v1, 15 2650; GFX10-NEXT: s_movk_i32 s0, 0x3800 2651; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 2652; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 2653; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 2654; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2655; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664 2656; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2657; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2658; GFX10-NEXT: s_waitcnt vmcnt(0) 2659; GFX10-NEXT: s_setpc_b64 s[30:31] 2660; 2661; GFX9-PAL-LABEL: store_load_large_imm_offset_foo: 2662; GFX9-PAL: ; %bb.0: ; %bb 2663; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2664; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13 2665; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000 2666; GFX9-PAL-NEXT: s_add_i32 vcc_hi, s32, 4 2667; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 2668; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2669; GFX9-PAL-NEXT: s_add_i32 s0, s0, vcc_hi 2670; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15 2671; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712 2672; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2673; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc 2674; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2675; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2676; 2677; GFX940-LABEL: store_load_large_imm_offset_foo: 2678; GFX940: ; %bb.0: ; %bb 2679; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2680; GFX940-NEXT: v_mov_b32_e32 v0, 13 2681; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 2682; GFX940-NEXT: s_waitcnt vmcnt(0) 2683; GFX940-NEXT: v_mov_b32_e32 v0, 0x3000 2684; GFX940-NEXT: v_mov_b32_e32 v1, 15 2685; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1 2686; GFX940-NEXT: s_waitcnt vmcnt(0) 2687; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1 2688; GFX940-NEXT: s_waitcnt vmcnt(0) 2689; GFX940-NEXT: s_setpc_b64 s[30:31] 2690; 2691; GFX10-PAL-LABEL: store_load_large_imm_offset_foo: 2692; GFX10-PAL: ; %bb.0: ; %bb 2693; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2694; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2695; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13 2696; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2697; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800 2698; GFX10-PAL-NEXT: s_add_i32 vcc_lo, s32, 4 2699; GFX10-PAL-NEXT: s_add_i32 s0, s0, vcc_lo 2700; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4 2701; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2702; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664 2703; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2704; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc 2705; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2706; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2707; GCN-LABEL: store_load_large_imm_offset_foo: 2708; GCN: ; %bb.0: ; %bb 2709; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2710; GCN-NEXT: v_mov_b32_e32 v0, 13 2711; GCN-NEXT: scratch_store_dword off, v0, s32 sc0 sc1 2712; GCN-NEXT: s_waitcnt vmcnt(0) 2713; GCN-NEXT: v_mov_b32_e32 v0, 0x3000 2714; GCN-NEXT: v_mov_b32_e32 v1, 15 2715; GCN-NEXT: scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1 2716; GCN-NEXT: s_waitcnt vmcnt(0) 2717; GCN-NEXT: scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1 2718; GCN-NEXT: s_waitcnt vmcnt(0) 2719; GCN-NEXT: s_setpc_b64 s[30:31] 2720bb: 2721 %i = alloca [4096 x i32], align 4, addrspace(5) 2722 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 2723 store volatile i32 13, i32 addrspace(5)* %i1, align 4 2724 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2725 store volatile i32 15, i32 addrspace(5)* %i7, align 4 2726 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 2727 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 2728 ret void 2729} 2730 2731define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 2732; GFX9-LABEL: store_load_vidx_sidx_offset: 2733; GFX9: ; %bb.0: ; %bb 2734; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 2735; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 2736; GFX9-NEXT: v_mov_b32_e32 v1, 4 2737; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 2738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2739; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 2740; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2741; GFX9-NEXT: v_mov_b32_e32 v1, 15 2742; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 2743; GFX9-NEXT: s_waitcnt vmcnt(0) 2744; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2745; GFX9-NEXT: s_waitcnt vmcnt(0) 2746; GFX9-NEXT: s_endpgm 2747; 2748; GFX10-LABEL: store_load_vidx_sidx_offset: 2749; GFX10: ; %bb.0: ; %bb 2750; GFX10-NEXT: s_add_u32 s2, s2, s5 2751; GFX10-NEXT: s_addc_u32 s3, s3, 0 2752; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 2753; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 2754; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 2755; GFX10-NEXT: v_mov_b32_e32 v1, 15 2756; GFX10-NEXT: s_waitcnt lgkmcnt(0) 2757; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 2758; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2759; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 2760; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2761; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2762; GFX10-NEXT: s_waitcnt vmcnt(0) 2763; GFX10-NEXT: s_endpgm 2764; 2765; GFX9-PAL-LABEL: store_load_vidx_sidx_offset: 2766; GFX9-PAL: ; %bb.0: ; %bb 2767; GFX9-PAL-NEXT: s_getpc_b64 s[4:5] 2768; GFX9-PAL-NEXT: s_mov_b32 s4, s0 2769; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2770; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4 2771; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2772; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 2773; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2774; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3 2775; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0 2776; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 2777; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1 2778; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2779; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2780; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2781; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 2782; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2783; GFX9-PAL-NEXT: s_endpgm 2784; 2785; GFX940-LABEL: store_load_vidx_sidx_offset: 2786; GFX940: ; %bb.0: ; %bb 2787; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 2788; GFX940-NEXT: v_mov_b32_e32 v1, 15 2789; GFX940-NEXT: s_waitcnt lgkmcnt(0) 2790; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 2791; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 2792; GFX940-NEXT: s_waitcnt vmcnt(0) 2793; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 2794; GFX940-NEXT: s_waitcnt vmcnt(0) 2795; GFX940-NEXT: s_endpgm 2796; 2797; GFX10-PAL-LABEL: store_load_vidx_sidx_offset: 2798; GFX10-PAL: ; %bb.0: ; %bb 2799; GFX10-PAL-NEXT: s_getpc_b64 s[4:5] 2800; GFX10-PAL-NEXT: s_mov_b32 s4, s0 2801; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2802; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2803; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff 2804; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3 2805; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0 2806; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 2807; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 2808; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0 2809; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2810; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 2811; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0 2812; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4 2813; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024 2814; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2815; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 2816; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2817; GFX10-PAL-NEXT: s_endpgm 2818; GCN-LABEL: store_load_vidx_sidx_offset: 2819; GCN: ; %bb.0: ; %bb 2820; GCN-NEXT: s_load_dword s0, s[0:1], 0x24 2821; GCN-NEXT: v_mov_b32_e32 v1, 15 2822; GCN-NEXT: s_waitcnt lgkmcnt(0) 2823; GCN-NEXT: v_add_lshl_u32 v0, s0, v0, 2 2824; GCN-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 2825; GCN-NEXT: s_waitcnt vmcnt(0) 2826; GCN-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 2827; GCN-NEXT: s_waitcnt vmcnt(0) 2828; GCN-NEXT: s_endpgm 2829bb: 2830 %alloca = alloca [32 x i32], align 4, addrspace(5) 2831 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 2832 %add1 = add nsw i32 %sidx, %vidx 2833 %add2 = add nsw i32 %add1, 256 2834 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 2835 store volatile i32 15, i32 addrspace(5)* %gep, align 4 2836 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 2837 ret void 2838} 2839 2840define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 2841; GFX9-LABEL: store_load_i64_aligned: 2842; GFX9: ; %bb.0: ; %bb 2843; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2844; GFX9-NEXT: v_mov_b32_e32 v1, 15 2845; GFX9-NEXT: v_mov_b32_e32 v2, 0 2846; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2847; GFX9-NEXT: s_waitcnt vmcnt(0) 2848; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2849; GFX9-NEXT: s_waitcnt vmcnt(0) 2850; GFX9-NEXT: s_setpc_b64 s[30:31] 2851; 2852; GFX10-LABEL: store_load_i64_aligned: 2853; GFX10: ; %bb.0: ; %bb 2854; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2855; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2856; GFX10-NEXT: v_mov_b32_e32 v1, 15 2857; GFX10-NEXT: v_mov_b32_e32 v2, 0 2858; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2859; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2860; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2861; GFX10-NEXT: s_waitcnt vmcnt(0) 2862; GFX10-NEXT: s_setpc_b64 s[30:31] 2863; 2864; GFX9-PAL-LABEL: store_load_i64_aligned: 2865; GFX9-PAL: ; %bb.0: ; %bb 2866; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2867; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2868; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2869; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2870; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2871; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2872; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2873; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2874; 2875; GFX940-LABEL: store_load_i64_aligned: 2876; GFX940: ; %bb.0: ; %bb 2877; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2878; GFX940-NEXT: v_mov_b32_e32 v2, 15 2879; GFX940-NEXT: v_mov_b32_e32 v3, 0 2880; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2881; GFX940-NEXT: s_waitcnt vmcnt(0) 2882; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2883; GFX940-NEXT: s_waitcnt vmcnt(0) 2884; GFX940-NEXT: s_setpc_b64 s[30:31] 2885; 2886; GFX10-PAL-LABEL: store_load_i64_aligned: 2887; GFX10-PAL: ; %bb.0: ; %bb 2888; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2889; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2890; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2891; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2892; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2893; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2894; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2895; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2896; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2897; GCN-LABEL: store_load_i64_aligned: 2898; GCN: ; %bb.0: ; %bb 2899; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2900; GCN-NEXT: v_mov_b32_e32 v2, 15 2901; GCN-NEXT: v_mov_b32_e32 v3, 0 2902; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2903; GCN-NEXT: s_waitcnt vmcnt(0) 2904; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2905; GCN-NEXT: s_waitcnt vmcnt(0) 2906; GCN-NEXT: s_setpc_b64 s[30:31] 2907bb: 2908 store volatile i64 15, i64 addrspace(5)* %arg, align 8 2909 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 2910 ret void 2911} 2912 2913define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 2914; GFX9-LABEL: store_load_i64_unaligned: 2915; GFX9: ; %bb.0: ; %bb 2916; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2917; GFX9-NEXT: v_mov_b32_e32 v1, 15 2918; GFX9-NEXT: v_mov_b32_e32 v2, 0 2919; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2920; GFX9-NEXT: s_waitcnt vmcnt(0) 2921; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2922; GFX9-NEXT: s_waitcnt vmcnt(0) 2923; GFX9-NEXT: s_setpc_b64 s[30:31] 2924; 2925; GFX10-LABEL: store_load_i64_unaligned: 2926; GFX10: ; %bb.0: ; %bb 2927; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2928; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2929; GFX10-NEXT: v_mov_b32_e32 v1, 15 2930; GFX10-NEXT: v_mov_b32_e32 v2, 0 2931; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2932; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 2933; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2934; GFX10-NEXT: s_waitcnt vmcnt(0) 2935; GFX10-NEXT: s_setpc_b64 s[30:31] 2936; 2937; GFX9-PAL-LABEL: store_load_i64_unaligned: 2938; GFX9-PAL: ; %bb.0: ; %bb 2939; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2940; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 2941; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0 2942; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2943; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2944; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 2945; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 2946; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 2947; 2948; GFX940-LABEL: store_load_i64_unaligned: 2949; GFX940: ; %bb.0: ; %bb 2950; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2951; GFX940-NEXT: v_mov_b32_e32 v2, 15 2952; GFX940-NEXT: v_mov_b32_e32 v3, 0 2953; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2954; GFX940-NEXT: s_waitcnt vmcnt(0) 2955; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2956; GFX940-NEXT: s_waitcnt vmcnt(0) 2957; GFX940-NEXT: s_setpc_b64 s[30:31] 2958; 2959; GFX10-PAL-LABEL: store_load_i64_unaligned: 2960; GFX10-PAL: ; %bb.0: ; %bb 2961; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2962; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2963; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15 2964; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0 2965; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off 2966; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 2967; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 2968; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 2969; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 2970; GCN-LABEL: store_load_i64_unaligned: 2971; GCN: ; %bb.0: ; %bb 2972; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2973; GCN-NEXT: v_mov_b32_e32 v2, 15 2974; GCN-NEXT: v_mov_b32_e32 v3, 0 2975; GCN-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 2976; GCN-NEXT: s_waitcnt vmcnt(0) 2977; GCN-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 2978; GCN-NEXT: s_waitcnt vmcnt(0) 2979; GCN-NEXT: s_setpc_b64 s[30:31] 2980bb: 2981 store volatile i64 15, i64 addrspace(5)* %arg, align 1 2982 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 2983 ret void 2984} 2985 2986define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 2987; GFX9-LABEL: store_load_v3i32_unaligned: 2988; GFX9: ; %bb.0: ; %bb 2989; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2990; GFX9-NEXT: v_mov_b32_e32 v1, 1 2991; GFX9-NEXT: v_mov_b32_e32 v2, 2 2992; GFX9-NEXT: v_mov_b32_e32 v3, 3 2993; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 2994; GFX9-NEXT: s_waitcnt vmcnt(0) 2995; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 2996; GFX9-NEXT: s_waitcnt vmcnt(0) 2997; GFX9-NEXT: s_setpc_b64 s[30:31] 2998; 2999; GFX10-LABEL: store_load_v3i32_unaligned: 3000; GFX10: ; %bb.0: ; %bb 3001; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3002; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3003; GFX10-NEXT: v_mov_b32_e32 v1, 1 3004; GFX10-NEXT: v_mov_b32_e32 v2, 2 3005; GFX10-NEXT: v_mov_b32_e32 v3, 3 3006; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3007; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3008; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3009; GFX10-NEXT: s_waitcnt vmcnt(0) 3010; GFX10-NEXT: s_setpc_b64 s[30:31] 3011; 3012; GFX9-PAL-LABEL: store_load_v3i32_unaligned: 3013; GFX9-PAL: ; %bb.0: ; %bb 3014; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3015; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3016; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3017; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3018; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3019; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3020; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 3021; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3022; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3023; 3024; GFX940-LABEL: store_load_v3i32_unaligned: 3025; GFX940: ; %bb.0: ; %bb 3026; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3027; GFX940-NEXT: v_mov_b32_e32 v2, 1 3028; GFX940-NEXT: v_mov_b32_e32 v3, 2 3029; GFX940-NEXT: v_mov_b32_e32 v4, 3 3030; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3031; GFX940-NEXT: s_waitcnt vmcnt(0) 3032; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3033; GFX940-NEXT: s_waitcnt vmcnt(0) 3034; GFX940-NEXT: s_setpc_b64 s[30:31] 3035; 3036; GFX10-PAL-LABEL: store_load_v3i32_unaligned: 3037; GFX10-PAL: ; %bb.0: ; %bb 3038; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3039; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3040; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3041; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3042; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3043; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off 3044; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3045; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 3046; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3047; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3048; GCN-LABEL: store_load_v3i32_unaligned: 3049; GCN: ; %bb.0: ; %bb 3050; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3051; GCN-NEXT: v_mov_b32_e32 v2, 1 3052; GCN-NEXT: v_mov_b32_e32 v3, 2 3053; GCN-NEXT: v_mov_b32_e32 v4, 3 3054; GCN-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 3055; GCN-NEXT: s_waitcnt vmcnt(0) 3056; GCN-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 3057; GCN-NEXT: s_waitcnt vmcnt(0) 3058; GCN-NEXT: s_setpc_b64 s[30:31] 3059bb: 3060 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 3061 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 3062 ret void 3063} 3064 3065define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 3066; GFX9-LABEL: store_load_v4i32_unaligned: 3067; GFX9: ; %bb.0: ; %bb 3068; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3069; GFX9-NEXT: v_mov_b32_e32 v1, 1 3070; GFX9-NEXT: v_mov_b32_e32 v2, 2 3071; GFX9-NEXT: v_mov_b32_e32 v3, 3 3072; GFX9-NEXT: v_mov_b32_e32 v4, 4 3073; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3074; GFX9-NEXT: s_waitcnt vmcnt(0) 3075; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3076; GFX9-NEXT: s_waitcnt vmcnt(0) 3077; GFX9-NEXT: s_setpc_b64 s[30:31] 3078; 3079; GFX10-LABEL: store_load_v4i32_unaligned: 3080; GFX10: ; %bb.0: ; %bb 3081; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3082; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3083; GFX10-NEXT: v_mov_b32_e32 v1, 1 3084; GFX10-NEXT: v_mov_b32_e32 v2, 2 3085; GFX10-NEXT: v_mov_b32_e32 v3, 3 3086; GFX10-NEXT: v_mov_b32_e32 v4, 4 3087; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3088; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3089; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3090; GFX10-NEXT: s_waitcnt vmcnt(0) 3091; GFX10-NEXT: s_setpc_b64 s[30:31] 3092; 3093; GFX9-PAL-LABEL: store_load_v4i32_unaligned: 3094; GFX9-PAL: ; %bb.0: ; %bb 3095; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3096; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3097; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2 3098; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3 3099; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4 3100; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3101; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3102; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 3103; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3104; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3105; 3106; GFX940-LABEL: store_load_v4i32_unaligned: 3107; GFX940: ; %bb.0: ; %bb 3108; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3109; GFX940-NEXT: v_mov_b32_e32 v2, 1 3110; GFX940-NEXT: v_mov_b32_e32 v3, 2 3111; GFX940-NEXT: v_mov_b32_e32 v4, 3 3112; GFX940-NEXT: v_mov_b32_e32 v5, 4 3113; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3114; GFX940-NEXT: s_waitcnt vmcnt(0) 3115; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3116; GFX940-NEXT: s_waitcnt vmcnt(0) 3117; GFX940-NEXT: s_setpc_b64 s[30:31] 3118; 3119; GFX10-PAL-LABEL: store_load_v4i32_unaligned: 3120; GFX10-PAL: ; %bb.0: ; %bb 3121; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3122; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3123; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1 3124; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2 3125; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3 3126; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4 3127; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off 3128; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3129; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 3130; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3131; GFX10-PAL-NEXT: s_setpc_b64 s[30:31] 3132; GCN-LABEL: store_load_v4i32_unaligned: 3133; GCN: ; %bb.0: ; %bb 3134; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3135; GCN-NEXT: v_mov_b32_e32 v2, 1 3136; GCN-NEXT: v_mov_b32_e32 v3, 2 3137; GCN-NEXT: v_mov_b32_e32 v4, 3 3138; GCN-NEXT: v_mov_b32_e32 v5, 4 3139; GCN-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 3140; GCN-NEXT: s_waitcnt vmcnt(0) 3141; GCN-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 3142; GCN-NEXT: s_waitcnt vmcnt(0) 3143; GCN-NEXT: s_setpc_b64 s[30:31] 3144bb: 3145 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 3146 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 3147 ret void 3148} 3149 3150define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3151; GFX9-LABEL: store_load_i32_negative_unaligned: 3152; GFX9: ; %bb.0: ; %bb 3153; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3154; GFX9-NEXT: v_add_u32_e32 v0, -1, v0 3155; GFX9-NEXT: v_mov_b32_e32 v1, 1 3156; GFX9-NEXT: scratch_store_byte v0, v1, off 3157; GFX9-NEXT: s_waitcnt vmcnt(0) 3158; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3159; GFX9-NEXT: s_waitcnt vmcnt(0) 3160; GFX9-NEXT: s_setpc_b64 s[30:31] 3161; 3162; GFX10-LABEL: store_load_i32_negative_unaligned: 3163; GFX10: ; %bb.0: ; %bb 3164; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3165; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3166; GFX10-NEXT: v_mov_b32_e32 v1, 1 3167; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-1 3168; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3169; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3170; GFX10-NEXT: s_waitcnt vmcnt(0) 3171; GFX10-NEXT: s_setpc_b64 s[30:31] 3172; 3173; GFX9-PAL-LABEL: store_load_i32_negative_unaligned: 3174; GFX9-PAL: ; %bb.0: ; %bb 3175; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3176; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0 3177; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3178; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 3179; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3180; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 3181; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3182; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3183; 3184; GFX940-LABEL: store_load_i32_negative_unaligned: 3185; GFX940: ; %bb.0: ; %bb 3186; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3187; GFX940-NEXT: v_add_u32_e32 v0, -1, v0 3188; GFX940-NEXT: v_mov_b32_e32 v1, 1 3189; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1 3190; GFX940-NEXT: s_waitcnt vmcnt(0) 3191; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1 3192; GFX940-NEXT: s_waitcnt vmcnt(0) 3193; GFX940-NEXT: s_setpc_b64 s[30:31] 3194; 3195; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned: 3196; GFX1010-PAL: ; %bb.0: ; %bb 3197; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3198; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3199; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0 3200; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 3201; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off 3202; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3203; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc 3204; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3205; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 3206; 3207; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned: 3208; GFX1030-PAL: ; %bb.0: ; %bb 3209; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3210; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3211; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 3212; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-1 3213; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3214; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-1 glc dlc 3215; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3216; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 3217bb: 3218 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1 3219 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 3220 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 3221 ret void 3222} 3223 3224define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) { 3225; GFX9-LABEL: store_load_i32_large_negative_unaligned: 3226; GFX9: ; %bb.0: ; %bb 3227; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3228; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 3229; GFX9-NEXT: v_mov_b32_e32 v1, 1 3230; GFX9-NEXT: scratch_store_byte v0, v1, off 3231; GFX9-NEXT: s_waitcnt vmcnt(0) 3232; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc 3233; GFX9-NEXT: s_waitcnt vmcnt(0) 3234; GFX9-NEXT: s_setpc_b64 s[30:31] 3235; 3236; GFX10-LABEL: store_load_i32_large_negative_unaligned: 3237; GFX10: ; %bb.0: ; %bb 3238; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3239; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3240; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 3241; GFX10-NEXT: v_mov_b32_e32 v1, 1 3242; GFX10-NEXT: scratch_store_byte v0, v1, off offset:-129 3243; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3244; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 3245; GFX10-NEXT: s_waitcnt vmcnt(0) 3246; GFX10-NEXT: s_setpc_b64 s[30:31] 3247; 3248; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned: 3249; GFX9-PAL: ; %bb.0: ; %bb 3250; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3251; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0 3252; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1 3253; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off 3254; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3255; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc 3256; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3257; GFX9-PAL-NEXT: s_setpc_b64 s[30:31] 3258; 3259; GFX940-LABEL: store_load_i32_large_negative_unaligned: 3260; GFX940: ; %bb.0: ; %bb 3261; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3262; GFX940-NEXT: s_movk_i32 s0, 0xef7f 3263; GFX940-NEXT: v_mov_b32_e32 v1, 1 3264; GFX940-NEXT: scratch_store_byte v0, v1, s0 sc0 sc1 3265; GFX940-NEXT: s_waitcnt vmcnt(0) 3266; GFX940-NEXT: scratch_load_ubyte v0, v0, s0 sc0 sc1 3267; GFX940-NEXT: s_waitcnt vmcnt(0) 3268; GFX940-NEXT: s_setpc_b64 s[30:31] 3269; 3270; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned: 3271; GFX1010-PAL: ; %bb.0: ; %bb 3272; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3273; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3274; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffefff, v0 3275; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 1 3276; GFX1010-PAL-NEXT: scratch_store_byte v0, v1, off offset:-128 3277; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3278; GFX1010-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-128 glc dlc 3279; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) 3280; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31] 3281; 3282; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned: 3283; GFX1030-PAL: ; %bb.0: ; %bb 3284; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3285; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3286; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 3287; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 1 3288; GFX1030-PAL-NEXT: scratch_store_byte v0, v1, off offset:-129 3289; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3290; GFX1030-PAL-NEXT: scratch_load_ubyte v0, v0, off offset:-129 glc dlc 3291; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) 3292; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31] 3293bb: 3294 %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225 3295 store volatile i8 1, i8 addrspace(5)* %ptr, align 1 3296 %load = load volatile i8, i8 addrspace(5)* %ptr, align 1 3297 ret void 3298} 3299 3300define amdgpu_ps void @large_offset() { 3301; GFX9-LABEL: large_offset: 3302; GFX9: ; %bb.0: ; %bb 3303; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2 3304; GFX9-NEXT: v_mov_b32_e32 v0, 0 3305; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 3306; GFX9-NEXT: v_mov_b32_e32 v1, v0 3307; GFX9-NEXT: v_mov_b32_e32 v2, v0 3308; GFX9-NEXT: v_mov_b32_e32 v3, v0 3309; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3310; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 3311; GFX9-NEXT: s_waitcnt vmcnt(0) 3312; GFX9-NEXT: s_mov_b32 vcc_hi, 0 3313; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 3314; GFX9-NEXT: s_waitcnt vmcnt(0) 3315; GFX9-NEXT: v_mov_b32_e32 v0, 16 3316; GFX9-NEXT: ;;#ASMSTART 3317; GFX9-NEXT: ; use v0 3318; GFX9-NEXT: ;;#ASMEND 3319; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 3320; GFX9-NEXT: ;;#ASMSTART 3321; GFX9-NEXT: ; use v0 3322; GFX9-NEXT: ;;#ASMEND 3323; GFX9-NEXT: s_endpgm 3324; 3325; GFX10-LABEL: large_offset: 3326; GFX10: ; %bb.0: ; %bb 3327; GFX10-NEXT: s_add_u32 s0, s0, s2 3328; GFX10-NEXT: s_addc_u32 s1, s1, 0 3329; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 3330; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 3331; GFX10-NEXT: v_mov_b32_e32 v0, 0 3332; GFX10-NEXT: s_movk_i32 s0, 0x810 3333; GFX10-NEXT: s_addk_i32 s0, 0x3c0 3334; GFX10-NEXT: v_mov_b32_e32 v1, v0 3335; GFX10-NEXT: v_mov_b32_e32 v2, v0 3336; GFX10-NEXT: v_mov_b32_e32 v3, v0 3337; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 3338; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 3339; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 3340; GFX10-NEXT: s_waitcnt vmcnt(0) 3341; GFX10-NEXT: v_mov_b32_e32 v0, 16 3342; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 3343; GFX10-NEXT: ;;#ASMSTART 3344; GFX10-NEXT: ; use v0 3345; GFX10-NEXT: ;;#ASMEND 3346; GFX10-NEXT: ;;#ASMSTART 3347; GFX10-NEXT: ; use v1 3348; GFX10-NEXT: ;;#ASMEND 3349; GFX10-NEXT: s_endpgm 3350; 3351; GFX9-PAL-LABEL: large_offset: 3352; GFX9-PAL: ; %bb.0: ; %bb 3353; GFX9-PAL-NEXT: s_getpc_b64 s[2:3] 3354; GFX9-PAL-NEXT: s_mov_b32 s2, s0 3355; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3356; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0 3357; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 3358; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0 3359; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0 3360; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) 3361; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3362; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0 3363; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 3364; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3365; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024 3366; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3367; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 3368; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc 3369; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) 3370; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 3371; GFX9-PAL-NEXT: ;;#ASMSTART 3372; GFX9-PAL-NEXT: ; use v0 3373; GFX9-PAL-NEXT: ;;#ASMEND 3374; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 3375; GFX9-PAL-NEXT: ;;#ASMSTART 3376; GFX9-PAL-NEXT: ; use v0 3377; GFX9-PAL-NEXT: ;;#ASMEND 3378; GFX9-PAL-NEXT: s_endpgm 3379; 3380; GFX940-LABEL: large_offset: 3381; GFX940: ; %bb.0: ; %bb 3382; GFX940-NEXT: v_mov_b32_e32 v0, 0 3383; GFX940-NEXT: v_mov_b32_e32 v1, v0 3384; GFX940-NEXT: v_mov_b32_e32 v2, v0 3385; GFX940-NEXT: v_mov_b32_e32 v3, v0 3386; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1 3387; GFX940-NEXT: s_waitcnt vmcnt(0) 3388; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 3389; GFX940-NEXT: s_waitcnt vmcnt(0) 3390; GFX940-NEXT: v_mov_b32_e32 v0, 16 3391; GFX940-NEXT: ;;#ASMSTART 3392; GFX940-NEXT: ; use v0 3393; GFX940-NEXT: ;;#ASMEND 3394; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 3395; GFX940-NEXT: ;;#ASMSTART 3396; GFX940-NEXT: ; use v0 3397; GFX940-NEXT: ;;#ASMEND 3398; GFX940-NEXT: s_endpgm 3399; 3400; GFX10-PAL-LABEL: large_offset: 3401; GFX10-PAL: ; %bb.0: ; %bb 3402; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] 3403; GFX10-PAL-NEXT: s_mov_b32 s2, s0 3404; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 3405; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) 3406; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff 3407; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 3408; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 3409; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 3410; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 3411; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 3412; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810 3413; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0 3414; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 3415; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 3416; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 3417; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 3418; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 3419; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc 3420; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) 3421; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 3422; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 3423; GFX10-PAL-NEXT: ;;#ASMSTART 3424; GFX10-PAL-NEXT: ; use v0 3425; GFX10-PAL-NEXT: ;;#ASMEND 3426; GFX10-PAL-NEXT: ;;#ASMSTART 3427; GFX10-PAL-NEXT: ; use v1 3428; GFX10-PAL-NEXT: ;;#ASMEND 3429; GFX10-PAL-NEXT: s_endpgm 3430bb: 3431 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5) 3432 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5) 3433 %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60 3434 store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16 3435 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16 3436 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0 3437 call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0 3438 ret void 3439} 3440 3441declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg) 3442declare i32 @llvm.amdgcn.workitem.id.x() 3443