1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s 5 6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { 7; GFX803-LABEL: test_kern_empty: 8; GFX803: ; %bb.0: ; %entry 9; GFX803-NEXT: s_endpgm 10; 11; GFX900-LABEL: test_kern_empty: 12; GFX900: ; %bb.0: ; %entry 13; GFX900-NEXT: s_endpgm 14; 15; GFX1010-LABEL: test_kern_empty: 16; GFX1010: ; %bb.0: ; %entry 17; GFX1010-NEXT: s_endpgm 18entry: 19 ret void 20} 21 22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { 23; GFX803-LABEL: test_kern_stack: 24; GFX803: ; %bb.0: ; %entry 25; GFX803-NEXT: s_add_i32 s4, s4, s7 26; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 27; GFX803-NEXT: s_add_u32 s0, s0, s7 28; GFX803-NEXT: s_addc_u32 s1, s1, 0 29; GFX803-NEXT: v_mov_b32_e32 v0, 0 30; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 31; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 32; GFX803-NEXT: s_waitcnt vmcnt(0) 33; GFX803-NEXT: s_endpgm 34; 35; GFX900-LABEL: test_kern_stack: 36; GFX900: ; %bb.0: ; %entry 37; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 38; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 39; GFX900-NEXT: s_add_u32 s0, s0, s7 40; GFX900-NEXT: s_addc_u32 s1, s1, 0 41; GFX900-NEXT: v_mov_b32_e32 v0, 0 42; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 43; GFX900-NEXT: s_waitcnt vmcnt(0) 44; GFX900-NEXT: s_endpgm 45; 46; GFX1010-LABEL: test_kern_stack: 47; GFX1010: ; %bb.0: ; %entry 48; GFX1010-NEXT: s_add_u32 s4, s4, s7 49; GFX1010-NEXT: s_addc_u32 s5, s5, 0 50; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 51; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 52; GFX1010-NEXT: v_mov_b32_e32 v0, 0 53; GFX1010-NEXT: s_add_u32 s0, s0, s7 54; GFX1010-NEXT: s_addc_u32 s1, s1, 0 55; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 56; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 57; GFX1010-NEXT: s_endpgm 58entry: 59 %x = alloca i32, align 4, addrspace(5) 60 store volatile i32 0, i32 addrspace(5)* %x, align 4 61 ret void 62} 63 64define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { 65; GFX803-LABEL: test_kern_call: 66; GFX803: ; %bb.0: ; %entry 67; GFX803-NEXT: s_add_i32 s4, s4, s7 68; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 69; GFX803-NEXT: s_add_u32 s0, s0, s7 70; GFX803-NEXT: s_addc_u32 s1, s1, 0 71; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 72; GFX803-NEXT: s_getpc_b64 s[4:5] 73; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 74; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 75; GFX803-NEXT: s_mov_b32 s32, 0 76; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] 77; GFX803-NEXT: s_endpgm 78; 79; GFX900-LABEL: test_kern_call: 80; GFX900: ; %bb.0: ; %entry 81; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 82; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 83; GFX900-NEXT: s_add_u32 s0, s0, s7 84; GFX900-NEXT: s_addc_u32 s1, s1, 0 85; GFX900-NEXT: s_getpc_b64 s[4:5] 86; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 87; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 88; GFX900-NEXT: s_mov_b32 s32, 0 89; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] 90; GFX900-NEXT: s_endpgm 91; 92; GFX1010-LABEL: test_kern_call: 93; GFX1010: ; %bb.0: ; %entry 94; GFX1010-NEXT: s_add_u32 s4, s4, s7 95; GFX1010-NEXT: s_mov_b32 s32, 0 96; GFX1010-NEXT: s_addc_u32 s5, s5, 0 97; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 98; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 99; GFX1010-NEXT: s_add_u32 s0, s0, s7 100; GFX1010-NEXT: s_addc_u32 s1, s1, 0 101; GFX1010-NEXT: s_getpc_b64 s[4:5] 102; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 103; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 104; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] 105; GFX1010-NEXT: s_endpgm 106entry: 107 tail call void @ex() #0 108 ret void 109} 110 111define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { 112; GFX803-LABEL: test_kern_stack_and_call: 113; GFX803: ; %bb.0: ; %entry 114; GFX803-NEXT: s_add_i32 s4, s4, s7 115; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 116; GFX803-NEXT: s_add_u32 s0, s0, s7 117; GFX803-NEXT: s_addc_u32 s1, s1, 0 118; GFX803-NEXT: v_mov_b32_e32 v0, 0 119; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 120; GFX803-NEXT: s_getpc_b64 s[4:5] 121; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 122; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 123; GFX803-NEXT: s_movk_i32 s32, 0x400 124; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 125; GFX803-NEXT: s_waitcnt vmcnt(0) 126; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] 127; GFX803-NEXT: s_endpgm 128; 129; GFX900-LABEL: test_kern_stack_and_call: 130; GFX900: ; %bb.0: ; %entry 131; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 132; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 133; GFX900-NEXT: s_add_u32 s0, s0, s7 134; GFX900-NEXT: s_addc_u32 s1, s1, 0 135; GFX900-NEXT: v_mov_b32_e32 v0, 0 136; GFX900-NEXT: s_getpc_b64 s[4:5] 137; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 138; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 139; GFX900-NEXT: s_movk_i32 s32, 0x400 140; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 141; GFX900-NEXT: s_waitcnt vmcnt(0) 142; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] 143; GFX900-NEXT: s_endpgm 144; 145; GFX1010-LABEL: test_kern_stack_and_call: 146; GFX1010: ; %bb.0: ; %entry 147; GFX1010-NEXT: s_add_u32 s4, s4, s7 148; GFX1010-NEXT: s_movk_i32 s32, 0x200 149; GFX1010-NEXT: s_addc_u32 s5, s5, 0 150; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 151; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 152; GFX1010-NEXT: v_mov_b32_e32 v0, 0 153; GFX1010-NEXT: s_add_u32 s0, s0, s7 154; GFX1010-NEXT: s_addc_u32 s1, s1, 0 155; GFX1010-NEXT: s_getpc_b64 s[4:5] 156; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 157; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 158; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 159; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 160; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] 161; GFX1010-NEXT: s_endpgm 162entry: 163 %x = alloca i32, align 4, addrspace(5) 164 store volatile i32 0, i32 addrspace(5)* %x, align 4 165 tail call void @ex() #0 166 ret void 167} 168 169define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { 170; GFX803-LABEL: test_force_fp_kern_empty: 171; GFX803: ; %bb.0: ; %entry 172; GFX803-NEXT: s_mov_b32 s33, 0 173; GFX803-NEXT: s_endpgm 174; 175; GFX900-LABEL: test_force_fp_kern_empty: 176; GFX900: ; %bb.0: ; %entry 177; GFX900-NEXT: s_mov_b32 s33, 0 178; GFX900-NEXT: s_endpgm 179; 180; GFX1010-LABEL: test_force_fp_kern_empty: 181; GFX1010: ; %bb.0: ; %entry 182; GFX1010-NEXT: s_mov_b32 s33, 0 183; GFX1010-NEXT: s_endpgm 184entry: 185 ret void 186} 187 188define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { 189; GFX803-LABEL: test_force_fp_kern_stack: 190; GFX803: ; %bb.0: ; %entry 191; GFX803-NEXT: s_add_i32 s4, s4, s7 192; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 193; GFX803-NEXT: s_add_u32 s0, s0, s7 194; GFX803-NEXT: s_mov_b32 s33, 0 195; GFX803-NEXT: s_addc_u32 s1, s1, 0 196; GFX803-NEXT: v_mov_b32_e32 v0, 0 197; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 198; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 199; GFX803-NEXT: s_waitcnt vmcnt(0) 200; GFX803-NEXT: s_endpgm 201; 202; GFX900-LABEL: test_force_fp_kern_stack: 203; GFX900: ; %bb.0: ; %entry 204; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 205; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 206; GFX900-NEXT: s_add_u32 s0, s0, s7 207; GFX900-NEXT: s_mov_b32 s33, 0 208; GFX900-NEXT: s_addc_u32 s1, s1, 0 209; GFX900-NEXT: v_mov_b32_e32 v0, 0 210; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 211; GFX900-NEXT: s_waitcnt vmcnt(0) 212; GFX900-NEXT: s_endpgm 213; 214; GFX1010-LABEL: test_force_fp_kern_stack: 215; GFX1010: ; %bb.0: ; %entry 216; GFX1010-NEXT: s_add_u32 s4, s4, s7 217; GFX1010-NEXT: s_mov_b32 s33, 0 218; GFX1010-NEXT: s_addc_u32 s5, s5, 0 219; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 220; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 221; GFX1010-NEXT: v_mov_b32_e32 v0, 0 222; GFX1010-NEXT: s_add_u32 s0, s0, s7 223; GFX1010-NEXT: s_addc_u32 s1, s1, 0 224; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 225; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 226; GFX1010-NEXT: s_endpgm 227entry: 228 %x = alloca i32, align 4, addrspace(5) 229 store volatile i32 0, i32 addrspace(5)* %x, align 4 230 ret void 231} 232 233define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { 234; GFX803-LABEL: test_force_fp_kern_call: 235; GFX803: ; %bb.0: ; %entry 236; GFX803-NEXT: s_add_i32 s4, s4, s7 237; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 238; GFX803-NEXT: s_add_u32 s0, s0, s7 239; GFX803-NEXT: s_addc_u32 s1, s1, 0 240; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 241; GFX803-NEXT: s_getpc_b64 s[4:5] 242; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 243; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 244; GFX803-NEXT: s_mov_b32 s32, 0 245; GFX803-NEXT: s_mov_b32 s33, 0 246; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] 247; GFX803-NEXT: s_endpgm 248; 249; GFX900-LABEL: test_force_fp_kern_call: 250; GFX900: ; %bb.0: ; %entry 251; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 252; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 253; GFX900-NEXT: s_add_u32 s0, s0, s7 254; GFX900-NEXT: s_addc_u32 s1, s1, 0 255; GFX900-NEXT: s_getpc_b64 s[4:5] 256; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 257; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 258; GFX900-NEXT: s_mov_b32 s32, 0 259; GFX900-NEXT: s_mov_b32 s33, 0 260; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] 261; GFX900-NEXT: s_endpgm 262; 263; GFX1010-LABEL: test_force_fp_kern_call: 264; GFX1010: ; %bb.0: ; %entry 265; GFX1010-NEXT: s_add_u32 s4, s4, s7 266; GFX1010-NEXT: s_mov_b32 s32, 0 267; GFX1010-NEXT: s_mov_b32 s33, 0 268; GFX1010-NEXT: s_addc_u32 s5, s5, 0 269; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 270; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 271; GFX1010-NEXT: s_add_u32 s0, s0, s7 272; GFX1010-NEXT: s_addc_u32 s1, s1, 0 273; GFX1010-NEXT: s_getpc_b64 s[4:5] 274; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 275; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 276; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] 277; GFX1010-NEXT: s_endpgm 278entry: 279 tail call void @ex() #2 280 ret void 281} 282 283define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { 284; GFX803-LABEL: test_force_fp_kern_stack_and_call: 285; GFX803: ; %bb.0: ; %entry 286; GFX803-NEXT: s_add_i32 s4, s4, s7 287; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 288; GFX803-NEXT: s_add_u32 s0, s0, s7 289; GFX803-NEXT: s_mov_b32 s33, 0 290; GFX803-NEXT: s_addc_u32 s1, s1, 0 291; GFX803-NEXT: v_mov_b32_e32 v0, 0 292; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 293; GFX803-NEXT: s_getpc_b64 s[4:5] 294; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 295; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 296; GFX803-NEXT: s_movk_i32 s32, 0x400 297; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 298; GFX803-NEXT: s_waitcnt vmcnt(0) 299; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] 300; GFX803-NEXT: s_endpgm 301; 302; GFX900-LABEL: test_force_fp_kern_stack_and_call: 303; GFX900: ; %bb.0: ; %entry 304; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 305; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 306; GFX900-NEXT: s_add_u32 s0, s0, s7 307; GFX900-NEXT: s_addc_u32 s1, s1, 0 308; GFX900-NEXT: s_mov_b32 s33, 0 309; GFX900-NEXT: v_mov_b32_e32 v0, 0 310; GFX900-NEXT: s_getpc_b64 s[4:5] 311; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 312; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 313; GFX900-NEXT: s_movk_i32 s32, 0x400 314; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 315; GFX900-NEXT: s_waitcnt vmcnt(0) 316; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] 317; GFX900-NEXT: s_endpgm 318; 319; GFX1010-LABEL: test_force_fp_kern_stack_and_call: 320; GFX1010: ; %bb.0: ; %entry 321; GFX1010-NEXT: s_add_u32 s4, s4, s7 322; GFX1010-NEXT: s_movk_i32 s32, 0x200 323; GFX1010-NEXT: s_mov_b32 s33, 0 324; GFX1010-NEXT: s_addc_u32 s5, s5, 0 325; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 326; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 327; GFX1010-NEXT: v_mov_b32_e32 v0, 0 328; GFX1010-NEXT: s_add_u32 s0, s0, s7 329; GFX1010-NEXT: s_addc_u32 s1, s1, 0 330; GFX1010-NEXT: s_getpc_b64 s[4:5] 331; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 332; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+12 333; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 334; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 335; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] 336; GFX1010-NEXT: s_endpgm 337entry: 338 %x = alloca i32, align 4, addrspace(5) 339 store volatile i32 0, i32 addrspace(5)* %x, align 4 340 tail call void @ex() #2 341 ret void 342} 343 344define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { 345; GFX803-LABEL: test_sgpr_offset_kernel: 346; GFX803: ; %bb.0: ; %entry 347; GFX803-NEXT: s_add_i32 s4, s4, s7 348; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 349; GFX803-NEXT: s_add_u32 s0, s0, s7 350; GFX803-NEXT: s_addc_u32 s1, s1, 0 351; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 352; GFX803-NEXT: s_waitcnt vmcnt(0) 353; GFX803-NEXT: s_mov_b32 s4, 0x40000 354; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 355; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 356; GFX803-NEXT: ;;#ASMSTART 357; GFX803-NEXT: ;;#ASMEND 358; GFX803-NEXT: s_mov_b32 s4, 0x40000 359; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 360; GFX803-NEXT: s_waitcnt vmcnt(0) 361; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 362; GFX803-NEXT: s_waitcnt vmcnt(0) 363; GFX803-NEXT: s_endpgm 364; 365; GFX900-LABEL: test_sgpr_offset_kernel: 366; GFX900: ; %bb.0: ; %entry 367; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 368; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 369; GFX900-NEXT: s_add_u32 s0, s0, s7 370; GFX900-NEXT: s_addc_u32 s1, s1, 0 371; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 372; GFX900-NEXT: s_waitcnt vmcnt(0) 373; GFX900-NEXT: s_mov_b32 s6, 0x40000 374; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill 375; GFX900-NEXT: ;;#ASMSTART 376; GFX900-NEXT: ;;#ASMEND 377; GFX900-NEXT: s_mov_b32 s6, 0x40000 378; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload 379; GFX900-NEXT: s_waitcnt vmcnt(0) 380; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 381; GFX900-NEXT: s_waitcnt vmcnt(0) 382; GFX900-NEXT: s_endpgm 383; 384; GFX1010-LABEL: test_sgpr_offset_kernel: 385; GFX1010: ; %bb.0: ; %entry 386; GFX1010-NEXT: s_add_u32 s4, s4, s7 387; GFX1010-NEXT: s_addc_u32 s5, s5, 0 388; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 389; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 390; GFX1010-NEXT: s_add_u32 s0, s0, s7 391; GFX1010-NEXT: s_addc_u32 s1, s1, 0 392; GFX1010-NEXT: s_mov_b32 s6, 0x20000 393; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc 394; GFX1010-NEXT: s_waitcnt vmcnt(0) 395; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill 396; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 397; GFX1010-NEXT: s_mov_b32 s6, 0x20000 398; GFX1010-NEXT: ;;#ASMSTART 399; GFX1010-NEXT: ;;#ASMEND 400; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload 401; GFX1010-NEXT: s_waitcnt vmcnt(0) 402; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 403; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 404; GFX1010-NEXT: s_endpgm 405entry: 406 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 407 ; fit in the instruction, and has to live in the SGPR offset. 408 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 409 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 410 411 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 412 ; 0x40000 / 64 = 4096 (for wave64) 413 ; CHECK: s_add_u32 s6, s7, 0x40000 414 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 415 %a = load volatile i32, i32 addrspace(5)* %aptr 416 417 ; Force %a to spill 418 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 419 420 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 421 store volatile i32 %a, i32 addrspace(5)* %outptr 422 423 ret void 424} 425 426declare hidden void @ex() local_unnamed_addr #0 427 428attributes #0 = { nounwind } 429attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } 430attributes #2 = { nounwind "frame-pointer"="all" } 431