1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s 5 6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { 7; GFX803-LABEL: test_kern_empty: 8; GFX803: ; %bb.0: ; %entry 9; GFX803-NEXT: s_endpgm 10; 11; GFX900-LABEL: test_kern_empty: 12; GFX900: ; %bb.0: ; %entry 13; GFX900-NEXT: s_endpgm 14; 15; GFX1010-LABEL: test_kern_empty: 16; GFX1010: ; %bb.0: ; %entry 17; GFX1010-NEXT: s_endpgm 18entry: 19 ret void 20} 21 22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { 23; GFX803-LABEL: test_kern_stack: 24; GFX803: ; %bb.0: ; %entry 25; GFX803-NEXT: s_add_u32 s4, s4, s7 26; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 27; GFX803-NEXT: s_add_u32 s0, s0, s7 28; GFX803-NEXT: s_addc_u32 s1, s1, 0 29; GFX803-NEXT: v_mov_b32_e32 v0, 0 30; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 31; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 32; GFX803-NEXT: s_waitcnt vmcnt(0) 33; GFX803-NEXT: s_endpgm 34; 35; GFX900-LABEL: test_kern_stack: 36; GFX900: ; %bb.0: ; %entry 37; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 38; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 39; GFX900-NEXT: s_add_u32 s0, s0, s7 40; GFX900-NEXT: s_addc_u32 s1, s1, 0 41; GFX900-NEXT: v_mov_b32_e32 v0, 0 42; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 43; GFX900-NEXT: s_waitcnt vmcnt(0) 44; GFX900-NEXT: s_endpgm 45; 46; GFX1010-LABEL: test_kern_stack: 47; GFX1010: ; %bb.0: ; %entry 48; GFX1010-NEXT: s_add_u32 s4, s4, s7 49; GFX1010-NEXT: s_addc_u32 s5, s5, 0 50; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 51; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 52; GFX1010-NEXT: v_mov_b32_e32 v0, 0 53; GFX1010-NEXT: s_add_u32 s0, s0, s7 54; GFX1010-NEXT: s_addc_u32 s1, s1, 0 55; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 56; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 57; GFX1010-NEXT: s_endpgm 58entry: 59 %x = alloca i32, align 4, addrspace(5) 60 store volatile i32 0, i32 addrspace(5)* %x, align 4 61 ret void 62} 63 64define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { 65; GFX803-LABEL: test_kern_call: 66; GFX803: ; %bb.0: ; %entry 67; GFX803-NEXT: s_add_u32 s12, s12, s17 68; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 69; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 70; GFX803-NEXT: s_add_u32 s0, s0, s17 71; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 72; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 73; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 74; GFX803-NEXT: s_addc_u32 s1, s1, 0 75; GFX803-NEXT: s_mov_b32 s12, s14 76; GFX803-NEXT: s_mov_b32 s13, s15 77; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 78; GFX803-NEXT: s_mov_b32 s14, s16 79; GFX803-NEXT: s_getpc_b64 s[18:19] 80; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 81; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 82; GFX803-NEXT: s_mov_b32 s32, 0 83; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 84; GFX803-NEXT: s_endpgm 85; 86; GFX900-LABEL: test_kern_call: 87; GFX900: ; %bb.0: ; %entry 88; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 89; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 90; GFX900-NEXT: s_add_u32 s0, s0, s17 91; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 92; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 93; GFX900-NEXT: s_addc_u32 s1, s1, 0 94; GFX900-NEXT: s_mov_b32 s12, s14 95; GFX900-NEXT: s_mov_b32 s13, s15 96; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 97; GFX900-NEXT: s_mov_b32 s14, s16 98; GFX900-NEXT: s_getpc_b64 s[18:19] 99; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 100; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 101; GFX900-NEXT: s_mov_b32 s32, 0 102; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 103; GFX900-NEXT: s_endpgm 104 105; GFX1010-LABEL: test_kern_call: 106; GFX1010: ; %bb.0: ; %entry 107; GFX1010-NEXT: s_add_u32 s12, s12, s17 108; GFX1010-NEXT: s_mov_b32 s32, 0 109; GFX1010-NEXT: s_addc_u32 s13, s13, 0 110; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 111; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 112; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 113; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 114; GFX1010-NEXT: s_add_u32 s0, s0, s17 115; GFX1010-NEXT: s_addc_u32 s1, s1, 0 116; GFX1010-NEXT: s_mov_b32 s12, s14 117; GFX1010-NEXT: s_mov_b32 s13, s15 118; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 119; GFX1010-NEXT: s_mov_b32 s14, s16 120; GFX1010-NEXT: s_getpc_b64 s[18:19] 121; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 122; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 123; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 124; GFX1010-NEXT: s_endpgm 125entry: 126 tail call void @ex() #0 127 ret void 128} 129 130define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { 131; GFX803-LABEL: test_kern_stack_and_call: 132; GFX803: ; %bb.0: ; %entry 133; GFX803-NEXT: s_add_u32 s12, s12, s17 134; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 135; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 136; GFX803-NEXT: s_add_u32 s0, s0, s17 137; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 138; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 139; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 140; GFX803-NEXT: s_addc_u32 s1, s1, 0 141; GFX803-NEXT: s_mov_b32 s12, s14 142; GFX803-NEXT: v_mov_b32_e32 v3, 0 143; GFX803-NEXT: s_mov_b32 s13, s15 144; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 145; GFX803-NEXT: s_mov_b32 s14, s16 146; GFX803-NEXT: s_getpc_b64 s[18:19] 147; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 148; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 149; GFX803-NEXT: s_movk_i32 s32, 0x400 150; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 151; GFX803-NEXT: s_waitcnt vmcnt(0) 152; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 153; GFX803-NEXT: s_endpgm 154; 155; GFX900-LABEL: test_kern_stack_and_call: 156; GFX900: ; %bb.0: ; %entry 157; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 158; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 159; GFX900-NEXT: s_add_u32 s0, s0, s17 160; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 161; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 162; GFX900-NEXT: s_addc_u32 s1, s1, 0 163; GFX900-NEXT: s_mov_b32 s12, s14 164; GFX900-NEXT: v_mov_b32_e32 v3, 0 165; GFX900-NEXT: s_mov_b32 s13, s15 166; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 167; GFX900-NEXT: s_mov_b32 s14, s16 168; GFX900-NEXT: s_getpc_b64 s[18:19] 169; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 170; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 171; GFX900-NEXT: s_movk_i32 s32, 0x400 172; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 173; GFX900-NEXT: s_waitcnt vmcnt(0) 174; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 175; GFX900-NEXT: s_endpgm 176 177; GFX1010-LABEL: test_kern_stack_and_call: 178; GFX1010: ; %bb.0: ; %entry 179; GFX1010-NEXT: s_add_u32 s12, s12, s17 180; GFX1010-NEXT: s_movk_i32 s32, 0x200 181; GFX1010-NEXT: s_addc_u32 s13, s13, 0 182; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 183; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 184; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 185; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 186; GFX1010-NEXT: v_mov_b32_e32 v3, 0 187; GFX1010-NEXT: s_add_u32 s0, s0, s17 188; GFX1010-NEXT: s_addc_u32 s1, s1, 0 189; GFX1010-NEXT: s_mov_b32 s12, s14 190; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 191; GFX1010-NEXT: s_mov_b32 s13, s15 192; GFX1010-NEXT: s_mov_b32 s14, s16 193; GFX1010-NEXT: s_getpc_b64 s[18:19] 194; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 195; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 196; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 197; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 198; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 199; GFX1010-NEXT: s_endpgm 200entry: 201 %x = alloca i32, align 4, addrspace(5) 202 store volatile i32 0, i32 addrspace(5)* %x, align 4 203 tail call void @ex() #0 204 ret void 205} 206 207define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { 208; GFX803-LABEL: test_force_fp_kern_empty: 209; GFX803: ; %bb.0: ; %entry 210; GFX803-NEXT: s_mov_b32 s33, 0 211; GFX803-NEXT: s_endpgm 212 213; GFX900-LABEL: test_force_fp_kern_empty: 214; GFX900: ; %bb.0: ; %entry 215; GFX900-NEXT: s_mov_b32 s33, 0 216; GFX900-NEXT: s_endpgm 217; 218; GFX1010-LABEL: test_force_fp_kern_empty: 219; GFX1010: ; %bb.0: ; %entry 220; GFX1010-NEXT: s_mov_b32 s33, 0 221; GFX1010-NEXT: s_endpgm 222entry: 223 ret void 224} 225 226define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { 227; GFX803-LABEL: test_force_fp_kern_stack: 228; GFX803: ; %bb.0: ; %entry 229; GFX803-NEXT: s_add_u32 s4, s4, s7 230; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 231; GFX803-NEXT: s_add_u32 s0, s0, s7 232; GFX803-NEXT: s_mov_b32 s33, 0 233; GFX803-NEXT: s_addc_u32 s1, s1, 0 234; GFX803-NEXT: v_mov_b32_e32 v0, 0 235; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 236; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 237; GFX803-NEXT: s_waitcnt vmcnt(0) 238; GFX803-NEXT: s_endpgm 239; 240; GFX900-LABEL: test_force_fp_kern_stack: 241; GFX900: ; %bb.0: ; %entry 242; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 243; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 244; GFX900-NEXT: s_add_u32 s0, s0, s7 245; GFX900-NEXT: s_mov_b32 s33, 0 246; GFX900-NEXT: s_addc_u32 s1, s1, 0 247; GFX900-NEXT: v_mov_b32_e32 v0, 0 248; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 249; GFX900-NEXT: s_waitcnt vmcnt(0) 250; GFX900-NEXT: s_endpgm 251; 252; GFX1010-LABEL: test_force_fp_kern_stack: 253; GFX1010: ; %bb.0: ; %entry 254; GFX1010-NEXT: s_add_u32 s4, s4, s7 255; GFX1010-NEXT: s_mov_b32 s33, 0 256; GFX1010-NEXT: s_addc_u32 s5, s5, 0 257; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 258; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 259; GFX1010-NEXT: v_mov_b32_e32 v0, 0 260; GFX1010-NEXT: s_add_u32 s0, s0, s7 261; GFX1010-NEXT: s_addc_u32 s1, s1, 0 262; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 263; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 264; GFX1010-NEXT: s_endpgm 265entry: 266 %x = alloca i32, align 4, addrspace(5) 267 store volatile i32 0, i32 addrspace(5)* %x, align 4 268 ret void 269} 270 271define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { 272; GFX803-LABEL: test_force_fp_kern_call: 273; GFX803: ; %bb.0: ; %entry 274; GFX803-NEXT: s_add_u32 s12, s12, s17 275; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 276; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 277; GFX803-NEXT: s_add_u32 s0, s0, s17 278; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 279; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 280; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 281; GFX803-NEXT: s_addc_u32 s1, s1, 0 282; GFX803-NEXT: s_mov_b32 s12, s14 283; GFX803-NEXT: s_mov_b32 s13, s15 284; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 285; GFX803-NEXT: s_mov_b32 s14, s16 286; GFX803-NEXT: s_getpc_b64 s[18:19] 287; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 288; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 289; GFX803-NEXT: s_mov_b32 s32, 0 290; GFX803-NEXT: s_mov_b32 s33, 0 291; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 292; GFX803-NEXT: s_endpgm 293; 294; GFX900-LABEL: test_force_fp_kern_call: 295; GFX900: ; %bb.0: ; %entry 296; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 297; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 298; GFX900-NEXT: s_add_u32 s0, s0, s17 299; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 300; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 301; GFX900-NEXT: s_addc_u32 s1, s1, 0 302; GFX900-NEXT: s_mov_b32 s12, s14 303; GFX900-NEXT: s_mov_b32 s13, s15 304; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 305; GFX900-NEXT: s_mov_b32 s14, s16 306; GFX900-NEXT: s_getpc_b64 s[18:19] 307; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 308; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 309; GFX900-NEXT: s_mov_b32 s32, 0 310; GFX900-NEXT: s_mov_b32 s33, 0 311; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 312; GFX900-NEXT: s_endpgm 313; 314; GFX1010-LABEL: test_force_fp_kern_call: 315; GFX1010: ; %bb.0: ; %entry 316; GFX1010-NEXT s_add_u32 s12, s12, s17 317; GFX1010-NEXT s_mov_b32 s32, 0 318; GFX1010-NEXT s_mov_b32 s33, 0 319; GFX1010-NEXT s_addc_u32 s13, s13, 0 320; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 321; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 322; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 323; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 324; GFX1010-NEXT s_add_u32 s0, s0, s17 325; GFX1010-NEXT s_addc_u32 s1, s1, 0 326; GFX1010-NEXT s_mov_b32 s12, s14 327; GFX1010-NEXT s_mov_b32 s13, s15 328; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 329; GFX1010-NEXT s_mov_b32 s14, s16 330; GFX1010-NEXT s_getpc_b64 s[18:19] 331; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 332; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 333; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] 334; GFX1010-NEXT s_endpgm 335entry: 336 tail call void @ex() #2 337 ret void 338} 339 340define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { 341; GFX803-LABEL: test_force_fp_kern_stack_and_call: 342; GFX803: ; %bb.0: ; %entry 343; GFX803-NEXT: s_add_u32 s12, s12, s17 344; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 345; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 346; GFX803-NEXT: s_add_u32 s0, s0, s17 347; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 348; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 349; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 350; GFX803-NEXT: s_addc_u32 s1, s1, 0 351; GFX803-NEXT: s_mov_b32 s12, s14 352; GFX803-NEXT: s_mov_b32 s33, 0 353; GFX803-NEXT: v_mov_b32_e32 v3, 0 354; GFX803-NEXT: s_mov_b32 s13, s15 355; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 356; GFX803-NEXT: s_mov_b32 s14, s16 357; GFX803-NEXT: s_getpc_b64 s[18:19] 358; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 359; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 360; GFX803-NEXT: s_movk_i32 s32, 0x400 361; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 362; GFX803-NEXT: s_waitcnt vmcnt(0) 363; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 364; GFX803-NEXT: s_endpgm 365; 366; GFX900-LABEL: test_force_fp_kern_stack_and_call: 367; GFX900: ; %bb.0: ; %entry 368; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 369; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 370; GFX900-NEXT: s_add_u32 s0, s0, s17 371; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 372; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 373; GFX900-NEXT: s_addc_u32 s1, s1, 0 374; GFX900-NEXT: s_mov_b32 s12, s14 375; GFX900-NEXT: s_mov_b32 s33, 0 376; GFX900-NEXT: v_mov_b32_e32 v3, 0 377; GFX900-NEXT: s_mov_b32 s13, s15 378; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 379; GFX900-NEXT: s_mov_b32 s14, s16 380; GFX900-NEXT: s_getpc_b64 s[18:19] 381; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 382; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 383; GFX900-NEXT: s_movk_i32 s32, 0x400 384; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 385; GFX900-NEXT: s_waitcnt vmcnt(0) 386; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 387; GFX900-NEXT: s_endpgm 388; 389; GFX1010-LABEL: test_force_fp_kern_stack_and_call: 390; GFX1010: ; %bb.0: ; %entry 391; GFX1010-NEXT: s_add_u32 s12, s12, s17 392; GFX1010-NEXT: s_movk_i32 s32, 0x200 393; GFX1010-NEXT: s_mov_b32 s33, 0 394; GFX1010-NEXT: s_addc_u32 s13, s13, 0 395; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 396; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 397; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 398; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 399; GFX1010-NEXT: v_mov_b32_e32 v3, 0 400; GFX1010-NEXT: s_add_u32 s0, s0, s17 401; GFX1010-NEXT: s_addc_u32 s1, s1, 0 402; GFX1010-NEXT: s_mov_b32 s12, s14 403; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 404; GFX1010-NEXT: s_mov_b32 s13, s15 405; GFX1010-NEXT: s_mov_b32 s14, s16 406; GFX1010-NEXT: s_getpc_b64 s[18:19] 407; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 408; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 409; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 410; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 411; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 412; GFX1010-NEXT: s_endpgm 413entry: 414 %x = alloca i32, align 4, addrspace(5) 415 store volatile i32 0, i32 addrspace(5)* %x, align 4 416 tail call void @ex() #2 417 ret void 418} 419 420define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { 421; GFX803-LABEL: test_sgpr_offset_kernel: 422; GFX803: ; %bb.0: ; %entry 423; GFX803-NEXT: s_add_u32 s4, s4, s7 424; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 425; GFX803-NEXT: s_add_u32 s0, s0, s7 426; GFX803-NEXT: s_addc_u32 s1, s1, 0 427; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 428; GFX803-NEXT: s_waitcnt vmcnt(0) 429; GFX803-NEXT: s_mov_b32 s4, 0x40000 430; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 431; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 432; GFX803-NEXT: ;;#ASMSTART 433; GFX803-NEXT: ;;#ASMEND 434; GFX803-NEXT: s_mov_b32 s4, 0x40000 435; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 436; GFX803-NEXT: s_waitcnt vmcnt(0) 437; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 438; GFX803-NEXT: s_waitcnt vmcnt(0) 439; GFX803-NEXT: s_endpgm 440; 441; GFX900-LABEL: test_sgpr_offset_kernel: 442; GFX900: ; %bb.0: ; %entry 443; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 444; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 445; GFX900-NEXT: s_add_u32 s0, s0, s7 446; GFX900-NEXT: s_addc_u32 s1, s1, 0 447; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 448; GFX900-NEXT: s_waitcnt vmcnt(0) 449; GFX900-NEXT: s_mov_b32 s6, 0x40000 450; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill 451; GFX900-NEXT: ;;#ASMSTART 452; GFX900-NEXT: ;;#ASMEND 453; GFX900-NEXT: s_mov_b32 s6, 0x40000 454; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload 455; GFX900-NEXT: s_waitcnt vmcnt(0) 456; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 457; GFX900-NEXT: s_waitcnt vmcnt(0) 458; GFX900-NEXT: s_endpgm 459; 460; GFX1010-LABEL: test_sgpr_offset_kernel: 461; GFX1010: ; %bb.0: ; %entry 462; GFX1010-NEXT: s_add_u32 s4, s4, s7 463; GFX1010-NEXT: s_addc_u32 s5, s5, 0 464; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 465; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 466; GFX1010-NEXT: s_add_u32 s0, s0, s7 467; GFX1010-NEXT: s_addc_u32 s1, s1, 0 468; GFX1010-NEXT: s_mov_b32 s6, 0x20000 469; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc 470; GFX1010-NEXT: s_waitcnt vmcnt(0) 471; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill 472; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 473; GFX1010-NEXT: s_mov_b32 s6, 0x20000 474; GFX1010-NEXT: ;;#ASMSTART 475; GFX1010-NEXT: ;;#ASMEND 476; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload 477; GFX1010-NEXT: s_waitcnt vmcnt(0) 478; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 479; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 480; GFX1010-NEXT: s_endpgm 481entry: 482 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 483 ; fit in the instruction, and has to live in the SGPR offset. 484 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 485 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 486 487 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 488 ; 0x40000 / 64 = 4096 (for wave64) 489 ; CHECK: s_add_u32 s6, s7, 0x40000 490 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 491 %a = load volatile i32, i32 addrspace(5)* %aptr 492 493 ; Force %a to spill 494 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 495 496 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 497 store volatile i32 %a, i32 addrspace(5)* %outptr 498 499 ret void 500} 501 502declare hidden void @ex() local_unnamed_addr #0 503 504attributes #0 = { nounwind } 505attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } 506attributes #2 = { nounwind "frame-pointer"="all" } 507