1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s 6 7define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { 8; GFX803-LABEL: test_kern_empty: 9; GFX803: ; %bb.0: ; %entry 10; GFX803-NEXT: s_endpgm 11; 12; GFX900-LABEL: test_kern_empty: 13; GFX900: ; %bb.0: ; %entry 14; GFX900-NEXT: s_endpgm 15; 16; GFX1010-LABEL: test_kern_empty: 17; GFX1010: ; %bb.0: ; %entry 18; GFX1010-NEXT: s_endpgm 19; 20; GFX1100-LABEL: test_kern_empty: 21; GFX1100: ; %bb.0: ; %entry 22; GFX1100-NEXT: s_endpgm 23entry: 24 ret void 25} 26 27define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { 28; GFX803-LABEL: test_kern_stack: 29; GFX803: ; %bb.0: ; %entry 30; GFX803-NEXT: s_add_u32 s0, s0, s7 31; GFX803-NEXT: s_addc_u32 s1, s1, 0 32; GFX803-NEXT: v_mov_b32_e32 v0, 0 33; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 34; GFX803-NEXT: s_waitcnt vmcnt(0) 35; GFX803-NEXT: s_endpgm 36; 37; GFX900-LABEL: test_kern_stack: 38; GFX900: ; %bb.0: ; %entry 39; GFX900-NEXT: s_add_u32 s0, s0, s7 40; GFX900-NEXT: s_addc_u32 s1, s1, 0 41; GFX900-NEXT: v_mov_b32_e32 v0, 0 42; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 43; GFX900-NEXT: s_waitcnt vmcnt(0) 44; GFX900-NEXT: s_endpgm 45; 46; GFX1010-LABEL: test_kern_stack: 47; GFX1010: ; %bb.0: ; %entry 48; GFX1010-NEXT: v_mov_b32_e32 v0, 0 49; GFX1010-NEXT: s_add_u32 s0, s0, s7 50; GFX1010-NEXT: s_addc_u32 s1, s1, 0 51; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 52; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 53; GFX1010-NEXT: s_endpgm 54; 55; GFX1100-LABEL: test_kern_stack: 56; GFX1100: ; %bb.0: ; %entry 57; GFX1100-NEXT: v_mov_b32_e32 v0, 0 58; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 59; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 60; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 61; GFX1100-NEXT: s_endpgm 62entry: 63 %x = alloca i32, align 4, addrspace(5) 64 store volatile i32 0, i32 addrspace(5)* %x, align 4 65 ret void 66} 67 68define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { 69; GFX803-LABEL: test_kern_call: 70; GFX803: ; %bb.0: ; %entry 71; GFX803-NEXT: s_add_i32 s12, s12, s17 72; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 73; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 74; GFX803-NEXT: s_add_u32 s0, s0, s17 75; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 76; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 77; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 78; GFX803-NEXT: s_addc_u32 s1, s1, 0 79; GFX803-NEXT: s_mov_b32 s13, s15 80; GFX803-NEXT: s_mov_b32 s12, s14 81; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 82; GFX803-NEXT: s_mov_b32 s14, s16 83; GFX803-NEXT: s_mov_b32 s32, 0 84; GFX803-NEXT: s_getpc_b64 s[18:19] 85; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 86; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 87; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 88; GFX803-NEXT: s_endpgm 89; 90; GFX900-LABEL: test_kern_call: 91; GFX900: ; %bb.0: ; %entry 92; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 93; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 94; GFX900-NEXT: s_add_u32 s0, s0, s17 95; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 96; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 97; GFX900-NEXT: s_addc_u32 s1, s1, 0 98; GFX900-NEXT: s_mov_b32 s13, s15 99; GFX900-NEXT: s_mov_b32 s12, s14 100; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 101; GFX900-NEXT: s_mov_b32 s14, s16 102; GFX900-NEXT: s_mov_b32 s32, 0 103; GFX900-NEXT: s_getpc_b64 s[18:19] 104; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 105; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 106; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 107; GFX900-NEXT: s_endpgm 108; 109; GFX1010-LABEL: test_kern_call: 110; GFX1010: ; %bb.0: ; %entry 111; GFX1010-NEXT: s_add_u32 s12, s12, s17 112; GFX1010-NEXT: s_mov_b32 s32, 0 113; GFX1010-NEXT: s_addc_u32 s13, s13, 0 114; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 115; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 116; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 117; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 118; GFX1010-NEXT: s_add_u32 s0, s0, s17 119; GFX1010-NEXT: s_addc_u32 s1, s1, 0 120; GFX1010-NEXT: s_mov_b32 s13, s15 121; GFX1010-NEXT: s_mov_b32 s12, s14 122; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 123; GFX1010-NEXT: s_mov_b32 s14, s16 124; GFX1010-NEXT: s_getpc_b64 s[18:19] 125; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 126; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 127; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 128; GFX1010-NEXT: s_endpgm 129; 130; GFX1100-LABEL: test_kern_call: 131; GFX1100: ; %bb.0: ; %entry 132; GFX1100-NEXT: v_mov_b32_e32 v31, v0 133; GFX1100-NEXT: s_mov_b32 s12, s13 134; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 135; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 136; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 137; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 138; GFX1100-NEXT: s_mov_b32 s13, s14 139; GFX1100-NEXT: s_mov_b32 s14, s15 140; GFX1100-NEXT: s_mov_b32 s32, 0 141; GFX1100-NEXT: s_getpc_b64 s[16:17] 142; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 143; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 144; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 145; GFX1100-NEXT: s_endpgm 146 147entry: 148 tail call void @ex() #0 149 ret void 150} 151 152define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { 153; GFX803-LABEL: test_kern_stack_and_call: 154; GFX803: ; %bb.0: ; %entry 155; GFX803-NEXT: s_add_i32 s12, s12, s17 156; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 157; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 158; GFX803-NEXT: s_add_u32 s0, s0, s17 159; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 160; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 161; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 162; GFX803-NEXT: s_addc_u32 s1, s1, 0 163; GFX803-NEXT: s_mov_b32 s13, s15 164; GFX803-NEXT: s_mov_b32 s12, s14 165; GFX803-NEXT: v_mov_b32_e32 v3, 0 166; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 167; GFX803-NEXT: s_mov_b32 s14, s16 168; GFX803-NEXT: s_movk_i32 s32, 0x400 169; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 170; GFX803-NEXT: s_waitcnt vmcnt(0) 171; GFX803-NEXT: s_getpc_b64 s[18:19] 172; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 173; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 174; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 175; GFX803-NEXT: s_endpgm 176; 177; GFX900-LABEL: test_kern_stack_and_call: 178; GFX900: ; %bb.0: ; %entry 179; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 180; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 181; GFX900-NEXT: s_add_u32 s0, s0, s17 182; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 183; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 184; GFX900-NEXT: s_addc_u32 s1, s1, 0 185; GFX900-NEXT: s_mov_b32 s13, s15 186; GFX900-NEXT: s_mov_b32 s12, s14 187; GFX900-NEXT: v_mov_b32_e32 v3, 0 188; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 189; GFX900-NEXT: s_mov_b32 s14, s16 190; GFX900-NEXT: s_movk_i32 s32, 0x400 191; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 192; GFX900-NEXT: s_waitcnt vmcnt(0) 193; GFX900-NEXT: s_getpc_b64 s[18:19] 194; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 195; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 196; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 197; GFX900-NEXT: s_endpgm 198; 199; GFX1010-LABEL: test_kern_stack_and_call: 200; GFX1010: ; %bb.0: ; %entry 201; GFX1010-NEXT: s_add_u32 s12, s12, s17 202; GFX1010-NEXT: s_movk_i32 s32, 0x200 203; GFX1010-NEXT: s_addc_u32 s13, s13, 0 204; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 205; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 206; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 207; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 208; GFX1010-NEXT: v_mov_b32_e32 v3, 0 209; GFX1010-NEXT: s_add_u32 s0, s0, s17 210; GFX1010-NEXT: s_addc_u32 s1, s1, 0 211; GFX1010-NEXT: s_mov_b32 s13, s15 212; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 213; GFX1010-NEXT: s_mov_b32 s12, s14 214; GFX1010-NEXT: s_mov_b32 s14, s16 215; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 216; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 217; GFX1010-NEXT: s_getpc_b64 s[18:19] 218; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 219; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 220; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 221; GFX1010-NEXT: s_endpgm 222; 223; GFX1100-LABEL: test_kern_stack_and_call: 224; GFX1100: ; %bb.0: ; %entry 225; GFX1100-NEXT: v_mov_b32_e32 v1, 0 226; GFX1100-NEXT: v_mov_b32_e32 v31, v0 227; GFX1100-NEXT: s_mov_b32 s12, s13 228; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 229; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 230; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 231; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 232; GFX1100-NEXT: s_mov_b32 s13, s14 233; GFX1100-NEXT: s_mov_b32 s14, s15 234; GFX1100-NEXT: s_mov_b32 s32, 16 235; GFX1100-NEXT: scratch_store_b32 off, v1, off offset:4 dlc 236; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 237; GFX1100-NEXT: s_getpc_b64 s[16:17] 238; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 239; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 240; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 241; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 242; GFX1100-NEXT: s_endpgm 243 244entry: 245 %x = alloca i32, align 4, addrspace(5) 246 store volatile i32 0, i32 addrspace(5)* %x, align 4 247 tail call void @ex() #0 248 ret void 249} 250 251define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { 252; GFX803-LABEL: test_force_fp_kern_empty: 253; GFX803: ; %bb.0: ; %entry 254; GFX803-NEXT: s_mov_b32 s33, 0 255; GFX803-NEXT: s_endpgm 256; 257; GFX900-LABEL: test_force_fp_kern_empty: 258; GFX900: ; %bb.0: ; %entry 259; GFX900-NEXT: s_mov_b32 s33, 0 260; GFX900-NEXT: s_endpgm 261; 262; GFX1010-LABEL: test_force_fp_kern_empty: 263; GFX1010: ; %bb.0: ; %entry 264; GFX1010-NEXT: s_mov_b32 s33, 0 265; GFX1010-NEXT: s_endpgm 266; 267; GFX1100-LABEL: test_force_fp_kern_empty: 268; GFX1100: ; %bb.0: ; %entry 269; GFX1100-NEXT: s_mov_b32 s33, 0 270; GFX1100-NEXT: s_endpgm 271 272entry: 273 ret void 274} 275 276define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { 277; GFX803-LABEL: test_force_fp_kern_stack: 278; GFX803: ; %bb.0: ; %entry 279; GFX803-NEXT: s_add_u32 s0, s0, s7 280; GFX803-NEXT: s_mov_b32 s33, 0 281; GFX803-NEXT: s_addc_u32 s1, s1, 0 282; GFX803-NEXT: v_mov_b32_e32 v0, 0 283; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 284; GFX803-NEXT: s_waitcnt vmcnt(0) 285; GFX803-NEXT: s_endpgm 286; 287; GFX900-LABEL: test_force_fp_kern_stack: 288; GFX900: ; %bb.0: ; %entry 289; GFX900-NEXT: s_add_u32 s0, s0, s7 290; GFX900-NEXT: s_mov_b32 s33, 0 291; GFX900-NEXT: s_addc_u32 s1, s1, 0 292; GFX900-NEXT: v_mov_b32_e32 v0, 0 293; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 294; GFX900-NEXT: s_waitcnt vmcnt(0) 295; GFX900-NEXT: s_endpgm 296; 297; GFX1010-LABEL: test_force_fp_kern_stack: 298; GFX1010: ; %bb.0: ; %entry 299; GFX1010-NEXT: v_mov_b32_e32 v0, 0 300; GFX1010-NEXT: s_add_u32 s0, s0, s7 301; GFX1010-NEXT: s_mov_b32 s33, 0 302; GFX1010-NEXT: s_addc_u32 s1, s1, 0 303; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 304; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 305; GFX1010-NEXT: s_endpgm 306; 307; GFX1100-LABEL: test_force_fp_kern_stack: 308; GFX1100: ; %bb.0: ; %entry 309; GFX1100-NEXT: v_mov_b32_e32 v0, 0 310; GFX1100-NEXT: s_mov_b32 s33, 0 311; GFX1100-NEXT: scratch_store_b32 off, v0, s33 offset:4 dlc 312; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 313; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 314; GFX1100-NEXT: s_endpgm 315entry: 316 %x = alloca i32, align 4, addrspace(5) 317 store volatile i32 0, i32 addrspace(5)* %x, align 4 318 ret void 319} 320 321define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { 322; GFX803-LABEL: test_force_fp_kern_call: 323; GFX803: ; %bb.0: ; %entry 324; GFX803-NEXT: s_add_i32 s12, s12, s17 325; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 326; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 327; GFX803-NEXT: s_add_u32 s0, s0, s17 328; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 329; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 330; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 331; GFX803-NEXT: s_addc_u32 s1, s1, 0 332; GFX803-NEXT: s_mov_b32 s13, s15 333; GFX803-NEXT: s_mov_b32 s12, s14 334; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 335; GFX803-NEXT: s_mov_b32 s14, s16 336; GFX803-NEXT: s_mov_b32 s32, 0 337; GFX803-NEXT: s_mov_b32 s33, 0 338; GFX803-NEXT: s_getpc_b64 s[18:19] 339; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 340; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 341; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 342; GFX803-NEXT: s_endpgm 343; 344; GFX900-LABEL: test_force_fp_kern_call: 345; GFX900: ; %bb.0: ; %entry 346; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 347; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 348; GFX900-NEXT: s_add_u32 s0, s0, s17 349; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 350; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 351; GFX900-NEXT: s_addc_u32 s1, s1, 0 352; GFX900-NEXT: s_mov_b32 s13, s15 353; GFX900-NEXT: s_mov_b32 s12, s14 354; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 355; GFX900-NEXT: s_mov_b32 s14, s16 356; GFX900-NEXT: s_mov_b32 s32, 0 357; GFX900-NEXT: s_mov_b32 s33, 0 358; GFX900-NEXT: s_getpc_b64 s[18:19] 359; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 360; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 361; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 362; GFX900-NEXT: s_endpgm 363; 364; GFX1010-LABEL: test_force_fp_kern_call: 365; GFX1010: ; %bb.0: ; %entry 366; GFX1010-NEXT: s_add_u32 s12, s12, s17 367; GFX1010-NEXT: s_mov_b32 s32, 0 368; GFX1010-NEXT: s_mov_b32 s33, 0 369; GFX1010-NEXT: s_addc_u32 s13, s13, 0 370; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 371; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 372; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 373; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 374; GFX1010-NEXT: s_add_u32 s0, s0, s17 375; GFX1010-NEXT: s_addc_u32 s1, s1, 0 376; GFX1010-NEXT: s_mov_b32 s13, s15 377; GFX1010-NEXT: s_mov_b32 s12, s14 378; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 379; GFX1010-NEXT: s_mov_b32 s14, s16 380; GFX1010-NEXT: s_getpc_b64 s[18:19] 381; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 382; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 383; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 384; GFX1010-NEXT: s_endpgm 385; 386; GFX1100-LABEL: test_force_fp_kern_call: 387; GFX1100: ; %bb.0: ; %entry 388; GFX1100-NEXT: v_mov_b32_e32 v31, v0 389; GFX1100-NEXT: s_mov_b32 s12, s13 390; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 391; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 392; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 393; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 394; GFX1100-NEXT: s_mov_b32 s13, s14 395; GFX1100-NEXT: s_mov_b32 s14, s15 396; GFX1100-NEXT: s_mov_b32 s32, 0 397; GFX1100-NEXT: s_mov_b32 s33, 0 398; GFX1100-NEXT: s_getpc_b64 s[16:17] 399; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 400; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 401; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 402; GFX1100-NEXT: s_endpgm 403; GFX1010-NEXT s_add_u32 s12, s12, s17 404; GFX1010-NEXT s_mov_b32 s32, 0 405; GFX1010-NEXT s_mov_b32 s33, 0 406; GFX1010-NEXT s_addc_u32 s13, s13, 0 407; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 408; GFX1010-NEXT s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 409; GFX1010-NEXT v_lshlrev_b32_e32 v2, 20, v2 410; GFX1010-NEXT v_lshlrev_b32_e32 v1, 10, v1 411; GFX1010-NEXT s_add_u32 s0, s0, s17 412; GFX1010-NEXT s_addc_u32 s1, s1, 0 413; GFX1010-NEXT s_mov_b32 s12, s14 414; GFX1010-NEXT s_mov_b32 s13, s15 415; GFX1010-NEXT v_or3_b32 v31, v0, v1, v2 416; GFX1010-NEXT s_mov_b32 s14, s16 417; GFX1010-NEXT s_getpc_b64 s[18:19] 418; GFX1010-NEXT s_add_u32 s18, s18, ex@rel32@lo+4 419; GFX1010-NEXT s_addc_u32 s19, s19, ex@rel32@hi+12 420; GFX1010-NEXT s_swappc_b64 s[30:31], s[18:19] 421; GFX1010-NEXT s_endpgm 422entry: 423 tail call void @ex() #2 424 ret void 425} 426 427define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { 428; GFX803-LABEL: test_force_fp_kern_stack_and_call: 429; GFX803: ; %bb.0: ; %entry 430; GFX803-NEXT: s_add_i32 s12, s12, s17 431; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 432; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 433; GFX803-NEXT: s_add_u32 s0, s0, s17 434; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 435; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 436; GFX803-NEXT: s_mov_b32 s33, 0 437; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 438; GFX803-NEXT: s_addc_u32 s1, s1, 0 439; GFX803-NEXT: s_mov_b32 s13, s15 440; GFX803-NEXT: s_mov_b32 s12, s14 441; GFX803-NEXT: v_mov_b32_e32 v3, 0 442; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 443; GFX803-NEXT: s_mov_b32 s14, s16 444; GFX803-NEXT: s_movk_i32 s32, 0x400 445; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 446; GFX803-NEXT: s_waitcnt vmcnt(0) 447; GFX803-NEXT: s_getpc_b64 s[18:19] 448; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 449; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 450; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 451; GFX803-NEXT: s_endpgm 452; 453; GFX900-LABEL: test_force_fp_kern_stack_and_call: 454; GFX900: ; %bb.0: ; %entry 455; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 456; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 457; GFX900-NEXT: s_add_u32 s0, s0, s17 458; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 459; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 460; GFX900-NEXT: s_mov_b32 s33, 0 461; GFX900-NEXT: s_addc_u32 s1, s1, 0 462; GFX900-NEXT: s_mov_b32 s13, s15 463; GFX900-NEXT: s_mov_b32 s12, s14 464; GFX900-NEXT: v_mov_b32_e32 v3, 0 465; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 466; GFX900-NEXT: s_mov_b32 s14, s16 467; GFX900-NEXT: s_movk_i32 s32, 0x400 468; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 469; GFX900-NEXT: s_waitcnt vmcnt(0) 470; GFX900-NEXT: s_getpc_b64 s[18:19] 471; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 472; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 473; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 474; GFX900-NEXT: s_endpgm 475; 476; GFX1010-LABEL: test_force_fp_kern_stack_and_call: 477; GFX1010: ; %bb.0: ; %entry 478; GFX1010-NEXT: s_add_u32 s12, s12, s17 479; GFX1010-NEXT: s_movk_i32 s32, 0x200 480; GFX1010-NEXT: s_mov_b32 s33, 0 481; GFX1010-NEXT: s_addc_u32 s13, s13, 0 482; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 483; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 484; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 485; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 486; GFX1010-NEXT: v_mov_b32_e32 v3, 0 487; GFX1010-NEXT: s_add_u32 s0, s0, s17 488; GFX1010-NEXT: s_addc_u32 s1, s1, 0 489; GFX1010-NEXT: s_mov_b32 s13, s15 490; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 491; GFX1010-NEXT: s_mov_b32 s12, s14 492; GFX1010-NEXT: s_mov_b32 s14, s16 493; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 494; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 495; GFX1010-NEXT: s_getpc_b64 s[18:19] 496; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 497; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 498; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 499; GFX1010-NEXT: s_endpgm 500; 501; GFX1100-LABEL: test_force_fp_kern_stack_and_call: 502; GFX1100: ; %bb.0: ; %entry 503; GFX1100-NEXT: v_mov_b32_e32 v1, 0 504; GFX1100-NEXT: v_mov_b32_e32 v31, v0 505; GFX1100-NEXT: s_mov_b32 s33, 0 506; GFX1100-NEXT: s_mov_b32 s12, s13 507; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] 508; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] 509; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] 510; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] 511; GFX1100-NEXT: s_mov_b32 s13, s14 512; GFX1100-NEXT: s_mov_b32 s14, s15 513; GFX1100-NEXT: s_mov_b32 s32, 16 514; GFX1100-NEXT: scratch_store_b32 off, v1, s33 offset:4 dlc 515; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 516; GFX1100-NEXT: s_getpc_b64 s[16:17] 517; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 518; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 519; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] 520; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 521; GFX1100-NEXT: s_endpgm 522entry: 523 %x = alloca i32, align 4, addrspace(5) 524 store volatile i32 0, i32 addrspace(5)* %x, align 4 525 tail call void @ex() #2 526 ret void 527} 528 529define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { 530; GFX803-LABEL: test_sgpr_offset_kernel: 531; GFX803: ; %bb.0: ; %entry 532; GFX803-NEXT: s_add_u32 s0, s0, s7 533; GFX803-NEXT: s_addc_u32 s1, s1, 0 534; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 535; GFX803-NEXT: s_waitcnt vmcnt(0) 536; GFX803-NEXT: s_mov_b32 s4, 0x40000 537; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 538; GFX803-NEXT: ;;#ASMSTART 539; GFX803-NEXT: ;;#ASMEND 540; GFX803-NEXT: s_mov_b32 s4, 0x40000 541; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 542; GFX803-NEXT: s_waitcnt vmcnt(0) 543; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 544; GFX803-NEXT: s_waitcnt vmcnt(0) 545; GFX803-NEXT: s_endpgm 546; 547; GFX900-LABEL: test_sgpr_offset_kernel: 548; GFX900: ; %bb.0: ; %entry 549; GFX900-NEXT: s_add_u32 s0, s0, s7 550; GFX900-NEXT: s_addc_u32 s1, s1, 0 551; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 552; GFX900-NEXT: s_waitcnt vmcnt(0) 553; GFX900-NEXT: s_mov_b32 s4, 0x40000 554; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 555; GFX900-NEXT: ;;#ASMSTART 556; GFX900-NEXT: ;;#ASMEND 557; GFX900-NEXT: s_mov_b32 s4, 0x40000 558; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 559; GFX900-NEXT: s_waitcnt vmcnt(0) 560; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 561; GFX900-NEXT: s_waitcnt vmcnt(0) 562; GFX900-NEXT: s_endpgm 563; 564; GFX1010-LABEL: test_sgpr_offset_kernel: 565; GFX1010: ; %bb.0: ; %entry 566; GFX1010-NEXT: s_add_u32 s0, s0, s7 567; GFX1010-NEXT: s_addc_u32 s1, s1, 0 568; GFX1010-NEXT: s_mov_b32 s4, 0x20000 569; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc 570; GFX1010-NEXT: s_waitcnt vmcnt(0) 571; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 572; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 573; GFX1010-NEXT: s_mov_b32 s4, 0x20000 574; GFX1010-NEXT: ;;#ASMSTART 575; GFX1010-NEXT: ;;#ASMEND 576; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 577; GFX1010-NEXT: s_waitcnt vmcnt(0) 578; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 579; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 580; GFX1010-NEXT: s_endpgm 581; 582; GFX1100-LABEL: test_sgpr_offset_kernel: 583; GFX1100: ; %bb.0: ; %entry 584; GFX1100-NEXT: scratch_load_b32 v0, off, off offset:8 glc dlc 585; GFX1100-NEXT: s_waitcnt vmcnt(0) 586; GFX1100-NEXT: s_movk_i32 s0, 0x1000 587; GFX1100-NEXT: scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill 588; GFX1100-NEXT: s_movk_i32 s0, 0x1000 589; GFX1100-NEXT: ;;#ASMSTART 590; GFX1100-NEXT: ;;#ASMEND 591; GFX1100-NEXT: scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload 592; GFX1100-NEXT: s_waitcnt vmcnt(0) 593; GFX1100-NEXT: scratch_store_b32 off, v0, off offset:8 dlc 594; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 595; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 596; GFX1100-NEXT: s_endpgm 597entry: 598 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 599 ; fit in the instruction, and has to live in the SGPR offset. 600 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 601 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 602 603 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 604 ; 0x40000 / 64 = 4096 (for wave64) 605 ; CHECK: s_add_u32 s6, s7, 0x40000 606 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 607 %a = load volatile i32, i32 addrspace(5)* %aptr 608 609 ; Force %a to spill 610 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 611 612 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 613 store volatile i32 %a, i32 addrspace(5)* %outptr 614 615 ret void 616} 617 618declare hidden void @ex() local_unnamed_addr #0 619 620attributes #0 = { nounwind } 621attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } 622attributes #2 = { nounwind "frame-pointer"="all" } 623