1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s 5 6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { 7; GFX803-LABEL: test_kern_empty: 8; GFX803: ; %bb.0: ; %entry 9; GFX803-NEXT: s_endpgm 10; 11; GFX900-LABEL: test_kern_empty: 12; GFX900: ; %bb.0: ; %entry 13; GFX900-NEXT: s_endpgm 14; 15; GFX1010-LABEL: test_kern_empty: 16; GFX1010: ; %bb.0: ; %entry 17; GFX1010-NEXT: s_endpgm 18entry: 19 ret void 20} 21 22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { 23; GFX803-LABEL: test_kern_stack: 24; GFX803: ; %bb.0: ; %entry 25; GFX803-NEXT: s_add_u32 s0, s0, s7 26; GFX803-NEXT: s_addc_u32 s1, s1, 0 27; GFX803-NEXT: v_mov_b32_e32 v0, 0 28; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 29; GFX803-NEXT: s_waitcnt vmcnt(0) 30; GFX803-NEXT: s_endpgm 31; 32; GFX900-LABEL: test_kern_stack: 33; GFX900: ; %bb.0: ; %entry 34; GFX900-NEXT: s_add_u32 s0, s0, s7 35; GFX900-NEXT: s_addc_u32 s1, s1, 0 36; GFX900-NEXT: v_mov_b32_e32 v0, 0 37; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 38; GFX900-NEXT: s_waitcnt vmcnt(0) 39; GFX900-NEXT: s_endpgm 40; 41; GFX1010-LABEL: test_kern_stack: 42; GFX1010: ; %bb.0: ; %entry 43; GFX1010-NEXT: v_mov_b32_e32 v0, 0 44; GFX1010-NEXT: s_add_u32 s0, s0, s7 45; GFX1010-NEXT: s_addc_u32 s1, s1, 0 46; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 47; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 48; GFX1010-NEXT: s_endpgm 49entry: 50 %x = alloca i32, align 4, addrspace(5) 51 store volatile i32 0, i32 addrspace(5)* %x, align 4 52 ret void 53} 54 55define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { 56; GFX803-LABEL: test_kern_call: 57; GFX803: ; %bb.0: ; %entry 58; GFX803-NEXT: s_add_i32 s12, s12, s17 59; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 60; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 61; GFX803-NEXT: s_add_u32 s0, s0, s17 62; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 63; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 64; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 65; GFX803-NEXT: s_addc_u32 s1, s1, 0 66; GFX803-NEXT: s_mov_b32 s13, s15 67; GFX803-NEXT: s_mov_b32 s12, s14 68; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 69; GFX803-NEXT: s_mov_b32 s14, s16 70; GFX803-NEXT: s_mov_b32 s32, 0 71; GFX803-NEXT: s_getpc_b64 s[18:19] 72; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 73; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 74; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 75; GFX803-NEXT: s_endpgm 76; 77; GFX900-LABEL: test_kern_call: 78; GFX900: ; %bb.0: ; %entry 79; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 80; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 81; GFX900-NEXT: s_add_u32 s0, s0, s17 82; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 83; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 84; GFX900-NEXT: s_addc_u32 s1, s1, 0 85; GFX900-NEXT: s_mov_b32 s13, s15 86; GFX900-NEXT: s_mov_b32 s12, s14 87; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 88; GFX900-NEXT: s_mov_b32 s14, s16 89; GFX900-NEXT: s_mov_b32 s32, 0 90; GFX900-NEXT: s_getpc_b64 s[18:19] 91; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 92; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 93; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 94; GFX900-NEXT: s_endpgm 95; 96; GFX1010-LABEL: test_kern_call: 97; GFX1010: ; %bb.0: ; %entry 98; GFX1010-NEXT: s_add_u32 s12, s12, s17 99; GFX1010-NEXT: s_mov_b32 s32, 0 100; GFX1010-NEXT: s_addc_u32 s13, s13, 0 101; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 102; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 103; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 104; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 105; GFX1010-NEXT: s_add_u32 s0, s0, s17 106; GFX1010-NEXT: s_addc_u32 s1, s1, 0 107; GFX1010-NEXT: s_mov_b32 s13, s15 108; GFX1010-NEXT: s_mov_b32 s12, s14 109; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 110; GFX1010-NEXT: s_mov_b32 s14, s16 111; GFX1010-NEXT: s_getpc_b64 s[18:19] 112; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 113; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 114; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 115; GFX1010-NEXT: s_endpgm 116entry: 117 tail call void @ex() #0 118 ret void 119} 120 121define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { 122; GFX803-LABEL: test_kern_stack_and_call: 123; GFX803: ; %bb.0: ; %entry 124; GFX803-NEXT: s_add_i32 s12, s12, s17 125; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 126; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 127; GFX803-NEXT: s_add_u32 s0, s0, s17 128; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 129; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 130; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 131; GFX803-NEXT: s_addc_u32 s1, s1, 0 132; GFX803-NEXT: s_mov_b32 s13, s15 133; GFX803-NEXT: s_mov_b32 s12, s14 134; GFX803-NEXT: v_mov_b32_e32 v3, 0 135; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 136; GFX803-NEXT: s_mov_b32 s14, s16 137; GFX803-NEXT: s_movk_i32 s32, 0x400 138; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 139; GFX803-NEXT: s_waitcnt vmcnt(0) 140; GFX803-NEXT: s_getpc_b64 s[18:19] 141; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 142; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 143; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 144; GFX803-NEXT: s_endpgm 145; 146; GFX900-LABEL: test_kern_stack_and_call: 147; GFX900: ; %bb.0: ; %entry 148; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 149; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 150; GFX900-NEXT: s_add_u32 s0, s0, s17 151; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 152; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 153; GFX900-NEXT: s_addc_u32 s1, s1, 0 154; GFX900-NEXT: s_mov_b32 s13, s15 155; GFX900-NEXT: s_mov_b32 s12, s14 156; GFX900-NEXT: v_mov_b32_e32 v3, 0 157; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 158; GFX900-NEXT: s_mov_b32 s14, s16 159; GFX900-NEXT: s_movk_i32 s32, 0x400 160; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 161; GFX900-NEXT: s_waitcnt vmcnt(0) 162; GFX900-NEXT: s_getpc_b64 s[18:19] 163; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 164; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 165; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 166; GFX900-NEXT: s_endpgm 167; 168; GFX1010-LABEL: test_kern_stack_and_call: 169; GFX1010: ; %bb.0: ; %entry 170; GFX1010-NEXT: s_add_u32 s12, s12, s17 171; GFX1010-NEXT: s_movk_i32 s32, 0x200 172; GFX1010-NEXT: s_addc_u32 s13, s13, 0 173; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 174; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 175; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 176; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 177; GFX1010-NEXT: v_mov_b32_e32 v3, 0 178; GFX1010-NEXT: s_add_u32 s0, s0, s17 179; GFX1010-NEXT: s_addc_u32 s1, s1, 0 180; GFX1010-NEXT: s_mov_b32 s13, s15 181; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 182; GFX1010-NEXT: s_mov_b32 s12, s14 183; GFX1010-NEXT: s_mov_b32 s14, s16 184; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 185; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 186; GFX1010-NEXT: s_getpc_b64 s[18:19] 187; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 188; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 189; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 190; GFX1010-NEXT: s_endpgm 191entry: 192 %x = alloca i32, align 4, addrspace(5) 193 store volatile i32 0, i32 addrspace(5)* %x, align 4 194 tail call void @ex() #0 195 ret void 196} 197 198define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { 199; GFX803-LABEL: test_force_fp_kern_empty: 200; GFX803: ; %bb.0: ; %entry 201; GFX803-NEXT: s_mov_b32 s33, 0 202; GFX803-NEXT: s_endpgm 203; 204; GFX900-LABEL: test_force_fp_kern_empty: 205; GFX900: ; %bb.0: ; %entry 206; GFX900-NEXT: s_mov_b32 s33, 0 207; GFX900-NEXT: s_endpgm 208; 209; GFX1010-LABEL: test_force_fp_kern_empty: 210; GFX1010: ; %bb.0: ; %entry 211; GFX1010-NEXT: s_mov_b32 s33, 0 212; GFX1010-NEXT: s_endpgm 213entry: 214 ret void 215} 216 217define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { 218; GFX803-LABEL: test_force_fp_kern_stack: 219; GFX803: ; %bb.0: ; %entry 220; GFX803-NEXT: s_add_u32 s0, s0, s7 221; GFX803-NEXT: s_mov_b32 s33, 0 222; GFX803-NEXT: s_addc_u32 s1, s1, 0 223; GFX803-NEXT: v_mov_b32_e32 v0, 0 224; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 225; GFX803-NEXT: s_waitcnt vmcnt(0) 226; GFX803-NEXT: s_endpgm 227; 228; GFX900-LABEL: test_force_fp_kern_stack: 229; GFX900: ; %bb.0: ; %entry 230; GFX900-NEXT: s_add_u32 s0, s0, s7 231; GFX900-NEXT: s_mov_b32 s33, 0 232; GFX900-NEXT: s_addc_u32 s1, s1, 0 233; GFX900-NEXT: v_mov_b32_e32 v0, 0 234; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 235; GFX900-NEXT: s_waitcnt vmcnt(0) 236; GFX900-NEXT: s_endpgm 237; 238; GFX1010-LABEL: test_force_fp_kern_stack: 239; GFX1010: ; %bb.0: ; %entry 240; GFX1010-NEXT: v_mov_b32_e32 v0, 0 241; GFX1010-NEXT: s_add_u32 s0, s0, s7 242; GFX1010-NEXT: s_mov_b32 s33, 0 243; GFX1010-NEXT: s_addc_u32 s1, s1, 0 244; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 245; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 246; GFX1010-NEXT: s_endpgm 247entry: 248 %x = alloca i32, align 4, addrspace(5) 249 store volatile i32 0, i32 addrspace(5)* %x, align 4 250 ret void 251} 252 253define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { 254; GFX803-LABEL: test_force_fp_kern_call: 255; GFX803: ; %bb.0: ; %entry 256; GFX803-NEXT: s_add_i32 s12, s12, s17 257; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 258; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 259; GFX803-NEXT: s_add_u32 s0, s0, s17 260; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 261; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 262; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 263; GFX803-NEXT: s_addc_u32 s1, s1, 0 264; GFX803-NEXT: s_mov_b32 s13, s15 265; GFX803-NEXT: s_mov_b32 s12, s14 266; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 267; GFX803-NEXT: s_mov_b32 s14, s16 268; GFX803-NEXT: s_mov_b32 s32, 0 269; GFX803-NEXT: s_mov_b32 s33, 0 270; GFX803-NEXT: s_getpc_b64 s[18:19] 271; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 272; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 273; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 274; GFX803-NEXT: s_endpgm 275; 276; GFX900-LABEL: test_force_fp_kern_call: 277; GFX900: ; %bb.0: ; %entry 278; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 279; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 280; GFX900-NEXT: s_add_u32 s0, s0, s17 281; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 282; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 283; GFX900-NEXT: s_addc_u32 s1, s1, 0 284; GFX900-NEXT: s_mov_b32 s13, s15 285; GFX900-NEXT: s_mov_b32 s12, s14 286; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 287; GFX900-NEXT: s_mov_b32 s14, s16 288; GFX900-NEXT: s_mov_b32 s32, 0 289; GFX900-NEXT: s_mov_b32 s33, 0 290; GFX900-NEXT: s_getpc_b64 s[18:19] 291; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 292; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 293; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 294; GFX900-NEXT: s_endpgm 295; 296; GFX1010-LABEL: test_force_fp_kern_call: 297; GFX1010: ; %bb.0: ; %entry 298; GFX1010-NEXT: s_add_u32 s12, s12, s17 299; GFX1010-NEXT: s_mov_b32 s32, 0 300; GFX1010-NEXT: s_mov_b32 s33, 0 301; GFX1010-NEXT: s_addc_u32 s13, s13, 0 302; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 303; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 304; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 305; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 306; GFX1010-NEXT: s_add_u32 s0, s0, s17 307; GFX1010-NEXT: s_addc_u32 s1, s1, 0 308; GFX1010-NEXT: s_mov_b32 s13, s15 309; GFX1010-NEXT: s_mov_b32 s12, s14 310; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 311; GFX1010-NEXT: s_mov_b32 s14, s16 312; GFX1010-NEXT: s_getpc_b64 s[18:19] 313; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 314; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 315; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 316; GFX1010-NEXT: s_endpgm 317entry: 318 tail call void @ex() #2 319 ret void 320} 321 322define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { 323; GFX803-LABEL: test_force_fp_kern_stack_and_call: 324; GFX803: ; %bb.0: ; %entry 325; GFX803-NEXT: s_add_i32 s12, s12, s17 326; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 327; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 328; GFX803-NEXT: s_add_u32 s0, s0, s17 329; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 330; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 331; GFX803-NEXT: s_mov_b32 s33, 0 332; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 333; GFX803-NEXT: s_addc_u32 s1, s1, 0 334; GFX803-NEXT: s_mov_b32 s13, s15 335; GFX803-NEXT: s_mov_b32 s12, s14 336; GFX803-NEXT: v_mov_b32_e32 v3, 0 337; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 338; GFX803-NEXT: s_mov_b32 s14, s16 339; GFX803-NEXT: s_movk_i32 s32, 0x400 340; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 341; GFX803-NEXT: s_waitcnt vmcnt(0) 342; GFX803-NEXT: s_getpc_b64 s[18:19] 343; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 344; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 345; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] 346; GFX803-NEXT: s_endpgm 347; 348; GFX900-LABEL: test_force_fp_kern_stack_and_call: 349; GFX900: ; %bb.0: ; %entry 350; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 351; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 352; GFX900-NEXT: s_add_u32 s0, s0, s17 353; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 354; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 355; GFX900-NEXT: s_mov_b32 s33, 0 356; GFX900-NEXT: s_addc_u32 s1, s1, 0 357; GFX900-NEXT: s_mov_b32 s13, s15 358; GFX900-NEXT: s_mov_b32 s12, s14 359; GFX900-NEXT: v_mov_b32_e32 v3, 0 360; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 361; GFX900-NEXT: s_mov_b32 s14, s16 362; GFX900-NEXT: s_movk_i32 s32, 0x400 363; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 364; GFX900-NEXT: s_waitcnt vmcnt(0) 365; GFX900-NEXT: s_getpc_b64 s[18:19] 366; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 367; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 368; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] 369; GFX900-NEXT: s_endpgm 370; 371; GFX1010-LABEL: test_force_fp_kern_stack_and_call: 372; GFX1010: ; %bb.0: ; %entry 373; GFX1010-NEXT: s_add_u32 s12, s12, s17 374; GFX1010-NEXT: s_movk_i32 s32, 0x200 375; GFX1010-NEXT: s_mov_b32 s33, 0 376; GFX1010-NEXT: s_addc_u32 s13, s13, 0 377; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 378; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 379; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 20, v2 380; GFX1010-NEXT: v_lshlrev_b32_e32 v1, 10, v1 381; GFX1010-NEXT: v_mov_b32_e32 v3, 0 382; GFX1010-NEXT: s_add_u32 s0, s0, s17 383; GFX1010-NEXT: s_addc_u32 s1, s1, 0 384; GFX1010-NEXT: s_mov_b32 s13, s15 385; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 386; GFX1010-NEXT: s_mov_b32 s12, s14 387; GFX1010-NEXT: s_mov_b32 s14, s16 388; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:4 389; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 390; GFX1010-NEXT: s_getpc_b64 s[18:19] 391; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 392; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 393; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] 394; GFX1010-NEXT: s_endpgm 395entry: 396 %x = alloca i32, align 4, addrspace(5) 397 store volatile i32 0, i32 addrspace(5)* %x, align 4 398 tail call void @ex() #2 399 ret void 400} 401 402define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { 403; GFX803-LABEL: test_sgpr_offset_kernel: 404; GFX803: ; %bb.0: ; %entry 405; GFX803-NEXT: s_add_u32 s0, s0, s7 406; GFX803-NEXT: s_addc_u32 s1, s1, 0 407; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 408; GFX803-NEXT: s_waitcnt vmcnt(0) 409; GFX803-NEXT: s_mov_b32 s4, 0x40000 410; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 411; GFX803-NEXT: ;;#ASMSTART 412; GFX803-NEXT: ;;#ASMEND 413; GFX803-NEXT: s_mov_b32 s4, 0x40000 414; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 415; GFX803-NEXT: s_waitcnt vmcnt(0) 416; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 417; GFX803-NEXT: s_waitcnt vmcnt(0) 418; GFX803-NEXT: s_endpgm 419; 420; GFX900-LABEL: test_sgpr_offset_kernel: 421; GFX900: ; %bb.0: ; %entry 422; GFX900-NEXT: s_add_u32 s0, s0, s7 423; GFX900-NEXT: s_addc_u32 s1, s1, 0 424; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc 425; GFX900-NEXT: s_waitcnt vmcnt(0) 426; GFX900-NEXT: s_mov_b32 s4, 0x40000 427; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 428; GFX900-NEXT: ;;#ASMSTART 429; GFX900-NEXT: ;;#ASMEND 430; GFX900-NEXT: s_mov_b32 s4, 0x40000 431; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 432; GFX900-NEXT: s_waitcnt vmcnt(0) 433; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 434; GFX900-NEXT: s_waitcnt vmcnt(0) 435; GFX900-NEXT: s_endpgm 436; 437; GFX1010-LABEL: test_sgpr_offset_kernel: 438; GFX1010: ; %bb.0: ; %entry 439; GFX1010-NEXT: s_add_u32 s0, s0, s7 440; GFX1010-NEXT: s_addc_u32 s1, s1, 0 441; GFX1010-NEXT: s_mov_b32 s4, 0x20000 442; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc 443; GFX1010-NEXT: s_waitcnt vmcnt(0) 444; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill 445; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 446; GFX1010-NEXT: s_mov_b32 s4, 0x20000 447; GFX1010-NEXT: ;;#ASMSTART 448; GFX1010-NEXT: ;;#ASMEND 449; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload 450; GFX1010-NEXT: s_waitcnt vmcnt(0) 451; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 452; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 453; GFX1010-NEXT: s_endpgm 454entry: 455 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 456 ; fit in the instruction, and has to live in the SGPR offset. 457 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 458 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 459 460 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 461 ; 0x40000 / 64 = 4096 (for wave64) 462 ; CHECK: s_add_u32 s6, s7, 0x40000 463 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 464 %a = load volatile i32, i32 addrspace(5)* %aptr 465 466 ; Force %a to spill 467 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 468 469 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 470 store volatile i32 %a, i32 addrspace(5)* %outptr 471 472 ret void 473} 474 475declare hidden void @ex() local_unnamed_addr #0 476 477attributes #0 = { nounwind } 478attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } 479attributes #2 = { nounwind "frame-pointer"="all" } 480