1; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s 2; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s 3 4; Verify registers used for tracking exec mask changes when all 5; registers are spilled at the end of the block. The SGPR spill 6; placement relative to the exec modifications are important. 7 8; FIXME: This checks with SGPR to VGPR spilling disabled, but this may 9; not work correctly in cases where no workitems take a branch. 10 11 12; GCN-LABEL: {{^}}divergent_if_endif: 13; VGPR: workitem_private_segment_byte_size = 12{{$}} 14 15 16; GCN: {{^}}; %bb.0: 17; GCN: s_mov_b32 m0, -1 18; GCN: ds_read_b32 [[LOAD0:v[0-9]+]] 19 20; Spill load 21; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 22; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} 23 24; Spill saved exec 25; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec 26; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 27; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 28 29; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 30; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 31; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 32 33; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], [[CMP0]] 34; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]] 35 36; GCN: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]] 37 38; GCN: ; %bb.{{[0-9]+}}: ; %if 39; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 40; GCN: s_mov_b32 m0, -1 41; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] 42; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) 43 44 45; Spill val register 46; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] 47; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 48 49; VMEM: [[ENDIF]]: 50 51; Reload and restore exec mask 52; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 53; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 54 55; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload 56; VMEM: s_waitcnt vmcnt(0) 57; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 58; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 59 60; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] 61 62; Restore val 63; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload 64 65; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] 66define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { 67entry: 68 %tid = call i32 @llvm.amdgcn.workitem.id.x() 69 %load0 = load volatile i32, i32 addrspace(3)* undef 70 %cmp0 = icmp eq i32 %tid, 0 71 br i1 %cmp0, label %if, label %endif 72 73if: 74 %load1 = load volatile i32, i32 addrspace(3)* undef 75 %val = add i32 %load0, %load1 76 br label %endif 77 78endif: 79 %tmp4 = phi i32 [ %val, %if ], [ 0, %entry ] 80 store i32 %tmp4, i32 addrspace(1)* %out 81 ret void 82} 83 84; GCN-LABEL: {{^}}divergent_loop: 85; VGPR: workitem_private_segment_byte_size = 16{{$}} 86 87; GCN: {{^}}; %bb.0: 88; GCN-DAG: s_mov_b32 m0, -1 89; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} 90; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] 91; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} 92 93; Spill load 94; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 95; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec 96 97; Spill saved exec 98; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 99; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 100 101; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 102; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 103; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 104 105 106; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] 107; GCN: s_mov_b64 exec, s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]] 108; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]] 109 110 111; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]: 112; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 113; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] 114; GCN: s_cmp_lg_u32 115; VMEM: buffer_store_dword 116; VMEM: buffer_store_dword 117; VMEM: buffer_store_dword 118; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill 119; GCN-NEXT: s_cbranch_scc1 [[LOOP]] 120 121; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill 122 123; GCN: [[END]]: 124; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 125; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 126 127; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload 128; VMEM: s_waitcnt vmcnt(0) 129; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 130; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 131 132; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] 133; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload 134 135; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] 136define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { 137entry: 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %load0 = load volatile i32, i32 addrspace(3)* null 140 %cmp0 = icmp eq i32 %tid, 0 141 br i1 %cmp0, label %loop, label %end 142 143loop: 144 %i = phi i32 [ %i.inc, %loop ], [ 0, %entry ] 145 %val = phi i32 [ %val.sub, %loop ], [ %load0, %entry ] 146 %load1 = load volatile i32, i32 addrspace(3)* undef 147 %i.inc = add i32 %i, 1 148 %val.sub = sub i32 %val, %load1 149 %cmp1 = icmp ne i32 %i, 256 150 br i1 %cmp1, label %loop, label %end 151 152end: 153 %tmp4 = phi i32 [ %val.sub, %loop ], [ 0, %entry ] 154 store i32 %tmp4, i32 addrspace(1)* %out 155 ret void 156} 157 158; GCN-LABEL: {{^}}divergent_if_else_endif: 159; GCN: {{^}}; %bb.0: 160 161; GCN-DAG: s_mov_b32 m0, -1 162; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} 163; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] 164 165; Spill load 166; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 167 168; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 169; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] 170 171; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec 172; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] 173; GCN: s_xor_b64 s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]], s[[[ANDEXEC_LO]]:[[ANDEXEC_HI]]], s[[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]] 174 175; Spill saved exec 176; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 177; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 178 179; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 180; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 181; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill 182 183; GCN: s_mov_b64 exec, [[CMP0]] 184 185; FIXME: It makes no sense to put this skip here 186; GCN: s_cbranch_execz [[FLOW:.LBB[0-9]+_[0-9]+]] 187; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] 188 189; GCN: [[FLOW]]: ; %Flow 190; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 191; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 192 193; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] 194; VMEM: s_waitcnt vmcnt(0) 195; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 196; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 197 198; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]] 199 200; Regular spill value restored after exec modification 201; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload 202; Followed by spill 203; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill 204 205; GCN: s_and_b64 s[[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]], exec, s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]] 206 207; Spill saved exec 208; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] 209; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] 210 211; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0 212; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1 213; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill 214 215; GCN: s_xor_b64 exec, exec, s[[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]] 216; GCN-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9]+_[0-9]+]] 217 218 219; GCN: ; %bb.{{[0-9]+}}: ; %if 220; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 221; GCN: ds_read_b32 222; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] 223; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill 224; GCN-NEXT: s_branch [[ENDIF:.LBB[0-9]+_[0-9]+]] 225 226; GCN: [[ELSE]]: ; %else 227; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 228; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] 229; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill 230; GCN-NEXT: s_branch [[FLOW]] 231 232; GCN: [[ENDIF]]: 233; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] 234; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] 235 236 237; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload 238; VMEM: s_waitcnt vmcnt(0) 239; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 240; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 241 242; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]] 243 244; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload 245; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] 246define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { 247entry: 248 %tid = call i32 @llvm.amdgcn.workitem.id.x() 249 %load0 = load volatile i32, i32 addrspace(3)* null 250 %cmp0 = icmp eq i32 %tid, 0 251 br i1 %cmp0, label %if, label %else 252 253if: 254 %load1 = load volatile i32, i32 addrspace(3)* undef 255 %val0 = add i32 %load0, %load1 256 br label %endif 257 258else: 259 %load2 = load volatile i32, i32 addrspace(3)* undef 260 %val1 = sub i32 %load0, %load2 261 br label %endif 262 263endif: 264 %result = phi i32 [ %val0, %if ], [ %val1, %else ] 265 store i32 %result, i32 addrspace(1)* %out 266 ret void 267} 268 269declare i32 @llvm.amdgcn.workitem.id.x() #1 270 271attributes #0 = { nounwind } 272attributes #1 = { nounwind readnone } 273