1; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VMEM -check-prefix=GCN %s 2; RUN: llc -O0 -mtriple=amdgcn--amdhsa -march=amdgcn --amdhsa-code-object-version=2 -amdgpu-spill-sgpr-to-vgpr=1 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VGPR -check-prefix=GCN %s 3 4; Verify registers used for tracking exec mask changes when all 5; registers are spilled at the end of the block. The SGPR spill 6; placement relative to the exec modifications are important. 7 8; FIXME: This checks with SGPR to VGPR spilling disabled, but this may 9; not work correctly in cases where no workitems take a branch. 10 11 12; GCN-LABEL: {{^}}divergent_if_endif: 13; VGPR: workitem_private_segment_byte_size = 12{{$}} 14 15 16; GCN: {{^}}; %bb.0: 17; GCN: s_mov_b32 m0, -1 18; GCN: ds_read_b32 [[LOAD0:v[0-9]+]] 19 20; Spill load 21; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 22; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0 23 24; Spill saved exec 25; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec 26; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 27; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 28 29; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 30; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 31; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 32 33; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] 34; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} 35 36; GCN: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] 37 38; GCN: ; %bb.{{[0-9]+}}: ; %if 39; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 40; GCN: s_mov_b32 m0, -1 41; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] 42; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) 43 44 45; Spill val register 46; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] 47; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 48 49; VMEM: [[ENDIF]]: 50 51; Reload and restore exec mask 52; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 53; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 54 55; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload 56; VMEM: s_waitcnt vmcnt(0) 57; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 58; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 59 60; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} 61 62; Restore val 63; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload 64 65; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] 66define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { 67entry: 68 %tid = call i32 @llvm.amdgcn.workitem.id.x() 69 %load0 = load volatile i32, i32 addrspace(3)* undef 70 %cmp0 = icmp eq i32 %tid, 0 71 br i1 %cmp0, label %if, label %endif 72 73if: 74 %load1 = load volatile i32, i32 addrspace(3)* undef 75 %val = add i32 %load0, %load1 76 br label %endif 77 78endif: 79 %tmp4 = phi i32 [ %val, %if ], [ 0, %entry ] 80 store i32 %tmp4, i32 addrspace(1)* %out 81 ret void 82} 83 84; GCN-LABEL: {{^}}divergent_loop: 85; VGPR: workitem_private_segment_byte_size = 16{{$}} 86 87; GCN: {{^}}; %bb.0: 88; GCN-DAG: s_mov_b32 m0, -1 89; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} 90; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] 91; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, v0 92 93; Spill load 94; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 95; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec 96 97; Spill saved exec 98; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 99; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 100 101; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 102; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 103; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET:[0-9]+]] ; 4-byte Folded Spill 104 105 106; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] 107; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} 108; GCN-NEXT: s_cbranch_execz [[END:BB[0-9]+_[0-9]+]] 109 110 111; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: 112; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 113; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] 114; GCN: s_cmp_lg_u32 115; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill 116; GCN-NEXT: s_cbranch_scc1 [[LOOP]] 117 118; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill 119 120; GCN: [[END]]: 121; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 122; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 123 124; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[V_EXEC_SPILL_OFFSET]] ; 4-byte Folded Reload 125; VMEM: s_waitcnt vmcnt(0) 126; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 127; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 128 129; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} 130; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload 131 132; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] 133define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { 134entry: 135 %tid = call i32 @llvm.amdgcn.workitem.id.x() 136 %load0 = load volatile i32, i32 addrspace(3)* null 137 %cmp0 = icmp eq i32 %tid, 0 138 br i1 %cmp0, label %loop, label %end 139 140loop: 141 %i = phi i32 [ %i.inc, %loop ], [ 0, %entry ] 142 %val = phi i32 [ %val.sub, %loop ], [ %load0, %entry ] 143 %load1 = load volatile i32, i32 addrspace(3)* undef 144 %i.inc = add i32 %i, 1 145 %val.sub = sub i32 %val, %load1 146 %cmp1 = icmp ne i32 %i, 256 147 br i1 %cmp1, label %loop, label %end 148 149end: 150 %tmp4 = phi i32 [ %val.sub, %loop ], [ 0, %entry ] 151 store i32 %tmp4, i32 addrspace(1)* %out 152 ret void 153} 154 155; GCN-LABEL: {{^}}divergent_if_else_endif: 156; GCN: {{^}}; %bb.0: 157 158; GCN-DAG: s_mov_b32 m0, -1 159; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} 160; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] 161 162; Spill load 163; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill 164 165; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 166; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], [[ZERO]], v0 167 168; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec 169; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] 170; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} 171 172; Spill saved exec 173; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] 174; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] 175 176; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 177; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 178; VMEM: buffer_store_dword v[[V_SAVEEXEC]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill 179 180; GCN: s_mov_b64 exec, [[CMP0]] 181 182; FIXME: It makes no sense to put this skip here 183; GCN: s_cbranch_execz [[FLOW:BB[0-9]+_[0-9]+]] 184; GCN-NEXT: s_branch [[ELSE:BB[0-9]+_[0-9]+]] 185 186; GCN: [[FLOW]]: ; %Flow 187; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] 188; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] 189 190; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] 191; VMEM: s_waitcnt vmcnt(0) 192; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 0 193; VMEM: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC]], 1 194 195; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} 196 197; Regular spill value restored after exec modification 198; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload 199; Followed by spill 200; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill 201 202; GCN: s_and_b64 s{{\[}}[[FLOW_AND_EXEC_LO:[0-9]+]]:[[FLOW_AND_EXEC_HI:[0-9]+]]{{\]}}, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC]]{{\]}} 203 204; Spill saved exec 205; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_LO]], [[FLOW_SAVEEXEC_LO_LANE:[0-9]+]] 206; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[FLOW_AND_EXEC_HI]], [[FLOW_SAVEEXEC_HI_LANE:[0-9]+]] 207 208; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC:[0-9]+]], s[[FLOW_AND_EXEC_LO]], 0 209; VMEM: v_writelane_b32 v[[FLOW_V_SAVEEXEC]], s[[FLOW_AND_EXEC_HI]], 1 210; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill 211 212; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_AND_EXEC_LO]]:[[FLOW_AND_EXEC_HI]]{{\]}} 213; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] 214 215 216; GCN: ; %bb.{{[0-9]+}}: ; %if 217; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 218; GCN: ds_read_b32 219; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] 220; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill 221; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] 222 223; GCN: [[ELSE]]: ; %else 224; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload 225; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] 226; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill 227; GCN-NEXT: s_branch [[FLOW]] 228 229; GCN: [[ENDIF]]: 230; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]] 231; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] 232 233 234; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_OFFSET]] ; 4-byte Folded Reload 235; VMEM: s_waitcnt vmcnt(0) 236; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 0 237; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1 238 239; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} 240 241; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload 242; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] 243define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { 244entry: 245 %tid = call i32 @llvm.amdgcn.workitem.id.x() 246 %load0 = load volatile i32, i32 addrspace(3)* null 247 %cmp0 = icmp eq i32 %tid, 0 248 br i1 %cmp0, label %if, label %else 249 250if: 251 %load1 = load volatile i32, i32 addrspace(3)* undef 252 %val0 = add i32 %load0, %load1 253 br label %endif 254 255else: 256 %load2 = load volatile i32, i32 addrspace(3)* undef 257 %val1 = sub i32 %load0, %load2 258 br label %endif 259 260endif: 261 %result = phi i32 [ %val0, %if ], [ %val1, %else ] 262 store i32 %result, i32 addrspace(1)* %out 263 ret void 264} 265 266declare i32 @llvm.amdgcn.workitem.id.x() #1 267 268attributes #0 = { nounwind } 269attributes #1 = { nounwind readnone } 270