1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 5; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s 6 7declare i32 @llvm.amdgcn.workitem.id.x() #0 8 9@lds.obj = addrspace(3) global [256 x i32] undef, align 4 10 11define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { 12; CI-LABEL: write_ds_sub0_offset0_global: 13; CI: ; %bb.0: ; %entry 14; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 15; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 16; CI-NEXT: v_mov_b32_e32 v1, 0x7b 17; CI-NEXT: s_mov_b32 m0, -1 18; CI-NEXT: ds_write_b32 v0, v1 offset:12 19; CI-NEXT: s_endpgm 20; 21; GFX9-LABEL: write_ds_sub0_offset0_global: 22; GFX9: ; %bb.0: ; %entry 23; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 24; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 25; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 26; GFX9-NEXT: ds_write_b32 v0, v1 offset:12 27; GFX9-NEXT: s_endpgm 28; 29; GFX10-LABEL: write_ds_sub0_offset0_global: 30; GFX10: ; %bb.0: ; %entry 31; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 32; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b 33; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 34; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 35; GFX10-NEXT: s_endpgm 36; 37; GFX11-LABEL: write_ds_sub0_offset0_global: 38; GFX11: ; %bb.0: ; %entry 39; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 40; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 41; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 42; GFX11-NEXT: ds_store_b32 v0, v1 offset:12 43; GFX11-NEXT: s_endpgm 44entry: 45 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 46 %sub1 = sub i32 0, %x.i 47 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 48 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3 49 store i32 123, i32 addrspace(3)* %arrayidx 50 ret void 51} 52 53define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { 54; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: 55; CI: ; %bb.0: ; %entry 56; CI-NEXT: s_load_dword s0, s[0:1], 0x0 57; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 58; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 59; CI-NEXT: s_mov_b64 vcc, 0 60; CI-NEXT: s_waitcnt lgkmcnt(0) 61; CI-NEXT: v_mov_b32_e32 v1, s0 62; CI-NEXT: s_mov_b32 s0, 0 63; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 64; CI-NEXT: v_mov_b32_e32 v2, 0x7b 65; CI-NEXT: s_mov_b32 m0, -1 66; CI-NEXT: s_mov_b32 s3, 0xf000 67; CI-NEXT: s_mov_b32 s2, -1 68; CI-NEXT: s_mov_b32 s1, s0 69; CI-NEXT: ds_write_b32 v0, v2 offset:12 70; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 71; CI-NEXT: s_waitcnt vmcnt(0) 72; CI-NEXT: s_endpgm 73; 74; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: 75; GFX9: ; %bb.0: ; %entry 76; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 77; GFX9-NEXT: s_mov_b64 vcc, 0 78; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 79; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 80; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 81; GFX9-NEXT: s_waitcnt lgkmcnt(0) 82; GFX9-NEXT: v_mov_b32_e32 v1, s0 83; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 84; GFX9-NEXT: v_mov_b32_e32 v0, 0 85; GFX9-NEXT: v_mov_b32_e32 v1, 0 86; GFX9-NEXT: ds_write_b32 v3, v4 offset:12 87; GFX9-NEXT: global_store_dword v[0:1], v2, off 88; GFX9-NEXT: s_waitcnt vmcnt(0) 89; GFX9-NEXT: s_endpgm 90; 91; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: 92; GFX10: ; %bb.0: ; %entry 93; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 94; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 95; GFX10-NEXT: s_mov_b32 vcc_lo, 0 96; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b 97; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 98; GFX10-NEXT: v_mov_b32_e32 v0, 0 99; GFX10-NEXT: v_mov_b32_e32 v1, 0 100; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 101; GFX10-NEXT: s_waitcnt lgkmcnt(0) 102; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 103; GFX10-NEXT: global_store_dword v[0:1], v4, off 104; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 105; GFX10-NEXT: s_endpgm 106; 107; GFX11-LABEL: write_ds_sub0_offset0_global_clamp_bit: 108; GFX11: ; %bb.0: ; %entry 109; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 110; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 111; GFX11-NEXT: s_mov_b32 vcc_lo, 0 112; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 113; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 114; GFX11-NEXT: v_mov_b32_e32 v0, 0 115; GFX11-NEXT: v_mov_b32_e32 v1, 0 116; GFX11-NEXT: ds_store_b32 v2, v3 offset:12 117; GFX11-NEXT: s_waitcnt lgkmcnt(0) 118; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 119; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc 120; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 121; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 122; GFX11-NEXT: s_endpgm 123entry: 124 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 125 %sub1 = sub i32 0, %x.i 126 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 127 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3 128 store i32 123, i32 addrspace(3)* %arrayidx 129 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 130 store volatile float %fmas, float addrspace(1)* null 131 ret void 132} 133 134define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { 135; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: 136; CI: ; %bb.0: 137; CI-NEXT: s_load_dword s0, s[0:1], 0x0 138; CI-NEXT: s_mov_b64 vcc, 0 139; CI-NEXT: v_not_b32_e32 v0, v0 140; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 141; CI-NEXT: v_mov_b32_e32 v2, 0x7b 142; CI-NEXT: s_waitcnt lgkmcnt(0) 143; CI-NEXT: v_mov_b32_e32 v1, s0 144; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 145; CI-NEXT: s_mov_b32 s0, 0 146; CI-NEXT: s_mov_b32 m0, -1 147; CI-NEXT: s_mov_b32 s3, 0xf000 148; CI-NEXT: s_mov_b32 s2, -1 149; CI-NEXT: s_mov_b32 s1, s0 150; CI-NEXT: ds_write_b32 v0, v2 offset:65532 151; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 152; CI-NEXT: s_waitcnt vmcnt(0) 153; CI-NEXT: s_endpgm 154; 155; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: 156; GFX9: ; %bb.0: 157; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 158; GFX9-NEXT: s_mov_b64 vcc, 0 159; GFX9-NEXT: v_not_b32_e32 v0, v0 160; GFX9-NEXT: v_lshlrev_b32_e32 v3, 2, v0 161; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 162; GFX9-NEXT: s_waitcnt lgkmcnt(0) 163; GFX9-NEXT: v_mov_b32_e32 v1, s0 164; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 165; GFX9-NEXT: v_mov_b32_e32 v0, 0 166; GFX9-NEXT: v_mov_b32_e32 v1, 0 167; GFX9-NEXT: ds_write_b32 v3, v4 offset:65532 168; GFX9-NEXT: global_store_dword v[0:1], v2, off 169; GFX9-NEXT: s_waitcnt vmcnt(0) 170; GFX9-NEXT: s_endpgm 171; 172; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: 173; GFX10: ; %bb.0: 174; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 175; GFX10-NEXT: v_not_b32_e32 v0, v0 176; GFX10-NEXT: s_mov_b32 vcc_lo, 0 177; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b 178; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 179; GFX10-NEXT: v_mov_b32_e32 v0, 0 180; GFX10-NEXT: v_mov_b32_e32 v1, 0 181; GFX10-NEXT: ds_write_b32 v2, v3 offset:65532 182; GFX10-NEXT: s_waitcnt lgkmcnt(0) 183; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 184; GFX10-NEXT: global_store_dword v[0:1], v4, off 185; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 186; GFX10-NEXT: s_endpgm 187; 188; GFX11-LABEL: write_ds_sub_max_offset_global_clamp_bit: 189; GFX11: ; %bb.0: 190; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 191; GFX11-NEXT: v_not_b32_e32 v0, v0 192; GFX11-NEXT: s_mov_b32 vcc_lo, 0 193; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 194; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v2, 2, v0 195; GFX11-NEXT: v_mov_b32_e32 v0, 0 196; GFX11-NEXT: v_mov_b32_e32 v1, 0 197; GFX11-NEXT: ds_store_b32 v2, v3 offset:65532 198; GFX11-NEXT: s_waitcnt lgkmcnt(0) 199; GFX11-NEXT: v_div_fmas_f32 v4, s0, s0, s0 200; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc 201; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 202; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 203; GFX11-NEXT: s_endpgm 204 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 205 %sub1 = sub i32 -1, %x.i 206 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 207 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 16383 208 store i32 123, i32 addrspace(3)* %arrayidx 209 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 210 store volatile float %fmas, float addrspace(1)* null 211 ret void 212} 213 214define amdgpu_kernel void @add_x_shl_max_offset() #1 { 215; CI-LABEL: add_x_shl_max_offset: 216; CI: ; %bb.0: 217; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 218; CI-NEXT: v_mov_b32_e32 v1, 13 219; CI-NEXT: s_mov_b32 m0, -1 220; CI-NEXT: ds_write_b8 v0, v1 offset:65535 221; CI-NEXT: s_endpgm 222; 223; GFX9-LABEL: add_x_shl_max_offset: 224; GFX9: ; %bb.0: 225; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 226; GFX9-NEXT: v_mov_b32_e32 v1, 13 227; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 228; GFX9-NEXT: s_endpgm 229; 230; GFX10-LABEL: add_x_shl_max_offset: 231; GFX10: ; %bb.0: 232; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 233; GFX10-NEXT: v_mov_b32_e32 v1, 13 234; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 235; GFX10-NEXT: s_endpgm 236; 237; GFX11-LABEL: add_x_shl_max_offset: 238; GFX11: ; %bb.0: 239; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 4, v0 240; GFX11-NEXT: ds_store_b8 v0, v1 offset:65535 241; GFX11-NEXT: s_endpgm 242 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 243 %shl = shl i32 %x.i, 4 244 %add = add i32 %shl, 65535 245 %z = zext i32 %add to i64 246 %ptr = inttoptr i64 %z to i8 addrspace(3)* 247 store i8 13, i8 addrspace(3)* %ptr, align 1 248 ret void 249} 250 251; this could have the offset transform, but sub became xor 252 253define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { 254; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 255; CI: ; %bb.0: 256; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 257; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0 258; CI-NEXT: v_mov_b32_e32 v1, 13 259; CI-NEXT: s_mov_b32 m0, -1 260; CI-NEXT: ds_write_b8 v0, v1 261; CI-NEXT: s_endpgm 262; 263; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 264; GFX9: ; %bb.0: 265; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 266; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0 267; GFX9-NEXT: v_mov_b32_e32 v1, 13 268; GFX9-NEXT: ds_write_b8 v0, v1 269; GFX9-NEXT: s_endpgm 270; 271; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 272; GFX10: ; %bb.0: 273; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; GFX10-NEXT: v_mov_b32_e32 v1, 13 275; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0 276; GFX10-NEXT: ds_write_b8 v0, v1 277; GFX10-NEXT: s_endpgm 278; 279; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 280; GFX11: ; %bb.0: 281; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 282; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 283; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 284; GFX11-NEXT: ds_store_b8 v0, v1 285; GFX11-NEXT: s_endpgm 286 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 287 %.neg = mul i32 %x.i, -4 288 %add = add i32 %.neg, 65535 289 %z = zext i32 %add to i64 290 %ptr = inttoptr i64 %z to i8 addrspace(3)* 291 store i8 13, i8 addrspace(3)* %ptr, align 1 292 ret void 293} 294 295; this could have the offset transform, but sub became xor 296 297define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { 298; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 299; CI: ; %bb.0: 300; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 301; CI-NEXT: v_xor_b32_e32 v0, 0xffff, v0 302; CI-NEXT: v_mov_b32_e32 v1, 13 303; CI-NEXT: s_mov_b32 m0, -1 304; CI-NEXT: ds_write_b8 v0, v1 305; CI-NEXT: s_endpgm 306; 307; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 308; GFX9: ; %bb.0: 309; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 310; GFX9-NEXT: v_xor_b32_e32 v0, 0xffff, v0 311; GFX9-NEXT: v_mov_b32_e32 v1, 13 312; GFX9-NEXT: ds_write_b8 v0, v1 313; GFX9-NEXT: s_endpgm 314; 315; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 316; GFX10: ; %bb.0: 317; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 318; GFX10-NEXT: v_mov_b32_e32 v1, 13 319; GFX10-NEXT: v_xor_b32_e32 v0, 0xffff, v0 320; GFX10-NEXT: ds_write_b8 v0, v1 321; GFX10-NEXT: s_endpgm 322; 323; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 324; GFX11: ; %bb.0: 325; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 326; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 327; GFX11-NEXT: v_xor_b32_e32 v0, 0xffff, v0 328; GFX11-NEXT: ds_store_b8 v0, v1 329; GFX11-NEXT: s_endpgm 330 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 331 %neg = sub i32 0, %x.i 332 %shl = shl i32 %neg, 2 333 %add = add i32 65535, %shl 334 %ptr = inttoptr i32 %add to i8 addrspace(3)* 335 store i8 13, i8 addrspace(3)* %ptr 336 ret void 337} 338 339define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { 340; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 341; CI: ; %bb.0: 342; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 343; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0 344; CI-NEXT: v_mov_b32_e32 v1, 13 345; CI-NEXT: s_mov_b32 m0, -1 346; CI-NEXT: ds_write_b8 v0, v1 347; CI-NEXT: s_endpgm 348; 349; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 350; GFX9: ; %bb.0: 351; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 352; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0 353; GFX9-NEXT: v_mov_b32_e32 v1, 13 354; GFX9-NEXT: ds_write_b8 v0, v1 355; GFX9-NEXT: s_endpgm 356; 357; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 358; GFX10: ; %bb.0: 359; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 360; GFX10-NEXT: v_mov_b32_e32 v1, 13 361; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 362; GFX10-NEXT: ds_write_b8 v0, v1 363; GFX10-NEXT: s_endpgm 364; 365; GFX11-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 366; GFX11: ; %bb.0: 367; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 368; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 369; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 370; GFX11-NEXT: ds_store_b8 v0, v1 371; GFX11-NEXT: s_endpgm 372 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 373 %neg = sub i32 0, %x.i 374 %shl = shl i32 %neg, 2 375 %add = add i32 65536, %shl 376 %ptr = inttoptr i32 %add to i8 addrspace(3)* 377 store i8 13, i8 addrspace(3)* %ptr 378 ret void 379} 380 381define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { 382; CI-LABEL: add_x_shl_neg_to_sub_multi_use: 383; CI: ; %bb.0: 384; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 385; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 386; CI-NEXT: v_mov_b32_e32 v1, 13 387; CI-NEXT: s_mov_b32 m0, -1 388; CI-NEXT: ds_write_b32 v0, v1 offset:123 389; CI-NEXT: ds_write_b32 v0, v1 offset:456 390; CI-NEXT: s_endpgm 391; 392; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use: 393; GFX9: ; %bb.0: 394; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 395; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 396; GFX9-NEXT: v_mov_b32_e32 v1, 13 397; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 398; GFX9-NEXT: ds_write_b32 v0, v1 offset:456 399; GFX9-NEXT: s_endpgm 400; 401; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use: 402; GFX10: ; %bb.0: 403; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 404; GFX10-NEXT: v_mov_b32_e32 v1, 13 405; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 406; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 407; GFX10-NEXT: ds_write_b32 v0, v1 offset:456 408; GFX10-NEXT: s_endpgm 409; 410; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use: 411; GFX11: ; %bb.0: 412; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 413; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 414; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 415; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 416; GFX11-NEXT: ds_store_b32 v0, v1 offset:456 417; GFX11-NEXT: s_endpgm 418 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 419 %neg = sub i32 0, %x.i 420 %shl = shl i32 %neg, 2 421 %add0 = add i32 123, %shl 422 %add1 = add i32 456, %shl 423 %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)* 424 store volatile i32 13, i32 addrspace(3)* %ptr0 425 %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)* 426 store volatile i32 13, i32 addrspace(3)* %ptr1 427 ret void 428} 429 430define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { 431; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 432; CI: ; %bb.0: 433; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 434; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 435; CI-NEXT: v_mov_b32_e32 v1, 13 436; CI-NEXT: s_mov_b32 m0, -1 437; CI-NEXT: ds_write_b32 v0, v1 offset:123 438; CI-NEXT: ds_write_b32 v0, v1 offset:123 439; CI-NEXT: s_endpgm 440; 441; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 442; GFX9: ; %bb.0: 443; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 444; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 445; GFX9-NEXT: v_mov_b32_e32 v1, 13 446; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 447; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 448; GFX9-NEXT: s_endpgm 449; 450; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 451; GFX10: ; %bb.0: 452; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 453; GFX10-NEXT: v_mov_b32_e32 v1, 13 454; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 455; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 456; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 457; GFX10-NEXT: s_endpgm 458; 459; GFX11-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 460; GFX11: ; %bb.0: 461; GFX11-NEXT: v_dual_mov_b32 v1, 13 :: v_dual_lshlrev_b32 v0, 2, v0 462; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 463; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0, v0 464; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 465; GFX11-NEXT: ds_store_b32 v0, v1 offset:123 466; GFX11-NEXT: s_endpgm 467 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 468 %neg = sub i32 0, %x.i 469 %shl = shl i32 %neg, 2 470 %add = add i32 123, %shl 471 %ptr = inttoptr i32 %add to i32 addrspace(3)* 472 store volatile i32 13, i32 addrspace(3)* %ptr 473 store volatile i32 13, i32 addrspace(3)* %ptr 474 ret void 475} 476 477define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { 478; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 479; CI: ; %bb.0: 480; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 481; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 482; CI-NEXT: v_mov_b32_e32 v1, 0x7b 483; CI-NEXT: v_mov_b32_e32 v2, 0 484; CI-NEXT: s_mov_b32 m0, -1 485; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 486; CI-NEXT: s_endpgm 487; 488; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 489; GFX9: ; %bb.0: 490; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 491; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 492; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 493; GFX9-NEXT: v_mov_b32_e32 v2, 0 494; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 495; GFX9-NEXT: s_endpgm 496; 497; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 498; GFX10: ; %bb.0: 499; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 500; GFX10-NEXT: v_mov_b32_e32 v1, 0 501; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 502; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 503; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023 504; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019 505; GFX10-NEXT: s_endpgm 506; 507; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 508; GFX11: ; %bb.0: 509; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 510; GFX11-NEXT: v_mov_b32_e32 v2, 0 511; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 512; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fb, v0 513; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 514; GFX11-NEXT: s_endpgm 515 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 516 %neg = sub i32 0, %x.i 517 %shl = shl i32 %neg, 2 518 %add = add i32 1019, %shl 519 %ptr = inttoptr i32 %add to i64 addrspace(3)* 520 store i64 123, i64 addrspace(3)* %ptr, align 4 521 ret void 522} 523 524define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { 525; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 526; CI: ; %bb.0: 527; CI-NEXT: s_load_dword s0, s[0:1], 0x0 528; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 529; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 530; CI-NEXT: s_mov_b64 vcc, 0 531; CI-NEXT: s_waitcnt lgkmcnt(0) 532; CI-NEXT: v_mov_b32_e32 v1, s0 533; CI-NEXT: s_mov_b32 s0, 0 534; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 535; CI-NEXT: v_mov_b32_e32 v2, 0x7b 536; CI-NEXT: v_mov_b32_e32 v3, 0 537; CI-NEXT: s_mov_b32 m0, -1 538; CI-NEXT: s_mov_b32 s3, 0xf000 539; CI-NEXT: s_mov_b32 s2, -1 540; CI-NEXT: s_mov_b32 s1, s0 541; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 542; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 543; CI-NEXT: s_waitcnt vmcnt(0) 544; CI-NEXT: s_endpgm 545; 546; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 547; GFX9: ; %bb.0: 548; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 549; GFX9-NEXT: s_mov_b64 vcc, 0 550; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 551; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 552; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 553; GFX9-NEXT: s_waitcnt lgkmcnt(0) 554; GFX9-NEXT: v_mov_b32_e32 v1, s0 555; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 556; GFX9-NEXT: v_mov_b32_e32 v0, 0 557; GFX9-NEXT: v_mov_b32_e32 v5, 0 558; GFX9-NEXT: v_mov_b32_e32 v1, 0 559; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1 560; GFX9-NEXT: global_store_dword v[0:1], v2, off 561; GFX9-NEXT: s_waitcnt vmcnt(0) 562; GFX9-NEXT: s_endpgm 563; 564; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 565; GFX10: ; %bb.0: 566; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 567; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 568; GFX10-NEXT: s_mov_b32 vcc_lo, 0 569; GFX10-NEXT: v_mov_b32_e32 v3, 0 570; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b 571; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 572; GFX10-NEXT: v_mov_b32_e32 v0, 0 573; GFX10-NEXT: v_mov_b32_e32 v1, 0 574; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023 575; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019 576; GFX10-NEXT: s_waitcnt lgkmcnt(0) 577; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 578; GFX10-NEXT: global_store_dword v[0:1], v5, off 579; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 580; GFX10-NEXT: s_endpgm 581; 582; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 583; GFX11: ; %bb.0: 584; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 585; GFX11-NEXT: v_dual_mov_b32 v3, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 586; GFX11-NEXT: s_mov_b32 vcc_lo, 0 587; GFX11-NEXT: v_mov_b32_e32 v4, 0 588; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 589; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 590; GFX11-NEXT: v_mov_b32_e32 v0, 0 591; GFX11-NEXT: v_mov_b32_e32 v1, 0 592; GFX11-NEXT: ds_store_2addr_b32 v2, v3, v4 offset1:1 593; GFX11-NEXT: s_waitcnt lgkmcnt(0) 594; GFX11-NEXT: v_div_fmas_f32 v5, s0, s0, s0 595; GFX11-NEXT: global_store_b32 v[0:1], v5, off dlc 596; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 597; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 598; GFX11-NEXT: s_endpgm 599 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 600 %neg = sub i32 0, %x.i 601 %shl = shl i32 %neg, 2 602 %add = add i32 1019, %shl 603 %ptr = inttoptr i32 %add to i64 addrspace(3)* 604 store i64 123, i64 addrspace(3)* %ptr, align 4 605 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 606 store volatile float %fmas, float addrspace(1)* null 607 ret void 608} 609 610define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { 611; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 612; CI: ; %bb.0: 613; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 614; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0 615; CI-NEXT: v_mov_b32_e32 v1, 0x7b 616; CI-NEXT: v_mov_b32_e32 v2, 0 617; CI-NEXT: s_mov_b32 m0, -1 618; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 619; CI-NEXT: s_endpgm 620; 621; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 622; GFX9: ; %bb.0: 623; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 624; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0 625; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 626; GFX9-NEXT: v_mov_b32_e32 v2, 0 627; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 628; GFX9-NEXT: s_endpgm 629; 630; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 631; GFX10: ; %bb.0: 632; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 633; GFX10-NEXT: v_mov_b32_e32 v1, 0 634; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 635; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 636; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 637; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128 638; GFX10-NEXT: s_endpgm 639; 640; GFX11-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 641; GFX11: ; %bb.0: 642; GFX11-NEXT: v_dual_mov_b32 v1, 0x7b :: v_dual_lshlrev_b32 v0, 2, v0 643; GFX11-NEXT: v_mov_b32_e32 v2, 0 644; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 645; GFX11-NEXT: v_sub_nc_u32_e32 v0, 0x3fc, v0 646; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 647; GFX11-NEXT: s_endpgm 648 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 649 %neg = sub i32 0, %x.i 650 %shl = shl i32 %neg, 2 651 %add = add i32 1020, %shl 652 %ptr = inttoptr i32 %add to i64 addrspace(3)* 653 store i64 123, i64 addrspace(3)* %ptr, align 4 654 ret void 655} 656 657declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 658 659attributes #0 = { nounwind readnone } 660attributes #1 = { nounwind } 661attributes #2 = { nounwind convergent } 662