1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s 3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s 4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s 5 6declare i32 @llvm.amdgcn.workitem.id.x() #0 7 8@lds.obj = addrspace(3) global [256 x i32] undef, align 4 9 10define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 { 11; CI-LABEL: write_ds_sub0_offset0_global: 12; CI: ; %bb.0: ; %entry 13; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 14; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 15; CI-NEXT: v_mov_b32_e32 v1, 0x7b 16; CI-NEXT: s_mov_b32 m0, -1 17; CI-NEXT: ds_write_b32 v0, v1 offset:12 18; CI-NEXT: s_endpgm 19; 20; GFX9-LABEL: write_ds_sub0_offset0_global: 21; GFX9: ; %bb.0: ; %entry 22; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 23; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 24; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 25; GFX9-NEXT: ds_write_b32 v0, v1 offset:12 26; GFX9-NEXT: s_endpgm 27; 28; GFX10-LABEL: write_ds_sub0_offset0_global: 29; GFX10: ; %bb.0: ; %entry 30; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 31; GFX10-NEXT: v_mov_b32_e32 v1, 0x7b 32; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 33; GFX10-NEXT: ds_write_b32 v0, v1 offset:12 34; GFX10-NEXT: s_endpgm 35entry: 36 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 37 %sub1 = sub i32 0, %x.i 38 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 39 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3 40 store i32 123, i32 addrspace(3)* %arrayidx 41 ret void 42} 43 44define amdgpu_kernel void @write_ds_sub0_offset0_global_clamp_bit(float %dummy.val) #0 { 45; CI-LABEL: write_ds_sub0_offset0_global_clamp_bit: 46; CI: ; %bb.0: ; %entry 47; CI-NEXT: s_load_dword s0, s[0:1], 0x0 48; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 49; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 50; CI-NEXT: s_mov_b64 vcc, 0 51; CI-NEXT: s_waitcnt lgkmcnt(0) 52; CI-NEXT: v_mov_b32_e32 v1, s0 53; CI-NEXT: s_mov_b32 s0, 0 54; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 55; CI-NEXT: v_mov_b32_e32 v2, 0x7b 56; CI-NEXT: s_mov_b32 m0, -1 57; CI-NEXT: s_mov_b32 s3, 0xf000 58; CI-NEXT: s_mov_b32 s2, -1 59; CI-NEXT: s_mov_b32 s1, s0 60; CI-NEXT: ds_write_b32 v0, v2 offset:12 61; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 62; CI-NEXT: s_waitcnt vmcnt(0) 63; CI-NEXT: s_endpgm 64; 65; GFX9-LABEL: write_ds_sub0_offset0_global_clamp_bit: 66; GFX9: ; %bb.0: ; %entry 67; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 68; GFX9-NEXT: s_mov_b64 vcc, 0 69; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 70; GFX9-NEXT: v_sub_u32_e32 v3, 0, v0 71; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 72; GFX9-NEXT: s_waitcnt lgkmcnt(0) 73; GFX9-NEXT: v_mov_b32_e32 v1, s0 74; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 75; GFX9-NEXT: v_mov_b32_e32 v0, 0 76; GFX9-NEXT: v_mov_b32_e32 v1, 0 77; GFX9-NEXT: ds_write_b32 v3, v4 offset:12 78; GFX9-NEXT: global_store_dword v[0:1], v2, off 79; GFX9-NEXT: s_waitcnt vmcnt(0) 80; GFX9-NEXT: s_endpgm 81; 82; GFX10-LABEL: write_ds_sub0_offset0_global_clamp_bit: 83; GFX10: ; %bb.0: ; %entry 84; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 85; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 86; GFX10-NEXT: s_mov_b32 vcc_lo, 0 87; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b 88; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 89; GFX10-NEXT: v_mov_b32_e32 v0, 0 90; GFX10-NEXT: v_mov_b32_e32 v1, 0 91; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 92; GFX10-NEXT: s_waitcnt lgkmcnt(0) 93; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 94; GFX10-NEXT: global_store_dword v[0:1], v4, off 95; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 96; GFX10-NEXT: s_endpgm 97entry: 98 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 99 %sub1 = sub i32 0, %x.i 100 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 101 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 3 102 store i32 123, i32 addrspace(3)* %arrayidx 103 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 104 store volatile float %fmas, float addrspace(1)* null 105 ret void 106} 107 108define amdgpu_kernel void @write_ds_sub_max_offset_global_clamp_bit(float %dummy.val) #0 { 109; CI-LABEL: write_ds_sub_max_offset_global_clamp_bit: 110; CI: ; %bb.0: 111; CI-NEXT: s_load_dword s0, s[0:1], 0x0 112; CI-NEXT: s_mov_b64 vcc, 0 113; CI-NEXT: v_not_b32_e32 v0, v0 114; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 115; CI-NEXT: v_mov_b32_e32 v2, 0x7b 116; CI-NEXT: s_waitcnt lgkmcnt(0) 117; CI-NEXT: v_mov_b32_e32 v1, s0 118; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 119; CI-NEXT: s_mov_b32 s0, 0 120; CI-NEXT: s_mov_b32 m0, -1 121; CI-NEXT: s_mov_b32 s3, 0xf000 122; CI-NEXT: s_mov_b32 s2, -1 123; CI-NEXT: s_mov_b32 s1, s0 124; CI-NEXT: ds_write_b32 v0, v2 offset:65532 125; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 126; CI-NEXT: s_waitcnt vmcnt(0) 127; CI-NEXT: s_endpgm 128; 129; GFX9-LABEL: write_ds_sub_max_offset_global_clamp_bit: 130; GFX9: ; %bb.0: 131; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 132; GFX9-NEXT: s_mov_b64 vcc, 0 133; GFX9-NEXT: v_not_b32_e32 v0, v0 134; GFX9-NEXT: v_lshlrev_b32_e32 v3, 2, v0 135; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 136; GFX9-NEXT: s_waitcnt lgkmcnt(0) 137; GFX9-NEXT: v_mov_b32_e32 v1, s0 138; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 139; GFX9-NEXT: v_mov_b32_e32 v0, 0 140; GFX9-NEXT: v_mov_b32_e32 v1, 0 141; GFX9-NEXT: ds_write_b32 v3, v4 offset:65532 142; GFX9-NEXT: global_store_dword v[0:1], v2, off 143; GFX9-NEXT: s_waitcnt vmcnt(0) 144; GFX9-NEXT: s_endpgm 145; 146; GFX10-LABEL: write_ds_sub_max_offset_global_clamp_bit: 147; GFX10: ; %bb.0: 148; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 149; GFX10-NEXT: v_not_b32_e32 v0, v0 150; GFX10-NEXT: s_mov_b32 vcc_lo, 0 151; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b 152; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 153; GFX10-NEXT: v_mov_b32_e32 v0, 0 154; GFX10-NEXT: v_mov_b32_e32 v1, 0 155; GFX10-NEXT: ds_write_b32 v2, v3 offset:65532 156; GFX10-NEXT: s_waitcnt lgkmcnt(0) 157; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 158; GFX10-NEXT: global_store_dword v[0:1], v4, off 159; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 160; GFX10-NEXT: s_endpgm 161 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1 162 %sub1 = sub i32 -1, %x.i 163 %tmp0 = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds.obj, i32 0, i32 %sub1 164 %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %tmp0, i32 16383 165 store i32 123, i32 addrspace(3)* %arrayidx 166 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 167 store volatile float %fmas, float addrspace(1)* null 168 ret void 169} 170 171define amdgpu_kernel void @add_x_shl_max_offset() #1 { 172; CI-LABEL: add_x_shl_max_offset: 173; CI: ; %bb.0: 174; CI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 175; CI-NEXT: v_mov_b32_e32 v1, 13 176; CI-NEXT: s_mov_b32 m0, -1 177; CI-NEXT: ds_write_b8 v0, v1 offset:65535 178; CI-NEXT: s_endpgm 179; 180; GFX9-LABEL: add_x_shl_max_offset: 181; GFX9: ; %bb.0: 182; GFX9-NEXT: v_lshlrev_b32_e32 v0, 4, v0 183; GFX9-NEXT: v_mov_b32_e32 v1, 13 184; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 185; GFX9-NEXT: s_endpgm 186; 187; GFX10-LABEL: add_x_shl_max_offset: 188; GFX10: ; %bb.0: 189; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 190; GFX10-NEXT: v_mov_b32_e32 v1, 13 191; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 192; GFX10-NEXT: s_endpgm 193 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 194 %shl = shl i32 %x.i, 4 195 %add = add i32 %shl, 65535 196 %z = zext i32 %add to i64 197 %ptr = inttoptr i64 %z to i8 addrspace(3)* 198 store i8 13, i8 addrspace(3)* %ptr, align 1 199 ret void 200} 201 202define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_alt() #1 { 203; CI-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 204; CI: ; %bb.0: 205; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 206; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 207; CI-NEXT: v_mov_b32_e32 v1, 13 208; CI-NEXT: s_mov_b32 m0, -1 209; CI-NEXT: ds_write_b8 v0, v1 offset:65535 210; CI-NEXT: s_endpgm 211; 212; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 213; GFX9: ; %bb.0: 214; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 215; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 216; GFX9-NEXT: v_mov_b32_e32 v1, 13 217; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 218; GFX9-NEXT: s_endpgm 219; 220; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_alt: 221; GFX10: ; %bb.0: 222; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 223; GFX10-NEXT: v_mov_b32_e32 v1, 13 224; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 225; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 226; GFX10-NEXT: s_endpgm 227 %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() 228 %.neg = mul i32 %x.i, -4 229 %add = add i32 %.neg, 65535 230 %z = zext i32 %add to i64 231 %ptr = inttoptr i64 %z to i8 addrspace(3)* 232 store i8 13, i8 addrspace(3)* %ptr, align 1 233 ret void 234} 235 236define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_not_canonical() #1 { 237; CI-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 238; CI: ; %bb.0: 239; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 240; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 241; CI-NEXT: v_mov_b32_e32 v1, 13 242; CI-NEXT: s_mov_b32 m0, -1 243; CI-NEXT: ds_write_b8 v0, v1 offset:65535 244; CI-NEXT: s_endpgm 245; 246; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 247; GFX9: ; %bb.0: 248; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 249; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 250; GFX9-NEXT: v_mov_b32_e32 v1, 13 251; GFX9-NEXT: ds_write_b8 v0, v1 offset:65535 252; GFX9-NEXT: s_endpgm 253; 254; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_not_canonical: 255; GFX10: ; %bb.0: 256; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 257; GFX10-NEXT: v_mov_b32_e32 v1, 13 258; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 259; GFX10-NEXT: ds_write_b8 v0, v1 offset:65535 260; GFX10-NEXT: s_endpgm 261 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 262 %neg = sub i32 0, %x.i 263 %shl = shl i32 %neg, 2 264 %add = add i32 65535, %shl 265 %ptr = inttoptr i32 %add to i8 addrspace(3)* 266 store i8 13, i8 addrspace(3)* %ptr 267 ret void 268} 269 270define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 { 271; CI-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 272; CI: ; %bb.0: 273; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 274; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x10000, v0 275; CI-NEXT: v_mov_b32_e32 v1, 13 276; CI-NEXT: s_mov_b32 m0, -1 277; CI-NEXT: ds_write_b8 v0, v1 278; CI-NEXT: s_endpgm 279; 280; GFX9-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 281; GFX9: ; %bb.0: 282; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 283; GFX9-NEXT: v_sub_u32_e32 v0, 0x10000, v0 284; GFX9-NEXT: v_mov_b32_e32 v1, 13 285; GFX9-NEXT: ds_write_b8 v0, v1 286; GFX9-NEXT: s_endpgm 287; 288; GFX10-LABEL: add_x_shl_neg_to_sub_max_offset_p1: 289; GFX10: ; %bb.0: 290; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 291; GFX10-NEXT: v_mov_b32_e32 v1, 13 292; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x10000, v0 293; GFX10-NEXT: ds_write_b8 v0, v1 294; GFX10-NEXT: s_endpgm 295 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 296 %neg = sub i32 0, %x.i 297 %shl = shl i32 %neg, 2 298 %add = add i32 65536, %shl 299 %ptr = inttoptr i32 %add to i8 addrspace(3)* 300 store i8 13, i8 addrspace(3)* %ptr 301 ret void 302} 303 304define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 { 305; CI-LABEL: add_x_shl_neg_to_sub_multi_use: 306; CI: ; %bb.0: 307; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 308; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 309; CI-NEXT: v_mov_b32_e32 v1, 13 310; CI-NEXT: s_mov_b32 m0, -1 311; CI-NEXT: ds_write_b32 v0, v1 offset:123 312; CI-NEXT: ds_write_b32 v0, v1 offset:456 313; CI-NEXT: s_endpgm 314; 315; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use: 316; GFX9: ; %bb.0: 317; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 318; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 319; GFX9-NEXT: v_mov_b32_e32 v1, 13 320; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 321; GFX9-NEXT: ds_write_b32 v0, v1 offset:456 322; GFX9-NEXT: s_endpgm 323; 324; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use: 325; GFX10: ; %bb.0: 326; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 327; GFX10-NEXT: v_mov_b32_e32 v1, 13 328; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 329; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 330; GFX10-NEXT: ds_write_b32 v0, v1 offset:456 331; GFX10-NEXT: s_endpgm 332 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 333 %neg = sub i32 0, %x.i 334 %shl = shl i32 %neg, 2 335 %add0 = add i32 123, %shl 336 %add1 = add i32 456, %shl 337 %ptr0 = inttoptr i32 %add0 to i32 addrspace(3)* 338 store volatile i32 13, i32 addrspace(3)* %ptr0 339 %ptr1 = inttoptr i32 %add1 to i32 addrspace(3)* 340 store volatile i32 13, i32 addrspace(3)* %ptr1 341 ret void 342} 343 344define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 { 345; CI-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 346; CI: ; %bb.0: 347; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 348; CI-NEXT: v_sub_i32_e32 v0, vcc, 0, v0 349; CI-NEXT: v_mov_b32_e32 v1, 13 350; CI-NEXT: s_mov_b32 m0, -1 351; CI-NEXT: ds_write_b32 v0, v1 offset:123 352; CI-NEXT: ds_write_b32 v0, v1 offset:123 353; CI-NEXT: s_endpgm 354; 355; GFX9-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 356; GFX9: ; %bb.0: 357; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 358; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 359; GFX9-NEXT: v_mov_b32_e32 v1, 13 360; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 361; GFX9-NEXT: ds_write_b32 v0, v1 offset:123 362; GFX9-NEXT: s_endpgm 363; 364; GFX10-LABEL: add_x_shl_neg_to_sub_multi_use_same_offset: 365; GFX10: ; %bb.0: 366; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 367; GFX10-NEXT: v_mov_b32_e32 v1, 13 368; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 369; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 370; GFX10-NEXT: ds_write_b32 v0, v1 offset:123 371; GFX10-NEXT: s_endpgm 372 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 373 %neg = sub i32 0, %x.i 374 %shl = shl i32 %neg, 2 375 %add = add i32 123, %shl 376 %ptr = inttoptr i32 %add to i32 addrspace(3)* 377 store volatile i32 13, i32 addrspace(3)* %ptr 378 store volatile i32 13, i32 addrspace(3)* %ptr 379 ret void 380} 381 382define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 { 383; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 384; CI: ; %bb.0: 385; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 386; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 387; CI-NEXT: v_mov_b32_e32 v1, 0x7b 388; CI-NEXT: v_mov_b32_e32 v2, 0 389; CI-NEXT: s_mov_b32 m0, -1 390; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 391; CI-NEXT: s_endpgm 392; 393; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 394; GFX9: ; %bb.0: 395; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 396; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fb, v0 397; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 398; GFX9-NEXT: v_mov_b32_e32 v2, 0 399; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 400; GFX9-NEXT: s_endpgm 401; 402; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset: 403; GFX10: ; %bb.0: 404; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 405; GFX10-NEXT: v_mov_b32_e32 v1, 0 406; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 407; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 408; GFX10-NEXT: ds_write_b32 v0, v1 offset:1023 409; GFX10-NEXT: ds_write_b32 v0, v2 offset:1019 410; GFX10-NEXT: s_endpgm 411 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 412 %neg = sub i32 0, %x.i 413 %shl = shl i32 %neg, 2 414 %add = add i32 1019, %shl 415 %ptr = inttoptr i32 %add to i64 addrspace(3)* 416 store i64 123, i64 addrspace(3)* %ptr, align 4 417 ret void 418} 419 420define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit(float %dummy.val) #1 { 421; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 422; CI: ; %bb.0: 423; CI-NEXT: s_load_dword s0, s[0:1], 0x0 424; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 425; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fb, v0 426; CI-NEXT: s_mov_b64 vcc, 0 427; CI-NEXT: s_waitcnt lgkmcnt(0) 428; CI-NEXT: v_mov_b32_e32 v1, s0 429; CI-NEXT: s_mov_b32 s0, 0 430; CI-NEXT: v_div_fmas_f32 v1, v1, v1, v1 431; CI-NEXT: v_mov_b32_e32 v2, 0x7b 432; CI-NEXT: v_mov_b32_e32 v3, 0 433; CI-NEXT: s_mov_b32 m0, -1 434; CI-NEXT: s_mov_b32 s3, 0xf000 435; CI-NEXT: s_mov_b32 s2, -1 436; CI-NEXT: s_mov_b32 s1, s0 437; CI-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 438; CI-NEXT: buffer_store_dword v1, off, s[0:3], 0 439; CI-NEXT: s_waitcnt vmcnt(0) 440; CI-NEXT: s_endpgm 441; 442; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 443; GFX9: ; %bb.0: 444; GFX9-NEXT: s_load_dword s0, s[0:1], 0x0 445; GFX9-NEXT: s_mov_b64 vcc, 0 446; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 447; GFX9-NEXT: v_sub_u32_e32 v3, 0x3fb, v0 448; GFX9-NEXT: v_mov_b32_e32 v4, 0x7b 449; GFX9-NEXT: s_waitcnt lgkmcnt(0) 450; GFX9-NEXT: v_mov_b32_e32 v1, s0 451; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 452; GFX9-NEXT: v_mov_b32_e32 v0, 0 453; GFX9-NEXT: v_mov_b32_e32 v5, 0 454; GFX9-NEXT: v_mov_b32_e32 v1, 0 455; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1 456; GFX9-NEXT: global_store_dword v[0:1], v2, off 457; GFX9-NEXT: s_waitcnt vmcnt(0) 458; GFX9-NEXT: s_endpgm 459; 460; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_clamp_bit: 461; GFX10: ; %bb.0: 462; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 463; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 464; GFX10-NEXT: s_mov_b32 vcc_lo, 0 465; GFX10-NEXT: v_mov_b32_e32 v3, 0 466; GFX10-NEXT: v_mov_b32_e32 v4, 0x7b 467; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 468; GFX10-NEXT: v_mov_b32_e32 v0, 0 469; GFX10-NEXT: v_mov_b32_e32 v1, 0 470; GFX10-NEXT: ds_write_b32 v2, v3 offset:1023 471; GFX10-NEXT: ds_write_b32 v2, v4 offset:1019 472; GFX10-NEXT: s_waitcnt lgkmcnt(0) 473; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 474; GFX10-NEXT: global_store_dword v[0:1], v5, off 475; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 476; GFX10-NEXT: s_endpgm 477 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 478 %neg = sub i32 0, %x.i 479 %shl = shl i32 %neg, 2 480 %add = add i32 1019, %shl 481 %ptr = inttoptr i32 %add to i64 addrspace(3)* 482 store i64 123, i64 addrspace(3)* %ptr, align 4 483 %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false) 484 store volatile float %fmas, float addrspace(1)* null 485 ret void 486} 487 488define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 { 489; CI-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 490; CI: ; %bb.0: 491; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 492; CI-NEXT: v_sub_i32_e32 v0, vcc, 0x3fc, v0 493; CI-NEXT: v_mov_b32_e32 v1, 0x7b 494; CI-NEXT: v_mov_b32_e32 v2, 0 495; CI-NEXT: s_mov_b32 m0, -1 496; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 497; CI-NEXT: s_endpgm 498; 499; GFX9-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 500; GFX9: ; %bb.0: 501; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 502; GFX9-NEXT: v_sub_u32_e32 v0, 0x3fc, v0 503; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b 504; GFX9-NEXT: v_mov_b32_e32 v2, 0 505; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 506; GFX9-NEXT: s_endpgm 507; 508; GFX10-LABEL: add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1: 509; GFX10: ; %bb.0: 510; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 511; GFX10-NEXT: v_mov_b32_e32 v1, 0 512; GFX10-NEXT: v_mov_b32_e32 v2, 0x7b 513; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 514; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x200, v0 515; GFX10-NEXT: ds_write2_b32 v0, v2, v1 offset0:127 offset1:128 516; GFX10-NEXT: s_endpgm 517 %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0 518 %neg = sub i32 0, %x.i 519 %shl = shl i32 %neg, 2 520 %add = add i32 1020, %shl 521 %ptr = inttoptr i32 %add to i64 addrspace(3)* 522 store i64 123, i64 addrspace(3)* %ptr, align 4 523 ret void 524} 525 526declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) 527 528attributes #0 = { nounwind readnone } 529attributes #1 = { nounwind } 530attributes #2 = { nounwind convergent } 531