1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s 4 5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone 6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone 7 8define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { 9; SI-LABEL: test_copy_v4i8: 10; SI: ; %bb.0: 11; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12; SI-NEXT: s_mov_b32 s7, 0xf000 13; SI-NEXT: s_mov_b32 s10, 0 14; SI-NEXT: s_mov_b32 s11, s7 15; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 16; SI-NEXT: s_waitcnt lgkmcnt(0) 17; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 18; SI-NEXT: v_mov_b32_e32 v1, 0 19; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 20; SI-NEXT: s_mov_b32 s6, -1 21; SI-NEXT: s_mov_b32 s4, s0 22; SI-NEXT: s_mov_b32 s5, s1 23; SI-NEXT: s_waitcnt vmcnt(0) 24; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: test_copy_v4i8: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 30; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 31; VI-NEXT: s_waitcnt lgkmcnt(0) 32; VI-NEXT: v_mov_b32_e32 v1, s3 33; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 34; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 35; VI-NEXT: flat_load_dword v0, v[0:1] 36; VI-NEXT: s_mov_b32 s3, 0xf000 37; VI-NEXT: s_mov_b32 s2, -1 38; VI-NEXT: s_waitcnt vmcnt(0) 39; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 40; VI-NEXT: s_endpgm 41 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 42 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 43 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 44 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 45 ret void 46} 47 48define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { 49; SI-LABEL: test_copy_v4i8_x2: 50; SI: ; %bb.0: 51; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 52; SI-NEXT: s_mov_b32 s3, 0xf000 53; SI-NEXT: s_mov_b32 s6, 0 54; SI-NEXT: s_mov_b32 s7, s3 55; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 56; SI-NEXT: v_mov_b32_e32 v1, 0 57; SI-NEXT: s_waitcnt lgkmcnt(0) 58; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 59; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 60; SI-NEXT: s_mov_b32 s2, -1 61; SI-NEXT: s_mov_b32 s10, s2 62; SI-NEXT: s_mov_b32 s11, s3 63; SI-NEXT: s_waitcnt lgkmcnt(0) 64; SI-NEXT: s_mov_b32 s0, s4 65; SI-NEXT: s_mov_b32 s1, s5 66; SI-NEXT: s_mov_b32 s8, s6 67; SI-NEXT: s_mov_b32 s9, s7 68; SI-NEXT: s_waitcnt vmcnt(0) 69; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 70; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 71; SI-NEXT: s_endpgm 72; 73; VI-LABEL: test_copy_v4i8_x2: 74; VI: ; %bb.0: 75; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 76; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 77; VI-NEXT: s_mov_b32 s7, 0xf000 78; VI-NEXT: s_mov_b32 s6, -1 79; VI-NEXT: s_mov_b32 s10, s6 80; VI-NEXT: s_waitcnt lgkmcnt(0) 81; VI-NEXT: v_mov_b32_e32 v1, s3 82; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 83; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 84; VI-NEXT: flat_load_dword v0, v[0:1] 85; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 86; VI-NEXT: s_mov_b32 s11, s7 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: s_mov_b32 s4, s0 89; VI-NEXT: s_mov_b32 s5, s1 90; VI-NEXT: s_mov_b32 s8, s2 91; VI-NEXT: s_mov_b32 s9, s3 92; VI-NEXT: s_waitcnt vmcnt(0) 93; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 94; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 95; VI-NEXT: s_endpgm 96 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 97 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 98 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 99 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 100 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 101 ret void 102} 103 104define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { 105; SI-LABEL: test_copy_v4i8_x3: 106; SI: ; %bb.0: 107; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 108; SI-NEXT: s_mov_b32 s11, 0xf000 109; SI-NEXT: s_mov_b32 s14, 0 110; SI-NEXT: s_mov_b32 s15, s11 111; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 112; SI-NEXT: s_waitcnt lgkmcnt(0) 113; SI-NEXT: s_mov_b64 s[12:13], s[6:7] 114; SI-NEXT: v_mov_b32_e32 v1, 0 115; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 116; SI-NEXT: s_mov_b32 s10, -1 117; SI-NEXT: s_mov_b32 s8, s0 118; SI-NEXT: s_mov_b32 s9, s1 119; SI-NEXT: s_mov_b32 s14, s10 120; SI-NEXT: s_mov_b32 s6, s10 121; SI-NEXT: s_mov_b32 s7, s11 122; SI-NEXT: s_mov_b32 s12, s2 123; SI-NEXT: s_mov_b32 s13, s3 124; SI-NEXT: s_waitcnt vmcnt(0) 125; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 126; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 127; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 128; SI-NEXT: s_endpgm 129; 130; VI-LABEL: test_copy_v4i8_x3: 131; VI: ; %bb.0: 132; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 133; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 134; VI-NEXT: s_mov_b32 s11, 0xf000 135; VI-NEXT: s_mov_b32 s10, -1 136; VI-NEXT: s_mov_b32 s14, s10 137; VI-NEXT: s_waitcnt lgkmcnt(0) 138; VI-NEXT: v_mov_b32_e32 v1, s7 139; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 140; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 141; VI-NEXT: flat_load_dword v0, v[0:1] 142; VI-NEXT: s_mov_b32 s8, s0 143; VI-NEXT: s_mov_b32 s9, s1 144; VI-NEXT: s_mov_b32 s15, s11 145; VI-NEXT: s_mov_b32 s6, s10 146; VI-NEXT: s_mov_b32 s7, s11 147; VI-NEXT: s_mov_b32 s12, s2 148; VI-NEXT: s_mov_b32 s13, s3 149; VI-NEXT: s_waitcnt vmcnt(0) 150; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 151; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 152; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 153; VI-NEXT: s_endpgm 154 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 155 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 156 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 157 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 158 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 159 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 160 ret void 161} 162 163define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind { 164; SI-LABEL: test_copy_v4i8_x4: 165; SI: ; %bb.0: 166; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11 167; SI-NEXT: s_mov_b32 s11, 0xf000 168; SI-NEXT: s_mov_b32 s6, 0 169; SI-NEXT: s_mov_b32 s7, s11 170; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 171; SI-NEXT: v_mov_b32_e32 v1, 0 172; SI-NEXT: s_waitcnt lgkmcnt(0) 173; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 174; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 175; SI-NEXT: s_mov_b32 s10, -1 176; SI-NEXT: s_mov_b32 s14, s10 177; SI-NEXT: s_mov_b32 s15, s11 178; SI-NEXT: s_mov_b32 s18, s10 179; SI-NEXT: s_waitcnt lgkmcnt(0) 180; SI-NEXT: s_mov_b32 s8, s0 181; SI-NEXT: s_mov_b32 s9, s1 182; SI-NEXT: s_mov_b32 s19, s11 183; SI-NEXT: s_mov_b32 s22, s10 184; SI-NEXT: s_mov_b32 s23, s11 185; SI-NEXT: s_mov_b32 s12, s2 186; SI-NEXT: s_mov_b32 s13, s3 187; SI-NEXT: s_mov_b32 s16, s4 188; SI-NEXT: s_mov_b32 s17, s5 189; SI-NEXT: s_mov_b32 s20, s6 190; SI-NEXT: s_mov_b32 s21, s7 191; SI-NEXT: s_waitcnt vmcnt(0) 192; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 193; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 194; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0 195; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0 196; SI-NEXT: s_endpgm 197; 198; VI-LABEL: test_copy_v4i8_x4: 199; VI: ; %bb.0: 200; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44 201; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 202; VI-NEXT: s_mov_b32 s11, 0xf000 203; VI-NEXT: s_mov_b32 s10, -1 204; VI-NEXT: s_mov_b32 s14, s10 205; VI-NEXT: s_waitcnt lgkmcnt(0) 206; VI-NEXT: v_mov_b32_e32 v1, s3 207; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 208; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 209; VI-NEXT: flat_load_dword v0, v[0:1] 210; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 211; VI-NEXT: s_mov_b32 s15, s11 212; VI-NEXT: s_mov_b32 s18, s10 213; VI-NEXT: s_mov_b32 s19, s11 214; VI-NEXT: s_mov_b32 s22, s10 215; VI-NEXT: s_waitcnt lgkmcnt(0) 216; VI-NEXT: s_mov_b32 s8, s0 217; VI-NEXT: s_mov_b32 s9, s1 218; VI-NEXT: s_mov_b32 s23, s11 219; VI-NEXT: s_mov_b32 s12, s2 220; VI-NEXT: s_mov_b32 s13, s3 221; VI-NEXT: s_mov_b32 s16, s4 222; VI-NEXT: s_mov_b32 s17, s5 223; VI-NEXT: s_mov_b32 s20, s6 224; VI-NEXT: s_mov_b32 s21, s7 225; VI-NEXT: s_waitcnt vmcnt(0) 226; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 227; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0 228; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0 229; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0 230; VI-NEXT: s_endpgm 231 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 232 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 233 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 234 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 235 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4 236 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 237 store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4 238 ret void 239} 240 241define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind { 242; SI-LABEL: test_copy_v4i8_extra_use: 243; SI: ; %bb.0: 244; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 245; SI-NEXT: s_mov_b32 s3, 0xf000 246; SI-NEXT: s_mov_b32 s6, 0 247; SI-NEXT: s_mov_b32 s7, s3 248; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 249; SI-NEXT: v_mov_b32_e32 v1, 0 250; SI-NEXT: s_waitcnt lgkmcnt(0) 251; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 252; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 253; SI-NEXT: s_mov_b32 s2, -1 254; SI-NEXT: s_mov_b32 s10, s2 255; SI-NEXT: s_mov_b32 s11, s3 256; SI-NEXT: s_waitcnt lgkmcnt(0) 257; SI-NEXT: s_mov_b32 s0, s4 258; SI-NEXT: s_mov_b32 s1, s5 259; SI-NEXT: s_mov_b32 s8, s6 260; SI-NEXT: s_mov_b32 s9, s7 261; SI-NEXT: s_waitcnt vmcnt(0) 262; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 263; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 264; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 265; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 266; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 267; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 268; SI-NEXT: v_or_b32_e32 v2, v2, v3 269; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 270; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 271; SI-NEXT: v_or_b32_e32 v1, v4, v1 272; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 273; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 274; SI-NEXT: v_or_b32_e32 v1, v1, v2 275; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 276; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 277; SI-NEXT: buffer_store_dword v1, off, s[8:11], 0 278; SI-NEXT: s_endpgm 279; 280; VI-LABEL: test_copy_v4i8_extra_use: 281; VI: ; %bb.0: 282; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 283; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 284; VI-NEXT: s_mov_b32 s7, 0xf000 285; VI-NEXT: s_mov_b32 s6, -1 286; VI-NEXT: s_mov_b32 s10, s6 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: v_mov_b32_e32 v1, s3 289; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 290; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 291; VI-NEXT: flat_load_dword v0, v[0:1] 292; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 293; VI-NEXT: s_mov_b32 s11, s7 294; VI-NEXT: s_waitcnt lgkmcnt(0) 295; VI-NEXT: s_mov_b32 s4, s0 296; VI-NEXT: s_mov_b32 s5, s1 297; VI-NEXT: s_mov_b32 s8, s2 298; VI-NEXT: s_mov_b32 s9, s3 299; VI-NEXT: s_waitcnt vmcnt(0) 300; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 301; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 302; VI-NEXT: v_add_u16_e32 v1, 9, v1 303; VI-NEXT: v_add_u16_e32 v3, 9, v0 304; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 305; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 306; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 307; VI-NEXT: v_or_b32_e32 v1, v4, v1 308; VI-NEXT: v_or_b32_e32 v2, v2, v3 309; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 310; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 311; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 312; VI-NEXT: v_or_b32_e32 v1, v2, v1 313; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 314; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0 315; VI-NEXT: s_endpgm 316 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 317 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 318 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4 319 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> 320 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 321 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 322 ret void 323} 324 325; FIXME: Need to handle non-uniform case for function below (load without gep). 326define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind { 327; SI-LABEL: test_copy_v4i8_x2_extra_use: 328; SI: ; %bb.0: 329; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 330; SI-NEXT: s_mov_b32 s11, 0xf000 331; SI-NEXT: s_mov_b32 s14, 0 332; SI-NEXT: s_mov_b32 s15, s11 333; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 334; SI-NEXT: s_waitcnt lgkmcnt(0) 335; SI-NEXT: s_mov_b64 s[12:13], s[6:7] 336; SI-NEXT: v_mov_b32_e32 v1, 0 337; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 338; SI-NEXT: s_mov_b32 s10, -1 339; SI-NEXT: s_mov_b32 s14, s10 340; SI-NEXT: s_mov_b32 s8, s0 341; SI-NEXT: s_mov_b32 s9, s1 342; SI-NEXT: s_mov_b32 s12, s2 343; SI-NEXT: s_mov_b32 s13, s3 344; SI-NEXT: s_mov_b32 s6, s10 345; SI-NEXT: s_mov_b32 s7, s11 346; SI-NEXT: s_waitcnt vmcnt(0) 347; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 348; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0 349; SI-NEXT: v_and_b32_e32 v2, 0xff00, v0 350; SI-NEXT: v_and_b32_e32 v3, 0xff, v3 351; SI-NEXT: v_and_b32_e32 v4, 0xff00, v1 352; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1 353; SI-NEXT: v_or_b32_e32 v2, v2, v3 354; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 355; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2 356; SI-NEXT: v_or_b32_e32 v1, v4, v1 357; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 358; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 359; SI-NEXT: v_or_b32_e32 v1, v1, v2 360; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1 361; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 362; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 363; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 364; SI-NEXT: s_endpgm 365; 366; VI-LABEL: test_copy_v4i8_x2_extra_use: 367; VI: ; %bb.0: 368; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 369; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 370; VI-NEXT: s_mov_b32 s11, 0xf000 371; VI-NEXT: s_mov_b32 s10, -1 372; VI-NEXT: s_mov_b32 s14, s10 373; VI-NEXT: s_waitcnt lgkmcnt(0) 374; VI-NEXT: v_mov_b32_e32 v1, s7 375; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 376; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 377; VI-NEXT: flat_load_dword v0, v[0:1] 378; VI-NEXT: s_mov_b32 s15, s11 379; VI-NEXT: s_mov_b32 s8, s0 380; VI-NEXT: s_mov_b32 s9, s1 381; VI-NEXT: s_mov_b32 s12, s2 382; VI-NEXT: s_mov_b32 s13, s3 383; VI-NEXT: s_mov_b32 s6, s10 384; VI-NEXT: s_mov_b32 s7, s11 385; VI-NEXT: s_waitcnt vmcnt(0) 386; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 387; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1 388; VI-NEXT: v_add_u16_e32 v1, 9, v1 389; VI-NEXT: v_add_u16_e32 v3, 9, v0 390; VI-NEXT: v_and_b32_e32 v1, 0xff, v1 391; VI-NEXT: v_and_b32_e32 v2, 0xffffff00, v0 392; VI-NEXT: v_and_b32_e32 v3, 0xff, v3 393; VI-NEXT: v_or_b32_e32 v1, v4, v1 394; VI-NEXT: v_or_b32_e32 v2, v2, v3 395; VI-NEXT: v_add_u16_e32 v1, 0x900, v1 396; VI-NEXT: v_add_u16_e32 v2, 0x900, v2 397; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 398; VI-NEXT: v_or_b32_e32 v1, v2, v1 399; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 400; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0 401; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 402; VI-NEXT: s_endpgm 403 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 404 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x 405 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4 406 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9> 407 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4 408 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4 409 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4 410 ret void 411} 412 413define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { 414; SI-LABEL: test_copy_v3i8_align4: 415; SI: ; %bb.0: 416; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 417; SI-NEXT: s_mov_b32 s7, 0xf000 418; SI-NEXT: s_mov_b32 s10, 0 419; SI-NEXT: s_mov_b32 s11, s7 420; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 421; SI-NEXT: s_waitcnt lgkmcnt(0) 422; SI-NEXT: s_mov_b64 s[8:9], s[2:3] 423; SI-NEXT: v_mov_b32_e32 v1, 0 424; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 425; SI-NEXT: s_mov_b32 s6, -1 426; SI-NEXT: s_mov_b32 s4, s0 427; SI-NEXT: s_mov_b32 s5, s1 428; SI-NEXT: s_waitcnt vmcnt(0) 429; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 430; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 431; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 432; SI-NEXT: s_endpgm 433; 434; VI-LABEL: test_copy_v3i8_align4: 435; VI: ; %bb.0: 436; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 437; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 438; VI-NEXT: s_waitcnt lgkmcnt(0) 439; VI-NEXT: v_mov_b32_e32 v1, s3 440; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 441; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 442; VI-NEXT: flat_load_dword v0, v[0:1] 443; VI-NEXT: s_mov_b32 s3, 0xf000 444; VI-NEXT: s_mov_b32 s2, -1 445; VI-NEXT: s_waitcnt vmcnt(0) 446; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 447; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 448; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2 449; VI-NEXT: s_endpgm 450 %tid.x = call i32 @llvm.amdgcn.workitem.id.x() 451 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x 452 %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4 453 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4 454 ret void 455} 456 457define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { 458; SI-LABEL: test_copy_v3i8_align2: 459; SI: ; %bb.0: 460; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 461; SI-NEXT: s_mov_b32 s7, 0xf000 462; SI-NEXT: s_mov_b32 s6, -1 463; SI-NEXT: s_mov_b32 s10, s6 464; SI-NEXT: s_mov_b32 s11, s7 465; SI-NEXT: s_waitcnt lgkmcnt(0) 466; SI-NEXT: s_mov_b32 s8, s2 467; SI-NEXT: s_mov_b32 s9, s3 468; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 469; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 470; SI-NEXT: s_mov_b32 s4, s0 471; SI-NEXT: s_mov_b32 s5, s1 472; SI-NEXT: s_waitcnt vmcnt(1) 473; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 474; SI-NEXT: s_waitcnt vmcnt(1) 475; SI-NEXT: buffer_store_short v1, off, s[4:7], 0 476; SI-NEXT: s_endpgm 477; 478; VI-LABEL: test_copy_v3i8_align2: 479; VI: ; %bb.0: 480; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 481; VI-NEXT: s_mov_b32 s7, 0xf000 482; VI-NEXT: s_mov_b32 s6, -1 483; VI-NEXT: s_mov_b32 s10, s6 484; VI-NEXT: s_mov_b32 s11, s7 485; VI-NEXT: s_waitcnt lgkmcnt(0) 486; VI-NEXT: s_mov_b32 s8, s2 487; VI-NEXT: s_mov_b32 s9, s3 488; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2 489; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 490; VI-NEXT: s_mov_b32 s4, s0 491; VI-NEXT: s_mov_b32 s5, s1 492; VI-NEXT: s_waitcnt vmcnt(1) 493; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2 494; VI-NEXT: s_waitcnt vmcnt(1) 495; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 496; VI-NEXT: s_endpgm 497 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2 498 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2 499 ret void 500} 501 502define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind { 503; SI-LABEL: test_copy_v3i8_align1: 504; SI: ; %bb.0: 505; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 506; SI-NEXT: s_mov_b32 s7, 0xf000 507; SI-NEXT: s_mov_b32 s6, -1 508; SI-NEXT: s_mov_b32 s10, s6 509; SI-NEXT: s_mov_b32 s11, s7 510; SI-NEXT: s_waitcnt lgkmcnt(0) 511; SI-NEXT: s_mov_b32 s8, s2 512; SI-NEXT: s_mov_b32 s9, s3 513; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 514; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 515; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 516; SI-NEXT: s_mov_b32 s4, s0 517; SI-NEXT: s_mov_b32 s5, s1 518; SI-NEXT: s_waitcnt vmcnt(2) 519; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 520; SI-NEXT: s_waitcnt vmcnt(2) 521; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:1 522; SI-NEXT: s_waitcnt vmcnt(2) 523; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:2 524; SI-NEXT: s_endpgm 525; 526; VI-LABEL: test_copy_v3i8_align1: 527; VI: ; %bb.0: 528; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 529; VI-NEXT: s_mov_b32 s7, 0xf000 530; VI-NEXT: s_mov_b32 s6, -1 531; VI-NEXT: s_mov_b32 s10, s6 532; VI-NEXT: s_mov_b32 s11, s7 533; VI-NEXT: s_waitcnt lgkmcnt(0) 534; VI-NEXT: s_mov_b32 s8, s2 535; VI-NEXT: s_mov_b32 s9, s3 536; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 537; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 538; VI-NEXT: s_mov_b32 s4, s0 539; VI-NEXT: s_mov_b32 s5, s1 540; VI-NEXT: s_waitcnt vmcnt(1) 541; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 542; VI-NEXT: s_waitcnt vmcnt(1) 543; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 544; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0 545; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1 546; VI-NEXT: s_endpgm 547 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1 548 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1 549 ret void 550} 551 552define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { 553; SI-LABEL: test_copy_v4i8_volatile_load: 554; SI: ; %bb.0: 555; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 556; SI-NEXT: s_mov_b32 s7, 0xf000 557; SI-NEXT: s_mov_b32 s6, -1 558; SI-NEXT: s_mov_b32 s10, s6 559; SI-NEXT: s_mov_b32 s11, s7 560; SI-NEXT: s_waitcnt lgkmcnt(0) 561; SI-NEXT: s_mov_b32 s8, s2 562; SI-NEXT: s_mov_b32 s9, s3 563; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc 564; SI-NEXT: s_waitcnt vmcnt(0) 565; SI-NEXT: s_mov_b32 s4, s0 566; SI-NEXT: s_mov_b32 s5, s1 567; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 568; SI-NEXT: s_endpgm 569; 570; VI-LABEL: test_copy_v4i8_volatile_load: 571; VI: ; %bb.0: 572; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 573; VI-NEXT: s_mov_b32 s7, 0xf000 574; VI-NEXT: s_mov_b32 s6, -1 575; VI-NEXT: s_mov_b32 s10, s6 576; VI-NEXT: s_mov_b32 s11, s7 577; VI-NEXT: s_waitcnt lgkmcnt(0) 578; VI-NEXT: s_mov_b32 s8, s2 579; VI-NEXT: s_mov_b32 s9, s3 580; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc 581; VI-NEXT: s_waitcnt vmcnt(0) 582; VI-NEXT: s_mov_b32 s4, s0 583; VI-NEXT: s_mov_b32 s5, s1 584; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 585; VI-NEXT: s_endpgm 586 %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 587 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 588 ret void 589} 590 591define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind { 592; SI-LABEL: test_copy_v4i8_volatile_store: 593; SI: ; %bb.0: 594; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 595; SI-NEXT: s_mov_b32 s7, 0xf000 596; SI-NEXT: s_mov_b32 s6, -1 597; SI-NEXT: s_mov_b32 s10, s6 598; SI-NEXT: s_mov_b32 s11, s7 599; SI-NEXT: s_waitcnt lgkmcnt(0) 600; SI-NEXT: s_mov_b32 s8, s2 601; SI-NEXT: s_mov_b32 s9, s3 602; SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 603; SI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 604; SI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 605; SI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 606; SI-NEXT: s_mov_b32 s4, s0 607; SI-NEXT: s_mov_b32 s5, s1 608; SI-NEXT: s_waitcnt vmcnt(3) 609; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 610; SI-NEXT: s_waitcnt vmcnt(0) 611; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 612; SI-NEXT: s_waitcnt vmcnt(0) 613; SI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 614; SI-NEXT: s_waitcnt vmcnt(0) 615; SI-NEXT: buffer_store_byte v3, off, s[4:7], 0 616; SI-NEXT: s_waitcnt vmcnt(0) 617; SI-NEXT: s_endpgm 618; 619; VI-LABEL: test_copy_v4i8_volatile_store: 620; VI: ; %bb.0: 621; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 622; VI-NEXT: s_mov_b32 s7, 0xf000 623; VI-NEXT: s_mov_b32 s6, -1 624; VI-NEXT: s_mov_b32 s10, s6 625; VI-NEXT: s_mov_b32 s11, s7 626; VI-NEXT: s_waitcnt lgkmcnt(0) 627; VI-NEXT: s_mov_b32 s8, s2 628; VI-NEXT: s_mov_b32 s9, s3 629; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3 630; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2 631; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1 632; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 633; VI-NEXT: s_mov_b32 s4, s0 634; VI-NEXT: s_mov_b32 s5, s1 635; VI-NEXT: s_waitcnt vmcnt(3) 636; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3 637; VI-NEXT: s_waitcnt vmcnt(0) 638; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2 639; VI-NEXT: s_waitcnt vmcnt(0) 640; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 641; VI-NEXT: s_waitcnt vmcnt(0) 642; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0 643; VI-NEXT: s_waitcnt vmcnt(0) 644; VI-NEXT: s_endpgm 645 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4 646 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4 647 ret void 648} 649