1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI 3; RUN: llc < %s -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI 4 5; XXX - Why the packing? 6define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 7; SI-LABEL: scalar_to_vector_v2i32: 8; SI: ; %bb.0: 9; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10; SI-NEXT: s_mov_b32 s7, 0xf000 11; SI-NEXT: s_mov_b32 s6, -1 12; SI-NEXT: s_mov_b32 s10, s6 13; SI-NEXT: s_mov_b32 s11, s7 14; SI-NEXT: s_waitcnt lgkmcnt(0) 15; SI-NEXT: s_mov_b32 s8, s2 16; SI-NEXT: s_mov_b32 s9, s3 17; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 18; SI-NEXT: s_waitcnt vmcnt(0) 19; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 20; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 21; SI-NEXT: s_mov_b32 s4, s0 22; SI-NEXT: s_mov_b32 s5, s1 23; SI-NEXT: v_mov_b32_e32 v1, v0 24; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 25; SI-NEXT: s_endpgm 26; 27; VI-LABEL: scalar_to_vector_v2i32: 28; VI: ; %bb.0: 29; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 30; VI-NEXT: s_mov_b32 s7, 0xf000 31; VI-NEXT: s_mov_b32 s6, -1 32; VI-NEXT: s_mov_b32 s10, s6 33; VI-NEXT: s_mov_b32 s11, s7 34; VI-NEXT: s_waitcnt lgkmcnt(0) 35; VI-NEXT: s_mov_b32 s8, s2 36; VI-NEXT: s_mov_b32 s9, s3 37; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 38; VI-NEXT: s_mov_b32 s4, s0 39; VI-NEXT: s_mov_b32 s5, s1 40; VI-NEXT: s_waitcnt vmcnt(0) 41; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 42; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 43; VI-NEXT: v_mov_b32_e32 v1, v0 44; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 45; VI-NEXT: s_endpgm 46 %tmp1 = load i32, i32 addrspace(1)* %in, align 4 47 %bc = bitcast i32 %tmp1 to <2 x i16> 48 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 49 store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 50 ret void 51} 52 53define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { 54; SI-LABEL: scalar_to_vector_v2f32: 55; SI: ; %bb.0: 56; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 57; SI-NEXT: s_mov_b32 s7, 0xf000 58; SI-NEXT: s_mov_b32 s6, -1 59; SI-NEXT: s_mov_b32 s10, s6 60; SI-NEXT: s_mov_b32 s11, s7 61; SI-NEXT: s_waitcnt lgkmcnt(0) 62; SI-NEXT: s_mov_b32 s8, s2 63; SI-NEXT: s_mov_b32 s9, s3 64; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 65; SI-NEXT: s_waitcnt vmcnt(0) 66; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 67; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16 68; SI-NEXT: s_mov_b32 s4, s0 69; SI-NEXT: s_mov_b32 s5, s1 70; SI-NEXT: v_mov_b32_e32 v1, v0 71; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 72; SI-NEXT: s_endpgm 73; 74; VI-LABEL: scalar_to_vector_v2f32: 75; VI: ; %bb.0: 76; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 77; VI-NEXT: s_mov_b32 s7, 0xf000 78; VI-NEXT: s_mov_b32 s6, -1 79; VI-NEXT: s_mov_b32 s10, s6 80; VI-NEXT: s_mov_b32 s11, s7 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_mov_b32 s8, s2 83; VI-NEXT: s_mov_b32 s9, s3 84; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 85; VI-NEXT: s_mov_b32 s4, s0 86; VI-NEXT: s_mov_b32 s5, s1 87; VI-NEXT: s_waitcnt vmcnt(0) 88; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 89; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 90; VI-NEXT: v_mov_b32_e32 v1, v0 91; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 92; VI-NEXT: s_endpgm 93 %tmp1 = load float, float addrspace(1)* %in, align 4 94 %bc = bitcast float %tmp1 to <2 x i16> 95 %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> 96 store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8 97 ret void 98} 99 100define amdgpu_kernel void @scalar_to_vector_v4i16() { 101; SI-LABEL: scalar_to_vector_v4i16: 102; SI: ; %bb.0: ; %bb 103; SI-NEXT: s_mov_b32 s3, 0xf000 104; SI-NEXT: s_mov_b32 s2, -1 105; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 106; SI-NEXT: s_waitcnt vmcnt(0) 107; SI-NEXT: v_readfirstlane_b32 s0, v0 108; SI-NEXT: s_lshl_b32 s1, s0, 8 109; SI-NEXT: s_or_b32 s0, s1, s0 110; SI-NEXT: s_and_b32 s1, s0, 0xff00 111; SI-NEXT: s_lshr_b32 s4, s0, 8 112; SI-NEXT: s_or_b32 s1, s4, s1 113; SI-NEXT: s_lshl_b32 s4, s1, 16 114; SI-NEXT: s_or_b32 s1, s1, s4 115; SI-NEXT: s_or_b32 s0, s0, s4 116; SI-NEXT: v_mov_b32_e32 v0, s0 117; SI-NEXT: v_mov_b32_e32 v1, s1 118; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 119; SI-NEXT: s_endpgm 120; 121; VI-LABEL: scalar_to_vector_v4i16: 122; VI: ; %bb.0: ; %bb 123; VI-NEXT: s_mov_b32 s3, 0xf000 124; VI-NEXT: s_mov_b32 s2, -1 125; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 126; VI-NEXT: s_waitcnt vmcnt(0) 127; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 128; VI-NEXT: v_or_b32_e32 v0, v1, v0 129; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 130; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 131; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 132; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 133; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 134; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 135; VI-NEXT: s_endpgm 136bb: 137 %tmp = load <2 x i8>, <2 x i8> addrspace(1)* undef, align 1 138 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 139 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 140 store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8 141 ret void 142} 143 144define amdgpu_kernel void @scalar_to_vector_v4f16() { 145; SI-LABEL: scalar_to_vector_v4f16: 146; SI: ; %bb.0: ; %bb 147; SI-NEXT: s_mov_b32 s3, 0xf000 148; SI-NEXT: s_mov_b32 s2, -1 149; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 150; SI-NEXT: s_waitcnt vmcnt(0) 151; SI-NEXT: v_readfirstlane_b32 s0, v0 152; SI-NEXT: s_lshl_b32 s1, s0, 8 153; SI-NEXT: s_or_b32 s0, s1, s0 154; SI-NEXT: s_and_b32 s1, s0, 0xff00 155; SI-NEXT: s_lshr_b32 s4, s0, 8 156; SI-NEXT: s_or_b32 s1, s4, s1 157; SI-NEXT: s_lshl_b32 s4, s1, 16 158; SI-NEXT: s_or_b32 s1, s1, s4 159; SI-NEXT: s_or_b32 s0, s0, s4 160; SI-NEXT: v_mov_b32_e32 v0, s0 161; SI-NEXT: v_mov_b32_e32 v1, s1 162; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 163; SI-NEXT: s_endpgm 164; 165; VI-LABEL: scalar_to_vector_v4f16: 166; VI: ; %bb.0: ; %bb 167; VI-NEXT: s_mov_b32 s3, 0xf000 168; VI-NEXT: s_mov_b32 s2, -1 169; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 170; VI-NEXT: s_waitcnt vmcnt(0) 171; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0 172; VI-NEXT: v_or_b32_e32 v0, v1, v0 173; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0 174; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 175; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 176; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 177; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 178; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 179; VI-NEXT: s_endpgm 180bb: 181 %load = load half, half addrspace(1)* undef, align 1 182 %tmp = bitcast half %load to <2 x i8> 183 %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 184 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9> 185 store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8 186 ret void 187} 188 189; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed 190; to produce one, but for some reason never made it to selection. 191 192 193; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 194; %tmp1 = load i32, i32 addrspace(1)* %in, align 4 195; %bc = bitcast i32 %tmp1 to <4 x i8> 196 197; %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 198; store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4 199; ret void 200; } 201 202; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind { 203; %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0 204; %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1 205; %bc = bitcast <2 x i64> %newvec1 to <4 x i32> 206; %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4> 207; store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16 208; ret void 209; } 210 211; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind { 212; %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0 213; %bc = bitcast <4 x i32> %newvec0 to <8 x i16> 214; %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4> 215; store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16 216; ret void 217; } 218 219; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind { 220; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 221; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> 222; %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4> 223; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 224; ret void 225; } 226 227define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { 228; SI-LABEL: scalar_to_vector_test6: 229; SI: ; %bb.0: 230; SI-NEXT: s_load_dword s4, s[0:1], 0xb 231; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 232; SI-NEXT: s_mov_b32 s3, 0xf000 233; SI-NEXT: s_mov_b32 s2, -1 234; SI-NEXT: s_waitcnt lgkmcnt(0) 235; SI-NEXT: v_mov_b32_e32 v0, s4 236; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 237; SI-NEXT: s_endpgm 238; 239; VI-LABEL: scalar_to_vector_test6: 240; VI: ; %bb.0: 241; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 242; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 243; VI-NEXT: s_mov_b32 s3, 0xf000 244; VI-NEXT: s_mov_b32 s2, -1 245; VI-NEXT: s_waitcnt lgkmcnt(0) 246; VI-NEXT: v_mov_b32_e32 v0, s4 247; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 248; VI-NEXT: s_endpgm 249 %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 250 %bc = bitcast <4 x i8> %newvec0 to <2 x half> 251 store <2 x half> %bc, <2 x half> addrspace(1)* %out 252 ret void 253} 254