1; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s 2 3; GCN-LABEL: {{^}}float4_inselt: 4; GCN-NOT: v_movrel 5; GCN-NOT: buffer_ 6; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 7; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 8; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 9; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] 10; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 11; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] 12; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 13; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] 14; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] 15define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) { 16entry: 17 %v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel 18 store <4 x float> %v, <4 x float> addrspace(1)* %out 19 ret void 20} 21 22; GCN-LABEL: {{^}}float4_inselt_undef: 23; GCN-NOT: v_movrel 24; GCN-NOT: buffer_ 25; GCN-NOT: v_cmp_ 26; GCN-NOT: v_cndmask_ 27; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0 28; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 29; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 30; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] 31define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) { 32entry: 33 %v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel 34 store <4 x float> %v, <4 x float> addrspace(1)* %out 35 ret void 36} 37 38; GCN-LABEL: {{^}}int4_inselt: 39; GCN-NOT: v_movrel 40; GCN-NOT: buffer_ 41; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 42; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]] 43; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 44; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]] 45; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 46; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]] 47; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 48; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]] 49; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] 50define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) { 51entry: 52 %v = insertelement <4 x i32> %vec, i32 1, i32 %sel 53 store <4 x i32> %v, <4 x i32> addrspace(1)* %out 54 ret void 55} 56 57; GCN-LABEL: {{^}}float2_inselt: 58; GCN-NOT: v_movrel 59; GCN-NOT: buffer_ 60; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 61; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 62; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 63; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]] 64; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]] 65define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) { 66entry: 67 %v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel 68 store <2 x float> %v, <2 x float> addrspace(1)* %out 69 ret void 70} 71 72; GCN-LABEL: {{^}}float8_inselt: 73; GCN-NOT: v_movrel 74; GCN-NOT: buffer_ 75; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3 76; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]] 77; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2 78; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]] 79; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1 80; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]] 81; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0 82; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]] 83; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7 84; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]] 85; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6 86; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]] 87; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5 88; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]] 89; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4 90; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]] 91; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]] 92; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]] 93define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) { 94entry: 95 %v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel 96 store <8 x float> %v, <8 x float> addrspace(1)* %out 97 ret void 98} 99 100; GCN-LABEL: {{^}}float16_inselt: 101; GCN: v_movreld_b32 102define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) { 103entry: 104 %v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel 105 store <16 x float> %v, <16 x float> addrspace(1)* %out 106 ret void 107} 108 109; GCN-LABEL: {{^}}half4_inselt: 110; GCN-NOT: v_cndmask_b32 111; GCN-NOT: v_movrel 112; GCN-NOT: buffer_ 113; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 114; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 115; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 116define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) { 117entry: 118 %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel 119 store <4 x half> %v, <4 x half> addrspace(1)* %out 120 ret void 121} 122 123; GCN-LABEL: {{^}}half2_inselt: 124; GCN-NOT: v_cndmask_b32 125; GCN-NOT: v_movrel 126; GCN-NOT: buffer_ 127; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 128; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] 129; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}} 130define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) { 131entry: 132 %v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel 133 store <2 x half> %v, <2 x half> addrspace(1)* %out 134 ret void 135} 136 137; GCN-LABEL: {{^}}half8_inselt: 138; GCN-NOT: v_movrel 139; GCN-NOT: buffer_ 140; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 141; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1 142; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2 143; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3 144; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4 145; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5 146; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6 147; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7 148; GCN-DAG: v_cndmask_b32_e32 149; GCN-DAG: v_cndmask_b32_e32 150; GCN-DAG: v_cndmask_b32_e32 151; GCN-DAG: v_cndmask_b32_e32 152; GCN-DAG: v_cndmask_b32_e32 153; GCN-DAG: v_cndmask_b32_e32 154; GCN-DAG: v_cndmask_b32_e32 155; GCN-DAG: v_cndmask_b32_e32 156; GCN-DAG: v_or_b32_sdwa 157; GCN-DAG: v_or_b32_sdwa 158; GCN-DAG: v_or_b32_sdwa 159; GCN-DAG: v_or_b32_sdwa 160define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) { 161entry: 162 %v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel 163 store <8 x half> %v, <8 x half> addrspace(1)* %out 164 ret void 165} 166 167; GCN-LABEL: {{^}}short2_inselt: 168; GCN-NOT: v_cndmask_b32 169; GCN-NOT: v_movrel 170; GCN-NOT: buffer_ 171; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 172; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] 173; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}} 174define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) { 175entry: 176 %v = insertelement <2 x i16> %vec, i16 1, i32 %sel 177 store <2 x i16> %v, <2 x i16> addrspace(1)* %out 178 ret void 179} 180 181; GCN-LABEL: {{^}}short4_inselt: 182; GCN-NOT: v_cndmask_b32 183; GCN-NOT: v_movrel 184; GCN-NOT: buffer_ 185; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4 186; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 187; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 188define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) { 189entry: 190 %v = insertelement <4 x i16> %vec, i16 1, i32 %sel 191 store <4 x i16> %v, <4 x i16> addrspace(1)* %out 192 ret void 193} 194 195; GCN-LABEL: {{^}}byte8_inselt: 196; GCN-NOT: v_movrel 197; GCN-NOT: buffer_ 198; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3 199; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] 200; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 201define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) { 202entry: 203 %v = insertelement <8 x i8> %vec, i8 1, i32 %sel 204 store <8 x i8> %v, <8 x i8> addrspace(1)* %out 205 ret void 206} 207 208; GCN-LABEL: {{^}}byte16_inselt: 209; GCN-NOT: v_movrel 210; GCN-NOT: buffer_ 211; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0 212; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15 213; GCN-DAG: v_cndmask_b32_e32 214; GCN-DAG: v_cndmask_b32_e32 215; GCN-DAG: v_cndmask_b32_e32 216; GCN-DAG: v_cndmask_b32_e32 217; GCN-DAG: v_cndmask_b32_e32 218; GCN-DAG: v_cndmask_b32_e32 219; GCN-DAG: v_cndmask_b32_e32 220; GCN-DAG: v_cndmask_b32_e32 221; GCN-DAG: v_cndmask_b32_e32 222; GCN-DAG: v_cndmask_b32_e32 223; GCN-DAG: v_cndmask_b32_e32 224; GCN-DAG: v_cndmask_b32_e32 225; GCN-DAG: v_cndmask_b32_e32 226; GCN-DAG: v_cndmask_b32_e32 227; GCN-DAG: v_cndmask_b32_e32 228; GCN-DAG: v_cndmask_b32_e32 229; GCN-DAG: v_or_b32_sdwa 230; GCN-DAG: v_or_b32_sdwa 231; GCN-DAG: v_or_b32_sdwa 232; GCN-DAG: v_or_b32_sdwa 233; GCN-DAG: v_or_b32_sdwa 234; GCN-DAG: v_or_b32_sdwa 235; GCN-DAG: v_or_b32_sdwa 236; GCN-DAG: v_or_b32_sdwa 237define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) { 238entry: 239 %v = insertelement <16 x i8> %vec, i8 1, i32 %sel 240 store <16 x i8> %v, <16 x i8> addrspace(1)* %out 241 ret void 242} 243 244; GCN-LABEL: {{^}}double2_inselt: 245; GCN-NOT: v_movrel 246; GCN-NOT: buffer_ 247; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1 248; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]] 249; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]] 250; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0 251; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]] 252; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]] 253define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) { 254entry: 255 %v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel 256 store <2 x double> %v, <2 x double> addrspace(1)* %out 257 ret void 258} 259 260; GCN-LABEL: {{^}}double8_inselt: 261; GCN-NOT: v_cndmask 262; GCN: buffer_store_dword 263; GCN: buffer_store_dword 264; GCN: buffer_load_dword 265; GCN: buffer_load_dword 266; GCN: buffer_load_dword 267; GCN: buffer_load_dword 268; GCN: buffer_load_dword 269; GCN: buffer_load_dword 270; GCN: buffer_load_dword 271; GCN: buffer_load_dword 272; GCN: buffer_load_dword 273; GCN: buffer_load_dword 274; GCN: buffer_load_dword 275; GCN: buffer_load_dword 276; GCN: buffer_load_dword 277; GCN: buffer_load_dword 278; GCN: buffer_load_dword 279; GCN: buffer_load_dword 280define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) { 281entry: 282 %v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel 283 store <8 x double> %v, <8 x double> addrspace(1)* %out 284 ret void 285} 286 287; GCN-LABEL: {{^}}bit4_inselt: 288; GCN: buffer_store_byte 289; GCN: buffer_load_ubyte 290; GCN: buffer_load_ubyte 291; GCN: buffer_load_ubyte 292; GCN: buffer_load_ubyte 293define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) { 294entry: 295 %v = insertelement <4 x i1> %vec, i1 1, i32 %sel 296 store <4 x i1> %v, <4 x i1> addrspace(1)* %out 297 ret void 298} 299 300; GCN-LABEL: {{^}}bit128_inselt: 301; GCN-NOT: buffer_ 302; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0 303; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]] 304; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f 305; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]] 306; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]] 307define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) { 308entry: 309 %v = insertelement <128 x i1> %vec, i1 1, i32 %sel 310 store <128 x i1> %v, <128 x i1> addrspace(1)* %out 311 ret void 312} 313