1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 10; SI-LABEL: insertelement_v2f32_0: 11; SI: ; %bb.0: 12; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 13; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 14; SI-NEXT: s_mov_b32 s3, 0x100f000 15; SI-NEXT: s_mov_b32 s2, -1 16; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: v_mov_b32_e32 v1, s5 19; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: insertelement_v2f32_0: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 25; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 26; VI-NEXT: s_mov_b32 s3, 0x1100f000 27; VI-NEXT: s_mov_b32 s2, -1 28; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v1, s5 31; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 32; VI-NEXT: s_endpgm 33 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 34 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 35 ret void 36} 37 38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 39; SI-LABEL: insertelement_v2f32_1: 40; SI: ; %bb.0: 41; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 42; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 43; SI-NEXT: s_mov_b32 s3, 0x100f000 44; SI-NEXT: s_mov_b32 s2, -1 45; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 46; SI-NEXT: s_waitcnt lgkmcnt(0) 47; SI-NEXT: v_mov_b32_e32 v0, s4 48; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 49; SI-NEXT: s_endpgm 50; 51; VI-LABEL: insertelement_v2f32_1: 52; VI: ; %bb.0: 53; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 54; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 55; VI-NEXT: s_mov_b32 s3, 0x1100f000 56; VI-NEXT: s_mov_b32 s2, -1 57; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 58; VI-NEXT: s_waitcnt lgkmcnt(0) 59; VI-NEXT: v_mov_b32_e32 v0, s4 60; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 61; VI-NEXT: s_endpgm 62 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 63 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 64 ret void 65} 66 67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 68; SI-LABEL: insertelement_v2i32_0: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 71; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 72; SI-NEXT: s_mov_b32 s3, 0x100f000 73; SI-NEXT: s_mov_b32 s2, -1 74; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 75; SI-NEXT: s_waitcnt lgkmcnt(0) 76; SI-NEXT: v_mov_b32_e32 v1, s5 77; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 78; SI-NEXT: s_endpgm 79; 80; VI-LABEL: insertelement_v2i32_0: 81; VI: ; %bb.0: 82; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 83; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 84; VI-NEXT: s_mov_b32 s3, 0x1100f000 85; VI-NEXT: s_mov_b32 s2, -1 86; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: v_mov_b32_e32 v1, s5 89; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 90; VI-NEXT: s_endpgm 91 %vecins = insertelement <2 x i32> %a, i32 999, i32 0 92 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 93 ret void 94} 95 96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 97; SI-LABEL: insertelement_v2i32_1: 98; SI: ; %bb.0: 99; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 100; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 101; SI-NEXT: s_mov_b32 s3, 0x100f000 102; SI-NEXT: s_mov_b32 s2, -1 103; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s4 106; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 107; SI-NEXT: s_endpgm 108; 109; VI-LABEL: insertelement_v2i32_1: 110; VI: ; %bb.0: 111; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 112; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 113; VI-NEXT: s_mov_b32 s3, 0x1100f000 114; VI-NEXT: s_mov_b32 s2, -1 115; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s4 118; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 119; VI-NEXT: s_endpgm 120 %vecins = insertelement <2 x i32> %a, i32 999, i32 1 121 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 122 ret void 123} 124 125; FIXME: Why is the constant moved into the intermediate register and 126; not just directly into the vector component? 127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 128; SI-LABEL: insertelement_v4f32_0: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 131; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s4, 0x40a00000 134; SI-NEXT: s_mov_b32 s3, 0x100f000 135; SI-NEXT: s_mov_b32 s2, -1 136; SI-NEXT: v_mov_b32_e32 v0, s4 137; SI-NEXT: v_mov_b32_e32 v1, s5 138; SI-NEXT: v_mov_b32_e32 v2, s6 139; SI-NEXT: v_mov_b32_e32 v3, s7 140; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 141; SI-NEXT: s_endpgm 142; 143; VI-LABEL: insertelement_v4f32_0: 144; VI: ; %bb.0: 145; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 146; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: s_mov_b32 s4, 0x40a00000 149; VI-NEXT: s_mov_b32 s3, 0x1100f000 150; VI-NEXT: s_mov_b32 s2, -1 151; VI-NEXT: v_mov_b32_e32 v0, s4 152; VI-NEXT: v_mov_b32_e32 v1, s5 153; VI-NEXT: v_mov_b32_e32 v2, s6 154; VI-NEXT: v_mov_b32_e32 v3, s7 155; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 156; VI-NEXT: s_endpgm 157 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 158 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 159 ret void 160} 161 162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 163; SI-LABEL: insertelement_v4f32_1: 164; SI: ; %bb.0: 165; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 166; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 167; SI-NEXT: s_waitcnt lgkmcnt(0) 168; SI-NEXT: s_mov_b32 s5, 0x40a00000 169; SI-NEXT: s_mov_b32 s3, 0x100f000 170; SI-NEXT: s_mov_b32 s2, -1 171; SI-NEXT: v_mov_b32_e32 v0, s4 172; SI-NEXT: v_mov_b32_e32 v1, s5 173; SI-NEXT: v_mov_b32_e32 v2, s6 174; SI-NEXT: v_mov_b32_e32 v3, s7 175; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 176; SI-NEXT: s_endpgm 177; 178; VI-LABEL: insertelement_v4f32_1: 179; VI: ; %bb.0: 180; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 181; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 182; VI-NEXT: s_waitcnt lgkmcnt(0) 183; VI-NEXT: s_mov_b32 s5, 0x40a00000 184; VI-NEXT: s_mov_b32 s3, 0x1100f000 185; VI-NEXT: s_mov_b32 s2, -1 186; VI-NEXT: v_mov_b32_e32 v0, s4 187; VI-NEXT: v_mov_b32_e32 v1, s5 188; VI-NEXT: v_mov_b32_e32 v2, s6 189; VI-NEXT: v_mov_b32_e32 v3, s7 190; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 191; VI-NEXT: s_endpgm 192 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 193 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 194 ret void 195} 196 197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 198; SI-LABEL: insertelement_v4f32_2: 199; SI: ; %bb.0: 200; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 201; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: s_mov_b32 s6, 0x40a00000 204; SI-NEXT: s_mov_b32 s3, 0x100f000 205; SI-NEXT: s_mov_b32 s2, -1 206; SI-NEXT: v_mov_b32_e32 v0, s4 207; SI-NEXT: v_mov_b32_e32 v1, s5 208; SI-NEXT: v_mov_b32_e32 v2, s6 209; SI-NEXT: v_mov_b32_e32 v3, s7 210; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 211; SI-NEXT: s_endpgm 212; 213; VI-LABEL: insertelement_v4f32_2: 214; VI: ; %bb.0: 215; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 216; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 217; VI-NEXT: s_waitcnt lgkmcnt(0) 218; VI-NEXT: s_mov_b32 s6, 0x40a00000 219; VI-NEXT: s_mov_b32 s3, 0x1100f000 220; VI-NEXT: s_mov_b32 s2, -1 221; VI-NEXT: v_mov_b32_e32 v0, s4 222; VI-NEXT: v_mov_b32_e32 v1, s5 223; VI-NEXT: v_mov_b32_e32 v2, s6 224; VI-NEXT: v_mov_b32_e32 v3, s7 225; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 226; VI-NEXT: s_endpgm 227 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 228 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 229 ret void 230} 231 232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 233; SI-LABEL: insertelement_v4f32_3: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 236; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 237; SI-NEXT: s_waitcnt lgkmcnt(0) 238; SI-NEXT: s_mov_b32 s7, 0x40a00000 239; SI-NEXT: s_mov_b32 s3, 0x100f000 240; SI-NEXT: s_mov_b32 s2, -1 241; SI-NEXT: v_mov_b32_e32 v0, s4 242; SI-NEXT: v_mov_b32_e32 v1, s5 243; SI-NEXT: v_mov_b32_e32 v2, s6 244; SI-NEXT: v_mov_b32_e32 v3, s7 245; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 246; SI-NEXT: s_endpgm 247; 248; VI-LABEL: insertelement_v4f32_3: 249; VI: ; %bb.0: 250; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 251; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 252; VI-NEXT: s_waitcnt lgkmcnt(0) 253; VI-NEXT: s_mov_b32 s7, 0x40a00000 254; VI-NEXT: s_mov_b32 s3, 0x1100f000 255; VI-NEXT: s_mov_b32 s2, -1 256; VI-NEXT: v_mov_b32_e32 v0, s4 257; VI-NEXT: v_mov_b32_e32 v1, s5 258; VI-NEXT: v_mov_b32_e32 v2, s6 259; VI-NEXT: v_mov_b32_e32 v3, s7 260; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 261; VI-NEXT: s_endpgm 262 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 263 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 264 ret void 265} 266 267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 268; SI-LABEL: insertelement_v4i32_0: 269; SI: ; %bb.0: 270; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 271; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 272; SI-NEXT: s_waitcnt lgkmcnt(0) 273; SI-NEXT: s_movk_i32 s4, 0x3e7 274; SI-NEXT: s_mov_b32 s3, 0x100f000 275; SI-NEXT: s_mov_b32 s2, -1 276; SI-NEXT: v_mov_b32_e32 v0, s4 277; SI-NEXT: v_mov_b32_e32 v1, s5 278; SI-NEXT: v_mov_b32_e32 v2, s6 279; SI-NEXT: v_mov_b32_e32 v3, s7 280; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 281; SI-NEXT: s_endpgm 282; 283; VI-LABEL: insertelement_v4i32_0: 284; VI: ; %bb.0: 285; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 286; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: s_movk_i32 s4, 0x3e7 289; VI-NEXT: s_mov_b32 s3, 0x1100f000 290; VI-NEXT: s_mov_b32 s2, -1 291; VI-NEXT: v_mov_b32_e32 v0, s4 292; VI-NEXT: v_mov_b32_e32 v1, s5 293; VI-NEXT: v_mov_b32_e32 v2, s6 294; VI-NEXT: v_mov_b32_e32 v3, s7 295; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 296; VI-NEXT: s_endpgm 297 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 298 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 299 ret void 300} 301 302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 303; SI-LABEL: insertelement_v3f32_1: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 306; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 307; SI-NEXT: s_mov_b32 s3, 0x100f000 308; SI-NEXT: s_mov_b32 s2, -1 309; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: v_mov_b32_e32 v0, s4 312; SI-NEXT: v_mov_b32_e32 v2, s6 313; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 314; SI-NEXT: s_endpgm 315; 316; VI-LABEL: insertelement_v3f32_1: 317; VI: ; %bb.0: 318; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 319; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 320; VI-NEXT: s_mov_b32 s3, 0x1100f000 321; VI-NEXT: s_mov_b32 s2, -1 322; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 323; VI-NEXT: s_waitcnt lgkmcnt(0) 324; VI-NEXT: v_mov_b32_e32 v0, s4 325; VI-NEXT: v_mov_b32_e32 v2, s6 326; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 327; VI-NEXT: s_endpgm 328 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 329 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 330 ret void 331} 332 333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 334; SI-LABEL: insertelement_v3f32_2: 335; SI: ; %bb.0: 336; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 337; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 338; SI-NEXT: s_mov_b32 s3, 0x100f000 339; SI-NEXT: s_mov_b32 s2, -1 340; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 341; SI-NEXT: s_waitcnt lgkmcnt(0) 342; SI-NEXT: v_mov_b32_e32 v0, s4 343; SI-NEXT: v_mov_b32_e32 v1, s5 344; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: insertelement_v3f32_2: 348; VI: ; %bb.0: 349; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 350; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 351; VI-NEXT: s_mov_b32 s3, 0x1100f000 352; VI-NEXT: s_mov_b32 s2, -1 353; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 354; VI-NEXT: s_waitcnt lgkmcnt(0) 355; VI-NEXT: v_mov_b32_e32 v0, s4 356; VI-NEXT: v_mov_b32_e32 v1, s5 357; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 358; VI-NEXT: s_endpgm 359 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 360 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 361 ret void 362} 363 364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 365; GCN-LABEL: insertelement_v3f32_3: 366; GCN: ; %bb.0: 367; GCN-NEXT: s_endpgm 368 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 369 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 370 ret void 371} 372 373define <4 x float> @insertelement_to_sgpr() nounwind { 374; GCN-LABEL: insertelement_to_sgpr: 375; GCN: ; %bb.0: 376; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 378; GCN-NEXT: s_waitcnt lgkmcnt(0) 379; GCN-NEXT: s_mov_b32 s12, 0 380; GCN-NEXT: s_mov_b32 s4, s12 381; GCN-NEXT: s_mov_b32 s5, s12 382; GCN-NEXT: s_mov_b32 s6, s12 383; GCN-NEXT: s_mov_b32 s7, s12 384; GCN-NEXT: s_mov_b32 s8, s12 385; GCN-NEXT: s_mov_b32 s9, s12 386; GCN-NEXT: s_mov_b32 s10, s12 387; GCN-NEXT: s_mov_b32 s11, s12 388; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[12:15] dmask:0x1 389; GCN-NEXT: s_waitcnt vmcnt(0) 390; GCN-NEXT: s_setpc_b64 s[30:31] 391 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 392 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 393 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 394 ret <4 x float> %tmp2 395} 396 397define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 398; SI-LABEL: dynamic_insertelement_v2f32: 399; SI: ; %bb.0: 400; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 401; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 402; SI-NEXT: s_load_dword s4, s[4:5], 0x4 403; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 404; SI-NEXT: s_mov_b32 s3, 0x100f000 405; SI-NEXT: s_mov_b32 s2, -1 406; SI-NEXT: s_waitcnt lgkmcnt(0) 407; SI-NEXT: v_mov_b32_e32 v1, s7 408; SI-NEXT: s_cmp_lg_u32 s4, 1 409; SI-NEXT: s_cselect_b64 vcc, -1, 0 410; SI-NEXT: s_cmp_lg_u32 s4, 0 411; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 412; SI-NEXT: v_mov_b32_e32 v2, s6 413; SI-NEXT: s_cselect_b64 vcc, -1, 0 414; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 415; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 416; SI-NEXT: s_endpgm 417; 418; VI-LABEL: dynamic_insertelement_v2f32: 419; VI: ; %bb.0: 420; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 421; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 422; VI-NEXT: s_load_dword s4, s[4:5], 0x10 423; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 424; VI-NEXT: s_mov_b32 s3, 0x1100f000 425; VI-NEXT: s_mov_b32 s2, -1 426; VI-NEXT: s_waitcnt lgkmcnt(0) 427; VI-NEXT: v_mov_b32_e32 v1, s7 428; VI-NEXT: s_cmp_lg_u32 s4, 1 429; VI-NEXT: s_cselect_b64 vcc, -1, 0 430; VI-NEXT: s_cmp_lg_u32 s4, 0 431; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 432; VI-NEXT: v_mov_b32_e32 v2, s6 433; VI-NEXT: s_cselect_b64 vcc, -1, 0 434; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 435; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 436; VI-NEXT: s_endpgm 437 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 438 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 439 ret void 440} 441 442define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 443; SI-LABEL: dynamic_insertelement_v3f32: 444; SI: ; %bb.0: 445; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 446; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 447; SI-NEXT: s_load_dword s4, s[4:5], 0x8 448; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 449; SI-NEXT: s_mov_b32 s3, 0x100f000 450; SI-NEXT: s_mov_b32 s2, -1 451; SI-NEXT: s_waitcnt lgkmcnt(0) 452; SI-NEXT: v_mov_b32_e32 v1, s10 453; SI-NEXT: s_cmp_lg_u32 s4, 2 454; SI-NEXT: s_cselect_b64 vcc, -1, 0 455; SI-NEXT: s_cmp_lg_u32 s4, 1 456; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 457; SI-NEXT: s_cselect_b64 vcc, -1, 0 458; SI-NEXT: v_mov_b32_e32 v1, s9 459; SI-NEXT: s_cmp_lg_u32 s4, 0 460; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 461; SI-NEXT: v_mov_b32_e32 v3, s8 462; SI-NEXT: s_cselect_b64 vcc, -1, 0 463; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 464; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 465; SI-NEXT: s_endpgm 466; 467; VI-LABEL: dynamic_insertelement_v3f32: 468; VI: ; %bb.0: 469; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 470; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 471; VI-NEXT: s_load_dword s4, s[4:5], 0x20 472; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 473; VI-NEXT: s_mov_b32 s3, 0x1100f000 474; VI-NEXT: s_mov_b32 s2, -1 475; VI-NEXT: s_waitcnt lgkmcnt(0) 476; VI-NEXT: v_mov_b32_e32 v1, s10 477; VI-NEXT: s_cmp_lg_u32 s4, 2 478; VI-NEXT: s_cselect_b64 vcc, -1, 0 479; VI-NEXT: s_cmp_lg_u32 s4, 1 480; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 481; VI-NEXT: s_cselect_b64 vcc, -1, 0 482; VI-NEXT: v_mov_b32_e32 v1, s9 483; VI-NEXT: s_cmp_lg_u32 s4, 0 484; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 485; VI-NEXT: v_mov_b32_e32 v3, s8 486; VI-NEXT: s_cselect_b64 vcc, -1, 0 487; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 488; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 489; VI-NEXT: s_endpgm 490 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 491 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 492 ret void 493} 494 495define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 496; SI-LABEL: dynamic_insertelement_v4f32: 497; SI: ; %bb.0: 498; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 499; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 500; SI-NEXT: s_load_dword s4, s[4:5], 0x8 501; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 502; SI-NEXT: s_mov_b32 s3, 0x100f000 503; SI-NEXT: s_mov_b32 s2, -1 504; SI-NEXT: s_waitcnt lgkmcnt(0) 505; SI-NEXT: v_mov_b32_e32 v1, s11 506; SI-NEXT: s_cmp_lg_u32 s4, 3 507; SI-NEXT: s_cselect_b64 vcc, -1, 0 508; SI-NEXT: s_cmp_lg_u32 s4, 2 509; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 510; SI-NEXT: s_cselect_b64 vcc, -1, 0 511; SI-NEXT: v_mov_b32_e32 v1, s10 512; SI-NEXT: s_cmp_lg_u32 s4, 1 513; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 514; SI-NEXT: s_cselect_b64 vcc, -1, 0 515; SI-NEXT: v_mov_b32_e32 v1, s9 516; SI-NEXT: s_cmp_lg_u32 s4, 0 517; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 518; SI-NEXT: v_mov_b32_e32 v4, s8 519; SI-NEXT: s_cselect_b64 vcc, -1, 0 520; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 521; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 522; SI-NEXT: s_endpgm 523; 524; VI-LABEL: dynamic_insertelement_v4f32: 525; VI: ; %bb.0: 526; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 527; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 528; VI-NEXT: s_load_dword s4, s[4:5], 0x20 529; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 530; VI-NEXT: s_mov_b32 s3, 0x1100f000 531; VI-NEXT: s_mov_b32 s2, -1 532; VI-NEXT: s_waitcnt lgkmcnt(0) 533; VI-NEXT: v_mov_b32_e32 v1, s11 534; VI-NEXT: s_cmp_lg_u32 s4, 3 535; VI-NEXT: s_cselect_b64 vcc, -1, 0 536; VI-NEXT: s_cmp_lg_u32 s4, 2 537; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 538; VI-NEXT: s_cselect_b64 vcc, -1, 0 539; VI-NEXT: v_mov_b32_e32 v1, s10 540; VI-NEXT: s_cmp_lg_u32 s4, 1 541; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 542; VI-NEXT: s_cselect_b64 vcc, -1, 0 543; VI-NEXT: v_mov_b32_e32 v1, s9 544; VI-NEXT: s_cmp_lg_u32 s4, 0 545; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 546; VI-NEXT: v_mov_b32_e32 v4, s8 547; VI-NEXT: s_cselect_b64 vcc, -1, 0 548; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 549; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 550; VI-NEXT: s_endpgm 551 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 552 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 553 ret void 554} 555 556define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 557; SI-LABEL: dynamic_insertelement_v8f32: 558; SI: ; %bb.0: 559; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 560; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 561; SI-NEXT: s_load_dword s4, s[4:5], 0x10 562; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000 563; SI-NEXT: s_mov_b32 s3, 0x100f000 564; SI-NEXT: s_mov_b32 s2, -1 565; SI-NEXT: s_waitcnt lgkmcnt(0) 566; SI-NEXT: v_mov_b32_e32 v0, s11 567; SI-NEXT: s_cmp_lg_u32 s4, 3 568; SI-NEXT: s_cselect_b64 vcc, -1, 0 569; SI-NEXT: s_cmp_lg_u32 s4, 2 570; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 571; SI-NEXT: s_cselect_b64 vcc, -1, 0 572; SI-NEXT: v_mov_b32_e32 v0, s10 573; SI-NEXT: s_cmp_lg_u32 s4, 1 574; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 575; SI-NEXT: s_cselect_b64 vcc, -1, 0 576; SI-NEXT: v_mov_b32_e32 v0, s9 577; SI-NEXT: s_cmp_lg_u32 s4, 0 578; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 579; SI-NEXT: s_cselect_b64 vcc, -1, 0 580; SI-NEXT: v_mov_b32_e32 v0, s8 581; SI-NEXT: s_cmp_lg_u32 s4, 7 582; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 583; SI-NEXT: s_cselect_b64 vcc, -1, 0 584; SI-NEXT: v_mov_b32_e32 v5, s15 585; SI-NEXT: s_cmp_lg_u32 s4, 6 586; SI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 587; SI-NEXT: s_cselect_b64 vcc, -1, 0 588; SI-NEXT: v_mov_b32_e32 v5, s14 589; SI-NEXT: s_cmp_lg_u32 s4, 5 590; SI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 591; SI-NEXT: s_cselect_b64 vcc, -1, 0 592; SI-NEXT: v_mov_b32_e32 v5, s13 593; SI-NEXT: s_cmp_lg_u32 s4, 4 594; SI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 595; SI-NEXT: v_mov_b32_e32 v8, s12 596; SI-NEXT: s_cselect_b64 vcc, -1, 0 597; SI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 598; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 599; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 600; SI-NEXT: s_endpgm 601; 602; VI-LABEL: dynamic_insertelement_v8f32: 603; VI: ; %bb.0: 604; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 605; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 606; VI-NEXT: s_load_dword s4, s[4:5], 0x40 607; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000 608; VI-NEXT: s_mov_b32 s3, 0x1100f000 609; VI-NEXT: s_mov_b32 s2, -1 610; VI-NEXT: s_waitcnt lgkmcnt(0) 611; VI-NEXT: v_mov_b32_e32 v0, s11 612; VI-NEXT: s_cmp_lg_u32 s4, 3 613; VI-NEXT: s_cselect_b64 vcc, -1, 0 614; VI-NEXT: s_cmp_lg_u32 s4, 2 615; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc 616; VI-NEXT: s_cselect_b64 vcc, -1, 0 617; VI-NEXT: v_mov_b32_e32 v0, s10 618; VI-NEXT: s_cmp_lg_u32 s4, 1 619; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc 620; VI-NEXT: s_cselect_b64 vcc, -1, 0 621; VI-NEXT: v_mov_b32_e32 v0, s9 622; VI-NEXT: s_cmp_lg_u32 s4, 0 623; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc 624; VI-NEXT: s_cselect_b64 vcc, -1, 0 625; VI-NEXT: v_mov_b32_e32 v0, s8 626; VI-NEXT: s_cmp_lg_u32 s4, 7 627; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc 628; VI-NEXT: s_cselect_b64 vcc, -1, 0 629; VI-NEXT: v_mov_b32_e32 v5, s15 630; VI-NEXT: s_cmp_lg_u32 s4, 6 631; VI-NEXT: v_cndmask_b32_e32 v7, v4, v5, vcc 632; VI-NEXT: s_cselect_b64 vcc, -1, 0 633; VI-NEXT: v_mov_b32_e32 v5, s14 634; VI-NEXT: s_cmp_lg_u32 s4, 5 635; VI-NEXT: v_cndmask_b32_e32 v6, v4, v5, vcc 636; VI-NEXT: s_cselect_b64 vcc, -1, 0 637; VI-NEXT: v_mov_b32_e32 v5, s13 638; VI-NEXT: s_cmp_lg_u32 s4, 4 639; VI-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc 640; VI-NEXT: v_mov_b32_e32 v8, s12 641; VI-NEXT: s_cselect_b64 vcc, -1, 0 642; VI-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 643; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 644; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 645; VI-NEXT: s_endpgm 646 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 647 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 648 ret void 649} 650 651define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 652; SI-LABEL: dynamic_insertelement_v16f32: 653; SI: ; %bb.0: 654; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 655; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 656; SI-NEXT: s_load_dword s4, s[4:5], 0x20 657; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 658; SI-NEXT: s_mov_b32 s3, 0x100f000 659; SI-NEXT: s_mov_b32 s2, -1 660; SI-NEXT: s_waitcnt lgkmcnt(0) 661; SI-NEXT: v_mov_b32_e32 v0, s8 662; SI-NEXT: v_mov_b32_e32 v1, s9 663; SI-NEXT: v_mov_b32_e32 v2, s10 664; SI-NEXT: v_mov_b32_e32 v3, s11 665; SI-NEXT: v_mov_b32_e32 v4, s12 666; SI-NEXT: v_mov_b32_e32 v5, s13 667; SI-NEXT: v_mov_b32_e32 v6, s14 668; SI-NEXT: v_mov_b32_e32 v7, s15 669; SI-NEXT: v_mov_b32_e32 v8, s16 670; SI-NEXT: v_mov_b32_e32 v9, s17 671; SI-NEXT: v_mov_b32_e32 v10, s18 672; SI-NEXT: v_mov_b32_e32 v11, s19 673; SI-NEXT: v_mov_b32_e32 v12, s20 674; SI-NEXT: v_mov_b32_e32 v13, s21 675; SI-NEXT: v_mov_b32_e32 v14, s22 676; SI-NEXT: v_mov_b32_e32 v15, s23 677; SI-NEXT: s_mov_b32 m0, s4 678; SI-NEXT: v_movreld_b32_e32 v0, v16 679; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 680; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 681; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 682; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 683; SI-NEXT: s_endpgm 684; 685; VI-LABEL: dynamic_insertelement_v16f32: 686; VI: ; %bb.0: 687; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 688; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 689; VI-NEXT: s_load_dword s4, s[4:5], 0x80 690; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 691; VI-NEXT: s_mov_b32 s3, 0x1100f000 692; VI-NEXT: s_mov_b32 s2, -1 693; VI-NEXT: s_waitcnt lgkmcnt(0) 694; VI-NEXT: v_mov_b32_e32 v0, s8 695; VI-NEXT: v_mov_b32_e32 v1, s9 696; VI-NEXT: v_mov_b32_e32 v2, s10 697; VI-NEXT: v_mov_b32_e32 v3, s11 698; VI-NEXT: v_mov_b32_e32 v4, s12 699; VI-NEXT: v_mov_b32_e32 v5, s13 700; VI-NEXT: v_mov_b32_e32 v6, s14 701; VI-NEXT: v_mov_b32_e32 v7, s15 702; VI-NEXT: v_mov_b32_e32 v8, s16 703; VI-NEXT: v_mov_b32_e32 v9, s17 704; VI-NEXT: v_mov_b32_e32 v10, s18 705; VI-NEXT: v_mov_b32_e32 v11, s19 706; VI-NEXT: v_mov_b32_e32 v12, s20 707; VI-NEXT: v_mov_b32_e32 v13, s21 708; VI-NEXT: v_mov_b32_e32 v14, s22 709; VI-NEXT: v_mov_b32_e32 v15, s23 710; VI-NEXT: s_mov_b32 m0, s4 711; VI-NEXT: v_movreld_b32_e32 v0, v16 712; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 713; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 714; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 715; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 716; VI-NEXT: s_endpgm 717 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 718 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 719 ret void 720} 721 722define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 723; SI-LABEL: dynamic_insertelement_v2i32: 724; SI: ; %bb.0: 725; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 726; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 727; SI-NEXT: s_load_dword s4, s[4:5], 0x4 728; SI-NEXT: s_mov_b32 s3, 0x100f000 729; SI-NEXT: s_mov_b32 s2, -1 730; SI-NEXT: s_waitcnt lgkmcnt(0) 731; SI-NEXT: v_mov_b32_e32 v0, s7 732; SI-NEXT: s_cmp_lg_u32 s4, 1 733; SI-NEXT: s_cselect_b64 vcc, -1, 0 734; SI-NEXT: s_cmp_lg_u32 s4, 0 735; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 736; SI-NEXT: v_mov_b32_e32 v0, s6 737; SI-NEXT: s_cselect_b64 vcc, -1, 0 738; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 739; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 740; SI-NEXT: s_endpgm 741; 742; VI-LABEL: dynamic_insertelement_v2i32: 743; VI: ; %bb.0: 744; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 745; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 746; VI-NEXT: s_load_dword s4, s[4:5], 0x10 747; VI-NEXT: s_mov_b32 s3, 0x1100f000 748; VI-NEXT: s_mov_b32 s2, -1 749; VI-NEXT: s_waitcnt lgkmcnt(0) 750; VI-NEXT: s_cmp_lg_u32 s4, 1 751; VI-NEXT: s_cselect_b32 s5, s7, 5 752; VI-NEXT: s_cmp_lg_u32 s4, 0 753; VI-NEXT: s_cselect_b32 s4, s6, 5 754; VI-NEXT: v_mov_b32_e32 v0, s4 755; VI-NEXT: v_mov_b32_e32 v1, s5 756; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 757; VI-NEXT: s_endpgm 758 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 759 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 760 ret void 761} 762 763define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 764; SI-LABEL: dynamic_insertelement_v3i32: 765; SI: ; %bb.0: 766; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 767; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 768; SI-NEXT: s_load_dword s4, s[4:5], 0x8 769; SI-NEXT: s_mov_b32 s3, 0x100f000 770; SI-NEXT: s_mov_b32 s2, -1 771; SI-NEXT: s_waitcnt lgkmcnt(0) 772; SI-NEXT: v_mov_b32_e32 v0, s10 773; SI-NEXT: s_cmp_lg_u32 s4, 2 774; SI-NEXT: s_cselect_b64 vcc, -1, 0 775; SI-NEXT: s_cmp_lg_u32 s4, 1 776; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 777; SI-NEXT: s_cselect_b64 vcc, -1, 0 778; SI-NEXT: v_mov_b32_e32 v0, s9 779; SI-NEXT: s_cmp_lg_u32 s4, 0 780; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 781; SI-NEXT: v_mov_b32_e32 v0, s8 782; SI-NEXT: s_cselect_b64 vcc, -1, 0 783; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 784; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 785; SI-NEXT: s_endpgm 786; 787; VI-LABEL: dynamic_insertelement_v3i32: 788; VI: ; %bb.0: 789; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 790; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 791; VI-NEXT: s_load_dword s4, s[4:5], 0x20 792; VI-NEXT: s_mov_b32 s3, 0x1100f000 793; VI-NEXT: s_mov_b32 s2, -1 794; VI-NEXT: s_waitcnt lgkmcnt(0) 795; VI-NEXT: s_cmp_lg_u32 s4, 2 796; VI-NEXT: s_cselect_b32 s5, s10, 5 797; VI-NEXT: s_cmp_lg_u32 s4, 1 798; VI-NEXT: s_cselect_b32 s6, s9, 5 799; VI-NEXT: s_cmp_lg_u32 s4, 0 800; VI-NEXT: s_cselect_b32 s4, s8, 5 801; VI-NEXT: v_mov_b32_e32 v0, s4 802; VI-NEXT: v_mov_b32_e32 v1, s6 803; VI-NEXT: v_mov_b32_e32 v2, s5 804; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 805; VI-NEXT: s_endpgm 806 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 807 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 808 ret void 809} 810 811define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 812; SI-LABEL: dynamic_insertelement_v4i32: 813; SI: ; %bb.0: 814; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 815; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 816; SI-NEXT: s_load_dword s6, s[4:5], 0x8 817; SI-NEXT: s_load_dword s4, s[4:5], 0x11 818; SI-NEXT: s_mov_b32 s3, 0x100f000 819; SI-NEXT: s_mov_b32 s2, -1 820; SI-NEXT: s_waitcnt lgkmcnt(0) 821; SI-NEXT: v_mov_b32_e32 v0, s11 822; SI-NEXT: s_cmp_eq_u32 s6, 3 823; SI-NEXT: s_cselect_b64 vcc, -1, 0 824; SI-NEXT: v_mov_b32_e32 v4, s4 825; SI-NEXT: s_cmp_eq_u32 s6, 2 826; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 827; SI-NEXT: s_cselect_b64 vcc, -1, 0 828; SI-NEXT: v_mov_b32_e32 v0, s10 829; SI-NEXT: s_cmp_eq_u32 s6, 1 830; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 831; SI-NEXT: s_cselect_b64 vcc, -1, 0 832; SI-NEXT: v_mov_b32_e32 v0, s9 833; SI-NEXT: s_cmp_eq_u32 s6, 0 834; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 835; SI-NEXT: v_mov_b32_e32 v0, s8 836; SI-NEXT: s_cselect_b64 vcc, -1, 0 837; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 838; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 839; SI-NEXT: s_endpgm 840; 841; VI-LABEL: dynamic_insertelement_v4i32: 842; VI: ; %bb.0: 843; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 844; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 845; VI-NEXT: s_load_dword s6, s[4:5], 0x20 846; VI-NEXT: s_load_dword s4, s[4:5], 0x44 847; VI-NEXT: s_mov_b32 s3, 0x1100f000 848; VI-NEXT: s_mov_b32 s2, -1 849; VI-NEXT: s_waitcnt lgkmcnt(0) 850; VI-NEXT: s_cmp_eq_u32 s6, 3 851; VI-NEXT: s_cselect_b32 s5, s4, s11 852; VI-NEXT: s_cmp_eq_u32 s6, 2 853; VI-NEXT: s_cselect_b32 s7, s4, s10 854; VI-NEXT: s_cmp_eq_u32 s6, 1 855; VI-NEXT: s_cselect_b32 s9, s4, s9 856; VI-NEXT: s_cmp_eq_u32 s6, 0 857; VI-NEXT: s_cselect_b32 s4, s4, s8 858; VI-NEXT: v_mov_b32_e32 v0, s4 859; VI-NEXT: v_mov_b32_e32 v1, s9 860; VI-NEXT: v_mov_b32_e32 v2, s7 861; VI-NEXT: v_mov_b32_e32 v3, s5 862; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 863; VI-NEXT: s_endpgm 864 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 865 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 866 ret void 867} 868 869define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 870; SI-LABEL: dynamic_insertelement_v8i32: 871; SI: ; %bb.0: 872; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 873; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 874; SI-NEXT: s_load_dword s4, s[4:5], 0x10 875; SI-NEXT: s_mov_b32 s3, 0x100f000 876; SI-NEXT: s_mov_b32 s2, -1 877; SI-NEXT: s_waitcnt lgkmcnt(0) 878; SI-NEXT: v_mov_b32_e32 v0, s11 879; SI-NEXT: s_cmp_lg_u32 s4, 3 880; SI-NEXT: s_cselect_b64 vcc, -1, 0 881; SI-NEXT: s_cmp_lg_u32 s4, 2 882; SI-NEXT: v_cndmask_b32_e32 v3, 5, v0, vcc 883; SI-NEXT: s_cselect_b64 vcc, -1, 0 884; SI-NEXT: v_mov_b32_e32 v0, s10 885; SI-NEXT: s_cmp_lg_u32 s4, 1 886; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 887; SI-NEXT: s_cselect_b64 vcc, -1, 0 888; SI-NEXT: v_mov_b32_e32 v0, s9 889; SI-NEXT: s_cmp_lg_u32 s4, 0 890; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 891; SI-NEXT: s_cselect_b64 vcc, -1, 0 892; SI-NEXT: v_mov_b32_e32 v0, s8 893; SI-NEXT: s_cmp_lg_u32 s4, 7 894; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 895; SI-NEXT: s_cselect_b64 vcc, -1, 0 896; SI-NEXT: v_mov_b32_e32 v4, s15 897; SI-NEXT: s_cmp_lg_u32 s4, 6 898; SI-NEXT: v_cndmask_b32_e32 v7, 5, v4, vcc 899; SI-NEXT: s_cselect_b64 vcc, -1, 0 900; SI-NEXT: v_mov_b32_e32 v4, s14 901; SI-NEXT: s_cmp_lg_u32 s4, 5 902; SI-NEXT: v_cndmask_b32_e32 v6, 5, v4, vcc 903; SI-NEXT: s_cselect_b64 vcc, -1, 0 904; SI-NEXT: v_mov_b32_e32 v4, s13 905; SI-NEXT: s_cmp_lg_u32 s4, 4 906; SI-NEXT: v_cndmask_b32_e32 v5, 5, v4, vcc 907; SI-NEXT: v_mov_b32_e32 v4, s12 908; SI-NEXT: s_cselect_b64 vcc, -1, 0 909; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 910; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 911; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 912; SI-NEXT: s_endpgm 913; 914; VI-LABEL: dynamic_insertelement_v8i32: 915; VI: ; %bb.0: 916; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 917; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 918; VI-NEXT: s_load_dword s4, s[4:5], 0x40 919; VI-NEXT: s_mov_b32 s3, 0x1100f000 920; VI-NEXT: s_mov_b32 s2, -1 921; VI-NEXT: s_waitcnt lgkmcnt(0) 922; VI-NEXT: s_cmp_lg_u32 s4, 3 923; VI-NEXT: s_cselect_b32 s5, s11, 5 924; VI-NEXT: s_cmp_lg_u32 s4, 2 925; VI-NEXT: s_cselect_b32 s6, s10, 5 926; VI-NEXT: s_cmp_lg_u32 s4, 1 927; VI-NEXT: s_cselect_b32 s7, s9, 5 928; VI-NEXT: s_cmp_lg_u32 s4, 0 929; VI-NEXT: s_cselect_b32 s8, s8, 5 930; VI-NEXT: s_cmp_lg_u32 s4, 7 931; VI-NEXT: s_cselect_b32 s9, s15, 5 932; VI-NEXT: s_cmp_lg_u32 s4, 6 933; VI-NEXT: s_cselect_b32 s10, s14, 5 934; VI-NEXT: s_cmp_lg_u32 s4, 5 935; VI-NEXT: s_cselect_b32 s11, s13, 5 936; VI-NEXT: s_cmp_lg_u32 s4, 4 937; VI-NEXT: s_cselect_b32 s4, s12, 5 938; VI-NEXT: v_mov_b32_e32 v0, s4 939; VI-NEXT: v_mov_b32_e32 v1, s11 940; VI-NEXT: v_mov_b32_e32 v2, s10 941; VI-NEXT: v_mov_b32_e32 v3, s9 942; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 943; VI-NEXT: s_nop 0 944; VI-NEXT: v_mov_b32_e32 v0, s8 945; VI-NEXT: v_mov_b32_e32 v1, s7 946; VI-NEXT: v_mov_b32_e32 v2, s6 947; VI-NEXT: v_mov_b32_e32 v3, s5 948; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 949; VI-NEXT: s_endpgm 950 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 951 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 952 ret void 953} 954 955define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 956; SI-LABEL: dynamic_insertelement_v16i32: 957; SI: ; %bb.0: 958; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 959; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 960; SI-NEXT: s_load_dword s4, s[4:5], 0x20 961; SI-NEXT: s_mov_b32 s3, 0x100f000 962; SI-NEXT: s_mov_b32 s2, -1 963; SI-NEXT: s_waitcnt lgkmcnt(0) 964; SI-NEXT: v_mov_b32_e32 v0, s8 965; SI-NEXT: v_mov_b32_e32 v1, s9 966; SI-NEXT: v_mov_b32_e32 v2, s10 967; SI-NEXT: v_mov_b32_e32 v3, s11 968; SI-NEXT: v_mov_b32_e32 v4, s12 969; SI-NEXT: v_mov_b32_e32 v5, s13 970; SI-NEXT: v_mov_b32_e32 v6, s14 971; SI-NEXT: v_mov_b32_e32 v7, s15 972; SI-NEXT: v_mov_b32_e32 v8, s16 973; SI-NEXT: v_mov_b32_e32 v9, s17 974; SI-NEXT: v_mov_b32_e32 v10, s18 975; SI-NEXT: v_mov_b32_e32 v11, s19 976; SI-NEXT: v_mov_b32_e32 v12, s20 977; SI-NEXT: v_mov_b32_e32 v13, s21 978; SI-NEXT: v_mov_b32_e32 v14, s22 979; SI-NEXT: v_mov_b32_e32 v15, s23 980; SI-NEXT: s_mov_b32 m0, s4 981; SI-NEXT: v_movreld_b32_e32 v0, 5 982; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 983; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 984; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 985; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 986; SI-NEXT: s_endpgm 987; 988; VI-LABEL: dynamic_insertelement_v16i32: 989; VI: ; %bb.0: 990; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 991; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 992; VI-NEXT: s_load_dword s4, s[4:5], 0x80 993; VI-NEXT: s_mov_b32 s3, 0x1100f000 994; VI-NEXT: s_mov_b32 s2, -1 995; VI-NEXT: s_waitcnt lgkmcnt(0) 996; VI-NEXT: v_mov_b32_e32 v0, s8 997; VI-NEXT: v_mov_b32_e32 v1, s9 998; VI-NEXT: v_mov_b32_e32 v2, s10 999; VI-NEXT: v_mov_b32_e32 v3, s11 1000; VI-NEXT: v_mov_b32_e32 v4, s12 1001; VI-NEXT: v_mov_b32_e32 v5, s13 1002; VI-NEXT: v_mov_b32_e32 v6, s14 1003; VI-NEXT: v_mov_b32_e32 v7, s15 1004; VI-NEXT: v_mov_b32_e32 v8, s16 1005; VI-NEXT: v_mov_b32_e32 v9, s17 1006; VI-NEXT: v_mov_b32_e32 v10, s18 1007; VI-NEXT: v_mov_b32_e32 v11, s19 1008; VI-NEXT: v_mov_b32_e32 v12, s20 1009; VI-NEXT: v_mov_b32_e32 v13, s21 1010; VI-NEXT: v_mov_b32_e32 v14, s22 1011; VI-NEXT: v_mov_b32_e32 v15, s23 1012; VI-NEXT: s_mov_b32 m0, s4 1013; VI-NEXT: v_movreld_b32_e32 v0, 5 1014; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1015; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1016; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1017; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1018; VI-NEXT: s_endpgm 1019 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 1020 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 1021 ret void 1022} 1023 1024define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 1025; SI-LABEL: dynamic_insertelement_v2i16: 1026; SI: ; %bb.0: 1027; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1028; SI-NEXT: s_load_dword s6, s[4:5], 0x2 1029; SI-NEXT: s_load_dword s4, s[4:5], 0x3 1030; SI-NEXT: s_mov_b32 s3, 0x100f000 1031; SI-NEXT: s_mov_b32 s2, -1 1032; SI-NEXT: s_waitcnt lgkmcnt(0) 1033; SI-NEXT: s_lshl_b32 s4, s4, 4 1034; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1035; SI-NEXT: s_andn2_b32 s5, s6, s4 1036; SI-NEXT: s_and_b32 s4, s4, 0x50005 1037; SI-NEXT: s_or_b32 s4, s4, s5 1038; SI-NEXT: v_mov_b32_e32 v0, s4 1039; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1040; SI-NEXT: s_endpgm 1041; 1042; VI-LABEL: dynamic_insertelement_v2i16: 1043; VI: ; %bb.0: 1044; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1045; VI-NEXT: s_load_dword s6, s[4:5], 0x8 1046; VI-NEXT: s_load_dword s4, s[4:5], 0xc 1047; VI-NEXT: s_mov_b32 s3, 0x1100f000 1048; VI-NEXT: s_mov_b32 s2, -1 1049; VI-NEXT: s_waitcnt lgkmcnt(0) 1050; VI-NEXT: s_lshl_b32 s4, s4, 4 1051; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1052; VI-NEXT: s_andn2_b32 s5, s6, s4 1053; VI-NEXT: s_and_b32 s4, s4, 0x50005 1054; VI-NEXT: s_or_b32 s4, s4, s5 1055; VI-NEXT: v_mov_b32_e32 v0, s4 1056; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1057; VI-NEXT: s_endpgm 1058 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 1059 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 1060 ret void 1061} 1062 1063define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 1064; SI-LABEL: dynamic_insertelement_v3i16: 1065; SI: ; %bb.0: 1066; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1067; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 1068; SI-NEXT: s_load_dword s4, s[4:5], 0x4 1069; SI-NEXT: s_mov_b32 s3, 0x100f000 1070; SI-NEXT: s_mov_b32 s2, -1 1071; SI-NEXT: s_waitcnt lgkmcnt(0) 1072; SI-NEXT: s_lshl_b32 s8, s4, 4 1073; SI-NEXT: s_mov_b64 s[4:5], 0xffff 1074; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 1075; SI-NEXT: s_mov_b32 s8, 0x50005 1076; SI-NEXT: s_and_b32 s9, s5, s8 1077; SI-NEXT: s_and_b32 s8, s4, s8 1078; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] 1079; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1080; SI-NEXT: v_mov_b32_e32 v0, s5 1081; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1082; SI-NEXT: v_mov_b32_e32 v0, s4 1083; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1084; SI-NEXT: s_endpgm 1085; 1086; VI-LABEL: dynamic_insertelement_v3i16: 1087; VI: ; %bb.0: 1088; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1089; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 1090; VI-NEXT: s_load_dword s4, s[4:5], 0x10 1091; VI-NEXT: s_mov_b32 s3, 0x1100f000 1092; VI-NEXT: s_mov_b32 s2, -1 1093; VI-NEXT: s_waitcnt lgkmcnt(0) 1094; VI-NEXT: s_lshl_b32 s8, s4, 4 1095; VI-NEXT: s_mov_b64 s[4:5], 0xffff 1096; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 1097; VI-NEXT: s_mov_b32 s8, 0x50005 1098; VI-NEXT: s_mov_b32 s9, s8 1099; VI-NEXT: s_andn2_b64 s[6:7], s[6:7], s[4:5] 1100; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9] 1101; VI-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] 1102; VI-NEXT: v_mov_b32_e32 v0, s5 1103; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1104; VI-NEXT: v_mov_b32_e32 v0, s4 1105; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1106; VI-NEXT: s_endpgm 1107 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 1108 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 1109 ret void 1110} 1111 1112define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 1113; SI-LABEL: dynamic_insertelement_v2i8: 1114; SI: ; %bb.0: 1115; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1116; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1117; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1118; SI-NEXT: s_mov_b32 s3, 0x100f000 1119; SI-NEXT: s_mov_b32 s2, -1 1120; SI-NEXT: s_waitcnt lgkmcnt(0) 1121; SI-NEXT: s_lshl_b32 s4, s4, 3 1122; SI-NEXT: s_lshl_b32 s4, -1, s4 1123; SI-NEXT: s_andn2_b32 s5, s6, s4 1124; SI-NEXT: s_and_b32 s4, s4, 0x505 1125; SI-NEXT: s_or_b32 s4, s4, s5 1126; SI-NEXT: v_mov_b32_e32 v0, s4 1127; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1128; SI-NEXT: s_endpgm 1129; 1130; VI-LABEL: dynamic_insertelement_v2i8: 1131; VI: ; %bb.0: 1132; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1133; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1134; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1135; VI-NEXT: s_mov_b32 s3, 0x1100f000 1136; VI-NEXT: s_mov_b32 s2, -1 1137; VI-NEXT: s_waitcnt lgkmcnt(0) 1138; VI-NEXT: s_lshl_b32 s4, s4, 3 1139; VI-NEXT: v_lshlrev_b16_e64 v0, s4, -1 1140; VI-NEXT: v_not_b32_e32 v1, v0 1141; VI-NEXT: v_and_b32_e32 v1, s6, v1 1142; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 1143; VI-NEXT: v_or_b32_e32 v0, v0, v1 1144; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1145; VI-NEXT: s_endpgm 1146 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 1147 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 1148 ret void 1149} 1150 1151; FIXME: post legalize i16 and i32 shifts aren't merged because of 1152; isTypeDesirableForOp in SimplifyDemandedBits 1153define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 1154; SI-LABEL: dynamic_insertelement_v3i8: 1155; SI: ; %bb.0: 1156; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1157; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1158; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1159; SI-NEXT: s_mov_b32 s3, 0x100f000 1160; SI-NEXT: s_mov_b32 s2, -1 1161; SI-NEXT: s_waitcnt lgkmcnt(0) 1162; SI-NEXT: s_lshl_b32 s4, s4, 3 1163; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1164; SI-NEXT: s_andn2_b32 s5, s6, s4 1165; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1166; SI-NEXT: s_or_b32 s4, s4, s5 1167; SI-NEXT: v_mov_b32_e32 v0, s4 1168; SI-NEXT: s_lshr_b32 s5, s4, 16 1169; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1170; SI-NEXT: v_mov_b32_e32 v0, s5 1171; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1172; SI-NEXT: s_endpgm 1173; 1174; VI-LABEL: dynamic_insertelement_v3i8: 1175; VI: ; %bb.0: 1176; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1177; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1178; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1179; VI-NEXT: s_mov_b32 s3, 0x1100f000 1180; VI-NEXT: s_mov_b32 s2, -1 1181; VI-NEXT: s_waitcnt lgkmcnt(0) 1182; VI-NEXT: s_lshl_b32 s4, s4, 3 1183; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1184; VI-NEXT: s_andn2_b32 s5, s6, s4 1185; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1186; VI-NEXT: s_or_b32 s4, s4, s5 1187; VI-NEXT: v_mov_b32_e32 v0, s4 1188; VI-NEXT: s_lshr_b32 s5, s4, 16 1189; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1190; VI-NEXT: v_mov_b32_e32 v0, s5 1191; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1192; VI-NEXT: s_endpgm 1193 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1194 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 1195 ret void 1196} 1197 1198define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1199; SI-LABEL: dynamic_insertelement_v4i8: 1200; SI: ; %bb.0: 1201; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1202; SI-NEXT: s_load_dword s6, s[4:5], 0xa 1203; SI-NEXT: s_load_dword s4, s[4:5], 0x13 1204; SI-NEXT: s_mov_b32 s3, 0x100f000 1205; SI-NEXT: s_mov_b32 s2, -1 1206; SI-NEXT: s_waitcnt lgkmcnt(0) 1207; SI-NEXT: s_lshl_b32 s4, s4, 3 1208; SI-NEXT: s_lshl_b32 s4, 0xffff, s4 1209; SI-NEXT: s_andn2_b32 s5, s6, s4 1210; SI-NEXT: s_and_b32 s4, s4, 0x5050505 1211; SI-NEXT: s_or_b32 s4, s4, s5 1212; SI-NEXT: v_mov_b32_e32 v0, s4 1213; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1214; SI-NEXT: s_endpgm 1215; 1216; VI-LABEL: dynamic_insertelement_v4i8: 1217; VI: ; %bb.0: 1218; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1219; VI-NEXT: s_load_dword s6, s[4:5], 0x28 1220; VI-NEXT: s_load_dword s4, s[4:5], 0x4c 1221; VI-NEXT: s_mov_b32 s3, 0x1100f000 1222; VI-NEXT: s_mov_b32 s2, -1 1223; VI-NEXT: s_waitcnt lgkmcnt(0) 1224; VI-NEXT: s_lshl_b32 s4, s4, 3 1225; VI-NEXT: s_lshl_b32 s4, 0xffff, s4 1226; VI-NEXT: s_andn2_b32 s5, s6, s4 1227; VI-NEXT: s_and_b32 s4, s4, 0x5050505 1228; VI-NEXT: s_or_b32 s4, s4, s5 1229; VI-NEXT: v_mov_b32_e32 v0, s4 1230; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1231; VI-NEXT: s_endpgm 1232 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1233 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 1234 ret void 1235} 1236 1237define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 1238; SI-LABEL: s_dynamic_insertelement_v8i8: 1239; SI: ; %bb.0: 1240; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1241; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1242; SI-NEXT: s_mov_b32 s3, 0x100f000 1243; SI-NEXT: s_mov_b32 s2, -1 1244; SI-NEXT: s_waitcnt lgkmcnt(0) 1245; SI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1246; SI-NEXT: s_mov_b32 s0, s8 1247; SI-NEXT: s_lshl_b32 s8, s6, 3 1248; SI-NEXT: s_mov_b64 s[6:7], 0xffff 1249; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1250; SI-NEXT: s_mov_b32 s8, 0x5050505 1251; SI-NEXT: s_mov_b32 s1, s9 1252; SI-NEXT: s_and_b32 s9, s7, s8 1253; SI-NEXT: s_and_b32 s8, s6, s8 1254; SI-NEXT: s_waitcnt lgkmcnt(0) 1255; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1256; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1257; SI-NEXT: v_mov_b32_e32 v0, s4 1258; SI-NEXT: v_mov_b32_e32 v1, s5 1259; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1260; SI-NEXT: s_endpgm 1261; 1262; VI-LABEL: s_dynamic_insertelement_v8i8: 1263; VI: ; %bb.0: 1264; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 1265; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1266; VI-NEXT: s_mov_b32 s3, 0x1100f000 1267; VI-NEXT: s_mov_b32 s2, -1 1268; VI-NEXT: s_waitcnt lgkmcnt(0) 1269; VI-NEXT: s_load_dwordx2 s[4:5], s[10:11], 0x0 1270; VI-NEXT: s_mov_b32 s0, s8 1271; VI-NEXT: s_lshl_b32 s8, s6, 3 1272; VI-NEXT: s_mov_b64 s[6:7], 0xffff 1273; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1274; VI-NEXT: s_mov_b32 s8, 0x5050505 1275; VI-NEXT: s_mov_b32 s1, s9 1276; VI-NEXT: s_and_b32 s9, s7, s8 1277; VI-NEXT: s_and_b32 s8, s6, s8 1278; VI-NEXT: s_waitcnt lgkmcnt(0) 1279; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1280; VI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 1281; VI-NEXT: v_mov_b32_e32 v0, s4 1282; VI-NEXT: v_mov_b32_e32 v1, s5 1283; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1284; VI-NEXT: s_endpgm 1285 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 1286 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1287 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 1288 ret void 1289} 1290 1291define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 1292; SI-LABEL: dynamic_insertelement_v16i8: 1293; SI: ; %bb.0: 1294; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1295; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1296; SI-NEXT: s_load_dword s4, s[4:5], 0x8 1297; SI-NEXT: s_mov_b32 s3, 0x100f000 1298; SI-NEXT: s_mov_b32 s2, -1 1299; SI-NEXT: s_waitcnt lgkmcnt(0) 1300; SI-NEXT: s_lshr_b32 s5, s11, 24 1301; SI-NEXT: s_cmp_lg_u32 s4, 15 1302; SI-NEXT: v_mov_b32_e32 v0, s5 1303; SI-NEXT: s_cselect_b64 vcc, -1, 0 1304; SI-NEXT: s_lshr_b32 s5, s11, 16 1305; SI-NEXT: s_cmp_lg_u32 s4, 14 1306; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1307; SI-NEXT: v_mov_b32_e32 v1, s5 1308; SI-NEXT: s_cselect_b64 vcc, -1, 0 1309; SI-NEXT: s_lshr_b32 s6, s11, 8 1310; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1311; SI-NEXT: s_movk_i32 s5, 0xff 1312; SI-NEXT: s_cmp_lg_u32 s4, 13 1313; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1314; SI-NEXT: v_and_b32_e32 v1, s5, v1 1315; SI-NEXT: v_or_b32_e32 v0, v1, v0 1316; SI-NEXT: v_mov_b32_e32 v1, s6 1317; SI-NEXT: s_cselect_b64 vcc, -1, 0 1318; SI-NEXT: s_cmp_lg_u32 s4, 12 1319; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1320; SI-NEXT: s_cselect_b64 vcc, -1, 0 1321; SI-NEXT: v_mov_b32_e32 v2, s11 1322; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1323; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1324; SI-NEXT: v_and_b32_e32 v2, s5, v2 1325; SI-NEXT: v_or_b32_e32 v1, v2, v1 1326; SI-NEXT: s_mov_b32 s6, 0xffff 1327; SI-NEXT: s_lshr_b32 s7, s10, 24 1328; SI-NEXT: s_cmp_lg_u32 s4, 11 1329; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1330; SI-NEXT: v_and_b32_e32 v1, s6, v1 1331; SI-NEXT: v_or_b32_e32 v3, v1, v0 1332; SI-NEXT: v_mov_b32_e32 v0, s7 1333; SI-NEXT: s_cselect_b64 vcc, -1, 0 1334; SI-NEXT: s_lshr_b32 s7, s10, 16 1335; SI-NEXT: s_cmp_lg_u32 s4, 10 1336; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1337; SI-NEXT: v_mov_b32_e32 v1, s7 1338; SI-NEXT: s_cselect_b64 vcc, -1, 0 1339; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1340; SI-NEXT: s_lshr_b32 s7, s10, 8 1341; SI-NEXT: s_cmp_lg_u32 s4, 9 1342; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1343; SI-NEXT: v_and_b32_e32 v1, s5, v1 1344; SI-NEXT: v_or_b32_e32 v0, v1, v0 1345; SI-NEXT: s_cselect_b64 vcc, -1, 0 1346; SI-NEXT: v_mov_b32_e32 v1, s7 1347; SI-NEXT: s_cmp_lg_u32 s4, 8 1348; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1349; SI-NEXT: s_cselect_b64 vcc, -1, 0 1350; SI-NEXT: v_mov_b32_e32 v2, s10 1351; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1352; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1353; SI-NEXT: v_and_b32_e32 v2, s5, v2 1354; SI-NEXT: v_or_b32_e32 v1, v2, v1 1355; SI-NEXT: s_lshr_b32 s7, s9, 24 1356; SI-NEXT: s_cmp_lg_u32 s4, 7 1357; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1358; SI-NEXT: v_and_b32_e32 v1, s6, v1 1359; SI-NEXT: v_or_b32_e32 v2, v1, v0 1360; SI-NEXT: v_mov_b32_e32 v0, s7 1361; SI-NEXT: s_cselect_b64 vcc, -1, 0 1362; SI-NEXT: s_lshr_b32 s7, s9, 16 1363; SI-NEXT: s_cmp_lg_u32 s4, 6 1364; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1365; SI-NEXT: v_mov_b32_e32 v1, s7 1366; SI-NEXT: s_cselect_b64 vcc, -1, 0 1367; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1368; SI-NEXT: s_lshr_b32 s7, s9, 8 1369; SI-NEXT: s_cmp_lg_u32 s4, 5 1370; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1371; SI-NEXT: v_and_b32_e32 v1, s5, v1 1372; SI-NEXT: v_or_b32_e32 v0, v1, v0 1373; SI-NEXT: s_cselect_b64 vcc, -1, 0 1374; SI-NEXT: v_mov_b32_e32 v1, s7 1375; SI-NEXT: s_cmp_lg_u32 s4, 4 1376; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1377; SI-NEXT: s_cselect_b64 vcc, -1, 0 1378; SI-NEXT: v_mov_b32_e32 v4, s9 1379; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1380; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1381; SI-NEXT: v_and_b32_e32 v4, s5, v4 1382; SI-NEXT: v_or_b32_e32 v1, v4, v1 1383; SI-NEXT: s_lshr_b32 s7, s8, 24 1384; SI-NEXT: s_cmp_lg_u32 s4, 3 1385; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1386; SI-NEXT: v_and_b32_e32 v1, s6, v1 1387; SI-NEXT: v_or_b32_e32 v1, v1, v0 1388; SI-NEXT: v_mov_b32_e32 v0, s7 1389; SI-NEXT: s_cselect_b64 vcc, -1, 0 1390; SI-NEXT: s_lshr_b32 s7, s8, 16 1391; SI-NEXT: s_cmp_lg_u32 s4, 2 1392; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1393; SI-NEXT: v_mov_b32_e32 v4, s7 1394; SI-NEXT: s_cselect_b64 vcc, -1, 0 1395; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1396; SI-NEXT: s_lshr_b32 s7, s8, 8 1397; SI-NEXT: s_cmp_lg_u32 s4, 1 1398; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1399; SI-NEXT: v_and_b32_e32 v4, s5, v4 1400; SI-NEXT: v_or_b32_e32 v0, v4, v0 1401; SI-NEXT: s_cselect_b64 vcc, -1, 0 1402; SI-NEXT: v_mov_b32_e32 v4, s7 1403; SI-NEXT: s_cmp_lg_u32 s4, 0 1404; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1405; SI-NEXT: v_mov_b32_e32 v5, s8 1406; SI-NEXT: s_cselect_b64 vcc, -1, 0 1407; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1408; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1409; SI-NEXT: v_and_b32_e32 v5, s5, v5 1410; SI-NEXT: v_or_b32_e32 v4, v5, v4 1411; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1412; SI-NEXT: v_and_b32_e32 v4, s6, v4 1413; SI-NEXT: v_or_b32_e32 v0, v4, v0 1414; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1415; SI-NEXT: s_endpgm 1416; 1417; VI-LABEL: dynamic_insertelement_v16i8: 1418; VI: ; %bb.0: 1419; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1420; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1421; VI-NEXT: s_load_dword s4, s[4:5], 0x20 1422; VI-NEXT: s_mov_b32 s3, 0x1100f000 1423; VI-NEXT: s_mov_b32 s2, -1 1424; VI-NEXT: s_waitcnt lgkmcnt(0) 1425; VI-NEXT: s_lshr_b32 s5, s11, 24 1426; VI-NEXT: s_cmp_lg_u32 s4, 15 1427; VI-NEXT: v_mov_b32_e32 v0, s5 1428; VI-NEXT: s_cselect_b64 vcc, -1, 0 1429; VI-NEXT: s_lshr_b32 s5, s11, 16 1430; VI-NEXT: s_cmp_lg_u32 s4, 14 1431; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1432; VI-NEXT: v_mov_b32_e32 v1, s5 1433; VI-NEXT: s_cselect_b64 vcc, -1, 0 1434; VI-NEXT: s_lshr_b32 s5, s11, 8 1435; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1436; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1437; VI-NEXT: s_cmp_lg_u32 s4, 13 1438; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1439; VI-NEXT: s_cselect_b64 vcc, -1, 0 1440; VI-NEXT: v_mov_b32_e32 v1, s5 1441; VI-NEXT: s_cmp_lg_u32 s4, 12 1442; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1443; VI-NEXT: s_cselect_b64 vcc, -1, 0 1444; VI-NEXT: v_mov_b32_e32 v2, s11 1445; VI-NEXT: s_lshr_b32 s5, s10, 24 1446; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1447; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1448; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1449; VI-NEXT: s_cmp_lg_u32 s4, 11 1450; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1451; VI-NEXT: v_mov_b32_e32 v0, s5 1452; VI-NEXT: s_cselect_b64 vcc, -1, 0 1453; VI-NEXT: s_lshr_b32 s5, s10, 16 1454; VI-NEXT: s_cmp_lg_u32 s4, 10 1455; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1456; VI-NEXT: v_mov_b32_e32 v1, s5 1457; VI-NEXT: s_cselect_b64 vcc, -1, 0 1458; VI-NEXT: s_lshr_b32 s5, s10, 8 1459; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1460; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1461; VI-NEXT: s_cmp_lg_u32 s4, 9 1462; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1463; VI-NEXT: s_cselect_b64 vcc, -1, 0 1464; VI-NEXT: v_mov_b32_e32 v1, s5 1465; VI-NEXT: s_cmp_lg_u32 s4, 8 1466; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1467; VI-NEXT: s_cselect_b64 vcc, -1, 0 1468; VI-NEXT: v_mov_b32_e32 v2, s10 1469; VI-NEXT: s_lshr_b32 s5, s9, 24 1470; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1471; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1472; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1473; VI-NEXT: s_cmp_lg_u32 s4, 7 1474; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1475; VI-NEXT: v_mov_b32_e32 v0, s5 1476; VI-NEXT: s_cselect_b64 vcc, -1, 0 1477; VI-NEXT: s_lshr_b32 s5, s9, 16 1478; VI-NEXT: s_cmp_lg_u32 s4, 6 1479; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1480; VI-NEXT: v_mov_b32_e32 v1, s5 1481; VI-NEXT: s_cselect_b64 vcc, -1, 0 1482; VI-NEXT: s_lshr_b32 s5, s9, 8 1483; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1484; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1485; VI-NEXT: s_cmp_lg_u32 s4, 5 1486; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1487; VI-NEXT: s_cselect_b64 vcc, -1, 0 1488; VI-NEXT: v_mov_b32_e32 v1, s5 1489; VI-NEXT: s_cmp_lg_u32 s4, 4 1490; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1491; VI-NEXT: s_cselect_b64 vcc, -1, 0 1492; VI-NEXT: v_mov_b32_e32 v4, s9 1493; VI-NEXT: s_lshr_b32 s5, s8, 24 1494; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1495; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1496; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1497; VI-NEXT: s_cmp_lg_u32 s4, 3 1498; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1499; VI-NEXT: v_mov_b32_e32 v0, s5 1500; VI-NEXT: s_cselect_b64 vcc, -1, 0 1501; VI-NEXT: s_lshr_b32 s5, s8, 16 1502; VI-NEXT: s_cmp_lg_u32 s4, 2 1503; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1504; VI-NEXT: v_mov_b32_e32 v4, s5 1505; VI-NEXT: s_cselect_b64 vcc, -1, 0 1506; VI-NEXT: s_lshr_b32 s5, s8, 8 1507; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1508; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1509; VI-NEXT: s_cmp_lg_u32 s4, 1 1510; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1511; VI-NEXT: s_cselect_b64 vcc, -1, 0 1512; VI-NEXT: v_mov_b32_e32 v4, s5 1513; VI-NEXT: s_cmp_lg_u32 s4, 0 1514; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1515; VI-NEXT: v_mov_b32_e32 v5, s8 1516; VI-NEXT: s_cselect_b64 vcc, -1, 0 1517; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 1518; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1519; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1520; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1521; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1522; VI-NEXT: s_endpgm 1523 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1524 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 1525 ret void 1526} 1527 1528; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1529; the compiler doesn't crash. 1530define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 1531; SI-LABEL: insert_split_bb: 1532; SI: ; %bb.0: ; %entry 1533; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1534; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1535; SI-NEXT: s_waitcnt lgkmcnt(0) 1536; SI-NEXT: s_cmp_lg_u32 s6, 0 1537; SI-NEXT: s_cbranch_scc0 BB30_2 1538; SI-NEXT: ; %bb.1: ; %else 1539; SI-NEXT: s_load_dword s7, s[2:3], 0x1 1540; SI-NEXT: s_mov_b64 s[4:5], 0 1541; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1542; SI-NEXT: s_waitcnt lgkmcnt(0) 1543; SI-NEXT: s_mov_b64 vcc, vcc 1544; SI-NEXT: s_cbranch_vccz BB30_3 1545; SI-NEXT: s_branch BB30_4 1546; SI-NEXT: BB30_2: 1547; SI-NEXT: BB30_3: ; %if 1548; SI-NEXT: s_load_dword s7, s[2:3], 0x0 1549; SI-NEXT: BB30_4: ; %endif 1550; SI-NEXT: s_waitcnt lgkmcnt(0) 1551; SI-NEXT: v_mov_b32_e32 v0, s6 1552; SI-NEXT: s_mov_b32 s3, 0x100f000 1553; SI-NEXT: s_mov_b32 s2, -1 1554; SI-NEXT: v_mov_b32_e32 v1, s7 1555; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1556; SI-NEXT: s_endpgm 1557; 1558; VI-LABEL: insert_split_bb: 1559; VI: ; %bb.0: ; %entry 1560; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1561; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1562; VI-NEXT: s_waitcnt lgkmcnt(0) 1563; VI-NEXT: s_cmp_lg_u32 s6, 0 1564; VI-NEXT: s_cbranch_scc0 BB30_2 1565; VI-NEXT: ; %bb.1: ; %else 1566; VI-NEXT: s_load_dword s7, s[2:3], 0x4 1567; VI-NEXT: s_cbranch_execz BB30_3 1568; VI-NEXT: s_branch BB30_4 1569; VI-NEXT: BB30_2: 1570; VI-NEXT: BB30_3: ; %if 1571; VI-NEXT: s_waitcnt lgkmcnt(0) 1572; VI-NEXT: s_load_dword s7, s[2:3], 0x0 1573; VI-NEXT: BB30_4: ; %endif 1574; VI-NEXT: s_waitcnt lgkmcnt(0) 1575; VI-NEXT: v_mov_b32_e32 v0, s6 1576; VI-NEXT: s_mov_b32 s3, 0x1100f000 1577; VI-NEXT: s_mov_b32 s2, -1 1578; VI-NEXT: v_mov_b32_e32 v1, s7 1579; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1580; VI-NEXT: s_endpgm 1581entry: 1582 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 1583 %1 = icmp eq i32 %a, 0 1584 br i1 %1, label %if, label %else 1585 1586if: 1587 %2 = load i32, i32 addrspace(1)* %in 1588 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 1589 br label %endif 1590 1591else: 1592 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 1593 %5 = load i32, i32 addrspace(1)* %4 1594 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 1595 br label %endif 1596 1597endif: 1598 %7 = phi <2 x i32> [%3, %if], [%6, %else] 1599 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 1600 ret void 1601} 1602 1603define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 1604; SI-LABEL: dynamic_insertelement_v2f64: 1605; SI: ; %bb.0: 1606; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1607; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xc 1608; SI-NEXT: s_load_dword s4, s[4:5], 0x18 1609; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1610; SI-NEXT: s_mov_b32 s3, 0x100f000 1611; SI-NEXT: s_mov_b32 s2, -1 1612; SI-NEXT: s_waitcnt lgkmcnt(0) 1613; SI-NEXT: v_mov_b32_e32 v0, s11 1614; SI-NEXT: s_cmp_eq_u32 s4, 1 1615; SI-NEXT: s_cselect_b64 vcc, -1, 0 1616; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1617; SI-NEXT: v_mov_b32_e32 v0, s10 1618; SI-NEXT: s_cmp_eq_u32 s4, 0 1619; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1620; SI-NEXT: v_mov_b32_e32 v0, s9 1621; SI-NEXT: s_cselect_b64 vcc, -1, 0 1622; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1623; SI-NEXT: v_mov_b32_e32 v0, s8 1624; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1625; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1626; SI-NEXT: s_endpgm 1627; 1628; VI-LABEL: dynamic_insertelement_v2f64: 1629; VI: ; %bb.0: 1630; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1631; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x30 1632; VI-NEXT: s_load_dword s4, s[4:5], 0x60 1633; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1634; VI-NEXT: s_mov_b32 s3, 0x1100f000 1635; VI-NEXT: s_mov_b32 s2, -1 1636; VI-NEXT: s_waitcnt lgkmcnt(0) 1637; VI-NEXT: v_mov_b32_e32 v0, s11 1638; VI-NEXT: s_cmp_eq_u32 s4, 1 1639; VI-NEXT: s_cselect_b64 vcc, -1, 0 1640; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1641; VI-NEXT: v_mov_b32_e32 v0, s10 1642; VI-NEXT: s_cmp_eq_u32 s4, 0 1643; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1644; VI-NEXT: v_mov_b32_e32 v0, s9 1645; VI-NEXT: s_cselect_b64 vcc, -1, 0 1646; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1647; VI-NEXT: v_mov_b32_e32 v0, s8 1648; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1649; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1650; VI-NEXT: s_endpgm 1651 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 1652 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 1653 ret void 1654} 1655 1656define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 1657; SI-LABEL: dynamic_insertelement_v2i64: 1658; SI: ; %bb.0: 1659; SI-NEXT: s_load_dword s6, s[4:5], 0x8 1660; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1661; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1662; SI-NEXT: s_mov_b32 s3, 0x100f000 1663; SI-NEXT: s_mov_b32 s2, -1 1664; SI-NEXT: s_waitcnt lgkmcnt(0) 1665; SI-NEXT: s_cmp_eq_u32 s6, 1 1666; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1667; SI-NEXT: v_mov_b32_e32 v0, s11 1668; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1669; SI-NEXT: v_mov_b32_e32 v0, s10 1670; SI-NEXT: s_cmp_eq_u32 s6, 0 1671; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1672; SI-NEXT: v_mov_b32_e32 v0, s9 1673; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1674; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1675; SI-NEXT: v_mov_b32_e32 v0, s8 1676; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1677; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1678; SI-NEXT: s_endpgm 1679; 1680; VI-LABEL: dynamic_insertelement_v2i64: 1681; VI: ; %bb.0: 1682; VI-NEXT: s_load_dword s6, s[4:5], 0x20 1683; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1684; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1685; VI-NEXT: s_mov_b32 s3, 0x1100f000 1686; VI-NEXT: s_mov_b32 s2, -1 1687; VI-NEXT: s_waitcnt lgkmcnt(0) 1688; VI-NEXT: s_cmp_eq_u32 s6, 1 1689; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1690; VI-NEXT: v_mov_b32_e32 v0, s11 1691; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1692; VI-NEXT: v_mov_b32_e32 v0, s10 1693; VI-NEXT: s_cmp_eq_u32 s6, 0 1694; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1695; VI-NEXT: v_mov_b32_e32 v0, s9 1696; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1697; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1698; VI-NEXT: v_mov_b32_e32 v0, s8 1699; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1700; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1701; VI-NEXT: s_endpgm 1702 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 1703 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 1704 ret void 1705} 1706 1707define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 1708; SI-LABEL: dynamic_insertelement_v3i64: 1709; SI: ; %bb.0: 1710; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1711; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1712; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xc 1713; SI-NEXT: s_load_dword s12, s[4:5], 0x10 1714; SI-NEXT: s_mov_b32 s3, 0x100f000 1715; SI-NEXT: s_mov_b32 s2, -1 1716; SI-NEXT: s_waitcnt lgkmcnt(0) 1717; SI-NEXT: v_mov_b32_e32 v0, s11 1718; SI-NEXT: v_mov_b32_e32 v4, s7 1719; SI-NEXT: s_cmp_eq_u32 s12, 1 1720; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1721; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1722; SI-NEXT: v_mov_b32_e32 v0, s10 1723; SI-NEXT: s_cmp_eq_u32 s12, 0 1724; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1725; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1726; SI-NEXT: v_mov_b32_e32 v0, s9 1727; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1728; SI-NEXT: v_mov_b32_e32 v0, s8 1729; SI-NEXT: s_cmp_eq_u32 s12, 2 1730; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1731; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 1732; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] 1733; SI-NEXT: v_mov_b32_e32 v4, s6 1734; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] 1735; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1736; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1737; SI-NEXT: s_endpgm 1738; 1739; VI-LABEL: dynamic_insertelement_v3i64: 1740; VI: ; %bb.0: 1741; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1742; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 1743; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x30 1744; VI-NEXT: s_load_dword s12, s[4:5], 0x40 1745; VI-NEXT: s_mov_b32 s3, 0x1100f000 1746; VI-NEXT: s_mov_b32 s2, -1 1747; VI-NEXT: s_waitcnt lgkmcnt(0) 1748; VI-NEXT: v_mov_b32_e32 v0, s11 1749; VI-NEXT: s_cmp_eq_u32 s12, 1 1750; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1751; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] 1752; VI-NEXT: v_mov_b32_e32 v0, s10 1753; VI-NEXT: s_cmp_eq_u32 s12, 0 1754; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[4:5] 1755; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1756; VI-NEXT: v_mov_b32_e32 v0, s9 1757; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5] 1758; VI-NEXT: v_mov_b32_e32 v0, s8 1759; VI-NEXT: s_cmp_eq_u32 s12, 2 1760; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[4:5] 1761; VI-NEXT: v_mov_b32_e32 v4, s7 1762; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 1763; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[4:5] 1764; VI-NEXT: v_mov_b32_e32 v4, s6 1765; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[4:5] 1766; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1767; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1768; VI-NEXT: s_endpgm 1769 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 1770 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 1771 ret void 1772} 1773 1774define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 1775; SI-LABEL: dynamic_insertelement_v4f64: 1776; SI: ; %bb.0: 1777; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1778; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1779; SI-NEXT: s_load_dword s4, s[4:5], 0x10 1780; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 1781; SI-NEXT: s_mov_b32 s3, 0x100f000 1782; SI-NEXT: s_mov_b32 s2, -1 1783; SI-NEXT: s_waitcnt lgkmcnt(0) 1784; SI-NEXT: v_mov_b32_e32 v0, s11 1785; SI-NEXT: s_cmp_eq_u32 s4, 1 1786; SI-NEXT: s_cselect_b64 vcc, -1, 0 1787; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1788; SI-NEXT: v_mov_b32_e32 v0, s10 1789; SI-NEXT: s_cmp_eq_u32 s4, 0 1790; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1791; SI-NEXT: s_cselect_b64 vcc, -1, 0 1792; SI-NEXT: v_mov_b32_e32 v0, s9 1793; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1794; SI-NEXT: v_mov_b32_e32 v0, s8 1795; SI-NEXT: s_cmp_eq_u32 s4, 3 1796; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1797; SI-NEXT: s_cselect_b64 vcc, -1, 0 1798; SI-NEXT: v_mov_b32_e32 v5, s15 1799; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1800; SI-NEXT: v_mov_b32_e32 v5, s14 1801; SI-NEXT: s_cmp_eq_u32 s4, 2 1802; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1803; SI-NEXT: v_mov_b32_e32 v5, s13 1804; SI-NEXT: s_cselect_b64 vcc, -1, 0 1805; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1806; SI-NEXT: v_mov_b32_e32 v4, s12 1807; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1808; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1809; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1810; SI-NEXT: s_endpgm 1811; 1812; VI-LABEL: dynamic_insertelement_v4f64: 1813; VI: ; %bb.0: 1814; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1815; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1816; VI-NEXT: s_load_dword s4, s[4:5], 0x40 1817; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 1818; VI-NEXT: s_mov_b32 s3, 0x1100f000 1819; VI-NEXT: s_mov_b32 s2, -1 1820; VI-NEXT: s_waitcnt lgkmcnt(0) 1821; VI-NEXT: v_mov_b32_e32 v0, s11 1822; VI-NEXT: s_cmp_eq_u32 s4, 1 1823; VI-NEXT: s_cselect_b64 vcc, -1, 0 1824; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1825; VI-NEXT: v_mov_b32_e32 v0, s10 1826; VI-NEXT: s_cmp_eq_u32 s4, 0 1827; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1828; VI-NEXT: s_cselect_b64 vcc, -1, 0 1829; VI-NEXT: v_mov_b32_e32 v0, s9 1830; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1831; VI-NEXT: v_mov_b32_e32 v0, s8 1832; VI-NEXT: s_cmp_eq_u32 s4, 3 1833; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1834; VI-NEXT: s_cselect_b64 vcc, -1, 0 1835; VI-NEXT: v_mov_b32_e32 v5, s15 1836; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1837; VI-NEXT: v_mov_b32_e32 v5, s14 1838; VI-NEXT: s_cmp_eq_u32 s4, 2 1839; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1840; VI-NEXT: v_mov_b32_e32 v5, s13 1841; VI-NEXT: s_cselect_b64 vcc, -1, 0 1842; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1843; VI-NEXT: v_mov_b32_e32 v4, s12 1844; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1845; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1846; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1847; VI-NEXT: s_endpgm 1848 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 1849 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 1850 ret void 1851} 1852 1853define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 1854; SI-LABEL: dynamic_insertelement_v8f64: 1855; SI: ; %bb.0: 1856; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1857; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 1858; SI-NEXT: s_load_dword s4, s[4:5], 0x20 1859; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 1860; SI-NEXT: s_mov_b32 s3, 0x100f000 1861; SI-NEXT: s_mov_b32 s2, -1 1862; SI-NEXT: s_waitcnt lgkmcnt(0) 1863; SI-NEXT: v_mov_b32_e32 v0, s8 1864; SI-NEXT: s_lshl_b32 s4, s4, 1 1865; SI-NEXT: v_mov_b32_e32 v1, s9 1866; SI-NEXT: v_mov_b32_e32 v2, s10 1867; SI-NEXT: v_mov_b32_e32 v3, s11 1868; SI-NEXT: v_mov_b32_e32 v4, s12 1869; SI-NEXT: v_mov_b32_e32 v5, s13 1870; SI-NEXT: v_mov_b32_e32 v6, s14 1871; SI-NEXT: v_mov_b32_e32 v7, s15 1872; SI-NEXT: v_mov_b32_e32 v8, s16 1873; SI-NEXT: v_mov_b32_e32 v9, s17 1874; SI-NEXT: v_mov_b32_e32 v10, s18 1875; SI-NEXT: v_mov_b32_e32 v11, s19 1876; SI-NEXT: v_mov_b32_e32 v12, s20 1877; SI-NEXT: v_mov_b32_e32 v13, s21 1878; SI-NEXT: v_mov_b32_e32 v14, s22 1879; SI-NEXT: v_mov_b32_e32 v15, s23 1880; SI-NEXT: s_mov_b32 m0, s4 1881; SI-NEXT: v_movreld_b32_e32 v0, 0 1882; SI-NEXT: v_movreld_b32_e32 v1, v16 1883; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1884; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1885; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1886; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1887; SI-NEXT: s_endpgm 1888; 1889; VI-LABEL: dynamic_insertelement_v8f64: 1890; VI: ; %bb.0: 1891; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1892; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 1893; VI-NEXT: s_load_dword s4, s[4:5], 0x80 1894; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 1895; VI-NEXT: s_mov_b32 s3, 0x1100f000 1896; VI-NEXT: s_mov_b32 s2, -1 1897; VI-NEXT: s_waitcnt lgkmcnt(0) 1898; VI-NEXT: v_mov_b32_e32 v0, s8 1899; VI-NEXT: s_lshl_b32 s4, s4, 1 1900; VI-NEXT: v_mov_b32_e32 v1, s9 1901; VI-NEXT: v_mov_b32_e32 v2, s10 1902; VI-NEXT: v_mov_b32_e32 v3, s11 1903; VI-NEXT: v_mov_b32_e32 v4, s12 1904; VI-NEXT: v_mov_b32_e32 v5, s13 1905; VI-NEXT: v_mov_b32_e32 v6, s14 1906; VI-NEXT: v_mov_b32_e32 v7, s15 1907; VI-NEXT: v_mov_b32_e32 v8, s16 1908; VI-NEXT: v_mov_b32_e32 v9, s17 1909; VI-NEXT: v_mov_b32_e32 v10, s18 1910; VI-NEXT: v_mov_b32_e32 v11, s19 1911; VI-NEXT: v_mov_b32_e32 v12, s20 1912; VI-NEXT: v_mov_b32_e32 v13, s21 1913; VI-NEXT: v_mov_b32_e32 v14, s22 1914; VI-NEXT: v_mov_b32_e32 v15, s23 1915; VI-NEXT: s_mov_b32 m0, s4 1916; VI-NEXT: v_movreld_b32_e32 v0, 0 1917; VI-NEXT: v_movreld_b32_e32 v1, v16 1918; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1919; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1920; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1921; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1922; VI-NEXT: s_endpgm 1923 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 1924 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 1925 ret void 1926} 1927 1928declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1929 1930attributes #0 = { nounwind } 1931attributes #1 = { nounwind readnone } 1932