1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s 4 5; FIXME: Broken on evergreen 6; FIXME: For some reason the 8 and 16 vectors are being stored as 7; individual elements instead of 128-bit stores. 8 9define amdgpu_kernel void @insertelement_v2f32_0(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 10; SI-LABEL: insertelement_v2f32_0: 11; SI: ; %bb.0: 12; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 13; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 14; SI-NEXT: s_mov_b32 s3, 0x100f000 15; SI-NEXT: s_mov_b32 s2, -1 16; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 17; SI-NEXT: s_waitcnt lgkmcnt(0) 18; SI-NEXT: v_mov_b32_e32 v1, s7 19; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 20; SI-NEXT: s_endpgm 21; 22; VI-LABEL: insertelement_v2f32_0: 23; VI: ; %bb.0: 24; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 25; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 26; VI-NEXT: s_mov_b32 s3, 0x1100f000 27; VI-NEXT: s_mov_b32 s2, -1 28; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 29; VI-NEXT: s_waitcnt lgkmcnt(0) 30; VI-NEXT: v_mov_b32_e32 v1, s7 31; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 32; VI-NEXT: s_endpgm 33 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 0 34 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 35 ret void 36} 37 38define amdgpu_kernel void @insertelement_v2f32_1(<2 x float> addrspace(1)* %out, <2 x float> %a) nounwind { 39; SI-LABEL: insertelement_v2f32_1: 40; SI: ; %bb.0: 41; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 42; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 43; SI-NEXT: s_mov_b32 s3, 0x100f000 44; SI-NEXT: s_mov_b32 s2, -1 45; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 46; SI-NEXT: s_waitcnt lgkmcnt(0) 47; SI-NEXT: v_mov_b32_e32 v0, s6 48; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 49; SI-NEXT: s_endpgm 50; 51; VI-LABEL: insertelement_v2f32_1: 52; VI: ; %bb.0: 53; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 54; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 55; VI-NEXT: s_mov_b32 s3, 0x1100f000 56; VI-NEXT: s_mov_b32 s2, -1 57; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 58; VI-NEXT: s_waitcnt lgkmcnt(0) 59; VI-NEXT: v_mov_b32_e32 v0, s6 60; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 61; VI-NEXT: s_endpgm 62 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 1 63 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 16 64 ret void 65} 66 67define amdgpu_kernel void @insertelement_v2i32_0(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 68; SI-LABEL: insertelement_v2i32_0: 69; SI: ; %bb.0: 70; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 71; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 72; SI-NEXT: s_mov_b32 s3, 0x100f000 73; SI-NEXT: s_mov_b32 s2, -1 74; SI-NEXT: v_mov_b32_e32 v0, 0x3e7 75; SI-NEXT: s_waitcnt lgkmcnt(0) 76; SI-NEXT: v_mov_b32_e32 v1, s7 77; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 78; SI-NEXT: s_endpgm 79; 80; VI-LABEL: insertelement_v2i32_0: 81; VI: ; %bb.0: 82; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 83; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 84; VI-NEXT: s_mov_b32 s3, 0x1100f000 85; VI-NEXT: s_mov_b32 s2, -1 86; VI-NEXT: v_mov_b32_e32 v0, 0x3e7 87; VI-NEXT: s_waitcnt lgkmcnt(0) 88; VI-NEXT: v_mov_b32_e32 v1, s7 89; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 90; VI-NEXT: s_endpgm 91 %vecins = insertelement <2 x i32> %a, i32 999, i32 0 92 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 93 ret void 94} 95 96define amdgpu_kernel void @insertelement_v2i32_1(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind { 97; SI-LABEL: insertelement_v2i32_1: 98; SI: ; %bb.0: 99; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 100; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 101; SI-NEXT: s_mov_b32 s3, 0x100f000 102; SI-NEXT: s_mov_b32 s2, -1 103; SI-NEXT: v_mov_b32_e32 v1, 0x3e7 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s6 106; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 107; SI-NEXT: s_endpgm 108; 109; VI-LABEL: insertelement_v2i32_1: 110; VI: ; %bb.0: 111; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 112; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 113; VI-NEXT: s_mov_b32 s3, 0x1100f000 114; VI-NEXT: s_mov_b32 s2, -1 115; VI-NEXT: v_mov_b32_e32 v1, 0x3e7 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s6 118; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 119; VI-NEXT: s_endpgm 120 %vecins = insertelement <2 x i32> %a, i32 999, i32 1 121 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 16 122 ret void 123} 124 125; FIXME: Why is the constant moved into the intermediate register and 126; not just directly into the vector component? 127define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 128; SI-LABEL: insertelement_v4f32_0: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 131; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: s_mov_b32 s0, 0x40a00000 134; SI-NEXT: s_mov_b32 s7, 0x100f000 135; SI-NEXT: s_mov_b32 s6, -1 136; SI-NEXT: v_mov_b32_e32 v0, s0 137; SI-NEXT: v_mov_b32_e32 v1, s1 138; SI-NEXT: v_mov_b32_e32 v2, s2 139; SI-NEXT: v_mov_b32_e32 v3, s3 140; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 141; SI-NEXT: s_endpgm 142; 143; VI-LABEL: insertelement_v4f32_0: 144; VI: ; %bb.0: 145; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 146; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 147; VI-NEXT: s_waitcnt lgkmcnt(0) 148; VI-NEXT: s_mov_b32 s0, 0x40a00000 149; VI-NEXT: s_mov_b32 s7, 0x1100f000 150; VI-NEXT: s_mov_b32 s6, -1 151; VI-NEXT: v_mov_b32_e32 v0, s0 152; VI-NEXT: v_mov_b32_e32 v1, s1 153; VI-NEXT: v_mov_b32_e32 v2, s2 154; VI-NEXT: v_mov_b32_e32 v3, s3 155; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 156; VI-NEXT: s_endpgm 157 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0 158 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 159 ret void 160} 161 162define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 163; SI-LABEL: insertelement_v4f32_1: 164; SI: ; %bb.0: 165; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 166; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 167; SI-NEXT: s_waitcnt lgkmcnt(0) 168; SI-NEXT: s_mov_b32 s1, 0x40a00000 169; SI-NEXT: s_mov_b32 s7, 0x100f000 170; SI-NEXT: s_mov_b32 s6, -1 171; SI-NEXT: v_mov_b32_e32 v0, s0 172; SI-NEXT: v_mov_b32_e32 v1, s1 173; SI-NEXT: v_mov_b32_e32 v2, s2 174; SI-NEXT: v_mov_b32_e32 v3, s3 175; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 176; SI-NEXT: s_endpgm 177; 178; VI-LABEL: insertelement_v4f32_1: 179; VI: ; %bb.0: 180; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 181; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 182; VI-NEXT: s_waitcnt lgkmcnt(0) 183; VI-NEXT: s_mov_b32 s1, 0x40a00000 184; VI-NEXT: s_mov_b32 s7, 0x1100f000 185; VI-NEXT: s_mov_b32 s6, -1 186; VI-NEXT: v_mov_b32_e32 v0, s0 187; VI-NEXT: v_mov_b32_e32 v1, s1 188; VI-NEXT: v_mov_b32_e32 v2, s2 189; VI-NEXT: v_mov_b32_e32 v3, s3 190; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 191; VI-NEXT: s_endpgm 192 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1 193 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 194 ret void 195} 196 197define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 198; SI-LABEL: insertelement_v4f32_2: 199; SI: ; %bb.0: 200; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 201; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 202; SI-NEXT: s_waitcnt lgkmcnt(0) 203; SI-NEXT: s_mov_b32 s2, 0x40a00000 204; SI-NEXT: s_mov_b32 s7, 0x100f000 205; SI-NEXT: s_mov_b32 s6, -1 206; SI-NEXT: v_mov_b32_e32 v0, s0 207; SI-NEXT: v_mov_b32_e32 v1, s1 208; SI-NEXT: v_mov_b32_e32 v2, s2 209; SI-NEXT: v_mov_b32_e32 v3, s3 210; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 211; SI-NEXT: s_endpgm 212; 213; VI-LABEL: insertelement_v4f32_2: 214; VI: ; %bb.0: 215; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 216; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 217; VI-NEXT: s_waitcnt lgkmcnt(0) 218; VI-NEXT: s_mov_b32 s2, 0x40a00000 219; VI-NEXT: s_mov_b32 s7, 0x1100f000 220; VI-NEXT: s_mov_b32 s6, -1 221; VI-NEXT: v_mov_b32_e32 v0, s0 222; VI-NEXT: v_mov_b32_e32 v1, s1 223; VI-NEXT: v_mov_b32_e32 v2, s2 224; VI-NEXT: v_mov_b32_e32 v3, s3 225; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 226; VI-NEXT: s_endpgm 227 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2 228 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 229 ret void 230} 231 232define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind { 233; SI-LABEL: insertelement_v4f32_3: 234; SI: ; %bb.0: 235; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 236; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 237; SI-NEXT: s_waitcnt lgkmcnt(0) 238; SI-NEXT: s_mov_b32 s3, 0x40a00000 239; SI-NEXT: s_mov_b32 s7, 0x100f000 240; SI-NEXT: s_mov_b32 s6, -1 241; SI-NEXT: v_mov_b32_e32 v0, s0 242; SI-NEXT: v_mov_b32_e32 v1, s1 243; SI-NEXT: v_mov_b32_e32 v2, s2 244; SI-NEXT: v_mov_b32_e32 v3, s3 245; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 246; SI-NEXT: s_endpgm 247; 248; VI-LABEL: insertelement_v4f32_3: 249; VI: ; %bb.0: 250; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 251; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 252; VI-NEXT: s_waitcnt lgkmcnt(0) 253; VI-NEXT: s_mov_b32 s3, 0x40a00000 254; VI-NEXT: s_mov_b32 s7, 0x1100f000 255; VI-NEXT: s_mov_b32 s6, -1 256; VI-NEXT: v_mov_b32_e32 v0, s0 257; VI-NEXT: v_mov_b32_e32 v1, s1 258; VI-NEXT: v_mov_b32_e32 v2, s2 259; VI-NEXT: v_mov_b32_e32 v3, s3 260; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 261; VI-NEXT: s_endpgm 262 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 263 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 264 ret void 265} 266 267define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind { 268; SI-LABEL: insertelement_v4i32_0: 269; SI: ; %bb.0: 270; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 271; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 272; SI-NEXT: s_waitcnt lgkmcnt(0) 273; SI-NEXT: s_movk_i32 s0, 0x3e7 274; SI-NEXT: s_mov_b32 s7, 0x100f000 275; SI-NEXT: s_mov_b32 s6, -1 276; SI-NEXT: v_mov_b32_e32 v0, s0 277; SI-NEXT: v_mov_b32_e32 v1, s1 278; SI-NEXT: v_mov_b32_e32 v2, s2 279; SI-NEXT: v_mov_b32_e32 v3, s3 280; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 281; SI-NEXT: s_endpgm 282; 283; VI-LABEL: insertelement_v4i32_0: 284; VI: ; %bb.0: 285; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 286; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 287; VI-NEXT: s_waitcnt lgkmcnt(0) 288; VI-NEXT: s_movk_i32 s0, 0x3e7 289; VI-NEXT: s_mov_b32 s7, 0x1100f000 290; VI-NEXT: s_mov_b32 s6, -1 291; VI-NEXT: v_mov_b32_e32 v0, s0 292; VI-NEXT: v_mov_b32_e32 v1, s1 293; VI-NEXT: v_mov_b32_e32 v2, s2 294; VI-NEXT: v_mov_b32_e32 v3, s3 295; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 296; VI-NEXT: s_endpgm 297 %vecins = insertelement <4 x i32> %a, i32 999, i32 0 298 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 299 ret void 300} 301 302define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 303; SI-LABEL: insertelement_v3f32_1: 304; SI: ; %bb.0: 305; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 306; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 307; SI-NEXT: s_mov_b32 s7, 0x100f000 308; SI-NEXT: s_mov_b32 s6, -1 309; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000 310; SI-NEXT: s_waitcnt lgkmcnt(0) 311; SI-NEXT: v_mov_b32_e32 v0, s0 312; SI-NEXT: v_mov_b32_e32 v2, s2 313; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 314; SI-NEXT: s_endpgm 315; 316; VI-LABEL: insertelement_v3f32_1: 317; VI: ; %bb.0: 318; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 319; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 320; VI-NEXT: s_mov_b32 s7, 0x1100f000 321; VI-NEXT: s_mov_b32 s6, -1 322; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000 323; VI-NEXT: s_waitcnt lgkmcnt(0) 324; VI-NEXT: v_mov_b32_e32 v0, s0 325; VI-NEXT: v_mov_b32_e32 v2, s2 326; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 327; VI-NEXT: s_endpgm 328 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1 329 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 330 ret void 331} 332 333define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 334; SI-LABEL: insertelement_v3f32_2: 335; SI: ; %bb.0: 336; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 337; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 338; SI-NEXT: s_mov_b32 s7, 0x100f000 339; SI-NEXT: s_mov_b32 s6, -1 340; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000 341; SI-NEXT: s_waitcnt lgkmcnt(0) 342; SI-NEXT: v_mov_b32_e32 v0, s0 343; SI-NEXT: v_mov_b32_e32 v1, s1 344; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: insertelement_v3f32_2: 348; VI: ; %bb.0: 349; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 350; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 351; VI-NEXT: s_mov_b32 s7, 0x1100f000 352; VI-NEXT: s_mov_b32 s6, -1 353; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000 354; VI-NEXT: s_waitcnt lgkmcnt(0) 355; VI-NEXT: v_mov_b32_e32 v0, s0 356; VI-NEXT: v_mov_b32_e32 v1, s1 357; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 358; VI-NEXT: s_endpgm 359 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2 360 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 361 ret void 362} 363 364define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind { 365; GCN-LABEL: insertelement_v3f32_3: 366; GCN: ; %bb.0: 367; GCN-NEXT: s_endpgm 368 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3 369 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 370 ret void 371} 372 373define <4 x float> @insertelement_to_sgpr() nounwind { 374; GCN-LABEL: insertelement_to_sgpr: 375; GCN: ; %bb.0: 376; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 377; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 378; GCN-NEXT: s_waitcnt lgkmcnt(0) 379; GCN-NEXT: s_mov_b32 s4, 0 380; GCN-NEXT: image_gather4_lz v[0:3], v[0:1], s[4:11], s[4:7] dmask:0x1 381; GCN-NEXT: s_waitcnt vmcnt(0) 382; GCN-NEXT: s_setpc_b64 s[30:31] 383 %tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef 384 %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 385 %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0) 386 ret <4 x float> %tmp2 387} 388 389define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind { 390; SI-LABEL: dynamic_insertelement_v2f32: 391; SI: ; %bb.0: 392; SI-NEXT: s_load_dword s6, s[4:5], 0x4 393; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 394; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 395; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 396; SI-NEXT: s_mov_b32 s3, 0x100f000 397; SI-NEXT: s_waitcnt lgkmcnt(0) 398; SI-NEXT: s_cmp_lg_u32 s6, 1 399; SI-NEXT: s_cselect_b64 vcc, -1, 0 400; SI-NEXT: v_mov_b32_e32 v1, s5 401; SI-NEXT: s_cmp_lg_u32 s6, 0 402; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 403; SI-NEXT: v_mov_b32_e32 v2, s4 404; SI-NEXT: s_cselect_b64 vcc, -1, 0 405; SI-NEXT: s_mov_b32 s2, -1 406; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 407; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 408; SI-NEXT: s_endpgm 409; 410; VI-LABEL: dynamic_insertelement_v2f32: 411; VI: ; %bb.0: 412; VI-NEXT: s_load_dword s6, s[4:5], 0x10 413; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 414; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 415; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 416; VI-NEXT: s_mov_b32 s3, 0x1100f000 417; VI-NEXT: s_waitcnt lgkmcnt(0) 418; VI-NEXT: s_cmp_lg_u32 s6, 1 419; VI-NEXT: s_cselect_b64 vcc, -1, 0 420; VI-NEXT: v_mov_b32_e32 v1, s5 421; VI-NEXT: s_cmp_lg_u32 s6, 0 422; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 423; VI-NEXT: v_mov_b32_e32 v2, s4 424; VI-NEXT: s_cselect_b64 vcc, -1, 0 425; VI-NEXT: s_mov_b32 s2, -1 426; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 427; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 428; VI-NEXT: s_endpgm 429 %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b 430 store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8 431 ret void 432} 433 434define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind { 435; SI-LABEL: dynamic_insertelement_v3f32: 436; SI: ; %bb.0: 437; SI-NEXT: s_load_dword s8, s[4:5], 0x8 438; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 439; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 440; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 441; SI-NEXT: s_mov_b32 s3, 0x100f000 442; SI-NEXT: s_waitcnt lgkmcnt(0) 443; SI-NEXT: s_cmp_lg_u32 s8, 2 444; SI-NEXT: s_cselect_b64 vcc, -1, 0 445; SI-NEXT: v_mov_b32_e32 v1, s6 446; SI-NEXT: s_cmp_lg_u32 s8, 1 447; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 448; SI-NEXT: v_mov_b32_e32 v1, s5 449; SI-NEXT: s_cselect_b64 vcc, -1, 0 450; SI-NEXT: s_cmp_lg_u32 s8, 0 451; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 452; SI-NEXT: v_mov_b32_e32 v3, s4 453; SI-NEXT: s_cselect_b64 vcc, -1, 0 454; SI-NEXT: s_mov_b32 s2, -1 455; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 456; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 457; SI-NEXT: s_endpgm 458; 459; VI-LABEL: dynamic_insertelement_v3f32: 460; VI: ; %bb.0: 461; VI-NEXT: s_load_dword s8, s[4:5], 0x20 462; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 463; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 464; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 465; VI-NEXT: s_mov_b32 s3, 0x1100f000 466; VI-NEXT: s_waitcnt lgkmcnt(0) 467; VI-NEXT: s_cmp_lg_u32 s8, 2 468; VI-NEXT: s_cselect_b64 vcc, -1, 0 469; VI-NEXT: v_mov_b32_e32 v1, s6 470; VI-NEXT: s_cmp_lg_u32 s8, 1 471; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 472; VI-NEXT: v_mov_b32_e32 v1, s5 473; VI-NEXT: s_cselect_b64 vcc, -1, 0 474; VI-NEXT: s_cmp_lg_u32 s8, 0 475; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 476; VI-NEXT: v_mov_b32_e32 v3, s4 477; VI-NEXT: s_cselect_b64 vcc, -1, 0 478; VI-NEXT: s_mov_b32 s2, -1 479; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 480; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 481; VI-NEXT: s_endpgm 482 %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b 483 store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16 484 ret void 485} 486 487define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind { 488; SI-LABEL: dynamic_insertelement_v4f32: 489; SI: ; %bb.0: 490; SI-NEXT: s_load_dword s8, s[4:5], 0x8 491; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 492; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4 493; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000 494; SI-NEXT: s_mov_b32 s3, 0x100f000 495; SI-NEXT: s_waitcnt lgkmcnt(0) 496; SI-NEXT: s_cmp_lg_u32 s8, 3 497; SI-NEXT: s_cselect_b64 vcc, -1, 0 498; SI-NEXT: v_mov_b32_e32 v1, s7 499; SI-NEXT: s_cmp_lg_u32 s8, 2 500; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 501; SI-NEXT: v_mov_b32_e32 v1, s6 502; SI-NEXT: s_cselect_b64 vcc, -1, 0 503; SI-NEXT: s_cmp_lg_u32 s8, 1 504; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 505; SI-NEXT: v_mov_b32_e32 v1, s5 506; SI-NEXT: s_cselect_b64 vcc, -1, 0 507; SI-NEXT: s_cmp_lg_u32 s8, 0 508; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 509; SI-NEXT: v_mov_b32_e32 v4, s4 510; SI-NEXT: s_cselect_b64 vcc, -1, 0 511; SI-NEXT: s_mov_b32 s2, -1 512; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 513; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 514; SI-NEXT: s_endpgm 515; 516; VI-LABEL: dynamic_insertelement_v4f32: 517; VI: ; %bb.0: 518; VI-NEXT: s_load_dword s8, s[4:5], 0x20 519; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 520; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 521; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000 522; VI-NEXT: s_mov_b32 s3, 0x1100f000 523; VI-NEXT: s_waitcnt lgkmcnt(0) 524; VI-NEXT: s_cmp_lg_u32 s8, 3 525; VI-NEXT: s_cselect_b64 vcc, -1, 0 526; VI-NEXT: v_mov_b32_e32 v1, s7 527; VI-NEXT: s_cmp_lg_u32 s8, 2 528; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 529; VI-NEXT: v_mov_b32_e32 v1, s6 530; VI-NEXT: s_cselect_b64 vcc, -1, 0 531; VI-NEXT: s_cmp_lg_u32 s8, 1 532; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 533; VI-NEXT: v_mov_b32_e32 v1, s5 534; VI-NEXT: s_cselect_b64 vcc, -1, 0 535; VI-NEXT: s_cmp_lg_u32 s8, 0 536; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 537; VI-NEXT: v_mov_b32_e32 v4, s4 538; VI-NEXT: s_cselect_b64 vcc, -1, 0 539; VI-NEXT: s_mov_b32 s2, -1 540; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 541; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 542; VI-NEXT: s_endpgm 543 %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b 544 store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16 545 ret void 546} 547 548define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind { 549; SI-LABEL: dynamic_insertelement_v8f32: 550; SI: ; %bb.0: 551; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 552; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 553; SI-NEXT: s_load_dword s4, s[4:5], 0x10 554; SI-NEXT: v_mov_b32_e32 v8, 0x40a00000 555; SI-NEXT: s_mov_b32 s3, 0x100f000 556; SI-NEXT: s_mov_b32 s2, -1 557; SI-NEXT: s_waitcnt lgkmcnt(0) 558; SI-NEXT: v_mov_b32_e32 v0, s8 559; SI-NEXT: v_mov_b32_e32 v1, s9 560; SI-NEXT: v_mov_b32_e32 v2, s10 561; SI-NEXT: v_mov_b32_e32 v3, s11 562; SI-NEXT: v_mov_b32_e32 v4, s12 563; SI-NEXT: v_mov_b32_e32 v5, s13 564; SI-NEXT: v_mov_b32_e32 v6, s14 565; SI-NEXT: v_mov_b32_e32 v7, s15 566; SI-NEXT: s_mov_b32 m0, s4 567; SI-NEXT: v_movreld_b32_e32 v0, v8 568; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 569; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 570; SI-NEXT: s_endpgm 571; 572; VI-LABEL: dynamic_insertelement_v8f32: 573; VI: ; %bb.0: 574; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 575; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 576; VI-NEXT: s_load_dword s4, s[4:5], 0x40 577; VI-NEXT: v_mov_b32_e32 v8, 0x40a00000 578; VI-NEXT: s_mov_b32 s3, 0x1100f000 579; VI-NEXT: s_mov_b32 s2, -1 580; VI-NEXT: s_waitcnt lgkmcnt(0) 581; VI-NEXT: v_mov_b32_e32 v0, s8 582; VI-NEXT: v_mov_b32_e32 v1, s9 583; VI-NEXT: v_mov_b32_e32 v2, s10 584; VI-NEXT: v_mov_b32_e32 v3, s11 585; VI-NEXT: v_mov_b32_e32 v4, s12 586; VI-NEXT: v_mov_b32_e32 v5, s13 587; VI-NEXT: v_mov_b32_e32 v6, s14 588; VI-NEXT: v_mov_b32_e32 v7, s15 589; VI-NEXT: s_mov_b32 m0, s4 590; VI-NEXT: v_movreld_b32_e32 v0, v8 591; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 592; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 593; VI-NEXT: s_endpgm 594 %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b 595 store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32 596 ret void 597} 598 599define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind { 600; SI-LABEL: dynamic_insertelement_v16f32: 601; SI: ; %bb.0: 602; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 603; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 604; SI-NEXT: s_load_dword s4, s[4:5], 0x20 605; SI-NEXT: v_mov_b32_e32 v16, 0x40a00000 606; SI-NEXT: s_mov_b32 s3, 0x100f000 607; SI-NEXT: s_mov_b32 s2, -1 608; SI-NEXT: s_waitcnt lgkmcnt(0) 609; SI-NEXT: v_mov_b32_e32 v0, s8 610; SI-NEXT: v_mov_b32_e32 v1, s9 611; SI-NEXT: v_mov_b32_e32 v2, s10 612; SI-NEXT: v_mov_b32_e32 v3, s11 613; SI-NEXT: v_mov_b32_e32 v4, s12 614; SI-NEXT: v_mov_b32_e32 v5, s13 615; SI-NEXT: v_mov_b32_e32 v6, s14 616; SI-NEXT: v_mov_b32_e32 v7, s15 617; SI-NEXT: v_mov_b32_e32 v8, s16 618; SI-NEXT: v_mov_b32_e32 v9, s17 619; SI-NEXT: v_mov_b32_e32 v10, s18 620; SI-NEXT: v_mov_b32_e32 v11, s19 621; SI-NEXT: v_mov_b32_e32 v12, s20 622; SI-NEXT: v_mov_b32_e32 v13, s21 623; SI-NEXT: v_mov_b32_e32 v14, s22 624; SI-NEXT: v_mov_b32_e32 v15, s23 625; SI-NEXT: s_mov_b32 m0, s4 626; SI-NEXT: v_movreld_b32_e32 v0, v16 627; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 628; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 629; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 630; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 631; SI-NEXT: s_endpgm 632; 633; VI-LABEL: dynamic_insertelement_v16f32: 634; VI: ; %bb.0: 635; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 636; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 637; VI-NEXT: s_load_dword s4, s[4:5], 0x80 638; VI-NEXT: v_mov_b32_e32 v16, 0x40a00000 639; VI-NEXT: s_mov_b32 s3, 0x1100f000 640; VI-NEXT: s_mov_b32 s2, -1 641; VI-NEXT: s_waitcnt lgkmcnt(0) 642; VI-NEXT: v_mov_b32_e32 v0, s8 643; VI-NEXT: v_mov_b32_e32 v1, s9 644; VI-NEXT: v_mov_b32_e32 v2, s10 645; VI-NEXT: v_mov_b32_e32 v3, s11 646; VI-NEXT: v_mov_b32_e32 v4, s12 647; VI-NEXT: v_mov_b32_e32 v5, s13 648; VI-NEXT: v_mov_b32_e32 v6, s14 649; VI-NEXT: v_mov_b32_e32 v7, s15 650; VI-NEXT: v_mov_b32_e32 v8, s16 651; VI-NEXT: v_mov_b32_e32 v9, s17 652; VI-NEXT: v_mov_b32_e32 v10, s18 653; VI-NEXT: v_mov_b32_e32 v11, s19 654; VI-NEXT: v_mov_b32_e32 v12, s20 655; VI-NEXT: v_mov_b32_e32 v13, s21 656; VI-NEXT: v_mov_b32_e32 v14, s22 657; VI-NEXT: v_mov_b32_e32 v15, s23 658; VI-NEXT: s_mov_b32 m0, s4 659; VI-NEXT: v_movreld_b32_e32 v0, v16 660; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 661; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 662; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 663; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 664; VI-NEXT: s_endpgm 665 %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b 666 store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64 667 ret void 668} 669 670define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind { 671; SI-LABEL: dynamic_insertelement_v2i32: 672; SI: ; %bb.0: 673; SI-NEXT: s_load_dword s8, s[4:5], 0x4 674; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2 675; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 676; SI-NEXT: s_mov_b32 s3, 0x100f000 677; SI-NEXT: s_mov_b32 s2, -1 678; SI-NEXT: s_waitcnt lgkmcnt(0) 679; SI-NEXT: s_cmp_lg_u32 s8, 1 680; SI-NEXT: v_mov_b32_e32 v0, s7 681; SI-NEXT: s_cselect_b64 vcc, -1, 0 682; SI-NEXT: s_cmp_lg_u32 s8, 0 683; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 684; SI-NEXT: v_mov_b32_e32 v0, s6 685; SI-NEXT: s_cselect_b64 vcc, -1, 0 686; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 687; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 688; SI-NEXT: s_endpgm 689; 690; VI-LABEL: dynamic_insertelement_v2i32: 691; VI: ; %bb.0: 692; VI-NEXT: s_load_dword s8, s[4:5], 0x10 693; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8 694; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 695; VI-NEXT: s_mov_b32 s3, 0x1100f000 696; VI-NEXT: s_mov_b32 s2, -1 697; VI-NEXT: s_waitcnt lgkmcnt(0) 698; VI-NEXT: s_cmp_lg_u32 s8, 1 699; VI-NEXT: s_cselect_b32 s4, s7, 5 700; VI-NEXT: s_cmp_lg_u32 s8, 0 701; VI-NEXT: s_cselect_b32 s5, s6, 5 702; VI-NEXT: v_mov_b32_e32 v0, s5 703; VI-NEXT: v_mov_b32_e32 v1, s4 704; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 705; VI-NEXT: s_endpgm 706 %vecins = insertelement <2 x i32> %a, i32 5, i32 %b 707 store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8 708 ret void 709} 710 711define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind { 712; SI-LABEL: dynamic_insertelement_v3i32: 713; SI: ; %bb.0: 714; SI-NEXT: s_load_dword s8, s[4:5], 0x8 715; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 716; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 717; SI-NEXT: s_mov_b32 s7, 0x100f000 718; SI-NEXT: s_mov_b32 s6, -1 719; SI-NEXT: s_waitcnt lgkmcnt(0) 720; SI-NEXT: s_cmp_lg_u32 s8, 2 721; SI-NEXT: v_mov_b32_e32 v0, s2 722; SI-NEXT: s_cselect_b64 vcc, -1, 0 723; SI-NEXT: s_cmp_lg_u32 s8, 1 724; SI-NEXT: v_cndmask_b32_e32 v2, 5, v0, vcc 725; SI-NEXT: v_mov_b32_e32 v0, s1 726; SI-NEXT: s_cselect_b64 vcc, -1, 0 727; SI-NEXT: s_cmp_lg_u32 s8, 0 728; SI-NEXT: v_cndmask_b32_e32 v1, 5, v0, vcc 729; SI-NEXT: v_mov_b32_e32 v0, s0 730; SI-NEXT: s_cselect_b64 vcc, -1, 0 731; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 732; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 733; SI-NEXT: s_endpgm 734; 735; VI-LABEL: dynamic_insertelement_v3i32: 736; VI: ; %bb.0: 737; VI-NEXT: s_load_dword s8, s[4:5], 0x20 738; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 739; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 740; VI-NEXT: s_mov_b32 s7, 0x1100f000 741; VI-NEXT: s_mov_b32 s6, -1 742; VI-NEXT: s_waitcnt lgkmcnt(0) 743; VI-NEXT: s_cmp_lg_u32 s8, 2 744; VI-NEXT: s_cselect_b32 s2, s2, 5 745; VI-NEXT: s_cmp_lg_u32 s8, 1 746; VI-NEXT: s_cselect_b32 s1, s1, 5 747; VI-NEXT: s_cmp_lg_u32 s8, 0 748; VI-NEXT: s_cselect_b32 s0, s0, 5 749; VI-NEXT: v_mov_b32_e32 v0, s0 750; VI-NEXT: v_mov_b32_e32 v1, s1 751; VI-NEXT: v_mov_b32_e32 v2, s2 752; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 753; VI-NEXT: s_endpgm 754 %vecins = insertelement <3 x i32> %a, i32 5, i32 %b 755 store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16 756 ret void 757} 758 759define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind { 760; SI-LABEL: dynamic_insertelement_v4i32: 761; SI: ; %bb.0: 762; SI-NEXT: s_load_dword s6, s[4:5], 0x8 763; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 764; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 765; SI-NEXT: s_load_dword s4, s[4:5], 0x11 766; SI-NEXT: s_mov_b32 s3, 0x100f000 767; SI-NEXT: s_waitcnt lgkmcnt(0) 768; SI-NEXT: s_cmp_eq_u32 s6, 3 769; SI-NEXT: s_cselect_b64 vcc, -1, 0 770; SI-NEXT: v_mov_b32_e32 v0, s11 771; SI-NEXT: v_mov_b32_e32 v4, s4 772; SI-NEXT: s_cmp_eq_u32 s6, 2 773; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 774; SI-NEXT: v_mov_b32_e32 v0, s10 775; SI-NEXT: s_cselect_b64 vcc, -1, 0 776; SI-NEXT: s_cmp_eq_u32 s6, 1 777; SI-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc 778; SI-NEXT: v_mov_b32_e32 v0, s9 779; SI-NEXT: s_cselect_b64 vcc, -1, 0 780; SI-NEXT: s_cmp_eq_u32 s6, 0 781; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 782; SI-NEXT: v_mov_b32_e32 v0, s8 783; SI-NEXT: s_cselect_b64 vcc, -1, 0 784; SI-NEXT: s_mov_b32 s2, -1 785; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 786; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 787; SI-NEXT: s_endpgm 788; 789; VI-LABEL: dynamic_insertelement_v4i32: 790; VI: ; %bb.0: 791; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 792; VI-NEXT: s_load_dword s8, s[4:5], 0x20 793; VI-NEXT: s_load_dword s9, s[4:5], 0x44 794; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 795; VI-NEXT: s_mov_b32 s7, 0x1100f000 796; VI-NEXT: s_mov_b32 s6, -1 797; VI-NEXT: s_waitcnt lgkmcnt(0) 798; VI-NEXT: s_cmp_eq_u32 s8, 3 799; VI-NEXT: s_cselect_b32 s3, s9, s3 800; VI-NEXT: s_cmp_eq_u32 s8, 2 801; VI-NEXT: s_cselect_b32 s2, s9, s2 802; VI-NEXT: s_cmp_eq_u32 s8, 1 803; VI-NEXT: s_cselect_b32 s1, s9, s1 804; VI-NEXT: s_cmp_eq_u32 s8, 0 805; VI-NEXT: s_cselect_b32 s0, s9, s0 806; VI-NEXT: v_mov_b32_e32 v0, s0 807; VI-NEXT: v_mov_b32_e32 v1, s1 808; VI-NEXT: v_mov_b32_e32 v2, s2 809; VI-NEXT: v_mov_b32_e32 v3, s3 810; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 811; VI-NEXT: s_endpgm 812 %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b 813 store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16 814 ret void 815} 816 817define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind { 818; SI-LABEL: dynamic_insertelement_v8i32: 819; SI: ; %bb.0: 820; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 821; SI-NEXT: s_load_dword s6, s[4:5], 0x10 822; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 823; SI-NEXT: s_mov_b32 s3, 0x100f000 824; SI-NEXT: s_mov_b32 s2, -1 825; SI-NEXT: s_waitcnt lgkmcnt(0) 826; SI-NEXT: v_mov_b32_e32 v0, s8 827; SI-NEXT: v_mov_b32_e32 v1, s9 828; SI-NEXT: v_mov_b32_e32 v2, s10 829; SI-NEXT: v_mov_b32_e32 v3, s11 830; SI-NEXT: v_mov_b32_e32 v4, s12 831; SI-NEXT: v_mov_b32_e32 v5, s13 832; SI-NEXT: v_mov_b32_e32 v6, s14 833; SI-NEXT: v_mov_b32_e32 v7, s15 834; SI-NEXT: s_mov_b32 m0, s6 835; SI-NEXT: v_movreld_b32_e32 v0, 5 836; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 837; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 838; SI-NEXT: s_endpgm 839; 840; VI-LABEL: dynamic_insertelement_v8i32: 841; VI: ; %bb.0: 842; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 843; VI-NEXT: s_load_dword s6, s[4:5], 0x40 844; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 845; VI-NEXT: s_mov_b32 s3, 0x1100f000 846; VI-NEXT: s_mov_b32 s2, -1 847; VI-NEXT: s_waitcnt lgkmcnt(0) 848; VI-NEXT: v_mov_b32_e32 v0, s8 849; VI-NEXT: v_mov_b32_e32 v1, s9 850; VI-NEXT: v_mov_b32_e32 v2, s10 851; VI-NEXT: v_mov_b32_e32 v3, s11 852; VI-NEXT: v_mov_b32_e32 v4, s12 853; VI-NEXT: v_mov_b32_e32 v5, s13 854; VI-NEXT: v_mov_b32_e32 v6, s14 855; VI-NEXT: v_mov_b32_e32 v7, s15 856; VI-NEXT: s_mov_b32 m0, s6 857; VI-NEXT: v_movreld_b32_e32 v0, 5 858; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 859; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 860; VI-NEXT: s_endpgm 861 %vecins = insertelement <8 x i32> %a, i32 5, i32 %b 862 store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32 863 ret void 864} 865 866define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind { 867; SI-LABEL: dynamic_insertelement_v16i32: 868; SI: ; %bb.0: 869; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 870; SI-NEXT: s_load_dword s6, s[4:5], 0x20 871; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 872; SI-NEXT: s_mov_b32 s3, 0x100f000 873; SI-NEXT: s_mov_b32 s2, -1 874; SI-NEXT: s_waitcnt lgkmcnt(0) 875; SI-NEXT: v_mov_b32_e32 v0, s8 876; SI-NEXT: v_mov_b32_e32 v1, s9 877; SI-NEXT: v_mov_b32_e32 v2, s10 878; SI-NEXT: v_mov_b32_e32 v3, s11 879; SI-NEXT: v_mov_b32_e32 v4, s12 880; SI-NEXT: v_mov_b32_e32 v5, s13 881; SI-NEXT: v_mov_b32_e32 v6, s14 882; SI-NEXT: v_mov_b32_e32 v7, s15 883; SI-NEXT: v_mov_b32_e32 v8, s16 884; SI-NEXT: v_mov_b32_e32 v9, s17 885; SI-NEXT: v_mov_b32_e32 v10, s18 886; SI-NEXT: v_mov_b32_e32 v11, s19 887; SI-NEXT: v_mov_b32_e32 v12, s20 888; SI-NEXT: v_mov_b32_e32 v13, s21 889; SI-NEXT: v_mov_b32_e32 v14, s22 890; SI-NEXT: v_mov_b32_e32 v15, s23 891; SI-NEXT: s_mov_b32 m0, s6 892; SI-NEXT: v_movreld_b32_e32 v0, 5 893; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 894; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 895; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 896; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 897; SI-NEXT: s_endpgm 898; 899; VI-LABEL: dynamic_insertelement_v16i32: 900; VI: ; %bb.0: 901; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 902; VI-NEXT: s_load_dword s6, s[4:5], 0x80 903; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 904; VI-NEXT: s_mov_b32 s3, 0x1100f000 905; VI-NEXT: s_mov_b32 s2, -1 906; VI-NEXT: s_waitcnt lgkmcnt(0) 907; VI-NEXT: v_mov_b32_e32 v0, s8 908; VI-NEXT: v_mov_b32_e32 v1, s9 909; VI-NEXT: v_mov_b32_e32 v2, s10 910; VI-NEXT: v_mov_b32_e32 v3, s11 911; VI-NEXT: v_mov_b32_e32 v4, s12 912; VI-NEXT: v_mov_b32_e32 v5, s13 913; VI-NEXT: v_mov_b32_e32 v6, s14 914; VI-NEXT: v_mov_b32_e32 v7, s15 915; VI-NEXT: v_mov_b32_e32 v8, s16 916; VI-NEXT: v_mov_b32_e32 v9, s17 917; VI-NEXT: v_mov_b32_e32 v10, s18 918; VI-NEXT: v_mov_b32_e32 v11, s19 919; VI-NEXT: v_mov_b32_e32 v12, s20 920; VI-NEXT: v_mov_b32_e32 v13, s21 921; VI-NEXT: v_mov_b32_e32 v14, s22 922; VI-NEXT: v_mov_b32_e32 v15, s23 923; VI-NEXT: s_mov_b32 m0, s6 924; VI-NEXT: v_movreld_b32_e32 v0, 5 925; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 926; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 927; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 928; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 929; VI-NEXT: s_endpgm 930 %vecins = insertelement <16 x i32> %a, i32 5, i32 %b 931 store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64 932 ret void 933} 934 935define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind { 936; SI-LABEL: dynamic_insertelement_v2i16: 937; SI: ; %bb.0: 938; SI-NEXT: s_load_dword s6, s[4:5], 0x3 939; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 940; SI-NEXT: s_load_dword s4, s[4:5], 0x2 941; SI-NEXT: s_mov_b32 s3, 0x100f000 942; SI-NEXT: s_mov_b32 s2, -1 943; SI-NEXT: s_waitcnt lgkmcnt(0) 944; SI-NEXT: s_lshl_b32 s5, s6, 4 945; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 946; SI-NEXT: s_andn2_b32 s4, s4, s5 947; SI-NEXT: s_and_b32 s5, s5, 0x50005 948; SI-NEXT: s_or_b32 s4, s5, s4 949; SI-NEXT: v_mov_b32_e32 v0, s4 950; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 951; SI-NEXT: s_endpgm 952; 953; VI-LABEL: dynamic_insertelement_v2i16: 954; VI: ; %bb.0: 955; VI-NEXT: s_load_dword s6, s[4:5], 0xc 956; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 957; VI-NEXT: s_load_dword s4, s[4:5], 0x8 958; VI-NEXT: s_mov_b32 s3, 0x1100f000 959; VI-NEXT: s_mov_b32 s2, -1 960; VI-NEXT: s_waitcnt lgkmcnt(0) 961; VI-NEXT: s_lshl_b32 s5, s6, 4 962; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 963; VI-NEXT: s_andn2_b32 s4, s4, s5 964; VI-NEXT: s_and_b32 s5, s5, 0x50005 965; VI-NEXT: s_or_b32 s4, s5, s4 966; VI-NEXT: v_mov_b32_e32 v0, s4 967; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 968; VI-NEXT: s_endpgm 969 %vecins = insertelement <2 x i16> %a, i16 5, i32 %b 970 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8 971 ret void 972} 973 974define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind { 975; SI-LABEL: dynamic_insertelement_v3i16: 976; SI: ; %bb.0: 977; SI-NEXT: s_load_dword s6, s[4:5], 0x4 978; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 979; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2 980; SI-NEXT: s_mov_b32 s3, 0x100f000 981; SI-NEXT: s_mov_b32 s2, -1 982; SI-NEXT: s_waitcnt lgkmcnt(0) 983; SI-NEXT: s_lshl_b32 s8, s6, 4 984; SI-NEXT: s_mov_b64 s[6:7], 0xffff 985; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 986; SI-NEXT: s_and_b32 s9, s7, 0x50005 987; SI-NEXT: s_and_b32 s8, s6, 0x50005 988; SI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 989; SI-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] 990; SI-NEXT: v_mov_b32_e32 v0, s5 991; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 992; SI-NEXT: v_mov_b32_e32 v0, s4 993; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 994; SI-NEXT: s_endpgm 995; 996; VI-LABEL: dynamic_insertelement_v3i16: 997; VI: ; %bb.0: 998; VI-NEXT: s_load_dword s6, s[4:5], 0x10 999; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1000; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x8 1001; VI-NEXT: s_mov_b32 s3, 0x1100f000 1002; VI-NEXT: s_mov_b32 s2, -1 1003; VI-NEXT: s_waitcnt lgkmcnt(0) 1004; VI-NEXT: s_lshl_b32 s8, s6, 4 1005; VI-NEXT: s_mov_b64 s[6:7], 0xffff 1006; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], s8 1007; VI-NEXT: s_mov_b32 s8, 0x50005 1008; VI-NEXT: s_mov_b32 s9, s8 1009; VI-NEXT: s_andn2_b64 s[4:5], s[4:5], s[6:7] 1010; VI-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9] 1011; VI-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] 1012; VI-NEXT: v_mov_b32_e32 v0, s5 1013; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:4 1014; VI-NEXT: v_mov_b32_e32 v0, s4 1015; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1016; VI-NEXT: s_endpgm 1017 %vecins = insertelement <3 x i16> %a, i16 5, i32 %b 1018 store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8 1019 ret void 1020} 1021 1022define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { 1023; SI-LABEL: dynamic_insertelement_v2i8: 1024; SI: ; %bb.0: 1025; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1026; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1027; SI-NEXT: s_load_dword s4, s[4:5], 0xa 1028; SI-NEXT: s_mov_b32 s3, 0x100f000 1029; SI-NEXT: s_mov_b32 s2, -1 1030; SI-NEXT: s_waitcnt lgkmcnt(0) 1031; SI-NEXT: s_lshl_b32 s5, s6, 3 1032; SI-NEXT: s_lshl_b32 s5, -1, s5 1033; SI-NEXT: s_andn2_b32 s4, s4, s5 1034; SI-NEXT: s_and_b32 s5, s5, 0x505 1035; SI-NEXT: s_or_b32 s4, s5, s4 1036; SI-NEXT: v_mov_b32_e32 v0, s4 1037; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1038; SI-NEXT: s_endpgm 1039; 1040; VI-LABEL: dynamic_insertelement_v2i8: 1041; VI: ; %bb.0: 1042; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1043; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1044; VI-NEXT: s_load_dword s4, s[4:5], 0x28 1045; VI-NEXT: s_mov_b32 s3, 0x1100f000 1046; VI-NEXT: s_mov_b32 s2, -1 1047; VI-NEXT: s_waitcnt lgkmcnt(0) 1048; VI-NEXT: s_lshl_b32 s5, s6, 3 1049; VI-NEXT: v_lshlrev_b16_e64 v0, s5, -1 1050; VI-NEXT: v_not_b32_e32 v1, v0 1051; VI-NEXT: v_and_b32_e32 v1, s4, v1 1052; VI-NEXT: v_and_b32_e32 v0, 0x505, v0 1053; VI-NEXT: v_or_b32_e32 v0, v0, v1 1054; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1055; VI-NEXT: s_endpgm 1056 %vecins = insertelement <2 x i8> %a, i8 5, i32 %b 1057 store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8 1058 ret void 1059} 1060 1061; FIXME: post legalize i16 and i32 shifts aren't merged because of 1062; isTypeDesirableForOp in SimplifyDemandedBits 1063define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { 1064; SI-LABEL: dynamic_insertelement_v3i8: 1065; SI: ; %bb.0: 1066; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1067; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1068; SI-NEXT: s_load_dword s4, s[4:5], 0xa 1069; SI-NEXT: s_mov_b32 s3, 0x100f000 1070; SI-NEXT: s_mov_b32 s2, -1 1071; SI-NEXT: s_waitcnt lgkmcnt(0) 1072; SI-NEXT: s_lshl_b32 s5, s6, 3 1073; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 1074; SI-NEXT: s_andn2_b32 s4, s4, s5 1075; SI-NEXT: s_and_b32 s5, s5, 0x5050505 1076; SI-NEXT: s_or_b32 s4, s5, s4 1077; SI-NEXT: s_lshr_b32 s5, s4, 16 1078; SI-NEXT: v_mov_b32_e32 v0, s4 1079; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 1080; SI-NEXT: v_mov_b32_e32 v0, s5 1081; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1082; SI-NEXT: s_endpgm 1083; 1084; VI-LABEL: dynamic_insertelement_v3i8: 1085; VI: ; %bb.0: 1086; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1087; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1088; VI-NEXT: s_load_dword s4, s[4:5], 0x28 1089; VI-NEXT: s_mov_b32 s3, 0x1100f000 1090; VI-NEXT: s_mov_b32 s2, -1 1091; VI-NEXT: s_waitcnt lgkmcnt(0) 1092; VI-NEXT: s_lshl_b32 s5, s6, 3 1093; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 1094; VI-NEXT: s_andn2_b32 s4, s4, s5 1095; VI-NEXT: s_and_b32 s5, s5, 0x5050505 1096; VI-NEXT: s_or_b32 s4, s5, s4 1097; VI-NEXT: s_lshr_b32 s5, s4, 16 1098; VI-NEXT: v_mov_b32_e32 v0, s4 1099; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 1100; VI-NEXT: v_mov_b32_e32 v0, s5 1101; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2 1102; VI-NEXT: s_endpgm 1103 %vecins = insertelement <3 x i8> %a, i8 5, i32 %b 1104 store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4 1105 ret void 1106} 1107 1108define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { 1109; SI-LABEL: dynamic_insertelement_v4i8: 1110; SI: ; %bb.0: 1111; SI-NEXT: s_load_dword s6, s[4:5], 0x13 1112; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1113; SI-NEXT: s_load_dword s4, s[4:5], 0xa 1114; SI-NEXT: s_mov_b32 s3, 0x100f000 1115; SI-NEXT: s_mov_b32 s2, -1 1116; SI-NEXT: s_waitcnt lgkmcnt(0) 1117; SI-NEXT: s_lshl_b32 s5, s6, 3 1118; SI-NEXT: s_lshl_b32 s5, 0xffff, s5 1119; SI-NEXT: s_andn2_b32 s4, s4, s5 1120; SI-NEXT: s_and_b32 s5, s5, 0x5050505 1121; SI-NEXT: s_or_b32 s4, s5, s4 1122; SI-NEXT: v_mov_b32_e32 v0, s4 1123; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1124; SI-NEXT: s_endpgm 1125; 1126; VI-LABEL: dynamic_insertelement_v4i8: 1127; VI: ; %bb.0: 1128; VI-NEXT: s_load_dword s6, s[4:5], 0x4c 1129; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1130; VI-NEXT: s_load_dword s4, s[4:5], 0x28 1131; VI-NEXT: s_mov_b32 s3, 0x1100f000 1132; VI-NEXT: s_mov_b32 s2, -1 1133; VI-NEXT: s_waitcnt lgkmcnt(0) 1134; VI-NEXT: s_lshl_b32 s5, s6, 3 1135; VI-NEXT: s_lshl_b32 s5, 0xffff, s5 1136; VI-NEXT: s_andn2_b32 s4, s4, s5 1137; VI-NEXT: s_and_b32 s5, s5, 0x5050505 1138; VI-NEXT: s_or_b32 s4, s5, s4 1139; VI-NEXT: v_mov_b32_e32 v0, s4 1140; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 1141; VI-NEXT: s_endpgm 1142 %vecins = insertelement <4 x i8> %a, i8 5, i32 %b 1143 store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4 1144 ret void 1145} 1146 1147define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind { 1148; SI-LABEL: s_dynamic_insertelement_v8i8: 1149; SI: ; %bb.0: 1150; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1151; SI-NEXT: s_load_dword s8, s[4:5], 0x4 1152; SI-NEXT: s_mov_b32 s7, 0x100f000 1153; SI-NEXT: s_mov_b32 s6, -1 1154; SI-NEXT: s_waitcnt lgkmcnt(0) 1155; SI-NEXT: s_mov_b32 s4, s0 1156; SI-NEXT: s_mov_b32 s5, s1 1157; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1158; SI-NEXT: s_lshl_b32 s8, s8, 3 1159; SI-NEXT: s_mov_b64 s[2:3], 0xffff 1160; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 1161; SI-NEXT: s_and_b32 s9, s3, 0x5050505 1162; SI-NEXT: s_and_b32 s8, s2, 0x5050505 1163; SI-NEXT: s_waitcnt lgkmcnt(0) 1164; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 1165; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] 1166; SI-NEXT: v_mov_b32_e32 v0, s0 1167; SI-NEXT: v_mov_b32_e32 v1, s1 1168; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1169; SI-NEXT: s_endpgm 1170; 1171; VI-LABEL: s_dynamic_insertelement_v8i8: 1172; VI: ; %bb.0: 1173; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1174; VI-NEXT: s_load_dword s8, s[4:5], 0x10 1175; VI-NEXT: s_mov_b32 s7, 0x1100f000 1176; VI-NEXT: s_mov_b32 s6, -1 1177; VI-NEXT: s_waitcnt lgkmcnt(0) 1178; VI-NEXT: s_mov_b32 s4, s0 1179; VI-NEXT: s_mov_b32 s5, s1 1180; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 1181; VI-NEXT: s_lshl_b32 s8, s8, 3 1182; VI-NEXT: s_mov_b64 s[2:3], 0xffff 1183; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 1184; VI-NEXT: s_and_b32 s9, s3, 0x5050505 1185; VI-NEXT: s_and_b32 s8, s2, 0x5050505 1186; VI-NEXT: s_waitcnt lgkmcnt(0) 1187; VI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] 1188; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] 1189; VI-NEXT: v_mov_b32_e32 v0, s0 1190; VI-NEXT: v_mov_b32_e32 v1, s1 1191; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1192; VI-NEXT: s_endpgm 1193 %a = load <8 x i8>, <8 x i8> addrspace(4)* %a.ptr, align 4 1194 %vecins = insertelement <8 x i8> %a, i8 5, i32 %b 1195 store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8 1196 ret void 1197} 1198 1199define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind { 1200; SI-LABEL: dynamic_insertelement_v16i8: 1201; SI: ; %bb.0: 1202; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4 1203; SI-NEXT: s_load_dword s6, s[4:5], 0x8 1204; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1205; SI-NEXT: s_mov_b32 s3, 0x100f000 1206; SI-NEXT: s_mov_b32 s2, -1 1207; SI-NEXT: s_waitcnt lgkmcnt(0) 1208; SI-NEXT: s_lshr_b32 s4, s11, 24 1209; SI-NEXT: s_cmp_lg_u32 s6, 15 1210; SI-NEXT: v_mov_b32_e32 v0, s4 1211; SI-NEXT: s_cselect_b64 vcc, -1, 0 1212; SI-NEXT: s_lshr_b32 s4, s11, 16 1213; SI-NEXT: s_cmp_lg_u32 s6, 14 1214; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1215; SI-NEXT: v_mov_b32_e32 v1, s4 1216; SI-NEXT: s_cselect_b64 vcc, -1, 0 1217; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1218; SI-NEXT: s_lshr_b32 s4, s11, 8 1219; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1220; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 1221; SI-NEXT: s_cmp_lg_u32 s6, 13 1222; SI-NEXT: v_or_b32_e32 v0, v1, v0 1223; SI-NEXT: v_mov_b32_e32 v1, s4 1224; SI-NEXT: s_cselect_b64 vcc, -1, 0 1225; SI-NEXT: s_cmp_lg_u32 s6, 12 1226; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1227; SI-NEXT: v_mov_b32_e32 v2, s11 1228; SI-NEXT: s_cselect_b64 vcc, -1, 0 1229; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1230; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1231; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 1232; SI-NEXT: v_or_b32_e32 v1, v2, v1 1233; SI-NEXT: s_lshr_b32 s4, s10, 24 1234; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1235; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1236; SI-NEXT: s_cmp_lg_u32 s6, 11 1237; SI-NEXT: v_or_b32_e32 v3, v1, v0 1238; SI-NEXT: v_mov_b32_e32 v0, s4 1239; SI-NEXT: s_cselect_b64 vcc, -1, 0 1240; SI-NEXT: s_lshr_b32 s4, s10, 16 1241; SI-NEXT: s_cmp_lg_u32 s6, 10 1242; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1243; SI-NEXT: v_mov_b32_e32 v1, s4 1244; SI-NEXT: s_cselect_b64 vcc, -1, 0 1245; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1246; SI-NEXT: s_lshr_b32 s4, s10, 8 1247; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1248; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 1249; SI-NEXT: s_cmp_lg_u32 s6, 9 1250; SI-NEXT: v_or_b32_e32 v0, v1, v0 1251; SI-NEXT: v_mov_b32_e32 v1, s4 1252; SI-NEXT: s_cselect_b64 vcc, -1, 0 1253; SI-NEXT: s_cmp_lg_u32 s6, 8 1254; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1255; SI-NEXT: v_mov_b32_e32 v2, s10 1256; SI-NEXT: s_cselect_b64 vcc, -1, 0 1257; SI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1258; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1259; SI-NEXT: v_and_b32_e32 v2, 0xff, v2 1260; SI-NEXT: v_or_b32_e32 v1, v2, v1 1261; SI-NEXT: s_lshr_b32 s4, s9, 24 1262; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1263; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1264; SI-NEXT: s_cmp_lg_u32 s6, 7 1265; SI-NEXT: v_or_b32_e32 v2, v1, v0 1266; SI-NEXT: v_mov_b32_e32 v0, s4 1267; SI-NEXT: s_cselect_b64 vcc, -1, 0 1268; SI-NEXT: s_lshr_b32 s4, s9, 16 1269; SI-NEXT: s_cmp_lg_u32 s6, 6 1270; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1271; SI-NEXT: v_mov_b32_e32 v1, s4 1272; SI-NEXT: s_cselect_b64 vcc, -1, 0 1273; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1274; SI-NEXT: s_lshr_b32 s4, s9, 8 1275; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1276; SI-NEXT: v_and_b32_e32 v1, 0xff, v1 1277; SI-NEXT: s_cmp_lg_u32 s6, 5 1278; SI-NEXT: v_or_b32_e32 v0, v1, v0 1279; SI-NEXT: v_mov_b32_e32 v1, s4 1280; SI-NEXT: s_cselect_b64 vcc, -1, 0 1281; SI-NEXT: s_cmp_lg_u32 s6, 4 1282; SI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1283; SI-NEXT: v_mov_b32_e32 v4, s9 1284; SI-NEXT: s_cselect_b64 vcc, -1, 0 1285; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1286; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 1287; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 1288; SI-NEXT: v_or_b32_e32 v1, v4, v1 1289; SI-NEXT: s_lshr_b32 s4, s8, 24 1290; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1291; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 1292; SI-NEXT: s_cmp_lg_u32 s6, 3 1293; SI-NEXT: v_or_b32_e32 v1, v1, v0 1294; SI-NEXT: v_mov_b32_e32 v0, s4 1295; SI-NEXT: s_cselect_b64 vcc, -1, 0 1296; SI-NEXT: s_lshr_b32 s4, s8, 16 1297; SI-NEXT: s_cmp_lg_u32 s6, 2 1298; SI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1299; SI-NEXT: v_mov_b32_e32 v4, s4 1300; SI-NEXT: s_cselect_b64 vcc, -1, 0 1301; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1302; SI-NEXT: s_lshr_b32 s4, s8, 8 1303; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 1304; SI-NEXT: v_and_b32_e32 v4, 0xff, v4 1305; SI-NEXT: s_cmp_lg_u32 s6, 1 1306; SI-NEXT: v_or_b32_e32 v0, v4, v0 1307; SI-NEXT: v_mov_b32_e32 v4, s4 1308; SI-NEXT: s_cselect_b64 vcc, -1, 0 1309; SI-NEXT: s_cmp_lg_u32 s6, 0 1310; SI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1311; SI-NEXT: v_mov_b32_e32 v5, s8 1312; SI-NEXT: s_cselect_b64 vcc, -1, 0 1313; SI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1314; SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4 1315; SI-NEXT: v_and_b32_e32 v5, 0xff, v5 1316; SI-NEXT: v_or_b32_e32 v4, v5, v4 1317; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1318; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4 1319; SI-NEXT: v_or_b32_e32 v0, v4, v0 1320; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1321; SI-NEXT: s_endpgm 1322; 1323; VI-LABEL: dynamic_insertelement_v16i8: 1324; VI: ; %bb.0: 1325; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 1326; VI-NEXT: s_load_dword s6, s[4:5], 0x20 1327; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1328; VI-NEXT: s_mov_b32 s3, 0x1100f000 1329; VI-NEXT: s_mov_b32 s2, -1 1330; VI-NEXT: s_waitcnt lgkmcnt(0) 1331; VI-NEXT: s_lshr_b32 s4, s11, 24 1332; VI-NEXT: s_cmp_lg_u32 s6, 15 1333; VI-NEXT: v_mov_b32_e32 v0, s4 1334; VI-NEXT: s_cselect_b64 vcc, -1, 0 1335; VI-NEXT: s_lshr_b32 s4, s11, 16 1336; VI-NEXT: s_cmp_lg_u32 s6, 14 1337; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1338; VI-NEXT: v_mov_b32_e32 v1, s4 1339; VI-NEXT: s_cselect_b64 vcc, -1, 0 1340; VI-NEXT: s_lshr_b32 s4, s11, 8 1341; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1342; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1343; VI-NEXT: s_cmp_lg_u32 s6, 13 1344; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1345; VI-NEXT: v_mov_b32_e32 v1, s4 1346; VI-NEXT: s_cselect_b64 vcc, -1, 0 1347; VI-NEXT: s_cmp_lg_u32 s6, 12 1348; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1349; VI-NEXT: v_mov_b32_e32 v2, s11 1350; VI-NEXT: s_cselect_b64 vcc, -1, 0 1351; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1352; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1353; VI-NEXT: s_lshr_b32 s4, s10, 24 1354; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1355; VI-NEXT: s_cmp_lg_u32 s6, 11 1356; VI-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1357; VI-NEXT: v_mov_b32_e32 v0, s4 1358; VI-NEXT: s_cselect_b64 vcc, -1, 0 1359; VI-NEXT: s_lshr_b32 s4, s10, 16 1360; VI-NEXT: s_cmp_lg_u32 s6, 10 1361; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1362; VI-NEXT: v_mov_b32_e32 v1, s4 1363; VI-NEXT: s_cselect_b64 vcc, -1, 0 1364; VI-NEXT: s_lshr_b32 s4, s10, 8 1365; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1366; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1367; VI-NEXT: s_cmp_lg_u32 s6, 9 1368; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1369; VI-NEXT: v_mov_b32_e32 v1, s4 1370; VI-NEXT: s_cselect_b64 vcc, -1, 0 1371; VI-NEXT: s_cmp_lg_u32 s6, 8 1372; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1373; VI-NEXT: v_mov_b32_e32 v2, s10 1374; VI-NEXT: s_cselect_b64 vcc, -1, 0 1375; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1376; VI-NEXT: v_cndmask_b32_e32 v2, 5, v2, vcc 1377; VI-NEXT: s_lshr_b32 s4, s9, 24 1378; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1379; VI-NEXT: s_cmp_lg_u32 s6, 7 1380; VI-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1381; VI-NEXT: v_mov_b32_e32 v0, s4 1382; VI-NEXT: s_cselect_b64 vcc, -1, 0 1383; VI-NEXT: s_lshr_b32 s4, s9, 16 1384; VI-NEXT: s_cmp_lg_u32 s6, 6 1385; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1386; VI-NEXT: v_mov_b32_e32 v1, s4 1387; VI-NEXT: s_cselect_b64 vcc, -1, 0 1388; VI-NEXT: s_lshr_b32 s4, s9, 8 1389; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1390; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1391; VI-NEXT: s_cmp_lg_u32 s6, 5 1392; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1393; VI-NEXT: v_mov_b32_e32 v1, s4 1394; VI-NEXT: s_cselect_b64 vcc, -1, 0 1395; VI-NEXT: s_cmp_lg_u32 s6, 4 1396; VI-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc 1397; VI-NEXT: v_mov_b32_e32 v4, s9 1398; VI-NEXT: s_cselect_b64 vcc, -1, 0 1399; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 1400; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1401; VI-NEXT: s_lshr_b32 s4, s8, 24 1402; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1403; VI-NEXT: s_cmp_lg_u32 s6, 3 1404; VI-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1405; VI-NEXT: v_mov_b32_e32 v0, s4 1406; VI-NEXT: s_cselect_b64 vcc, -1, 0 1407; VI-NEXT: s_lshr_b32 s4, s8, 16 1408; VI-NEXT: s_cmp_lg_u32 s6, 2 1409; VI-NEXT: v_cndmask_b32_e32 v0, 5, v0, vcc 1410; VI-NEXT: v_mov_b32_e32 v4, s4 1411; VI-NEXT: s_cselect_b64 vcc, -1, 0 1412; VI-NEXT: s_lshr_b32 s4, s8, 8 1413; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 1414; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1415; VI-NEXT: s_cmp_lg_u32 s6, 1 1416; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1417; VI-NEXT: v_mov_b32_e32 v4, s4 1418; VI-NEXT: s_cselect_b64 vcc, -1, 0 1419; VI-NEXT: s_cmp_lg_u32 s6, 0 1420; VI-NEXT: v_cndmask_b32_e32 v4, 5, v4, vcc 1421; VI-NEXT: v_mov_b32_e32 v5, s8 1422; VI-NEXT: s_cselect_b64 vcc, -1, 0 1423; VI-NEXT: v_lshlrev_b16_e32 v4, 8, v4 1424; VI-NEXT: v_cndmask_b32_e32 v5, 5, v5, vcc 1425; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 1426; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 1427; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1428; VI-NEXT: s_endpgm 1429 %vecins = insertelement <16 x i8> %a, i8 5, i32 %b 1430 store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16 1431 ret void 1432} 1433 1434; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that 1435; the compiler doesn't crash. 1436define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) { 1437; SI-LABEL: insert_split_bb: 1438; SI: ; %bb.0: ; %entry 1439; SI-NEXT: s_load_dword s6, s[4:5], 0x4 1440; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1441; SI-NEXT: s_waitcnt lgkmcnt(0) 1442; SI-NEXT: s_cmp_lg_u32 s6, 0 1443; SI-NEXT: s_cbranch_scc0 .LBB30_4 1444; SI-NEXT: ; %bb.1: ; %else 1445; SI-NEXT: s_load_dword s7, s[2:3], 0x1 1446; SI-NEXT: s_mov_b64 s[4:5], 0 1447; SI-NEXT: s_andn2_b64 vcc, exec, s[4:5] 1448; SI-NEXT: s_waitcnt lgkmcnt(0) 1449; SI-NEXT: s_mov_b64 vcc, vcc 1450; SI-NEXT: s_cbranch_vccnz .LBB30_3 1451; SI-NEXT: .LBB30_2: ; %if 1452; SI-NEXT: s_load_dword s7, s[2:3], 0x0 1453; SI-NEXT: .LBB30_3: ; %endif 1454; SI-NEXT: s_waitcnt lgkmcnt(0) 1455; SI-NEXT: v_mov_b32_e32 v0, s6 1456; SI-NEXT: s_mov_b32 s3, 0x100f000 1457; SI-NEXT: s_mov_b32 s2, -1 1458; SI-NEXT: v_mov_b32_e32 v1, s7 1459; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1460; SI-NEXT: s_endpgm 1461; SI-NEXT: .LBB30_4: 1462; SI-NEXT: s_branch .LBB30_2 1463; 1464; VI-LABEL: insert_split_bb: 1465; VI: ; %bb.0: ; %entry 1466; VI-NEXT: s_load_dword s6, s[4:5], 0x10 1467; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1468; VI-NEXT: s_waitcnt lgkmcnt(0) 1469; VI-NEXT: s_cmp_lg_u32 s6, 0 1470; VI-NEXT: s_cbranch_scc0 .LBB30_4 1471; VI-NEXT: ; %bb.1: ; %else 1472; VI-NEXT: s_load_dword s7, s[2:3], 0x4 1473; VI-NEXT: s_cbranch_execnz .LBB30_3 1474; VI-NEXT: .LBB30_2: ; %if 1475; VI-NEXT: s_waitcnt lgkmcnt(0) 1476; VI-NEXT: s_load_dword s7, s[2:3], 0x0 1477; VI-NEXT: .LBB30_3: ; %endif 1478; VI-NEXT: s_waitcnt lgkmcnt(0) 1479; VI-NEXT: v_mov_b32_e32 v0, s6 1480; VI-NEXT: s_mov_b32 s3, 0x1100f000 1481; VI-NEXT: s_mov_b32 s2, -1 1482; VI-NEXT: v_mov_b32_e32 v1, s7 1483; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 1484; VI-NEXT: s_endpgm 1485; VI-NEXT: .LBB30_4: 1486; VI-NEXT: s_branch .LBB30_2 1487entry: 1488 %0 = insertelement <2 x i32> undef, i32 %a, i32 0 1489 %1 = icmp eq i32 %a, 0 1490 br i1 %1, label %if, label %else 1491 1492if: 1493 %2 = load i32, i32 addrspace(1)* %in 1494 %3 = insertelement <2 x i32> %0, i32 %2, i32 1 1495 br label %endif 1496 1497else: 1498 %4 = getelementptr i32, i32 addrspace(1)* %in, i32 1 1499 %5 = load i32, i32 addrspace(1)* %4 1500 %6 = insertelement <2 x i32> %0, i32 %5, i32 1 1501 br label %endif 1502 1503endif: 1504 %7 = phi <2 x i32> [%3, %if], [%6, %else] 1505 store <2 x i32> %7, <2 x i32> addrspace(1)* %out 1506 ret void 1507} 1508 1509define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind { 1510; SI-LABEL: dynamic_insertelement_v2f64: 1511; SI: ; %bb.0: 1512; SI-NEXT: s_load_dword s8, s[4:5], 0x18 1513; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xc 1514; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1515; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 1516; SI-NEXT: s_mov_b32 s7, 0x100f000 1517; SI-NEXT: s_waitcnt lgkmcnt(0) 1518; SI-NEXT: s_cmp_eq_u32 s8, 1 1519; SI-NEXT: v_mov_b32_e32 v0, s3 1520; SI-NEXT: s_cselect_b64 vcc, -1, 0 1521; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1522; SI-NEXT: v_mov_b32_e32 v0, s2 1523; SI-NEXT: s_cmp_eq_u32 s8, 0 1524; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1525; SI-NEXT: v_mov_b32_e32 v0, s1 1526; SI-NEXT: s_cselect_b64 vcc, -1, 0 1527; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1528; SI-NEXT: v_mov_b32_e32 v0, s0 1529; SI-NEXT: s_mov_b32 s6, -1 1530; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1531; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1532; SI-NEXT: s_endpgm 1533; 1534; VI-LABEL: dynamic_insertelement_v2f64: 1535; VI: ; %bb.0: 1536; VI-NEXT: s_load_dword s8, s[4:5], 0x60 1537; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x30 1538; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1539; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 1540; VI-NEXT: s_mov_b32 s7, 0x1100f000 1541; VI-NEXT: s_waitcnt lgkmcnt(0) 1542; VI-NEXT: s_cmp_eq_u32 s8, 1 1543; VI-NEXT: v_mov_b32_e32 v0, s3 1544; VI-NEXT: s_cselect_b64 vcc, -1, 0 1545; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc 1546; VI-NEXT: v_mov_b32_e32 v0, s2 1547; VI-NEXT: s_cmp_eq_u32 s8, 0 1548; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1549; VI-NEXT: v_mov_b32_e32 v0, s1 1550; VI-NEXT: s_cselect_b64 vcc, -1, 0 1551; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc 1552; VI-NEXT: v_mov_b32_e32 v0, s0 1553; VI-NEXT: s_mov_b32 s6, -1 1554; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1555; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1556; VI-NEXT: s_endpgm 1557 %vecins = insertelement <2 x double> %a, double 8.0, i32 %b 1558 store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16 1559 ret void 1560} 1561 1562define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind { 1563; SI-LABEL: dynamic_insertelement_v2i64: 1564; SI: ; %bb.0: 1565; SI-NEXT: s_load_dword s10, s[4:5], 0x8 1566; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 1567; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1568; SI-NEXT: s_mov_b32 s7, 0x100f000 1569; SI-NEXT: s_mov_b32 s6, -1 1570; SI-NEXT: s_waitcnt lgkmcnt(0) 1571; SI-NEXT: s_cmp_eq_u32 s10, 1 1572; SI-NEXT: v_mov_b32_e32 v0, s3 1573; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 1574; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[8:9] 1575; SI-NEXT: v_mov_b32_e32 v0, s2 1576; SI-NEXT: s_cmp_eq_u32 s10, 0 1577; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[8:9] 1578; SI-NEXT: v_mov_b32_e32 v0, s1 1579; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 1580; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] 1581; SI-NEXT: v_mov_b32_e32 v0, s0 1582; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] 1583; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1584; SI-NEXT: s_endpgm 1585; 1586; VI-LABEL: dynamic_insertelement_v2i64: 1587; VI: ; %bb.0: 1588; VI-NEXT: s_load_dword s10, s[4:5], 0x20 1589; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 1590; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 1591; VI-NEXT: s_mov_b32 s7, 0x1100f000 1592; VI-NEXT: s_mov_b32 s6, -1 1593; VI-NEXT: s_waitcnt lgkmcnt(0) 1594; VI-NEXT: s_cmp_eq_u32 s10, 1 1595; VI-NEXT: v_mov_b32_e32 v0, s3 1596; VI-NEXT: s_cselect_b64 s[8:9], -1, 0 1597; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[8:9] 1598; VI-NEXT: v_mov_b32_e32 v0, s2 1599; VI-NEXT: s_cmp_eq_u32 s10, 0 1600; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[8:9] 1601; VI-NEXT: v_mov_b32_e32 v0, s1 1602; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 1603; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3] 1604; VI-NEXT: v_mov_b32_e32 v0, s0 1605; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[2:3] 1606; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1607; VI-NEXT: s_endpgm 1608 %vecins = insertelement <2 x i64> %a, i64 5, i32 %b 1609 store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8 1610 ret void 1611} 1612 1613define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind { 1614; SI-LABEL: dynamic_insertelement_v3i64: 1615; SI: ; %bb.0: 1616; SI-NEXT: s_load_dword s12, s[4:5], 0x10 1617; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1618; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8 1619; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xc 1620; SI-NEXT: s_mov_b32 s3, 0x100f000 1621; SI-NEXT: s_waitcnt lgkmcnt(0) 1622; SI-NEXT: s_cmp_eq_u32 s12, 1 1623; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 1624; SI-NEXT: v_mov_b32_e32 v0, s11 1625; SI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[6:7] 1626; SI-NEXT: v_mov_b32_e32 v0, s10 1627; SI-NEXT: s_cmp_eq_u32 s12, 0 1628; SI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[6:7] 1629; SI-NEXT: v_mov_b32_e32 v0, s9 1630; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 1631; SI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[6:7] 1632; SI-NEXT: v_mov_b32_e32 v0, s8 1633; SI-NEXT: s_cmp_eq_u32 s12, 2 1634; SI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[6:7] 1635; SI-NEXT: v_mov_b32_e32 v4, s5 1636; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 1637; SI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[6:7] 1638; SI-NEXT: v_mov_b32_e32 v4, s4 1639; SI-NEXT: s_mov_b32 s2, -1 1640; SI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[6:7] 1641; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1642; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1643; SI-NEXT: s_endpgm 1644; 1645; VI-LABEL: dynamic_insertelement_v3i64: 1646; VI: ; %bb.0: 1647; VI-NEXT: s_load_dword s12, s[4:5], 0x40 1648; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1649; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x20 1650; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x30 1651; VI-NEXT: s_mov_b32 s3, 0x1100f000 1652; VI-NEXT: s_waitcnt lgkmcnt(0) 1653; VI-NEXT: s_cmp_eq_u32 s12, 1 1654; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 1655; VI-NEXT: v_mov_b32_e32 v0, s11 1656; VI-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[6:7] 1657; VI-NEXT: v_mov_b32_e32 v0, s10 1658; VI-NEXT: s_cmp_eq_u32 s12, 0 1659; VI-NEXT: v_cndmask_b32_e64 v2, v0, 5, s[6:7] 1660; VI-NEXT: v_mov_b32_e32 v0, s9 1661; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 1662; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[6:7] 1663; VI-NEXT: v_mov_b32_e32 v0, s8 1664; VI-NEXT: s_cmp_eq_u32 s12, 2 1665; VI-NEXT: v_cndmask_b32_e64 v0, v0, 5, s[6:7] 1666; VI-NEXT: v_mov_b32_e32 v4, s5 1667; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 1668; VI-NEXT: v_cndmask_b32_e64 v5, v4, 0, s[6:7] 1669; VI-NEXT: v_mov_b32_e32 v4, s4 1670; VI-NEXT: s_mov_b32 s2, -1 1671; VI-NEXT: v_cndmask_b32_e64 v4, v4, 5, s[6:7] 1672; VI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 1673; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1674; VI-NEXT: s_endpgm 1675 %vecins = insertelement <3 x i64> %a, i64 5, i32 %b 1676 store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32 1677 ret void 1678} 1679 1680define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind { 1681; SI-LABEL: dynamic_insertelement_v4f64: 1682; SI: ; %bb.0: 1683; SI-NEXT: s_load_dword s6, s[4:5], 0x10 1684; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8 1685; SI-NEXT: v_mov_b32_e32 v4, 0x40200000 1686; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1687; SI-NEXT: s_mov_b32 s3, 0x100f000 1688; SI-NEXT: s_waitcnt lgkmcnt(0) 1689; SI-NEXT: s_cmp_eq_u32 s6, 1 1690; SI-NEXT: v_mov_b32_e32 v0, s11 1691; SI-NEXT: s_cselect_b64 vcc, -1, 0 1692; SI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1693; SI-NEXT: v_mov_b32_e32 v0, s10 1694; SI-NEXT: s_cmp_eq_u32 s6, 0 1695; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1696; SI-NEXT: v_mov_b32_e32 v0, s9 1697; SI-NEXT: s_cselect_b64 vcc, -1, 0 1698; SI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1699; SI-NEXT: v_mov_b32_e32 v0, s8 1700; SI-NEXT: s_cmp_eq_u32 s6, 3 1701; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1702; SI-NEXT: v_mov_b32_e32 v5, s15 1703; SI-NEXT: s_cselect_b64 vcc, -1, 0 1704; SI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1705; SI-NEXT: v_mov_b32_e32 v5, s14 1706; SI-NEXT: s_cmp_eq_u32 s6, 2 1707; SI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1708; SI-NEXT: v_mov_b32_e32 v5, s13 1709; SI-NEXT: s_cselect_b64 vcc, -1, 0 1710; SI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1711; SI-NEXT: v_mov_b32_e32 v4, s12 1712; SI-NEXT: s_mov_b32 s2, -1 1713; SI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1714; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1715; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1716; SI-NEXT: s_endpgm 1717; 1718; VI-LABEL: dynamic_insertelement_v4f64: 1719; VI: ; %bb.0: 1720; VI-NEXT: s_load_dword s6, s[4:5], 0x40 1721; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20 1722; VI-NEXT: v_mov_b32_e32 v4, 0x40200000 1723; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1724; VI-NEXT: s_mov_b32 s3, 0x1100f000 1725; VI-NEXT: s_waitcnt lgkmcnt(0) 1726; VI-NEXT: s_cmp_eq_u32 s6, 1 1727; VI-NEXT: v_mov_b32_e32 v0, s11 1728; VI-NEXT: s_cselect_b64 vcc, -1, 0 1729; VI-NEXT: v_cndmask_b32_e32 v3, v0, v4, vcc 1730; VI-NEXT: v_mov_b32_e32 v0, s10 1731; VI-NEXT: s_cmp_eq_u32 s6, 0 1732; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, vcc 1733; VI-NEXT: v_mov_b32_e32 v0, s9 1734; VI-NEXT: s_cselect_b64 vcc, -1, 0 1735; VI-NEXT: v_cndmask_b32_e32 v1, v0, v4, vcc 1736; VI-NEXT: v_mov_b32_e32 v0, s8 1737; VI-NEXT: s_cmp_eq_u32 s6, 3 1738; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 1739; VI-NEXT: v_mov_b32_e32 v5, s15 1740; VI-NEXT: s_cselect_b64 vcc, -1, 0 1741; VI-NEXT: v_cndmask_b32_e32 v7, v5, v4, vcc 1742; VI-NEXT: v_mov_b32_e32 v5, s14 1743; VI-NEXT: s_cmp_eq_u32 s6, 2 1744; VI-NEXT: v_cndmask_b32_e64 v6, v5, 0, vcc 1745; VI-NEXT: v_mov_b32_e32 v5, s13 1746; VI-NEXT: s_cselect_b64 vcc, -1, 0 1747; VI-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc 1748; VI-NEXT: v_mov_b32_e32 v4, s12 1749; VI-NEXT: s_mov_b32 s2, -1 1750; VI-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 1751; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1752; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1753; VI-NEXT: s_endpgm 1754 %vecins = insertelement <4 x double> %a, double 8.0, i32 %b 1755 store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16 1756 ret void 1757} 1758 1759define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 { 1760; SI-LABEL: dynamic_insertelement_v8f64: 1761; SI: ; %bb.0: 1762; SI-NEXT: s_load_dword s6, s[4:5], 0x20 1763; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x10 1764; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1765; SI-NEXT: v_mov_b32_e32 v16, 0x40200000 1766; SI-NEXT: s_mov_b32 s3, 0x100f000 1767; SI-NEXT: s_waitcnt lgkmcnt(0) 1768; SI-NEXT: s_lshl_b32 s4, s6, 1 1769; SI-NEXT: v_mov_b32_e32 v0, s8 1770; SI-NEXT: v_mov_b32_e32 v1, s9 1771; SI-NEXT: v_mov_b32_e32 v2, s10 1772; SI-NEXT: v_mov_b32_e32 v3, s11 1773; SI-NEXT: v_mov_b32_e32 v4, s12 1774; SI-NEXT: v_mov_b32_e32 v5, s13 1775; SI-NEXT: v_mov_b32_e32 v6, s14 1776; SI-NEXT: v_mov_b32_e32 v7, s15 1777; SI-NEXT: v_mov_b32_e32 v8, s16 1778; SI-NEXT: v_mov_b32_e32 v9, s17 1779; SI-NEXT: v_mov_b32_e32 v10, s18 1780; SI-NEXT: v_mov_b32_e32 v11, s19 1781; SI-NEXT: v_mov_b32_e32 v12, s20 1782; SI-NEXT: v_mov_b32_e32 v13, s21 1783; SI-NEXT: v_mov_b32_e32 v14, s22 1784; SI-NEXT: v_mov_b32_e32 v15, s23 1785; SI-NEXT: s_mov_b32 m0, s4 1786; SI-NEXT: v_movreld_b32_e32 v0, 0 1787; SI-NEXT: s_mov_b32 s2, -1 1788; SI-NEXT: v_movreld_b32_e32 v1, v16 1789; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1790; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1791; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1792; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1793; SI-NEXT: s_endpgm 1794; 1795; VI-LABEL: dynamic_insertelement_v8f64: 1796; VI: ; %bb.0: 1797; VI-NEXT: s_load_dword s6, s[4:5], 0x80 1798; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40 1799; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1800; VI-NEXT: v_mov_b32_e32 v16, 0x40200000 1801; VI-NEXT: s_mov_b32 s3, 0x1100f000 1802; VI-NEXT: s_waitcnt lgkmcnt(0) 1803; VI-NEXT: s_lshl_b32 s4, s6, 1 1804; VI-NEXT: v_mov_b32_e32 v0, s8 1805; VI-NEXT: v_mov_b32_e32 v1, s9 1806; VI-NEXT: v_mov_b32_e32 v2, s10 1807; VI-NEXT: v_mov_b32_e32 v3, s11 1808; VI-NEXT: v_mov_b32_e32 v4, s12 1809; VI-NEXT: v_mov_b32_e32 v5, s13 1810; VI-NEXT: v_mov_b32_e32 v6, s14 1811; VI-NEXT: v_mov_b32_e32 v7, s15 1812; VI-NEXT: v_mov_b32_e32 v8, s16 1813; VI-NEXT: v_mov_b32_e32 v9, s17 1814; VI-NEXT: v_mov_b32_e32 v10, s18 1815; VI-NEXT: v_mov_b32_e32 v11, s19 1816; VI-NEXT: v_mov_b32_e32 v12, s20 1817; VI-NEXT: v_mov_b32_e32 v13, s21 1818; VI-NEXT: v_mov_b32_e32 v14, s22 1819; VI-NEXT: v_mov_b32_e32 v15, s23 1820; VI-NEXT: s_mov_b32 m0, s4 1821; VI-NEXT: v_movreld_b32_e32 v0, 0 1822; VI-NEXT: s_mov_b32 s2, -1 1823; VI-NEXT: v_movreld_b32_e32 v1, v16 1824; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 1825; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 1826; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 1827; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1828; VI-NEXT: s_endpgm 1829 %vecins = insertelement <8 x double> %a, double 8.0, i32 %b 1830 store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 1831 ret void 1832} 1833 1834declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 1835 1836attributes #0 = { nounwind } 1837attributes #1 = { nounwind readnone } 1838