1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI 3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI 4 5define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 6; SI-LABEL: s_sext_i1_to_i32: 7; SI: ; %bb.0: 8; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 9; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 10; SI-NEXT: s_mov_b32 s3, 0xf000 11; SI-NEXT: s_mov_b32 s2, -1 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: s_cmp_eq_u32 s4, s5 14; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 15; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 16; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: s_sext_i1_to_i32: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 22; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 23; VI-NEXT: s_mov_b32 s3, 0xf000 24; VI-NEXT: s_mov_b32 s2, -1 25; VI-NEXT: s_waitcnt lgkmcnt(0) 26; VI-NEXT: s_cmp_eq_u32 s4, s5 27; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 28; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 29; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 30; VI-NEXT: s_endpgm 31 %cmp = icmp eq i32 %a, %b 32 %sext = sext i1 %cmp to i32 33 store i32 %sext, i32 addrspace(1)* %out, align 4 34 ret void 35} 36 37define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 38; SI-LABEL: test_s_sext_i32_to_i64: 39; SI: ; %bb.0: ; %entry 40; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 41; SI-NEXT: s_load_dword s6, s[0:1], 0xd 42; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 43; SI-NEXT: s_mov_b32 s3, 0xf000 44; SI-NEXT: s_mov_b32 s2, -1 45; SI-NEXT: s_waitcnt lgkmcnt(0) 46; SI-NEXT: s_mul_i32 s4, s4, s5 47; SI-NEXT: s_add_i32 s4, s4, s6 48; SI-NEXT: s_ashr_i32 s5, s4, 31 49; SI-NEXT: v_mov_b32_e32 v0, s4 50; SI-NEXT: v_mov_b32_e32 v1, s5 51; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 52; SI-NEXT: s_endpgm 53; 54; VI-LABEL: test_s_sext_i32_to_i64: 55; VI: ; %bb.0: ; %entry 56; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 57; VI-NEXT: s_load_dword s6, s[0:1], 0x34 58; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 59; VI-NEXT: s_mov_b32 s3, 0xf000 60; VI-NEXT: s_mov_b32 s2, -1 61; VI-NEXT: s_waitcnt lgkmcnt(0) 62; VI-NEXT: s_mul_i32 s4, s4, s5 63; VI-NEXT: s_add_i32 s4, s4, s6 64; VI-NEXT: s_ashr_i32 s5, s4, 31 65; VI-NEXT: v_mov_b32_e32 v0, s4 66; VI-NEXT: v_mov_b32_e32 v1, s5 67; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 68; VI-NEXT: s_endpgm 69entry: 70 %mul = mul i32 %a, %b 71 %add = add i32 %mul, %c 72 %sext = sext i32 %add to i64 73 store i64 %sext, i64 addrspace(1)* %out, align 8 74 ret void 75} 76 77define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 78; SI-LABEL: s_sext_i1_to_i64: 79; SI: ; %bb.0: 80; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 81; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 82; SI-NEXT: s_mov_b32 s3, 0xf000 83; SI-NEXT: s_mov_b32 s2, -1 84; SI-NEXT: s_waitcnt lgkmcnt(0) 85; SI-NEXT: s_cmp_eq_u32 s4, s5 86; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 87; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 88; SI-NEXT: v_mov_b32_e32 v1, v0 89; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 90; SI-NEXT: s_endpgm 91; 92; VI-LABEL: s_sext_i1_to_i64: 93; VI: ; %bb.0: 94; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 95; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 96; VI-NEXT: s_mov_b32 s3, 0xf000 97; VI-NEXT: s_mov_b32 s2, -1 98; VI-NEXT: s_waitcnt lgkmcnt(0) 99; VI-NEXT: s_cmp_eq_u32 s4, s5 100; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 101; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 102; VI-NEXT: v_mov_b32_e32 v1, v0 103; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 104; VI-NEXT: s_endpgm 105 %cmp = icmp eq i32 %a, %b 106 %sext = sext i1 %cmp to i64 107 store i64 %sext, i64 addrspace(1)* %out, align 8 108 ret void 109} 110 111define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind { 112; SI-LABEL: s_sext_i32_to_i64: 113; SI: ; %bb.0: 114; SI-NEXT: s_load_dword s4, s[0:1], 0xb 115; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 116; SI-NEXT: s_mov_b32 s3, 0xf000 117; SI-NEXT: s_mov_b32 s2, -1 118; SI-NEXT: s_waitcnt lgkmcnt(0) 119; SI-NEXT: s_ashr_i32 s5, s4, 31 120; SI-NEXT: v_mov_b32_e32 v0, s4 121; SI-NEXT: v_mov_b32_e32 v1, s5 122; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 123; SI-NEXT: s_endpgm 124; 125; VI-LABEL: s_sext_i32_to_i64: 126; VI: ; %bb.0: 127; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 128; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 129; VI-NEXT: s_mov_b32 s3, 0xf000 130; VI-NEXT: s_mov_b32 s2, -1 131; VI-NEXT: s_waitcnt lgkmcnt(0) 132; VI-NEXT: s_ashr_i32 s5, s4, 31 133; VI-NEXT: v_mov_b32_e32 v0, s4 134; VI-NEXT: v_mov_b32_e32 v1, s5 135; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 136; VI-NEXT: s_endpgm 137 %sext = sext i32 %a to i64 138 store i64 %sext, i64 addrspace(1)* %out, align 8 139 ret void 140} 141 142define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 143; SI-LABEL: v_sext_i32_to_i64: 144; SI: ; %bb.0: 145; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 146; SI-NEXT: s_mov_b32 s7, 0xf000 147; SI-NEXT: s_mov_b32 s6, -1 148; SI-NEXT: s_mov_b32 s10, s6 149; SI-NEXT: s_mov_b32 s11, s7 150; SI-NEXT: s_waitcnt lgkmcnt(0) 151; SI-NEXT: s_mov_b32 s8, s2 152; SI-NEXT: s_mov_b32 s9, s3 153; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 154; SI-NEXT: s_mov_b32 s4, s0 155; SI-NEXT: s_mov_b32 s5, s1 156; SI-NEXT: s_waitcnt vmcnt(0) 157; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 158; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 159; SI-NEXT: s_endpgm 160; 161; VI-LABEL: v_sext_i32_to_i64: 162; VI: ; %bb.0: 163; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 164; VI-NEXT: s_mov_b32 s7, 0xf000 165; VI-NEXT: s_mov_b32 s6, -1 166; VI-NEXT: s_mov_b32 s10, s6 167; VI-NEXT: s_mov_b32 s11, s7 168; VI-NEXT: s_waitcnt lgkmcnt(0) 169; VI-NEXT: s_mov_b32 s8, s2 170; VI-NEXT: s_mov_b32 s9, s3 171; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 172; VI-NEXT: s_mov_b32 s4, s0 173; VI-NEXT: s_mov_b32 s5, s1 174; VI-NEXT: s_waitcnt vmcnt(0) 175; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 176; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 177; VI-NEXT: s_endpgm 178 %val = load i32, i32 addrspace(1)* %in, align 4 179 %sext = sext i32 %val to i64 180 store i64 %sext, i64 addrspace(1)* %out, align 8 181 ret void 182} 183 184define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind { 185; SI-LABEL: s_sext_i16_to_i64: 186; SI: ; %bb.0: 187; SI-NEXT: s_load_dword s4, s[0:1], 0xb 188; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 189; SI-NEXT: s_mov_b32 s3, 0xf000 190; SI-NEXT: s_mov_b32 s2, -1 191; SI-NEXT: s_waitcnt lgkmcnt(0) 192; SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 193; SI-NEXT: v_mov_b32_e32 v0, s4 194; SI-NEXT: v_mov_b32_e32 v1, s5 195; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 196; SI-NEXT: s_endpgm 197; 198; VI-LABEL: s_sext_i16_to_i64: 199; VI: ; %bb.0: 200; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 201; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 202; VI-NEXT: s_mov_b32 s3, 0xf000 203; VI-NEXT: s_mov_b32 s2, -1 204; VI-NEXT: s_waitcnt lgkmcnt(0) 205; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 206; VI-NEXT: v_mov_b32_e32 v0, s4 207; VI-NEXT: v_mov_b32_e32 v1, s5 208; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 209; VI-NEXT: s_endpgm 210 %sext = sext i16 %a to i64 211 store i64 %sext, i64 addrspace(1)* %out, align 8 212 ret void 213} 214 215define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind { 216; SI-LABEL: s_sext_i1_to_i16: 217; SI: ; %bb.0: 218; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 219; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 220; SI-NEXT: s_mov_b32 s3, 0xf000 221; SI-NEXT: s_mov_b32 s2, -1 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_cmp_eq_u32 s4, s5 224; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 225; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 226; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 227; SI-NEXT: s_endpgm 228; 229; VI-LABEL: s_sext_i1_to_i16: 230; VI: ; %bb.0: 231; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 232; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 233; VI-NEXT: s_mov_b32 s3, 0xf000 234; VI-NEXT: s_mov_b32 s2, -1 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: s_cmp_eq_u32 s4, s5 237; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 238; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 239; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 240; VI-NEXT: s_endpgm 241 %cmp = icmp eq i32 %a, %b 242 %sext = sext i1 %cmp to i16 243 store i16 %sext, i16 addrspace(1)* %out 244 ret void 245} 246 247; This purpose of this test is to make sure the i16 = sign_extend i1 node 248; makes it all the way throught the legalizer/optimizer to make sure 249; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node 250; is optimized to a select very early. 251define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind { 252; SI-LABEL: s_sext_i1_to_i16_with_and: 253; SI: ; %bb.0: 254; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 255; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 256; SI-NEXT: s_mov_b32 s3, 0xf000 257; SI-NEXT: s_mov_b32 s2, -1 258; SI-NEXT: s_waitcnt lgkmcnt(0) 259; SI-NEXT: s_cmp_eq_u32 s4, s5 260; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 261; SI-NEXT: s_cmp_eq_u32 s6, s7 262; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 263; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] 264; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 265; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 266; SI-NEXT: s_endpgm 267; 268; VI-LABEL: s_sext_i1_to_i16_with_and: 269; VI: ; %bb.0: 270; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 271; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 272; VI-NEXT: s_mov_b32 s3, 0xf000 273; VI-NEXT: s_mov_b32 s2, -1 274; VI-NEXT: s_waitcnt lgkmcnt(0) 275; VI-NEXT: s_cmp_eq_u32 s4, s5 276; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 277; VI-NEXT: s_cmp_eq_u32 s6, s7 278; VI-NEXT: s_cselect_b64 s[6:7], -1, 0 279; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] 280; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 281; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 282; VI-NEXT: s_endpgm 283 %cmp0 = icmp eq i32 %a, %b 284 %cmp1 = icmp eq i32 %c, %d 285 %cmp = and i1 %cmp0, %cmp1 286 %sext = sext i1 %cmp to i16 287 store i16 %sext, i16 addrspace(1)* %out 288 ret void 289} 290 291define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind { 292; SI-LABEL: v_sext_i1_to_i16_with_and: 293; SI: ; %bb.0: 294; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 295; SI-NEXT: s_load_dword s6, s[0:1], 0xd 296; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 297; SI-NEXT: s_mov_b32 s3, 0xf000 298; SI-NEXT: s_mov_b32 s2, -1 299; SI-NEXT: s_waitcnt lgkmcnt(0) 300; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 301; SI-NEXT: s_cmp_eq_u32 s5, s6 302; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 303; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 304; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 305; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 306; SI-NEXT: s_endpgm 307; 308; VI-LABEL: v_sext_i1_to_i16_with_and: 309; VI: ; %bb.0: 310; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 311; VI-NEXT: s_load_dword s6, s[0:1], 0x34 312; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 313; VI-NEXT: s_mov_b32 s3, 0xf000 314; VI-NEXT: s_mov_b32 s2, -1 315; VI-NEXT: s_waitcnt lgkmcnt(0) 316; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 317; VI-NEXT: s_cmp_eq_u32 s5, s6 318; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 319; VI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] 320; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] 321; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 322; VI-NEXT: s_endpgm 323 %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 324 %cmp0 = icmp eq i32 %a, %tid 325 %cmp1 = icmp eq i32 %b, %c 326 %cmp = and i1 %cmp0, %cmp1 327 %sext = sext i1 %cmp to i16 328 store i16 %sext, i16 addrspace(1)* %out 329 ret void 330} 331 332; FIXME: We end up with a v_bfe instruction, because the i16 srl 333; gets selected to a v_lshrrev_b16 instructions, so the input to 334; the bfe is a vector registers. To fix this we need to be able to 335; optimize: 336; t29: i16 = truncate t10 337; t55: i16 = srl t29, Constant:i32<8> 338; t63: i32 = any_extend t55 339; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8 340define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind { 341; SI-LABEL: s_sext_v4i8_to_v4i32: 342; SI: ; %bb.0: 343; SI-NEXT: s_load_dword s4, s[0:1], 0xb 344; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 345; SI-NEXT: s_mov_b32 s3, 0xf000 346; SI-NEXT: s_mov_b32 s2, -1 347; SI-NEXT: s_waitcnt lgkmcnt(0) 348; SI-NEXT: s_ashr_i32 s5, s4, 24 349; SI-NEXT: s_bfe_i32 s6, s4, 0x80010 350; SI-NEXT: s_bfe_i32 s7, s4, 0x80008 351; SI-NEXT: s_sext_i32_i8 s4, s4 352; SI-NEXT: v_mov_b32_e32 v0, s4 353; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 354; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 355; SI-NEXT: v_mov_b32_e32 v0, s7 356; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 357; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 358; SI-NEXT: v_mov_b32_e32 v0, s6 359; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 360; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 361; SI-NEXT: v_mov_b32_e32 v0, s5 362; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 363; SI-NEXT: s_waitcnt vmcnt(0) 364; SI-NEXT: s_endpgm 365; 366; VI-LABEL: s_sext_v4i8_to_v4i32: 367; VI: ; %bb.0: 368; VI-NEXT: s_load_dword s4, s[0:1], 0x2c 369; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 370; VI-NEXT: s_mov_b32 s3, 0xf000 371; VI-NEXT: s_mov_b32 s2, -1 372; VI-NEXT: s_waitcnt lgkmcnt(0) 373; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4 374; VI-NEXT: s_ashr_i32 s5, s4, 24 375; VI-NEXT: s_bfe_i32 s6, s4, 0x80010 376; VI-NEXT: s_sext_i32_i8 s4, s4 377; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 378; VI-NEXT: v_mov_b32_e32 v1, s4 379; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0 380; VI-NEXT: s_waitcnt vmcnt(0) 381; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 382; VI-NEXT: s_waitcnt vmcnt(0) 383; VI-NEXT: v_mov_b32_e32 v0, s6 384; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 385; VI-NEXT: s_waitcnt vmcnt(0) 386; VI-NEXT: v_mov_b32_e32 v0, s5 387; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; VI-NEXT: s_waitcnt vmcnt(0) 389; VI-NEXT: s_endpgm 390 %cast = bitcast i32 %a to <4 x i8> 391 %ext = sext <4 x i8> %cast to <4 x i32> 392 %elt0 = extractelement <4 x i32> %ext, i32 0 393 %elt1 = extractelement <4 x i32> %ext, i32 1 394 %elt2 = extractelement <4 x i32> %ext, i32 2 395 %elt3 = extractelement <4 x i32> %ext, i32 3 396 store volatile i32 %elt0, i32 addrspace(1)* %out 397 store volatile i32 %elt1, i32 addrspace(1)* %out 398 store volatile i32 %elt2, i32 addrspace(1)* %out 399 store volatile i32 %elt3, i32 addrspace(1)* %out 400 ret void 401} 402 403; FIXME: need to optimize same sequence as above test to avoid 404; this shift. 405define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind { 406; SI-LABEL: v_sext_v4i8_to_v4i32: 407; SI: ; %bb.0: 408; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 409; SI-NEXT: s_mov_b32 s7, 0xf000 410; SI-NEXT: s_mov_b32 s6, -1 411; SI-NEXT: s_mov_b32 s10, s6 412; SI-NEXT: s_mov_b32 s11, s7 413; SI-NEXT: s_waitcnt lgkmcnt(0) 414; SI-NEXT: s_mov_b32 s8, s2 415; SI-NEXT: s_mov_b32 s9, s3 416; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 417; SI-NEXT: s_mov_b32 s4, s0 418; SI-NEXT: s_mov_b32 s5, s1 419; SI-NEXT: s_waitcnt vmcnt(0) 420; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0 421; SI-NEXT: v_bfe_i32 v2, v0, 16, 8 422; SI-NEXT: v_bfe_i32 v3, v0, 8, 8 423; SI-NEXT: v_bfe_i32 v0, v0, 0, 8 424; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 425; SI-NEXT: s_waitcnt vmcnt(0) 426; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 427; SI-NEXT: s_waitcnt vmcnt(0) 428; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 429; SI-NEXT: s_waitcnt vmcnt(0) 430; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 431; SI-NEXT: s_waitcnt vmcnt(0) 432; SI-NEXT: s_endpgm 433; 434; VI-LABEL: v_sext_v4i8_to_v4i32: 435; VI: ; %bb.0: 436; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 437; VI-NEXT: s_mov_b32 s7, 0xf000 438; VI-NEXT: s_mov_b32 s6, -1 439; VI-NEXT: s_mov_b32 s10, s6 440; VI-NEXT: s_mov_b32 s11, s7 441; VI-NEXT: s_waitcnt lgkmcnt(0) 442; VI-NEXT: s_mov_b32 s8, s2 443; VI-NEXT: s_mov_b32 s9, s3 444; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 445; VI-NEXT: s_mov_b32 s4, s0 446; VI-NEXT: s_mov_b32 s5, s1 447; VI-NEXT: s_waitcnt vmcnt(0) 448; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0 449; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0 450; VI-NEXT: v_bfe_i32 v3, v0, 16, 8 451; VI-NEXT: v_bfe_i32 v0, v0, 0, 8 452; VI-NEXT: v_bfe_i32 v1, v1, 0, 8 453; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 454; VI-NEXT: s_waitcnt vmcnt(0) 455; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 456; VI-NEXT: s_waitcnt vmcnt(0) 457; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 458; VI-NEXT: s_waitcnt vmcnt(0) 459; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 460; VI-NEXT: s_waitcnt vmcnt(0) 461; VI-NEXT: s_endpgm 462 %a = load i32, i32 addrspace(1)* %in 463 %cast = bitcast i32 %a to <4 x i8> 464 %ext = sext <4 x i8> %cast to <4 x i32> 465 %elt0 = extractelement <4 x i32> %ext, i32 0 466 %elt1 = extractelement <4 x i32> %ext, i32 1 467 %elt2 = extractelement <4 x i32> %ext, i32 2 468 %elt3 = extractelement <4 x i32> %ext, i32 3 469 store volatile i32 %elt0, i32 addrspace(1)* %out 470 store volatile i32 %elt1, i32 addrspace(1)* %out 471 store volatile i32 %elt2, i32 addrspace(1)* %out 472 store volatile i32 %elt3, i32 addrspace(1)* %out 473 ret void 474} 475 476; FIXME: s_bfe_i64, same on SI and VI 477define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind { 478; SI-LABEL: s_sext_v4i16_to_v4i32: 479; SI: ; %bb.0: 480; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 481; SI-NEXT: s_mov_b32 s7, 0xf000 482; SI-NEXT: s_mov_b32 s6, -1 483; SI-NEXT: s_waitcnt lgkmcnt(0) 484; SI-NEXT: s_mov_b32 s4, s0 485; SI-NEXT: s_mov_b32 s5, s1 486; SI-NEXT: s_ashr_i64 s[0:1], s[2:3], 48 487; SI-NEXT: s_ashr_i32 s1, s2, 16 488; SI-NEXT: s_sext_i32_i16 s2, s2 489; SI-NEXT: v_mov_b32_e32 v0, s2 490; SI-NEXT: s_sext_i32_i16 s3, s3 491; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 492; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 493; SI-NEXT: v_mov_b32_e32 v0, s1 494; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 495; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 496; SI-NEXT: v_mov_b32_e32 v0, s3 497; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 498; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) 499; SI-NEXT: v_mov_b32_e32 v0, s0 500; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 501; SI-NEXT: s_waitcnt vmcnt(0) 502; SI-NEXT: s_endpgm 503; 504; VI-LABEL: s_sext_v4i16_to_v4i32: 505; VI: ; %bb.0: 506; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 507; VI-NEXT: s_mov_b32 s7, 0xf000 508; VI-NEXT: s_mov_b32 s6, -1 509; VI-NEXT: s_waitcnt lgkmcnt(0) 510; VI-NEXT: s_mov_b32 s5, s1 511; VI-NEXT: s_ashr_i32 s1, s2, 16 512; VI-NEXT: s_sext_i32_i16 s2, s2 513; VI-NEXT: s_mov_b32 s4, s0 514; VI-NEXT: v_mov_b32_e32 v0, s2 515; VI-NEXT: s_ashr_i32 s0, s3, 16 516; VI-NEXT: s_sext_i32_i16 s3, s3 517; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 518; VI-NEXT: s_waitcnt vmcnt(0) 519; VI-NEXT: v_mov_b32_e32 v0, s1 520; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 521; VI-NEXT: s_waitcnt vmcnt(0) 522; VI-NEXT: v_mov_b32_e32 v0, s3 523; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 524; VI-NEXT: s_waitcnt vmcnt(0) 525; VI-NEXT: v_mov_b32_e32 v0, s0 526; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 527; VI-NEXT: s_waitcnt vmcnt(0) 528; VI-NEXT: s_endpgm 529 %cast = bitcast i64 %a to <4 x i16> 530 %ext = sext <4 x i16> %cast to <4 x i32> 531 %elt0 = extractelement <4 x i32> %ext, i32 0 532 %elt1 = extractelement <4 x i32> %ext, i32 1 533 %elt2 = extractelement <4 x i32> %ext, i32 2 534 %elt3 = extractelement <4 x i32> %ext, i32 3 535 store volatile i32 %elt0, i32 addrspace(1)* %out 536 store volatile i32 %elt1, i32 addrspace(1)* %out 537 store volatile i32 %elt2, i32 addrspace(1)* %out 538 store volatile i32 %elt3, i32 addrspace(1)* %out 539 ret void 540} 541 542define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind { 543; SI-LABEL: v_sext_v4i16_to_v4i32: 544; SI: ; %bb.0: 545; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 546; SI-NEXT: s_mov_b32 s7, 0xf000 547; SI-NEXT: s_mov_b32 s6, -1 548; SI-NEXT: s_mov_b32 s10, s6 549; SI-NEXT: s_mov_b32 s11, s7 550; SI-NEXT: s_waitcnt lgkmcnt(0) 551; SI-NEXT: s_mov_b32 s8, s2 552; SI-NEXT: s_mov_b32 s9, s3 553; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 554; SI-NEXT: s_mov_b32 s4, s0 555; SI-NEXT: s_mov_b32 s5, s1 556; SI-NEXT: s_waitcnt vmcnt(0) 557; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48 558; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 559; SI-NEXT: v_bfe_i32 v0, v0, 0, 16 560; SI-NEXT: v_bfe_i32 v1, v1, 0, 16 561; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 562; SI-NEXT: s_waitcnt vmcnt(0) 563; SI-NEXT: buffer_store_dword v3, off, s[4:7], 0 564; SI-NEXT: s_waitcnt vmcnt(0) 565; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 566; SI-NEXT: s_waitcnt vmcnt(0) 567; SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 568; SI-NEXT: s_waitcnt vmcnt(0) 569; SI-NEXT: s_endpgm 570; 571; VI-LABEL: v_sext_v4i16_to_v4i32: 572; VI: ; %bb.0: 573; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 574; VI-NEXT: s_mov_b32 s7, 0xf000 575; VI-NEXT: s_mov_b32 s6, -1 576; VI-NEXT: s_mov_b32 s10, s6 577; VI-NEXT: s_mov_b32 s11, s7 578; VI-NEXT: s_waitcnt lgkmcnt(0) 579; VI-NEXT: s_mov_b32 s8, s2 580; VI-NEXT: s_mov_b32 s9, s3 581; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 582; VI-NEXT: s_mov_b32 s4, s0 583; VI-NEXT: s_mov_b32 s5, s1 584; VI-NEXT: s_waitcnt vmcnt(0) 585; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0 586; VI-NEXT: v_bfe_i32 v0, v0, 0, 16 587; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1 588; VI-NEXT: v_bfe_i32 v1, v1, 0, 16 589; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 590; VI-NEXT: s_waitcnt vmcnt(0) 591; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0 592; VI-NEXT: s_waitcnt vmcnt(0) 593; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0 594; VI-NEXT: s_waitcnt vmcnt(0) 595; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0 596; VI-NEXT: s_waitcnt vmcnt(0) 597; VI-NEXT: s_endpgm 598 %a = load i64, i64 addrspace(1)* %in 599 %cast = bitcast i64 %a to <4 x i16> 600 %ext = sext <4 x i16> %cast to <4 x i32> 601 %elt0 = extractelement <4 x i32> %ext, i32 0 602 %elt1 = extractelement <4 x i32> %ext, i32 1 603 %elt2 = extractelement <4 x i32> %ext, i32 2 604 %elt3 = extractelement <4 x i32> %ext, i32 3 605 store volatile i32 %elt0, i32 addrspace(1)* %out 606 store volatile i32 %elt1, i32 addrspace(1)* %out 607 store volatile i32 %elt2, i32 addrspace(1)* %out 608 store volatile i32 %elt3, i32 addrspace(1)* %out 609 ret void 610} 611 612declare i32 @llvm.amdgcn.workitem.id.x() #1 613 614attributes #1 = { nounwind readnone } 615