1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=TONGA %s 3; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs | FileCheck -check-prefixes=GFX81 %s 4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s 5; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefixes=GFX10 %s 6 7define amdgpu_ps half @image_sample_2d_f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { 8; TONGA-LABEL: image_sample_2d_f16: 9; TONGA: ; %bb.0: ; %main_body 10; TONGA-NEXT: s_mov_b64 s[12:13], exec 11; TONGA-NEXT: s_wqm_b64 exec, exec 12; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] 13; TONGA-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 14; TONGA-NEXT: s_waitcnt vmcnt(0) 15; TONGA-NEXT: ; return to shader part epilog 16; 17; GFX81-LABEL: image_sample_2d_f16: 18; GFX81: ; %bb.0: ; %main_body 19; GFX81-NEXT: s_mov_b64 s[12:13], exec 20; GFX81-NEXT: s_wqm_b64 exec, exec 21; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] 22; GFX81-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 23; GFX81-NEXT: s_waitcnt vmcnt(0) 24; GFX81-NEXT: ; return to shader part epilog 25; 26; GFX9-LABEL: image_sample_2d_f16: 27; GFX9: ; %bb.0: ; %main_body 28; GFX9-NEXT: s_mov_b64 s[12:13], exec 29; GFX9-NEXT: s_wqm_b64 exec, exec 30; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 31; GFX9-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 d16 32; GFX9-NEXT: s_waitcnt vmcnt(0) 33; GFX9-NEXT: ; return to shader part epilog 34; 35; GFX10-LABEL: image_sample_2d_f16: 36; GFX10: ; %bb.0: ; %main_body 37; GFX10-NEXT: s_mov_b32 s12, exec_lo 38; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 39; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 40; GFX10-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D d16 41; GFX10-NEXT: s_waitcnt vmcnt(0) 42; GFX10-NEXT: ; return to shader part epilog 43main_body: 44 %tex = call half @llvm.amdgcn.image.sample.2d.f16.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) 45 ret half %tex 46} 47 48define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) { 49; TONGA-LABEL: image_sample_2d_f16_tfe: 50; TONGA: ; %bb.0: ; %main_body 51; TONGA-NEXT: s_mov_b64 s[14:15], exec 52; TONGA-NEXT: s_wqm_b64 exec, exec 53; TONGA-NEXT: v_mov_b32_e32 v2, 0 54; TONGA-NEXT: v_mov_b32_e32 v3, v2 55; TONGA-NEXT: s_and_b64 exec, exec, s[14:15] 56; TONGA-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 57; TONGA-NEXT: v_mov_b32_e32 v0, s12 58; TONGA-NEXT: v_mov_b32_e32 v1, s13 59; TONGA-NEXT: s_waitcnt vmcnt(0) 60; TONGA-NEXT: flat_store_dword v[0:1], v3 61; TONGA-NEXT: v_mov_b32_e32 v0, v2 62; TONGA-NEXT: s_waitcnt vmcnt(0) 63; TONGA-NEXT: ; return to shader part epilog 64; 65; GFX81-LABEL: image_sample_2d_f16_tfe: 66; GFX81: ; %bb.0: ; %main_body 67; GFX81-NEXT: s_mov_b64 s[14:15], exec 68; GFX81-NEXT: s_wqm_b64 exec, exec 69; GFX81-NEXT: v_mov_b32_e32 v2, 0 70; GFX81-NEXT: v_mov_b32_e32 v3, v2 71; GFX81-NEXT: s_and_b64 exec, exec, s[14:15] 72; GFX81-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 73; GFX81-NEXT: v_mov_b32_e32 v0, s12 74; GFX81-NEXT: v_mov_b32_e32 v1, s13 75; GFX81-NEXT: s_waitcnt vmcnt(0) 76; GFX81-NEXT: flat_store_dword v[0:1], v3 77; GFX81-NEXT: v_mov_b32_e32 v0, v2 78; GFX81-NEXT: s_waitcnt vmcnt(0) 79; GFX81-NEXT: ; return to shader part epilog 80; 81; GFX9-LABEL: image_sample_2d_f16_tfe: 82; GFX9: ; %bb.0: ; %main_body 83; GFX9-NEXT: s_mov_b64 s[14:15], exec 84; GFX9-NEXT: s_wqm_b64 exec, exec 85; GFX9-NEXT: v_mov_b32_e32 v4, 0 86; GFX9-NEXT: v_mov_b32_e32 v5, v4 87; GFX9-NEXT: v_mov_b32_e32 v2, v4 88; GFX9-NEXT: v_mov_b32_e32 v3, v5 89; GFX9-NEXT: s_and_b64 exec, exec, s[14:15] 90; GFX9-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16 91; GFX9-NEXT: s_waitcnt vmcnt(0) 92; GFX9-NEXT: v_mov_b32_e32 v0, v2 93; GFX9-NEXT: global_store_dword v4, v3, s[12:13] 94; GFX9-NEXT: s_waitcnt vmcnt(0) 95; GFX9-NEXT: ; return to shader part epilog 96; 97; GFX10-LABEL: image_sample_2d_f16_tfe: 98; GFX10: ; %bb.0: ; %main_body 99; GFX10-NEXT: s_mov_b32 s14, exec_lo 100; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 101; GFX10-NEXT: v_mov_b32_e32 v4, 0 102; GFX10-NEXT: v_mov_b32_e32 v5, v4 103; GFX10-NEXT: v_mov_b32_e32 v2, v4 104; GFX10-NEXT: v_mov_b32_e32 v3, v5 105; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s14 106; GFX10-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 107; GFX10-NEXT: s_waitcnt vmcnt(0) 108; GFX10-NEXT: v_mov_b32_e32 v0, v2 109; GFX10-NEXT: global_store_dword v4, v3, s[12:13] 110; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 111; GFX10-NEXT: ; return to shader part epilog 112main_body: 113 %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) 114 %tex.vec = extractvalue {half, i32} %tex, 0 115 %tex.err = extractvalue {half, i32} %tex, 1 116 store i32 %tex.err, i32 addrspace(1)* %out, align 4 117 ret half %tex.vec 118} 119 120define amdgpu_ps float @image_sample_c_d_1d_v2f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { 121; TONGA-LABEL: image_sample_c_d_1d_v2f16: 122; TONGA: ; %bb.0: ; %main_body 123; TONGA-NEXT: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16 124; TONGA-NEXT: s_waitcnt vmcnt(0) 125; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 126; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 127; TONGA-NEXT: ; return to shader part epilog 128; 129; GFX81-LABEL: image_sample_c_d_1d_v2f16: 130; GFX81: ; %bb.0: ; %main_body 131; GFX81-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 132; GFX81-NEXT: s_waitcnt vmcnt(0) 133; GFX81-NEXT: ; return to shader part epilog 134; 135; GFX9-LABEL: image_sample_c_d_1d_v2f16: 136; GFX9: ; %bb.0: ; %main_body 137; GFX9-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16 138; GFX9-NEXT: s_waitcnt vmcnt(0) 139; GFX9-NEXT: ; return to shader part epilog 140; 141; GFX10-LABEL: image_sample_c_d_1d_v2f16: 142; GFX10: ; %bb.0: ; %main_body 143; GFX10-NEXT: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D d16 144; GFX10-NEXT: s_waitcnt vmcnt(0) 145; GFX10-NEXT: ; return to shader part epilog 146main_body: 147 %tex = call <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) 148 %r = bitcast <2 x half> %tex to float 149 ret float %r 150} 151 152define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) { 153; TONGA-LABEL: image_sample_c_d_1d_v2f16_tfe: 154; TONGA: ; %bb.0: ; %main_body 155; TONGA-NEXT: v_mov_b32_e32 v4, 0 156; TONGA-NEXT: v_mov_b32_e32 v5, v4 157; TONGA-NEXT: v_mov_b32_e32 v6, v4 158; TONGA-NEXT: image_sample_c_d v[4:6], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16 159; TONGA-NEXT: s_waitcnt vmcnt(0) 160; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v5 161; TONGA-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 162; TONGA-NEXT: v_mov_b32_e32 v1, v6 163; TONGA-NEXT: ; return to shader part epilog 164; 165; GFX81-LABEL: image_sample_c_d_1d_v2f16_tfe: 166; GFX81: ; %bb.0: ; %main_body 167; GFX81-NEXT: v_mov_b32_e32 v4, 0 168; GFX81-NEXT: v_mov_b32_e32 v5, v4 169; GFX81-NEXT: image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16 170; GFX81-NEXT: s_waitcnt vmcnt(0) 171; GFX81-NEXT: v_mov_b32_e32 v0, v4 172; GFX81-NEXT: v_mov_b32_e32 v1, v5 173; GFX81-NEXT: ; return to shader part epilog 174; 175; GFX9-LABEL: image_sample_c_d_1d_v2f16_tfe: 176; GFX9: ; %bb.0: ; %main_body 177; GFX9-NEXT: v_mov_b32_e32 v4, 0 178; GFX9-NEXT: v_mov_b32_e32 v5, v4 179; GFX9-NEXT: image_sample_c_d v[4:5], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16 180; GFX9-NEXT: s_waitcnt vmcnt(0) 181; GFX9-NEXT: v_mov_b32_e32 v0, v4 182; GFX9-NEXT: v_mov_b32_e32 v1, v5 183; GFX9-NEXT: ; return to shader part epilog 184; 185; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe: 186; GFX10: ; %bb.0: ; %main_body 187; GFX10-NEXT: v_mov_b32_e32 v5, v0 188; GFX10-NEXT: v_mov_b32_e32 v0, 0 189; GFX10-NEXT: v_mov_b32_e32 v4, v1 190; GFX10-NEXT: v_mov_b32_e32 v1, v0 191; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 192; GFX10-NEXT: s_waitcnt vmcnt(0) 193; GFX10-NEXT: ; return to shader part epilog 194main_body: 195 %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) 196 %tex.vec = extractvalue {<2 x half>, i32} %tex, 0 197 %tex.err = extractvalue {<2 x half>, i32} %tex, 1 198 %tex.vecf = bitcast <2 x half> %tex.vec to float 199 %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0 200 %tex.errf = bitcast i32 %tex.err to float 201 %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1 202 ret <2 x float> %r 203} 204 205define amdgpu_ps <2 x float> @image_sample_b_2d_v3f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { 206; TONGA-LABEL: image_sample_b_2d_v3f16: 207; TONGA: ; %bb.0: ; %main_body 208; TONGA-NEXT: s_mov_b64 s[12:13], exec 209; TONGA-NEXT: s_wqm_b64 exec, exec 210; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] 211; TONGA-NEXT: image_sample_b v[0:2], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 212; TONGA-NEXT: s_waitcnt vmcnt(0) 213; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 214; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 215; TONGA-NEXT: v_mov_b32_e32 v1, v2 216; TONGA-NEXT: ; return to shader part epilog 217; 218; GFX81-LABEL: image_sample_b_2d_v3f16: 219; GFX81: ; %bb.0: ; %main_body 220; GFX81-NEXT: s_mov_b64 s[12:13], exec 221; GFX81-NEXT: s_wqm_b64 exec, exec 222; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] 223; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 224; GFX81-NEXT: s_waitcnt vmcnt(0) 225; GFX81-NEXT: ; return to shader part epilog 226; 227; GFX9-LABEL: image_sample_b_2d_v3f16: 228; GFX9: ; %bb.0: ; %main_body 229; GFX9-NEXT: s_mov_b64 s[12:13], exec 230; GFX9-NEXT: s_wqm_b64 exec, exec 231; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 232; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 d16 233; GFX9-NEXT: s_waitcnt vmcnt(0) 234; GFX9-NEXT: ; return to shader part epilog 235; 236; GFX10-LABEL: image_sample_b_2d_v3f16: 237; GFX10: ; %bb.0: ; %main_body 238; GFX10-NEXT: s_mov_b32 s12, exec_lo 239; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 240; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 241; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D d16 242; GFX10-NEXT: s_waitcnt vmcnt(0) 243; GFX10-NEXT: ; return to shader part epilog 244main_body: 245 %tex = call <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) 246 %tex_wide = shufflevector <3 x half> %tex, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 247 %r = bitcast <4 x half> %tex_wide to <2 x float> 248 ret <2 x float> %r 249} 250 251define amdgpu_ps <4 x float> @image_sample_b_2d_v3f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { 252; TONGA-LABEL: image_sample_b_2d_v3f16_tfe: 253; TONGA: ; %bb.0: ; %main_body 254; TONGA-NEXT: s_mov_b64 s[12:13], exec 255; TONGA-NEXT: s_wqm_b64 exec, exec 256; TONGA-NEXT: v_mov_b32_e32 v3, 0 257; TONGA-NEXT: v_mov_b32_e32 v4, v3 258; TONGA-NEXT: v_mov_b32_e32 v5, v3 259; TONGA-NEXT: v_mov_b32_e32 v6, v3 260; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] 261; TONGA-NEXT: image_sample_b v[3:6], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 262; TONGA-NEXT: s_waitcnt vmcnt(0) 263; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 264; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 265; TONGA-NEXT: v_mov_b32_e32 v1, v5 266; TONGA-NEXT: v_mov_b32_e32 v2, v6 267; TONGA-NEXT: ; return to shader part epilog 268; 269; GFX81-LABEL: image_sample_b_2d_v3f16_tfe: 270; GFX81: ; %bb.0: ; %main_body 271; GFX81-NEXT: s_mov_b64 s[12:13], exec 272; GFX81-NEXT: s_wqm_b64 exec, exec 273; GFX81-NEXT: v_mov_b32_e32 v3, 0 274; GFX81-NEXT: v_mov_b32_e32 v4, v3 275; GFX81-NEXT: v_mov_b32_e32 v5, v3 276; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] 277; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 278; GFX81-NEXT: s_waitcnt vmcnt(0) 279; GFX81-NEXT: v_mov_b32_e32 v0, v3 280; GFX81-NEXT: v_mov_b32_e32 v1, v4 281; GFX81-NEXT: v_mov_b32_e32 v2, v5 282; GFX81-NEXT: ; return to shader part epilog 283; 284; GFX9-LABEL: image_sample_b_2d_v3f16_tfe: 285; GFX9: ; %bb.0: ; %main_body 286; GFX9-NEXT: s_mov_b64 s[12:13], exec 287; GFX9-NEXT: s_wqm_b64 exec, exec 288; GFX9-NEXT: v_mov_b32_e32 v3, 0 289; GFX9-NEXT: v_mov_b32_e32 v4, v3 290; GFX9-NEXT: v_mov_b32_e32 v5, v3 291; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 292; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0x7 tfe d16 293; GFX9-NEXT: s_waitcnt vmcnt(0) 294; GFX9-NEXT: v_mov_b32_e32 v0, v3 295; GFX9-NEXT: v_mov_b32_e32 v1, v4 296; GFX9-NEXT: v_mov_b32_e32 v2, v5 297; GFX9-NEXT: ; return to shader part epilog 298; 299; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: 300; GFX10: ; %bb.0: ; %main_body 301; GFX10-NEXT: s_mov_b32 s12, exec_lo 302; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 303; GFX10-NEXT: v_mov_b32_e32 v3, v0 304; GFX10-NEXT: v_mov_b32_e32 v0, 0 305; GFX10-NEXT: v_mov_b32_e32 v5, v2 306; GFX10-NEXT: v_mov_b32_e32 v4, v1 307; GFX10-NEXT: v_mov_b32_e32 v1, v0 308; GFX10-NEXT: v_mov_b32_e32 v2, v0 309; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 310; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 311; GFX10-NEXT: s_waitcnt vmcnt(0) 312; GFX10-NEXT: ; return to shader part epilog 313main_body: 314 %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) 315 %tex.vec = extractvalue {<3 x half>, i32} %tex, 0 316 %tex.vec_wide = shufflevector <3 x half> %tex.vec, <3 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 317 %tex.err = extractvalue {<3 x half>, i32} %tex, 1 318 %tex.vecf = bitcast <4 x half> %tex.vec_wide to <2 x float> 319 %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0 320 %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1 321 %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0 322 %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1 323 %tex.errf = bitcast i32 %tex.err to float 324 %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2 325 ret <4 x float> %r 326} 327 328define amdgpu_ps <2 x float> @image_sample_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { 329; TONGA-LABEL: image_sample_b_2d_v4f16: 330; TONGA: ; %bb.0: ; %main_body 331; TONGA-NEXT: s_mov_b64 s[12:13], exec 332; TONGA-NEXT: s_wqm_b64 exec, exec 333; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] 334; TONGA-NEXT: image_sample_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0xf d16 335; TONGA-NEXT: s_waitcnt vmcnt(0) 336; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1 337; TONGA-NEXT: v_lshlrev_b32_e32 v3, 16, v3 338; TONGA-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 339; TONGA-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 340; TONGA-NEXT: ; return to shader part epilog 341; 342; GFX81-LABEL: image_sample_b_2d_v4f16: 343; GFX81: ; %bb.0: ; %main_body 344; GFX81-NEXT: s_mov_b64 s[12:13], exec 345; GFX81-NEXT: s_wqm_b64 exec, exec 346; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] 347; GFX81-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16 348; GFX81-NEXT: s_waitcnt vmcnt(0) 349; GFX81-NEXT: ; return to shader part epilog 350; 351; GFX9-LABEL: image_sample_b_2d_v4f16: 352; GFX9: ; %bb.0: ; %main_body 353; GFX9-NEXT: s_mov_b64 s[12:13], exec 354; GFX9-NEXT: s_wqm_b64 exec, exec 355; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 356; GFX9-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf d16 357; GFX9-NEXT: s_waitcnt vmcnt(0) 358; GFX9-NEXT: ; return to shader part epilog 359; 360; GFX10-LABEL: image_sample_b_2d_v4f16: 361; GFX10: ; %bb.0: ; %main_body 362; GFX10-NEXT: s_mov_b32 s12, exec_lo 363; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 364; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 365; GFX10-NEXT: image_sample_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D d16 366; GFX10-NEXT: s_waitcnt vmcnt(0) 367; GFX10-NEXT: ; return to shader part epilog 368main_body: 369 %tex = call <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0) 370 %r = bitcast <4 x half> %tex to <2 x float> 371 ret <2 x float> %r 372} 373 374define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) { 375; TONGA-LABEL: image_sample_b_2d_v4f16_tfe: 376; TONGA: ; %bb.0: ; %main_body 377; TONGA-NEXT: s_mov_b64 s[12:13], exec 378; TONGA-NEXT: s_wqm_b64 exec, exec 379; TONGA-NEXT: v_mov_b32_e32 v3, 0 380; TONGA-NEXT: v_mov_b32_e32 v4, v3 381; TONGA-NEXT: v_mov_b32_e32 v5, v3 382; TONGA-NEXT: v_mov_b32_e32 v6, v3 383; TONGA-NEXT: v_mov_b32_e32 v7, v3 384; TONGA-NEXT: s_and_b64 exec, exec, s[12:13] 385; TONGA-NEXT: image_sample_b v[3:7], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 386; TONGA-NEXT: s_waitcnt vmcnt(0) 387; TONGA-NEXT: v_lshlrev_b32_e32 v0, 16, v4 388; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v6 389; TONGA-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 390; TONGA-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD 391; TONGA-NEXT: v_mov_b32_e32 v2, v7 392; TONGA-NEXT: ; return to shader part epilog 393; 394; GFX81-LABEL: image_sample_b_2d_v4f16_tfe: 395; GFX81: ; %bb.0: ; %main_body 396; GFX81-NEXT: s_mov_b64 s[12:13], exec 397; GFX81-NEXT: s_wqm_b64 exec, exec 398; GFX81-NEXT: v_mov_b32_e32 v3, 0 399; GFX81-NEXT: v_mov_b32_e32 v4, v3 400; GFX81-NEXT: v_mov_b32_e32 v5, v3 401; GFX81-NEXT: s_and_b64 exec, exec, s[12:13] 402; GFX81-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 403; GFX81-NEXT: s_waitcnt vmcnt(0) 404; GFX81-NEXT: v_mov_b32_e32 v0, v3 405; GFX81-NEXT: v_mov_b32_e32 v1, v4 406; GFX81-NEXT: v_mov_b32_e32 v2, v5 407; GFX81-NEXT: ; return to shader part epilog 408; 409; GFX9-LABEL: image_sample_b_2d_v4f16_tfe: 410; GFX9: ; %bb.0: ; %main_body 411; GFX9-NEXT: s_mov_b64 s[12:13], exec 412; GFX9-NEXT: s_wqm_b64 exec, exec 413; GFX9-NEXT: v_mov_b32_e32 v3, 0 414; GFX9-NEXT: v_mov_b32_e32 v4, v3 415; GFX9-NEXT: v_mov_b32_e32 v5, v3 416; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] 417; GFX9-NEXT: image_sample_b v[3:5], v[0:2], s[0:7], s[8:11] dmask:0xf tfe d16 418; GFX9-NEXT: s_waitcnt vmcnt(0) 419; GFX9-NEXT: v_mov_b32_e32 v0, v3 420; GFX9-NEXT: v_mov_b32_e32 v1, v4 421; GFX9-NEXT: v_mov_b32_e32 v2, v5 422; GFX9-NEXT: ; return to shader part epilog 423; 424; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: 425; GFX10: ; %bb.0: ; %main_body 426; GFX10-NEXT: s_mov_b32 s12, exec_lo 427; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo 428; GFX10-NEXT: v_mov_b32_e32 v3, v0 429; GFX10-NEXT: v_mov_b32_e32 v0, 0 430; GFX10-NEXT: v_mov_b32_e32 v5, v2 431; GFX10-NEXT: v_mov_b32_e32 v4, v1 432; GFX10-NEXT: v_mov_b32_e32 v1, v0 433; GFX10-NEXT: v_mov_b32_e32 v2, v0 434; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 435; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 436; GFX10-NEXT: s_waitcnt vmcnt(0) 437; GFX10-NEXT: ; return to shader part epilog 438main_body: 439 %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) 440 %tex.vec = extractvalue {<4 x half>, i32} %tex, 0 441 %tex.err = extractvalue {<4 x half>, i32} %tex, 1 442 %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float> 443 %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0 444 %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1 445 %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0 446 %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1 447 %tex.errf = bitcast i32 %tex.err to float 448 %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2 449 ret <4 x float> %r 450} 451 452declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 453declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 454declare <3 x half> @llvm.amdgcn.image.sample.2d.v3f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 455declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 456declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 457declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 458declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 459declare <3 x half> @llvm.amdgcn.image.sample.b.2d.v3f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 460declare {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 461declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 462declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1 463 464attributes #0 = { nounwind } 465attributes #1 = { nounwind readonly } 466attributes #2 = { nounwind readnone } 467