1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 4 5; half args should be promoted to float for SI and lower. 6 7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 8; SI-LABEL: load_f16_arg: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11; SI-NEXT: s_load_dword s2, s[4:5], 0x2 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: v_mov_b32_e32 v0, s0 14; SI-NEXT: v_mov_b32_e32 v1, s1 15; SI-NEXT: v_mov_b32_e32 v2, s2 16; SI-NEXT: flat_store_short v[0:1], v2 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: load_f16_arg: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 22; VI-NEXT: s_load_dword s2, s[4:5], 0x8 23; VI-NEXT: s_waitcnt lgkmcnt(0) 24; VI-NEXT: v_mov_b32_e32 v0, s0 25; VI-NEXT: v_mov_b32_e32 v1, s1 26; VI-NEXT: v_mov_b32_e32 v2, s2 27; VI-NEXT: flat_store_short v[0:1], v2 28; VI-NEXT: s_endpgm 29 store half %arg, half addrspace(1)* %out 30 ret void 31} 32 33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 34; SI-LABEL: load_v2f16_arg: 35; SI: ; %bb.0: 36; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 37; SI-NEXT: s_load_dword s2, s[4:5], 0x2 38; SI-NEXT: s_waitcnt lgkmcnt(0) 39; SI-NEXT: v_mov_b32_e32 v0, s0 40; SI-NEXT: v_mov_b32_e32 v1, s1 41; SI-NEXT: v_mov_b32_e32 v2, s2 42; SI-NEXT: flat_store_dword v[0:1], v2 43; SI-NEXT: s_endpgm 44; 45; VI-LABEL: load_v2f16_arg: 46; VI: ; %bb.0: 47; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 48; VI-NEXT: s_load_dword s2, s[4:5], 0x8 49; VI-NEXT: s_waitcnt lgkmcnt(0) 50; VI-NEXT: v_mov_b32_e32 v0, s0 51; VI-NEXT: v_mov_b32_e32 v1, s1 52; VI-NEXT: v_mov_b32_e32 v2, s2 53; VI-NEXT: flat_store_dword v[0:1], v2 54; VI-NEXT: s_endpgm 55 store <2 x half> %arg, <2 x half> addrspace(1)* %out 56 ret void 57} 58 59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 60; SI-LABEL: load_v3f16_arg: 61; SI: ; %bb.0: 62; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 63; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 64; SI-NEXT: s_waitcnt lgkmcnt(0) 65; SI-NEXT: s_add_u32 s4, s0, 4 66; SI-NEXT: s_addc_u32 s5, s1, 0 67; SI-NEXT: v_mov_b32_e32 v2, s4 68; SI-NEXT: v_mov_b32_e32 v4, s3 69; SI-NEXT: v_mov_b32_e32 v0, s0 70; SI-NEXT: v_mov_b32_e32 v3, s5 71; SI-NEXT: v_mov_b32_e32 v1, s1 72; SI-NEXT: v_mov_b32_e32 v5, s2 73; SI-NEXT: flat_store_short v[2:3], v4 74; SI-NEXT: flat_store_dword v[0:1], v5 75; SI-NEXT: s_endpgm 76; 77; VI-LABEL: load_v3f16_arg: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 80; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_add_u32 s4, s0, 4 83; VI-NEXT: s_addc_u32 s5, s1, 0 84; VI-NEXT: v_mov_b32_e32 v2, s4 85; VI-NEXT: v_mov_b32_e32 v4, s3 86; VI-NEXT: v_mov_b32_e32 v0, s0 87; VI-NEXT: v_mov_b32_e32 v3, s5 88; VI-NEXT: v_mov_b32_e32 v1, s1 89; VI-NEXT: v_mov_b32_e32 v5, s2 90; VI-NEXT: flat_store_short v[2:3], v4 91; VI-NEXT: flat_store_dword v[0:1], v5 92; VI-NEXT: s_endpgm 93 store <3 x half> %arg, <3 x half> addrspace(1)* %out 94 ret void 95} 96 97 98; FIXME: Why not one load? 99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 100; SI-LABEL: load_v4f16_arg: 101; SI: ; %bb.0: 102; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 103; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s0 106; SI-NEXT: v_mov_b32_e32 v2, s2 107; SI-NEXT: v_mov_b32_e32 v1, s1 108; SI-NEXT: v_mov_b32_e32 v3, s3 109; SI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: load_v4f16_arg: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 115; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s0 118; VI-NEXT: v_mov_b32_e32 v2, s2 119; VI-NEXT: v_mov_b32_e32 v1, s1 120; VI-NEXT: v_mov_b32_e32 v3, s3 121; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 122; VI-NEXT: s_endpgm 123 store <4 x half> %arg, <4 x half> addrspace(1)* %out 124 ret void 125} 126 127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 128; SI-LABEL: load_v8f16_arg: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 131; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: v_mov_b32_e32 v4, s6 134; SI-NEXT: v_mov_b32_e32 v0, s0 135; SI-NEXT: v_mov_b32_e32 v5, s7 136; SI-NEXT: v_mov_b32_e32 v1, s1 137; SI-NEXT: v_mov_b32_e32 v2, s2 138; SI-NEXT: v_mov_b32_e32 v3, s3 139; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 140; SI-NEXT: s_endpgm 141; 142; VI-LABEL: load_v8f16_arg: 143; VI: ; %bb.0: 144; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 145; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: v_mov_b32_e32 v4, s6 148; VI-NEXT: v_mov_b32_e32 v0, s0 149; VI-NEXT: v_mov_b32_e32 v5, s7 150; VI-NEXT: v_mov_b32_e32 v1, s1 151; VI-NEXT: v_mov_b32_e32 v2, s2 152; VI-NEXT: v_mov_b32_e32 v3, s3 153; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 154; VI-NEXT: s_endpgm 155 store <8 x half> %arg, <8 x half> addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 160; SI-LABEL: extload_v2f16_arg: 161; SI: ; %bb.0: 162; SI-NEXT: s_load_dword s2, s[4:5], 0x2 163; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 164; SI-NEXT: s_waitcnt lgkmcnt(0) 165; SI-NEXT: s_lshr_b32 s3, s2, 16 166; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 167; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 168; SI-NEXT: v_mov_b32_e32 v3, s1 169; SI-NEXT: v_mov_b32_e32 v2, s0 170; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 171; SI-NEXT: s_endpgm 172; 173; VI-LABEL: extload_v2f16_arg: 174; VI: ; %bb.0: 175; VI-NEXT: s_load_dword s2, s[4:5], 0x8 176; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 177; VI-NEXT: s_waitcnt lgkmcnt(0) 178; VI-NEXT: s_lshr_b32 s3, s2, 16 179; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 180; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 181; VI-NEXT: v_mov_b32_e32 v3, s1 182; VI-NEXT: v_mov_b32_e32 v2, s0 183; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 184; VI-NEXT: s_endpgm 185 %fpext = fpext <2 x half> %in to <2 x float> 186 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 187 ret void 188} 189 190define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 191; SI-LABEL: extload_f16_to_f32_arg: 192; SI: ; %bb.0: 193; SI-NEXT: s_load_dword s2, s[4:5], 0x2 194; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 195; SI-NEXT: s_waitcnt lgkmcnt(0) 196; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 197; SI-NEXT: v_mov_b32_e32 v0, s0 198; SI-NEXT: v_mov_b32_e32 v1, s1 199; SI-NEXT: flat_store_dword v[0:1], v2 200; SI-NEXT: s_endpgm 201; 202; VI-LABEL: extload_f16_to_f32_arg: 203; VI: ; %bb.0: 204; VI-NEXT: s_load_dword s2, s[4:5], 0x8 205; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 206; VI-NEXT: s_waitcnt lgkmcnt(0) 207; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 208; VI-NEXT: v_mov_b32_e32 v0, s0 209; VI-NEXT: v_mov_b32_e32 v1, s1 210; VI-NEXT: flat_store_dword v[0:1], v2 211; VI-NEXT: s_endpgm 212 %ext = fpext half %arg to float 213 store float %ext, float addrspace(1)* %out 214 ret void 215} 216 217define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 218; SI-LABEL: extload_v2f16_to_v2f32_arg: 219; SI: ; %bb.0: 220; SI-NEXT: s_load_dword s2, s[4:5], 0x2 221; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 222; SI-NEXT: s_waitcnt lgkmcnt(0) 223; SI-NEXT: s_lshr_b32 s3, s2, 16 224; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 225; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 226; SI-NEXT: v_mov_b32_e32 v3, s1 227; SI-NEXT: v_mov_b32_e32 v2, s0 228; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 229; SI-NEXT: s_endpgm 230; 231; VI-LABEL: extload_v2f16_to_v2f32_arg: 232; VI: ; %bb.0: 233; VI-NEXT: s_load_dword s2, s[4:5], 0x8 234; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: s_lshr_b32 s3, s2, 16 237; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 238; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 239; VI-NEXT: v_mov_b32_e32 v3, s1 240; VI-NEXT: v_mov_b32_e32 v2, s0 241; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 242; VI-NEXT: s_endpgm 243 %ext = fpext <2 x half> %arg to <2 x float> 244 store <2 x float> %ext, <2 x float> addrspace(1)* %out 245 ret void 246} 247 248define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 249; SI-LABEL: extload_v3f16_to_v3f32_arg: 250; SI: ; %bb.0: 251; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 252; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 253; SI-NEXT: s_waitcnt lgkmcnt(0) 254; SI-NEXT: s_lshr_b32 s4, s0, 16 255; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 256; SI-NEXT: v_cvt_f32_f16_e32 v1, s4 257; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 258; SI-NEXT: v_mov_b32_e32 v4, s3 259; SI-NEXT: v_mov_b32_e32 v3, s2 260; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 261; SI-NEXT: s_endpgm 262; 263; VI-LABEL: extload_v3f16_to_v3f32_arg: 264; VI: ; %bb.0: 265; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 266; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 267; VI-NEXT: s_waitcnt lgkmcnt(0) 268; VI-NEXT: s_lshr_b32 s4, s0, 16 269; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 270; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 271; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 272; VI-NEXT: v_mov_b32_e32 v4, s3 273; VI-NEXT: v_mov_b32_e32 v3, s2 274; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 275; VI-NEXT: s_endpgm 276 %ext = fpext <3 x half> %arg to <3 x float> 277 store <3 x float> %ext, <3 x float> addrspace(1)* %out 278 ret void 279} 280 281define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 282; SI-LABEL: extload_v4f16_to_v4f32_arg: 283; SI: ; %bb.0: 284; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 285; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 286; SI-NEXT: s_waitcnt lgkmcnt(0) 287; SI-NEXT: s_lshr_b32 s4, s1, 16 288; SI-NEXT: s_lshr_b32 s5, s0, 16 289; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 290; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 291; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 292; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 293; SI-NEXT: v_mov_b32_e32 v5, s3 294; SI-NEXT: v_mov_b32_e32 v4, s2 295; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 296; SI-NEXT: s_endpgm 297; 298; VI-LABEL: extload_v4f16_to_v4f32_arg: 299; VI: ; %bb.0: 300; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 301; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 302; VI-NEXT: s_waitcnt lgkmcnt(0) 303; VI-NEXT: s_lshr_b32 s4, s1, 16 304; VI-NEXT: s_lshr_b32 s5, s0, 16 305; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 306; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 307; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 308; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 309; VI-NEXT: v_mov_b32_e32 v5, s3 310; VI-NEXT: v_mov_b32_e32 v4, s2 311; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 312; VI-NEXT: s_endpgm 313 %ext = fpext <4 x half> %arg to <4 x float> 314 store <4 x float> %ext, <4 x float> addrspace(1)* %out 315 ret void 316} 317 318define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 319; SI-LABEL: extload_v8f16_to_v8f32_arg: 320; SI: ; %bb.0: 321; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 322; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 323; SI-NEXT: s_waitcnt lgkmcnt(0) 324; SI-NEXT: s_lshr_b32 s6, s1, 16 325; SI-NEXT: s_lshr_b32 s7, s0, 16 326; SI-NEXT: s_lshr_b32 s8, s3, 16 327; SI-NEXT: v_cvt_f32_f16_e32 v3, s6 328; SI-NEXT: s_lshr_b32 s6, s2, 16 329; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 330; SI-NEXT: v_cvt_f32_f16_e32 v5, s6 331; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 332; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 333; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 334; SI-NEXT: s_add_u32 s0, s4, 16 335; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 336; SI-NEXT: s_addc_u32 s1, s5, 0 337; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 338; SI-NEXT: v_mov_b32_e32 v9, s1 339; SI-NEXT: v_mov_b32_e32 v8, s0 340; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 341; SI-NEXT: s_nop 0 342; SI-NEXT: v_mov_b32_e32 v4, s4 343; SI-NEXT: v_mov_b32_e32 v5, s5 344; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 345; SI-NEXT: s_endpgm 346; 347; VI-LABEL: extload_v8f16_to_v8f32_arg: 348; VI: ; %bb.0: 349; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 350; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 351; VI-NEXT: s_waitcnt lgkmcnt(0) 352; VI-NEXT: s_lshr_b32 s6, s1, 16 353; VI-NEXT: s_lshr_b32 s7, s0, 16 354; VI-NEXT: s_lshr_b32 s8, s3, 16 355; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 356; VI-NEXT: s_lshr_b32 s6, s2, 16 357; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 358; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 359; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 360; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 361; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 362; VI-NEXT: s_add_u32 s0, s4, 16 363; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 364; VI-NEXT: s_addc_u32 s1, s5, 0 365; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 366; VI-NEXT: v_mov_b32_e32 v9, s1 367; VI-NEXT: v_mov_b32_e32 v8, s0 368; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 369; VI-NEXT: s_nop 0 370; VI-NEXT: v_mov_b32_e32 v4, s4 371; VI-NEXT: v_mov_b32_e32 v5, s5 372; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 373; VI-NEXT: s_endpgm 374 %ext = fpext <8 x half> %arg to <8 x float> 375 store <8 x float> %ext, <8 x float> addrspace(1)* %out 376 ret void 377} 378 379define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 380; SI-LABEL: extload_f16_to_f64_arg: 381; SI: ; %bb.0: 382; SI-NEXT: s_load_dword s0, s[4:5], 0x2 383; SI-NEXT: s_waitcnt lgkmcnt(0) 384; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 385; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 386; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 387; SI-NEXT: s_waitcnt lgkmcnt(0) 388; SI-NEXT: v_mov_b32_e32 v3, s1 389; SI-NEXT: v_mov_b32_e32 v2, s0 390; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 391; SI-NEXT: s_endpgm 392; 393; VI-LABEL: extload_f16_to_f64_arg: 394; VI: ; %bb.0: 395; VI-NEXT: s_load_dword s0, s[4:5], 0x8 396; VI-NEXT: s_waitcnt lgkmcnt(0) 397; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 398; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 399; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 400; VI-NEXT: s_waitcnt lgkmcnt(0) 401; VI-NEXT: v_mov_b32_e32 v3, s1 402; VI-NEXT: v_mov_b32_e32 v2, s0 403; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 404; VI-NEXT: s_endpgm 405 %ext = fpext half %arg to double 406 store double %ext, double addrspace(1)* %out 407 ret void 408} 409 410define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 411; SI-LABEL: extload_v2f16_to_v2f64_arg: 412; SI: ; %bb.0: 413; SI-NEXT: s_load_dword s0, s[4:5], 0x2 414; SI-NEXT: s_waitcnt lgkmcnt(0) 415; SI-NEXT: s_lshr_b32 s1, s0, 16 416; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 417; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 418; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 419; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 420; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 421; SI-NEXT: s_waitcnt lgkmcnt(0) 422; SI-NEXT: v_mov_b32_e32 v5, s1 423; SI-NEXT: v_mov_b32_e32 v4, s0 424; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 425; SI-NEXT: s_endpgm 426; 427; VI-LABEL: extload_v2f16_to_v2f64_arg: 428; VI: ; %bb.0: 429; VI-NEXT: s_load_dword s0, s[4:5], 0x8 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: s_lshr_b32 s1, s0, 16 432; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 433; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 434; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 435; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 436; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 437; VI-NEXT: s_waitcnt lgkmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v5, s1 439; VI-NEXT: v_mov_b32_e32 v4, s0 440; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 441; VI-NEXT: s_endpgm 442 %ext = fpext <2 x half> %arg to <2 x double> 443 store <2 x double> %ext, <2 x double> addrspace(1)* %out 444 ret void 445} 446 447define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 448; SI-LABEL: extload_v3f16_to_v3f64_arg: 449; SI: ; %bb.0: 450; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 451; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 452; SI-NEXT: s_waitcnt lgkmcnt(0) 453; SI-NEXT: v_cvt_f32_f16_e32 v0, s1 454; SI-NEXT: s_lshr_b32 s4, s0, 16 455; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 456; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 457; SI-NEXT: s_add_u32 s0, s2, 16 458; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 459; SI-NEXT: s_addc_u32 s1, s3, 0 460; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 461; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 462; SI-NEXT: v_mov_b32_e32 v7, s1 463; SI-NEXT: v_mov_b32_e32 v6, s0 464; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 465; SI-NEXT: v_mov_b32_e32 v5, s3 466; SI-NEXT: v_mov_b32_e32 v4, s2 467; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 468; SI-NEXT: s_endpgm 469; 470; VI-LABEL: extload_v3f16_to_v3f64_arg: 471; VI: ; %bb.0: 472; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 473; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 474; VI-NEXT: s_waitcnt lgkmcnt(0) 475; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 476; VI-NEXT: s_lshr_b32 s4, s0, 16 477; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 478; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 479; VI-NEXT: s_add_u32 s0, s2, 16 480; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 481; VI-NEXT: s_addc_u32 s1, s3, 0 482; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 483; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 484; VI-NEXT: v_mov_b32_e32 v7, s1 485; VI-NEXT: v_mov_b32_e32 v6, s0 486; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 487; VI-NEXT: v_mov_b32_e32 v5, s3 488; VI-NEXT: v_mov_b32_e32 v4, s2 489; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 490; VI-NEXT: s_endpgm 491 %ext = fpext <3 x half> %arg to <3 x double> 492 store <3 x double> %ext, <3 x double> addrspace(1)* %out 493 ret void 494} 495 496define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 497; SI-LABEL: extload_v4f16_to_v4f64_arg: 498; SI: ; %bb.0: 499; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 500; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 501; SI-NEXT: s_waitcnt lgkmcnt(0) 502; SI-NEXT: s_lshr_b32 s4, s1, 16 503; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 504; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 505; SI-NEXT: s_lshr_b32 s5, s0, 16 506; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 507; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 508; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 509; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 510; SI-NEXT: s_add_u32 s0, s2, 16 511; SI-NEXT: s_addc_u32 s1, s3, 0 512; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 513; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 514; SI-NEXT: v_mov_b32_e32 v9, s1 515; SI-NEXT: v_mov_b32_e32 v8, s0 516; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 517; SI-NEXT: s_nop 0 518; SI-NEXT: v_mov_b32_e32 v5, s3 519; SI-NEXT: v_mov_b32_e32 v4, s2 520; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 521; SI-NEXT: s_endpgm 522; 523; VI-LABEL: extload_v4f16_to_v4f64_arg: 524; VI: ; %bb.0: 525; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 526; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: s_lshr_b32 s5, s1, 16 529; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 530; VI-NEXT: v_cvt_f32_f16_e32 v5, s1 531; VI-NEXT: s_lshr_b32 s4, s0, 16 532; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 533; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 534; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 535; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 536; VI-NEXT: s_add_u32 s0, s2, 16 537; VI-NEXT: s_addc_u32 s1, s3, 0 538; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 539; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 540; VI-NEXT: v_mov_b32_e32 v9, s1 541; VI-NEXT: v_mov_b32_e32 v8, s0 542; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 543; VI-NEXT: s_nop 0 544; VI-NEXT: v_mov_b32_e32 v5, s3 545; VI-NEXT: v_mov_b32_e32 v4, s2 546; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 547; VI-NEXT: s_endpgm 548 %ext = fpext <4 x half> %arg to <4 x double> 549 store <4 x double> %ext, <4 x double> addrspace(1)* %out 550 ret void 551} 552 553define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 554; SI-LABEL: extload_v8f16_to_v8f64_arg: 555; SI: ; %bb.0: 556; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 557; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 558; SI-NEXT: s_waitcnt lgkmcnt(0) 559; SI-NEXT: s_lshr_b32 s6, s3, 16 560; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 561; SI-NEXT: v_cvt_f32_f16_e32 v12, s3 562; SI-NEXT: s_lshr_b32 s7, s2, 16 563; SI-NEXT: s_lshr_b32 s8, s1, 16 564; SI-NEXT: s_lshr_b32 s6, s0, 16 565; SI-NEXT: v_cvt_f32_f16_e32 v1, s7 566; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 567; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 568; SI-NEXT: s_add_u32 s0, s4, 48 569; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 570; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 571; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 572; SI-NEXT: s_addc_u32 s1, s5, 0 573; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 574; SI-NEXT: v_mov_b32_e32 v17, s1 575; SI-NEXT: v_mov_b32_e32 v16, s0 576; SI-NEXT: s_add_u32 s0, s4, 32 577; SI-NEXT: v_cvt_f32_f16_e32 v2, s6 578; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 579; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 580; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 581; SI-NEXT: s_addc_u32 s1, s5, 0 582; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 583; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 584; SI-NEXT: v_mov_b32_e32 v13, s1 585; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 586; SI-NEXT: v_mov_b32_e32 v12, s0 587; SI-NEXT: s_add_u32 s0, s4, 16 588; SI-NEXT: s_addc_u32 s1, s5, 0 589; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 590; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 591; SI-NEXT: s_nop 0 592; SI-NEXT: v_mov_b32_e32 v9, s1 593; SI-NEXT: v_mov_b32_e32 v8, s0 594; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 595; SI-NEXT: s_nop 0 596; SI-NEXT: v_mov_b32_e32 v4, s4 597; SI-NEXT: v_mov_b32_e32 v5, s5 598; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 599; SI-NEXT: s_endpgm 600; 601; VI-LABEL: extload_v8f16_to_v8f64_arg: 602; VI: ; %bb.0: 603; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 604; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 605; VI-NEXT: s_waitcnt lgkmcnt(0) 606; VI-NEXT: s_lshr_b32 s6, s0, 16 607; VI-NEXT: s_lshr_b32 s8, s2, 16 608; VI-NEXT: s_lshr_b32 s9, s3, 16 609; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 610; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 611; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 612; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 613; VI-NEXT: s_lshr_b32 s7, s1, 16 614; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 615; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 616; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 617; VI-NEXT: s_add_u32 s0, s4, 48 618; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 619; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 620; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 621; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 622; VI-NEXT: s_addc_u32 s1, s5, 0 623; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 624; VI-NEXT: v_mov_b32_e32 v17, s1 625; VI-NEXT: v_mov_b32_e32 v16, s0 626; VI-NEXT: s_add_u32 s0, s4, 32 627; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 628; VI-NEXT: s_addc_u32 s1, s5, 0 629; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 630; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 631; VI-NEXT: v_mov_b32_e32 v13, s1 632; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 633; VI-NEXT: v_mov_b32_e32 v12, s0 634; VI-NEXT: s_add_u32 s0, s4, 16 635; VI-NEXT: s_addc_u32 s1, s5, 0 636; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 637; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 638; VI-NEXT: s_nop 0 639; VI-NEXT: v_mov_b32_e32 v9, s1 640; VI-NEXT: v_mov_b32_e32 v8, s0 641; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 642; VI-NEXT: s_nop 0 643; VI-NEXT: v_mov_b32_e32 v4, s4 644; VI-NEXT: v_mov_b32_e32 v5, s5 645; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 646; VI-NEXT: s_endpgm 647 %ext = fpext <8 x half> %arg to <8 x double> 648 store <8 x double> %ext, <8 x double> addrspace(1)* %out 649 ret void 650} 651 652define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 653; GCN-LABEL: global_load_store_f16: 654; GCN: ; %bb.0: 655; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 656; GCN-NEXT: s_waitcnt lgkmcnt(0) 657; GCN-NEXT: v_mov_b32_e32 v0, s2 658; GCN-NEXT: v_mov_b32_e32 v1, s3 659; GCN-NEXT: flat_load_ushort v2, v[0:1] 660; GCN-NEXT: v_mov_b32_e32 v0, s0 661; GCN-NEXT: v_mov_b32_e32 v1, s1 662; GCN-NEXT: s_waitcnt vmcnt(0) 663; GCN-NEXT: flat_store_short v[0:1], v2 664; GCN-NEXT: s_endpgm 665 %val = load half, half addrspace(1)* %in 666 store half %val, half addrspace(1)* %out 667 ret void 668} 669 670define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 671; GCN-LABEL: global_load_store_v2f16: 672; GCN: ; %bb.0: 673; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 674; GCN-NEXT: s_waitcnt lgkmcnt(0) 675; GCN-NEXT: v_mov_b32_e32 v0, s2 676; GCN-NEXT: v_mov_b32_e32 v1, s3 677; GCN-NEXT: flat_load_dword v2, v[0:1] 678; GCN-NEXT: v_mov_b32_e32 v0, s0 679; GCN-NEXT: v_mov_b32_e32 v1, s1 680; GCN-NEXT: s_waitcnt vmcnt(0) 681; GCN-NEXT: flat_store_dword v[0:1], v2 682; GCN-NEXT: s_endpgm 683 %val = load <2 x half>, <2 x half> addrspace(1)* %in 684 store <2 x half> %val, <2 x half> addrspace(1)* %out 685 ret void 686} 687 688define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 689; GCN-LABEL: global_load_store_v4f16: 690; GCN: ; %bb.0: 691; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 692; GCN-NEXT: s_waitcnt lgkmcnt(0) 693; GCN-NEXT: v_mov_b32_e32 v0, s0 694; GCN-NEXT: v_mov_b32_e32 v1, s1 695; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 696; GCN-NEXT: v_mov_b32_e32 v2, s2 697; GCN-NEXT: v_mov_b32_e32 v3, s3 698; GCN-NEXT: s_waitcnt vmcnt(0) 699; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 700; GCN-NEXT: s_endpgm 701 %val = load <4 x half>, <4 x half> addrspace(1)* %in 702 store <4 x half> %val, <4 x half> addrspace(1)* %out 703 ret void 704} 705 706define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 707; GCN-LABEL: global_load_store_v8f16: 708; GCN: ; %bb.0: 709; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 710; GCN-NEXT: s_waitcnt lgkmcnt(0) 711; GCN-NEXT: v_mov_b32_e32 v0, s2 712; GCN-NEXT: v_mov_b32_e32 v1, s3 713; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 714; GCN-NEXT: v_mov_b32_e32 v4, s0 715; GCN-NEXT: v_mov_b32_e32 v5, s1 716; GCN-NEXT: s_waitcnt vmcnt(0) 717; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 718; GCN-NEXT: s_endpgm 719 %val = load <8 x half>, <8 x half> addrspace(1)* %in 720 store <8 x half> %val, <8 x half> addrspace(1)* %out 721 ret void 722} 723 724define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 725; GCN-LABEL: global_extload_f16_to_f32: 726; GCN: ; %bb.0: 727; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 728; GCN-NEXT: s_waitcnt lgkmcnt(0) 729; GCN-NEXT: v_mov_b32_e32 v0, s2 730; GCN-NEXT: v_mov_b32_e32 v1, s3 731; GCN-NEXT: flat_load_ushort v0, v[0:1] 732; GCN-NEXT: v_mov_b32_e32 v1, s1 733; GCN-NEXT: s_waitcnt vmcnt(0) 734; GCN-NEXT: v_cvt_f32_f16_e32 v2, v0 735; GCN-NEXT: v_mov_b32_e32 v0, s0 736; GCN-NEXT: flat_store_dword v[0:1], v2 737; GCN-NEXT: s_endpgm 738 %val = load half, half addrspace(1)* %in 739 %cvt = fpext half %val to float 740 store float %cvt, float addrspace(1)* %out 741 ret void 742} 743 744define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 745; SI-LABEL: global_extload_v2f16_to_v2f32: 746; SI: ; %bb.0: 747; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 748; SI-NEXT: s_waitcnt lgkmcnt(0) 749; SI-NEXT: v_mov_b32_e32 v0, s2 750; SI-NEXT: v_mov_b32_e32 v1, s3 751; SI-NEXT: flat_load_dword v1, v[0:1] 752; SI-NEXT: v_mov_b32_e32 v2, s0 753; SI-NEXT: v_mov_b32_e32 v3, s1 754; SI-NEXT: s_waitcnt vmcnt(0) 755; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 756; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 757; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 758; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 759; SI-NEXT: s_endpgm 760; 761; VI-LABEL: global_extload_v2f16_to_v2f32: 762; VI: ; %bb.0: 763; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 764; VI-NEXT: s_waitcnt lgkmcnt(0) 765; VI-NEXT: v_mov_b32_e32 v0, s2 766; VI-NEXT: v_mov_b32_e32 v1, s3 767; VI-NEXT: flat_load_dword v1, v[0:1] 768; VI-NEXT: v_mov_b32_e32 v2, s0 769; VI-NEXT: v_mov_b32_e32 v3, s1 770; VI-NEXT: s_waitcnt vmcnt(0) 771; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 772; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 773; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 774; VI-NEXT: s_endpgm 775 %val = load <2 x half>, <2 x half> addrspace(1)* %in 776 %cvt = fpext <2 x half> %val to <2 x float> 777 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 778 ret void 779} 780 781define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 782; SI-LABEL: global_extload_v3f16_to_v3f32: 783; SI: ; %bb.0: 784; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 785; SI-NEXT: s_waitcnt lgkmcnt(0) 786; SI-NEXT: v_mov_b32_e32 v0, s2 787; SI-NEXT: v_mov_b32_e32 v1, s3 788; SI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 789; SI-NEXT: v_mov_b32_e32 v3, s0 790; SI-NEXT: v_mov_b32_e32 v4, s1 791; SI-NEXT: s_waitcnt vmcnt(0) 792; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 793; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 794; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 795; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 796; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 797; SI-NEXT: s_endpgm 798; 799; VI-LABEL: global_extload_v3f16_to_v3f32: 800; VI: ; %bb.0: 801; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 802; VI-NEXT: s_waitcnt lgkmcnt(0) 803; VI-NEXT: v_mov_b32_e32 v0, s2 804; VI-NEXT: v_mov_b32_e32 v1, s3 805; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 806; VI-NEXT: v_mov_b32_e32 v3, s0 807; VI-NEXT: v_mov_b32_e32 v4, s1 808; VI-NEXT: s_waitcnt vmcnt(0) 809; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 810; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 811; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 812; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 813; VI-NEXT: s_endpgm 814 %val = load <3 x half>, <3 x half> addrspace(1)* %in 815 %cvt = fpext <3 x half> %val to <3 x float> 816 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 817 ret void 818} 819 820define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 821; SI-LABEL: global_extload_v4f16_to_v4f32: 822; SI: ; %bb.0: 823; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 824; SI-NEXT: s_waitcnt lgkmcnt(0) 825; SI-NEXT: v_mov_b32_e32 v0, s2 826; SI-NEXT: v_mov_b32_e32 v1, s3 827; SI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] 828; SI-NEXT: v_mov_b32_e32 v5, s1 829; SI-NEXT: s_waitcnt vmcnt(0) 830; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 831; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 832; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 833; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 834; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 835; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 836; SI-NEXT: v_mov_b32_e32 v4, s0 837; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 838; SI-NEXT: s_endpgm 839; 840; VI-LABEL: global_extload_v4f16_to_v4f32: 841; VI: ; %bb.0: 842; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 843; VI-NEXT: s_waitcnt lgkmcnt(0) 844; VI-NEXT: v_mov_b32_e32 v0, s2 845; VI-NEXT: v_mov_b32_e32 v1, s3 846; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 847; VI-NEXT: s_waitcnt vmcnt(0) 848; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 849; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 850; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 851; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 852; VI-NEXT: v_mov_b32_e32 v4, s0 853; VI-NEXT: v_mov_b32_e32 v5, s1 854; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 855; VI-NEXT: s_endpgm 856 %val = load <4 x half>, <4 x half> addrspace(1)* %in 857 %cvt = fpext <4 x half> %val to <4 x float> 858 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 859 ret void 860} 861 862define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 863; SI-LABEL: global_extload_v8f16_to_v8f32: 864; SI: ; %bb.0: 865; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 866; SI-NEXT: s_waitcnt lgkmcnt(0) 867; SI-NEXT: v_mov_b32_e32 v0, s2 868; SI-NEXT: v_mov_b32_e32 v1, s3 869; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 870; SI-NEXT: s_add_u32 s2, s0, 16 871; SI-NEXT: s_addc_u32 s3, s1, 0 872; SI-NEXT: v_mov_b32_e32 v13, s1 873; SI-NEXT: v_mov_b32_e32 v12, s0 874; SI-NEXT: s_waitcnt vmcnt(0) 875; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 876; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 877; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 878; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 879; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 880; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 881; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 882; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 883; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 884; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 885; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 886; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 887; SI-NEXT: v_mov_b32_e32 v0, s2 888; SI-NEXT: v_mov_b32_e32 v1, s3 889; SI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 890; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 891; SI-NEXT: s_endpgm 892; 893; VI-LABEL: global_extload_v8f16_to_v8f32: 894; VI: ; %bb.0: 895; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 896; VI-NEXT: s_waitcnt lgkmcnt(0) 897; VI-NEXT: v_mov_b32_e32 v0, s2 898; VI-NEXT: v_mov_b32_e32 v1, s3 899; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 900; VI-NEXT: s_add_u32 s2, s0, 16 901; VI-NEXT: s_addc_u32 s3, s1, 0 902; VI-NEXT: v_mov_b32_e32 v13, s1 903; VI-NEXT: v_mov_b32_e32 v12, s0 904; VI-NEXT: s_waitcnt vmcnt(0) 905; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 906; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 907; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 908; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 909; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 910; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 911; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 912; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 913; VI-NEXT: v_mov_b32_e32 v0, s2 914; VI-NEXT: v_mov_b32_e32 v1, s3 915; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 916; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 917; VI-NEXT: s_endpgm 918 %val = load <8 x half>, <8 x half> addrspace(1)* %in 919 %cvt = fpext <8 x half> %val to <8 x float> 920 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 921 ret void 922} 923 924define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 925; SI-LABEL: global_extload_v16f16_to_v16f32: 926; SI: ; %bb.0: 927; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 928; SI-NEXT: s_waitcnt lgkmcnt(0) 929; SI-NEXT: s_add_u32 s4, s2, 16 930; SI-NEXT: v_mov_b32_e32 v5, s3 931; SI-NEXT: s_addc_u32 s5, s3, 0 932; SI-NEXT: v_mov_b32_e32 v0, s4 933; SI-NEXT: v_mov_b32_e32 v4, s2 934; SI-NEXT: v_mov_b32_e32 v1, s5 935; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 936; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 937; SI-NEXT: s_add_u32 s2, s0, 16 938; SI-NEXT: s_addc_u32 s3, s1, 0 939; SI-NEXT: v_mov_b32_e32 v14, s3 940; SI-NEXT: v_mov_b32_e32 v13, s2 941; SI-NEXT: s_add_u32 s2, s0, 48 942; SI-NEXT: s_addc_u32 s3, s1, 0 943; SI-NEXT: s_waitcnt vmcnt(1) 944; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 945; SI-NEXT: s_waitcnt vmcnt(0) 946; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 947; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 948; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 949; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 950; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 951; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 952; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 953; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 954; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 955; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] 956; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 957; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 958; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 959; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 960; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 961; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 962; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 963; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 964; SI-NEXT: v_mov_b32_e32 v5, s1 965; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 966; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 967; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 968; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 969; SI-NEXT: v_mov_b32_e32 v4, s0 970; SI-NEXT: s_add_u32 s0, s0, 32 971; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 972; SI-NEXT: s_addc_u32 s1, s1, 0 973; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 974; SI-NEXT: v_mov_b32_e32 v15, s3 975; SI-NEXT: v_mov_b32_e32 v17, s1 976; SI-NEXT: v_mov_b32_e32 v14, s2 977; SI-NEXT: v_mov_b32_e32 v16, s0 978; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 979; SI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] 980; SI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] 981; SI-NEXT: s_endpgm 982; 983; VI-LABEL: global_extload_v16f16_to_v16f32: 984; VI: ; %bb.0: 985; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 986; VI-NEXT: s_waitcnt lgkmcnt(0) 987; VI-NEXT: v_mov_b32_e32 v0, s2 988; VI-NEXT: v_mov_b32_e32 v1, s3 989; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 990; VI-NEXT: s_add_u32 s2, s2, 16 991; VI-NEXT: s_addc_u32 s3, s3, 0 992; VI-NEXT: v_mov_b32_e32 v5, s3 993; VI-NEXT: v_mov_b32_e32 v4, s2 994; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 995; VI-NEXT: s_add_u32 s2, s0, 16 996; VI-NEXT: s_addc_u32 s3, s1, 0 997; VI-NEXT: v_mov_b32_e32 v19, s3 998; VI-NEXT: v_mov_b32_e32 v18, s2 999; VI-NEXT: s_add_u32 s2, s0, 48 1000; VI-NEXT: v_mov_b32_e32 v17, s1 1001; VI-NEXT: s_addc_u32 s3, s1, 0 1002; VI-NEXT: v_mov_b32_e32 v16, s0 1003; VI-NEXT: s_add_u32 s0, s0, 32 1004; VI-NEXT: s_addc_u32 s1, s1, 0 1005; VI-NEXT: v_mov_b32_e32 v21, s3 1006; VI-NEXT: v_mov_b32_e32 v20, s2 1007; VI-NEXT: s_waitcnt vmcnt(1) 1008; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 1009; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 1010; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1011; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1012; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1013; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 1014; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1015; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1016; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] 1017; VI-NEXT: s_waitcnt vmcnt(1) 1018; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 1019; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 1020; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 1021; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1022; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1023; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 1024; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1025; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1026; VI-NEXT: v_mov_b32_e32 v5, s1 1027; VI-NEXT: v_mov_b32_e32 v4, s0 1028; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1029; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] 1030; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1031; VI-NEXT: s_endpgm 1032 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1033 %cvt = fpext <16 x half> %val to <16 x float> 1034 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 1035 ret void 1036} 1037 1038define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 1039; GCN-LABEL: global_extload_f16_to_f64: 1040; GCN: ; %bb.0: 1041; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1042; GCN-NEXT: s_waitcnt lgkmcnt(0) 1043; GCN-NEXT: v_mov_b32_e32 v0, s2 1044; GCN-NEXT: v_mov_b32_e32 v1, s3 1045; GCN-NEXT: flat_load_ushort v0, v[0:1] 1046; GCN-NEXT: v_mov_b32_e32 v2, s0 1047; GCN-NEXT: v_mov_b32_e32 v3, s1 1048; GCN-NEXT: s_waitcnt vmcnt(0) 1049; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 1050; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1051; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1052; GCN-NEXT: s_endpgm 1053 %val = load half, half addrspace(1)* %in 1054 %cvt = fpext half %val to double 1055 store double %cvt, double addrspace(1)* %out 1056 ret void 1057} 1058 1059define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1060; SI-LABEL: global_extload_v2f16_to_v2f64: 1061; SI: ; %bb.0: 1062; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1063; SI-NEXT: s_waitcnt lgkmcnt(0) 1064; SI-NEXT: v_mov_b32_e32 v0, s2 1065; SI-NEXT: v_mov_b32_e32 v1, s3 1066; SI-NEXT: flat_load_dword v0, v[0:1] 1067; SI-NEXT: v_mov_b32_e32 v4, s0 1068; SI-NEXT: v_mov_b32_e32 v5, s1 1069; SI-NEXT: s_waitcnt vmcnt(0) 1070; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1071; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1072; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 1073; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1074; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1075; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1076; SI-NEXT: s_endpgm 1077; 1078; VI-LABEL: global_extload_v2f16_to_v2f64: 1079; VI: ; %bb.0: 1080; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1081; VI-NEXT: s_waitcnt lgkmcnt(0) 1082; VI-NEXT: v_mov_b32_e32 v0, s2 1083; VI-NEXT: v_mov_b32_e32 v1, s3 1084; VI-NEXT: flat_load_dword v0, v[0:1] 1085; VI-NEXT: v_mov_b32_e32 v4, s0 1086; VI-NEXT: v_mov_b32_e32 v5, s1 1087; VI-NEXT: s_waitcnt vmcnt(0) 1088; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1089; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1090; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1091; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1092; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1093; VI-NEXT: s_endpgm 1094 %val = load <2 x half>, <2 x half> addrspace(1)* %in 1095 %cvt = fpext <2 x half> %val to <2 x double> 1096 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 1097 ret void 1098} 1099 1100define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 1101; SI-LABEL: global_extload_v3f16_to_v3f64: 1102; SI: ; %bb.0: 1103; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1104; SI-NEXT: s_waitcnt lgkmcnt(0) 1105; SI-NEXT: v_mov_b32_e32 v0, s2 1106; SI-NEXT: v_mov_b32_e32 v1, s3 1107; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1108; SI-NEXT: s_add_u32 s2, s0, 16 1109; SI-NEXT: s_addc_u32 s3, s1, 0 1110; SI-NEXT: v_mov_b32_e32 v7, s3 1111; SI-NEXT: v_mov_b32_e32 v6, s2 1112; SI-NEXT: s_waitcnt vmcnt(0) 1113; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1114; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1115; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1116; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1117; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 1118; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1119; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1120; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 1121; SI-NEXT: v_mov_b32_e32 v5, s1 1122; SI-NEXT: v_mov_b32_e32 v4, s0 1123; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1124; SI-NEXT: s_endpgm 1125; 1126; VI-LABEL: global_extload_v3f16_to_v3f64: 1127; VI: ; %bb.0: 1128; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1129; VI-NEXT: s_waitcnt lgkmcnt(0) 1130; VI-NEXT: v_mov_b32_e32 v0, s2 1131; VI-NEXT: v_mov_b32_e32 v1, s3 1132; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1133; VI-NEXT: s_add_u32 s2, s0, 16 1134; VI-NEXT: s_addc_u32 s3, s1, 0 1135; VI-NEXT: v_mov_b32_e32 v5, s1 1136; VI-NEXT: v_mov_b32_e32 v4, s0 1137; VI-NEXT: s_waitcnt vmcnt(0) 1138; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1139; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1140; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1141; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 1142; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1143; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1144; VI-NEXT: v_mov_b32_e32 v9, s3 1145; VI-NEXT: v_mov_b32_e32 v8, s2 1146; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] 1147; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1148; VI-NEXT: s_endpgm 1149 %val = load <3 x half>, <3 x half> addrspace(1)* %in 1150 %cvt = fpext <3 x half> %val to <3 x double> 1151 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 1152 ret void 1153} 1154 1155define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1156; SI-LABEL: global_extload_v4f16_to_v4f64: 1157; SI: ; %bb.0: 1158; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1159; SI-NEXT: s_waitcnt lgkmcnt(0) 1160; SI-NEXT: v_mov_b32_e32 v0, s2 1161; SI-NEXT: v_mov_b32_e32 v1, s3 1162; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1163; SI-NEXT: s_add_u32 s2, s0, 16 1164; SI-NEXT: s_addc_u32 s3, s1, 0 1165; SI-NEXT: v_mov_b32_e32 v9, s1 1166; SI-NEXT: v_mov_b32_e32 v8, s0 1167; SI-NEXT: s_waitcnt vmcnt(0) 1168; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1169; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 1170; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1171; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1172; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1173; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 1174; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1175; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1176; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 1177; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1178; SI-NEXT: v_mov_b32_e32 v11, s3 1179; SI-NEXT: v_mov_b32_e32 v10, s2 1180; SI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1181; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1182; SI-NEXT: s_endpgm 1183; 1184; VI-LABEL: global_extload_v4f16_to_v4f64: 1185; VI: ; %bb.0: 1186; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1187; VI-NEXT: s_waitcnt lgkmcnt(0) 1188; VI-NEXT: v_mov_b32_e32 v0, s2 1189; VI-NEXT: v_mov_b32_e32 v1, s3 1190; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1191; VI-NEXT: s_add_u32 s2, s0, 16 1192; VI-NEXT: s_addc_u32 s3, s1, 0 1193; VI-NEXT: v_mov_b32_e32 v9, s1 1194; VI-NEXT: v_mov_b32_e32 v8, s0 1195; VI-NEXT: s_waitcnt vmcnt(0) 1196; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1197; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1198; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1199; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1200; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1201; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1202; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1203; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1204; VI-NEXT: v_mov_b32_e32 v11, s3 1205; VI-NEXT: v_mov_b32_e32 v10, s2 1206; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1207; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1208; VI-NEXT: s_endpgm 1209 %val = load <4 x half>, <4 x half> addrspace(1)* %in 1210 %cvt = fpext <4 x half> %val to <4 x double> 1211 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 1212 ret void 1213} 1214 1215define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 1216; SI-LABEL: global_extload_v8f16_to_v8f64: 1217; SI: ; %bb.0: 1218; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1219; SI-NEXT: s_waitcnt lgkmcnt(0) 1220; SI-NEXT: v_mov_b32_e32 v0, s2 1221; SI-NEXT: v_mov_b32_e32 v1, s3 1222; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1223; SI-NEXT: s_add_u32 s2, s0, 48 1224; SI-NEXT: s_addc_u32 s3, s1, 0 1225; SI-NEXT: v_mov_b32_e32 v7, s3 1226; SI-NEXT: v_mov_b32_e32 v6, s2 1227; SI-NEXT: s_add_u32 s2, s0, 32 1228; SI-NEXT: v_mov_b32_e32 v13, s1 1229; SI-NEXT: s_addc_u32 s3, s1, 0 1230; SI-NEXT: v_mov_b32_e32 v12, s0 1231; SI-NEXT: s_add_u32 s0, s0, 16 1232; SI-NEXT: v_mov_b32_e32 v15, s3 1233; SI-NEXT: s_addc_u32 s1, s1, 0 1234; SI-NEXT: v_mov_b32_e32 v14, s2 1235; SI-NEXT: s_waitcnt vmcnt(0) 1236; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1237; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1238; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1239; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 1240; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 1241; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1242; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 1243; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 1244; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1245; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 1246; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 1247; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1248; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 1249; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 1250; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1251; SI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] 1252; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1253; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1254; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 1255; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1256; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 1257; SI-NEXT: v_mov_b32_e32 v17, s1 1258; SI-NEXT: v_mov_b32_e32 v16, s0 1259; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1260; SI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1261; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1262; SI-NEXT: s_endpgm 1263; 1264; VI-LABEL: global_extload_v8f16_to_v8f64: 1265; VI: ; %bb.0: 1266; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1267; VI-NEXT: s_waitcnt lgkmcnt(0) 1268; VI-NEXT: v_mov_b32_e32 v0, s2 1269; VI-NEXT: v_mov_b32_e32 v1, s3 1270; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1271; VI-NEXT: s_add_u32 s2, s0, 48 1272; VI-NEXT: s_addc_u32 s3, s1, 0 1273; VI-NEXT: v_mov_b32_e32 v8, s3 1274; VI-NEXT: v_mov_b32_e32 v7, s2 1275; VI-NEXT: s_add_u32 s2, s0, 32 1276; VI-NEXT: v_mov_b32_e32 v13, s1 1277; VI-NEXT: s_addc_u32 s3, s1, 0 1278; VI-NEXT: v_mov_b32_e32 v12, s0 1279; VI-NEXT: s_add_u32 s0, s0, 16 1280; VI-NEXT: v_mov_b32_e32 v15, s3 1281; VI-NEXT: s_addc_u32 s1, s1, 0 1282; VI-NEXT: v_mov_b32_e32 v14, s2 1283; VI-NEXT: s_waitcnt vmcnt(0) 1284; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1285; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1286; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1287; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1288; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1289; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 1290; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1291; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 1292; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1293; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1294; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1295; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] 1296; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 1297; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 1298; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1299; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 1300; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 1301; VI-NEXT: v_mov_b32_e32 v17, s1 1302; VI-NEXT: v_mov_b32_e32 v16, s0 1303; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1304; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] 1305; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1306; VI-NEXT: s_endpgm 1307 %val = load <8 x half>, <8 x half> addrspace(1)* %in 1308 %cvt = fpext <8 x half> %val to <8 x double> 1309 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 1310 ret void 1311} 1312 1313define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 1314; SI-LABEL: global_extload_v16f16_to_v16f64: 1315; SI: ; %bb.0: 1316; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1317; SI-NEXT: s_waitcnt lgkmcnt(0) 1318; SI-NEXT: v_mov_b32_e32 v0, s2 1319; SI-NEXT: v_mov_b32_e32 v1, s3 1320; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1321; SI-NEXT: s_add_u32 s2, s2, 16 1322; SI-NEXT: s_addc_u32 s3, s3, 0 1323; SI-NEXT: v_mov_b32_e32 v5, s3 1324; SI-NEXT: v_mov_b32_e32 v4, s2 1325; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1326; SI-NEXT: s_add_u32 s2, s0, 48 1327; SI-NEXT: s_addc_u32 s3, s1, 0 1328; SI-NEXT: v_mov_b32_e32 v15, s3 1329; SI-NEXT: v_mov_b32_e32 v14, s2 1330; SI-NEXT: s_add_u32 s2, s0, 32 1331; SI-NEXT: s_addc_u32 s3, s1, 0 1332; SI-NEXT: v_mov_b32_e32 v17, s3 1333; SI-NEXT: v_mov_b32_e32 v16, s2 1334; SI-NEXT: s_add_u32 s2, s0, 16 1335; SI-NEXT: s_addc_u32 s3, s1, 0 1336; SI-NEXT: v_mov_b32_e32 v19, s3 1337; SI-NEXT: v_mov_b32_e32 v18, s2 1338; SI-NEXT: s_add_u32 s2, s0, 0x70 1339; SI-NEXT: s_addc_u32 s3, s1, 0 1340; SI-NEXT: v_mov_b32_e32 v13, s1 1341; SI-NEXT: v_mov_b32_e32 v12, s0 1342; SI-NEXT: s_waitcnt vmcnt(1) 1343; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 1344; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1345; SI-NEXT: v_cvt_f32_f16_e32 v10, v8 1346; SI-NEXT: s_waitcnt vmcnt(0) 1347; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 1348; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 1349; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1350; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1351; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1352; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1353; SI-NEXT: v_cvt_f32_f16_e32 v21, v5 1354; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1355; SI-NEXT: v_mov_b32_e32 v15, s3 1356; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 1357; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 1358; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1359; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1360; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1361; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1362; SI-NEXT: v_mov_b32_e32 v14, s2 1363; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 1364; SI-NEXT: v_cvt_f32_f16_e32 v9, v0 1365; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1366; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1367; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1368; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 1369; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1370; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1371; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 1372; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1373; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1374; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 1375; SI-NEXT: s_add_u32 s2, s0, 0x60 1376; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1377; SI-NEXT: v_cvt_f32_f16_e32 v10, v11 1378; SI-NEXT: s_addc_u32 s3, s1, 0 1379; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1380; SI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1381; SI-NEXT: v_mov_b32_e32 v17, s3 1382; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1383; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1384; SI-NEXT: v_cvt_f32_f16_e32 v7, v20 1385; SI-NEXT: v_cvt_f32_f16_e32 v4, v4 1386; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 1387; SI-NEXT: v_mov_b32_e32 v16, s2 1388; SI-NEXT: s_add_u32 s2, s0, 0x50 1389; SI-NEXT: s_addc_u32 s3, s1, 0 1390; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 1391; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1392; SI-NEXT: s_add_u32 s0, s0, 64 1393; SI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] 1394; SI-NEXT: s_addc_u32 s1, s1, 0 1395; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 1396; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 1397; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1398; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 1399; SI-NEXT: v_mov_b32_e32 v19, s3 1400; SI-NEXT: v_mov_b32_e32 v13, s1 1401; SI-NEXT: v_mov_b32_e32 v18, s2 1402; SI-NEXT: v_mov_b32_e32 v12, s0 1403; SI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1404; SI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1405; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1406; SI-NEXT: s_endpgm 1407; 1408; VI-LABEL: global_extload_v16f16_to_v16f64: 1409; VI: ; %bb.0: 1410; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1411; VI-NEXT: s_waitcnt lgkmcnt(0) 1412; VI-NEXT: v_mov_b32_e32 v0, s2 1413; VI-NEXT: v_mov_b32_e32 v1, s3 1414; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 1415; VI-NEXT: s_add_u32 s2, s2, 16 1416; VI-NEXT: s_addc_u32 s3, s3, 0 1417; VI-NEXT: v_mov_b32_e32 v0, s2 1418; VI-NEXT: v_mov_b32_e32 v1, s3 1419; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1420; VI-NEXT: s_add_u32 s2, s0, 48 1421; VI-NEXT: s_addc_u32 s3, s1, 0 1422; VI-NEXT: v_mov_b32_e32 v14, s3 1423; VI-NEXT: v_mov_b32_e32 v13, s2 1424; VI-NEXT: s_add_u32 s2, s0, 32 1425; VI-NEXT: s_addc_u32 s3, s1, 0 1426; VI-NEXT: v_mov_b32_e32 v16, s3 1427; VI-NEXT: v_mov_b32_e32 v15, s2 1428; VI-NEXT: s_add_u32 s2, s0, 16 1429; VI-NEXT: s_addc_u32 s3, s1, 0 1430; VI-NEXT: v_mov_b32_e32 v18, s3 1431; VI-NEXT: v_mov_b32_e32 v17, s2 1432; VI-NEXT: s_add_u32 s2, s0, 0x70 1433; VI-NEXT: v_mov_b32_e32 v12, s1 1434; VI-NEXT: s_addc_u32 s3, s1, 0 1435; VI-NEXT: v_mov_b32_e32 v11, s0 1436; VI-NEXT: s_waitcnt vmcnt(1) 1437; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1438; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1439; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1440; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1441; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1442; VI-NEXT: s_nop 0 1443; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1444; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1445; VI-NEXT: s_waitcnt vmcnt(1) 1446; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 1447; VI-NEXT: v_mov_b32_e32 v14, s3 1448; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1449; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1450; VI-NEXT: v_mov_b32_e32 v13, s2 1451; VI-NEXT: s_add_u32 s2, s0, 0x60 1452; VI-NEXT: s_addc_u32 s3, s1, 0 1453; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] 1454; VI-NEXT: v_mov_b32_e32 v16, s3 1455; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1456; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1457; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 1458; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1459; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 1460; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1461; VI-NEXT: v_mov_b32_e32 v15, s2 1462; VI-NEXT: s_add_u32 s2, s0, 0x50 1463; VI-NEXT: s_addc_u32 s3, s1, 0 1464; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] 1465; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1466; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 1467; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 1468; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1469; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1470; VI-NEXT: s_add_u32 s0, s0, 64 1471; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 1472; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1473; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 1474; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 1475; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1476; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 1477; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1478; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1479; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1480; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1481; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] 1482; VI-NEXT: s_addc_u32 s1, s1, 0 1483; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 1484; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 1485; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1486; VI-NEXT: v_mov_b32_e32 v20, s3 1487; VI-NEXT: v_mov_b32_e32 v13, s1 1488; VI-NEXT: v_mov_b32_e32 v19, s2 1489; VI-NEXT: v_mov_b32_e32 v12, s0 1490; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] 1491; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7] 1492; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1493; VI-NEXT: s_endpgm 1494 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1495 %cvt = fpext <16 x half> %val to <16 x double> 1496 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 1497 ret void 1498} 1499 1500define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 1501; GCN-LABEL: global_truncstore_f32_to_f16: 1502; GCN: ; %bb.0: 1503; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1504; GCN-NEXT: s_waitcnt lgkmcnt(0) 1505; GCN-NEXT: v_mov_b32_e32 v0, s2 1506; GCN-NEXT: v_mov_b32_e32 v1, s3 1507; GCN-NEXT: flat_load_dword v0, v[0:1] 1508; GCN-NEXT: v_mov_b32_e32 v1, s1 1509; GCN-NEXT: s_waitcnt vmcnt(0) 1510; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 1511; GCN-NEXT: v_mov_b32_e32 v0, s0 1512; GCN-NEXT: flat_store_short v[0:1], v2 1513; GCN-NEXT: s_endpgm 1514 %val = load float, float addrspace(1)* %in 1515 %cvt = fptrunc float %val to half 1516 store half %cvt, half addrspace(1)* %out 1517 ret void 1518} 1519 1520define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 1521; SI-LABEL: global_truncstore_v2f32_to_v2f16: 1522; SI: ; %bb.0: 1523; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1524; SI-NEXT: s_waitcnt lgkmcnt(0) 1525; SI-NEXT: v_mov_b32_e32 v0, s2 1526; SI-NEXT: v_mov_b32_e32 v1, s3 1527; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1528; SI-NEXT: s_waitcnt vmcnt(0) 1529; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 1530; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 1531; SI-NEXT: v_mov_b32_e32 v0, s0 1532; SI-NEXT: v_mov_b32_e32 v1, s1 1533; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1534; SI-NEXT: v_or_b32_e32 v2, v3, v2 1535; SI-NEXT: flat_store_dword v[0:1], v2 1536; SI-NEXT: s_endpgm 1537; 1538; VI-LABEL: global_truncstore_v2f32_to_v2f16: 1539; VI: ; %bb.0: 1540; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1541; VI-NEXT: s_waitcnt lgkmcnt(0) 1542; VI-NEXT: v_mov_b32_e32 v0, s2 1543; VI-NEXT: v_mov_b32_e32 v1, s3 1544; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1545; VI-NEXT: s_waitcnt vmcnt(0) 1546; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1547; VI-NEXT: v_cvt_f16_f32_e32 v3, v0 1548; VI-NEXT: v_mov_b32_e32 v0, s0 1549; VI-NEXT: v_mov_b32_e32 v1, s1 1550; VI-NEXT: v_or_b32_e32 v2, v3, v2 1551; VI-NEXT: flat_store_dword v[0:1], v2 1552; VI-NEXT: s_endpgm 1553 %val = load <2 x float>, <2 x float> addrspace(1)* %in 1554 %cvt = fptrunc <2 x float> %val to <2 x half> 1555 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 1556 ret void 1557} 1558 1559define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 1560; SI-LABEL: global_truncstore_v3f32_to_v3f16: 1561; SI: ; %bb.0: 1562; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1563; SI-NEXT: s_waitcnt lgkmcnt(0) 1564; SI-NEXT: v_mov_b32_e32 v0, s2 1565; SI-NEXT: v_mov_b32_e32 v1, s3 1566; SI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1567; SI-NEXT: s_add_u32 s2, s0, 4 1568; SI-NEXT: s_addc_u32 s3, s1, 0 1569; SI-NEXT: s_waitcnt vmcnt(0) 1570; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 1571; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1572; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 1573; SI-NEXT: v_mov_b32_e32 v0, s2 1574; SI-NEXT: v_mov_b32_e32 v1, s3 1575; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1576; SI-NEXT: flat_store_short v[0:1], v2 1577; SI-NEXT: v_mov_b32_e32 v0, s0 1578; SI-NEXT: v_or_b32_e32 v2, v4, v3 1579; SI-NEXT: v_mov_b32_e32 v1, s1 1580; SI-NEXT: flat_store_dword v[0:1], v2 1581; SI-NEXT: s_endpgm 1582; 1583; VI-LABEL: global_truncstore_v3f32_to_v3f16: 1584; VI: ; %bb.0: 1585; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1586; VI-NEXT: s_waitcnt lgkmcnt(0) 1587; VI-NEXT: v_mov_b32_e32 v0, s2 1588; VI-NEXT: v_mov_b32_e32 v1, s3 1589; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1590; VI-NEXT: s_add_u32 s2, s0, 4 1591; VI-NEXT: s_addc_u32 s3, s1, 0 1592; VI-NEXT: s_waitcnt vmcnt(0) 1593; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1594; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1595; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 1596; VI-NEXT: v_mov_b32_e32 v0, s2 1597; VI-NEXT: v_mov_b32_e32 v1, s3 1598; VI-NEXT: flat_store_short v[0:1], v2 1599; VI-NEXT: v_mov_b32_e32 v0, s0 1600; VI-NEXT: v_or_b32_e32 v3, v4, v3 1601; VI-NEXT: v_mov_b32_e32 v1, s1 1602; VI-NEXT: flat_store_dword v[0:1], v3 1603; VI-NEXT: s_endpgm 1604 %val = load <3 x float>, <3 x float> addrspace(1)* %in 1605 %cvt = fptrunc <3 x float> %val to <3 x half> 1606 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 1607 ret void 1608} 1609 1610define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 1611; SI-LABEL: global_truncstore_v4f32_to_v4f16: 1612; SI: ; %bb.0: 1613; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1614; SI-NEXT: s_waitcnt lgkmcnt(0) 1615; SI-NEXT: v_mov_b32_e32 v0, s2 1616; SI-NEXT: v_mov_b32_e32 v1, s3 1617; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1618; SI-NEXT: v_mov_b32_e32 v4, s0 1619; SI-NEXT: v_mov_b32_e32 v5, s1 1620; SI-NEXT: s_waitcnt vmcnt(0) 1621; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1622; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1623; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1624; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1625; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1626; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1627; SI-NEXT: v_or_b32_e32 v1, v2, v3 1628; SI-NEXT: v_or_b32_e32 v0, v0, v6 1629; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1630; SI-NEXT: s_endpgm 1631; 1632; VI-LABEL: global_truncstore_v4f32_to_v4f16: 1633; VI: ; %bb.0: 1634; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1635; VI-NEXT: s_waitcnt lgkmcnt(0) 1636; VI-NEXT: v_mov_b32_e32 v0, s2 1637; VI-NEXT: v_mov_b32_e32 v1, s3 1638; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1639; VI-NEXT: s_waitcnt vmcnt(0) 1640; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1641; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1642; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1643; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 1644; VI-NEXT: v_mov_b32_e32 v0, s0 1645; VI-NEXT: v_mov_b32_e32 v1, s1 1646; VI-NEXT: v_or_b32_e32 v3, v2, v3 1647; VI-NEXT: v_or_b32_e32 v2, v5, v4 1648; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1649; VI-NEXT: s_endpgm 1650 %val = load <4 x float>, <4 x float> addrspace(1)* %in 1651 %cvt = fptrunc <4 x float> %val to <4 x half> 1652 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 1653 ret void 1654} 1655 1656define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 1657; SI-LABEL: global_truncstore_v8f32_to_v8f16: 1658; SI: ; %bb.0: 1659; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1660; SI-NEXT: s_waitcnt lgkmcnt(0) 1661; SI-NEXT: v_mov_b32_e32 v0, s2 1662; SI-NEXT: v_mov_b32_e32 v1, s3 1663; SI-NEXT: s_add_u32 s2, s2, 16 1664; SI-NEXT: s_addc_u32 s3, s3, 0 1665; SI-NEXT: v_mov_b32_e32 v5, s3 1666; SI-NEXT: v_mov_b32_e32 v4, s2 1667; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1668; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1669; SI-NEXT: v_mov_b32_e32 v8, s0 1670; SI-NEXT: v_mov_b32_e32 v9, s1 1671; SI-NEXT: s_waitcnt vmcnt(1) 1672; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1673; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1674; SI-NEXT: s_waitcnt vmcnt(0) 1675; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1676; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1677; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1678; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1679; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1680; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1681; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1682; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 1683; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1684; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1685; SI-NEXT: v_or_b32_e32 v1, v2, v3 1686; SI-NEXT: v_or_b32_e32 v0, v0, v10 1687; SI-NEXT: v_or_b32_e32 v3, v6, v7 1688; SI-NEXT: v_or_b32_e32 v2, v4, v5 1689; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1690; SI-NEXT: s_endpgm 1691; 1692; VI-LABEL: global_truncstore_v8f32_to_v8f16: 1693; VI: ; %bb.0: 1694; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1695; VI-NEXT: s_waitcnt lgkmcnt(0) 1696; VI-NEXT: v_mov_b32_e32 v0, s2 1697; VI-NEXT: v_mov_b32_e32 v1, s3 1698; VI-NEXT: s_add_u32 s2, s2, 16 1699; VI-NEXT: s_addc_u32 s3, s3, 0 1700; VI-NEXT: v_mov_b32_e32 v5, s3 1701; VI-NEXT: v_mov_b32_e32 v4, s2 1702; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1703; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1704; VI-NEXT: v_mov_b32_e32 v8, s0 1705; VI-NEXT: v_mov_b32_e32 v9, s1 1706; VI-NEXT: s_waitcnt vmcnt(1) 1707; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1708; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1709; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1710; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1711; VI-NEXT: s_waitcnt vmcnt(0) 1712; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1713; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1714; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1715; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 1716; VI-NEXT: v_or_b32_e32 v1, v2, v3 1717; VI-NEXT: v_or_b32_e32 v0, v0, v10 1718; VI-NEXT: v_or_b32_e32 v3, v6, v7 1719; VI-NEXT: v_or_b32_e32 v2, v4, v5 1720; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1721; VI-NEXT: s_endpgm 1722 %val = load <8 x float>, <8 x float> addrspace(1)* %in 1723 %cvt = fptrunc <8 x float> %val to <8 x half> 1724 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 1725 ret void 1726} 1727 1728define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 1729; SI-LABEL: global_truncstore_v16f32_to_v16f16: 1730; SI: ; %bb.0: 1731; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1732; SI-NEXT: s_waitcnt lgkmcnt(0) 1733; SI-NEXT: s_add_u32 s4, s2, 32 1734; SI-NEXT: s_addc_u32 s5, s3, 0 1735; SI-NEXT: v_mov_b32_e32 v0, s4 1736; SI-NEXT: v_mov_b32_e32 v1, s5 1737; SI-NEXT: s_add_u32 s4, s2, 48 1738; SI-NEXT: s_addc_u32 s5, s3, 0 1739; SI-NEXT: v_mov_b32_e32 v9, s3 1740; SI-NEXT: v_mov_b32_e32 v4, s4 1741; SI-NEXT: v_mov_b32_e32 v8, s2 1742; SI-NEXT: s_add_u32 s2, s2, 16 1743; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1744; SI-NEXT: v_mov_b32_e32 v5, s5 1745; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1746; SI-NEXT: s_addc_u32 s3, s3, 0 1747; SI-NEXT: v_mov_b32_e32 v13, s3 1748; SI-NEXT: v_mov_b32_e32 v12, s2 1749; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1750; SI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1751; SI-NEXT: s_add_u32 s2, s0, 16 1752; SI-NEXT: s_addc_u32 s3, s1, 0 1753; SI-NEXT: s_waitcnt vmcnt(3) 1754; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1755; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1756; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1757; SI-NEXT: s_waitcnt vmcnt(2) 1758; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1759; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 1760; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1761; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1762; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 1763; SI-NEXT: s_waitcnt vmcnt(1) 1764; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1765; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 1766; SI-NEXT: s_waitcnt vmcnt(0) 1767; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 1768; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1769; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1770; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1771; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 1772; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1773; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1774; SI-NEXT: v_mov_b32_e32 v5, s3 1775; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 1776; SI-NEXT: v_or_b32_e32 v1, v2, v3 1777; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 1778; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 1779; SI-NEXT: v_mov_b32_e32 v4, s2 1780; SI-NEXT: v_or_b32_e32 v0, v0, v18 1781; SI-NEXT: v_or_b32_e32 v3, v6, v2 1782; SI-NEXT: v_or_b32_e32 v2, v17, v7 1783; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 1784; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 1785; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 1786; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 1787; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1788; SI-NEXT: v_mov_b32_e32 v5, s1 1789; SI-NEXT: v_or_b32_e32 v1, v10, v6 1790; SI-NEXT: v_or_b32_e32 v0, v8, v7 1791; SI-NEXT: v_or_b32_e32 v3, v14, v9 1792; SI-NEXT: v_or_b32_e32 v2, v12, v11 1793; SI-NEXT: v_mov_b32_e32 v4, s0 1794; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1795; SI-NEXT: s_endpgm 1796; 1797; VI-LABEL: global_truncstore_v16f32_to_v16f16: 1798; VI: ; %bb.0: 1799; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1800; VI-NEXT: s_waitcnt lgkmcnt(0) 1801; VI-NEXT: s_add_u32 s4, s2, 32 1802; VI-NEXT: s_addc_u32 s5, s3, 0 1803; VI-NEXT: v_mov_b32_e32 v0, s4 1804; VI-NEXT: v_mov_b32_e32 v1, s5 1805; VI-NEXT: s_add_u32 s4, s2, 48 1806; VI-NEXT: s_addc_u32 s5, s3, 0 1807; VI-NEXT: v_mov_b32_e32 v9, s3 1808; VI-NEXT: v_mov_b32_e32 v4, s4 1809; VI-NEXT: v_mov_b32_e32 v8, s2 1810; VI-NEXT: s_add_u32 s2, s2, 16 1811; VI-NEXT: v_mov_b32_e32 v5, s5 1812; VI-NEXT: s_addc_u32 s3, s3, 0 1813; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1814; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1815; VI-NEXT: v_mov_b32_e32 v13, s3 1816; VI-NEXT: v_mov_b32_e32 v12, s2 1817; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1818; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1819; VI-NEXT: s_add_u32 s2, s0, 16 1820; VI-NEXT: s_addc_u32 s3, s1, 0 1821; VI-NEXT: s_waitcnt vmcnt(3) 1822; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1823; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1824; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1825; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1826; VI-NEXT: s_waitcnt vmcnt(2) 1827; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1828; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1829; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1830; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 1831; VI-NEXT: s_waitcnt vmcnt(1) 1832; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1833; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 1834; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1835; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 1836; VI-NEXT: s_waitcnt vmcnt(0) 1837; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1838; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 1839; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1840; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 1841; VI-NEXT: v_mov_b32_e32 v5, s3 1842; VI-NEXT: v_mov_b32_e32 v4, s2 1843; VI-NEXT: v_or_b32_e32 v1, v2, v3 1844; VI-NEXT: v_or_b32_e32 v0, v0, v16 1845; VI-NEXT: v_or_b32_e32 v3, v6, v7 1846; VI-NEXT: v_or_b32_e32 v2, v18, v17 1847; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1848; VI-NEXT: v_mov_b32_e32 v5, s1 1849; VI-NEXT: v_or_b32_e32 v1, v10, v11 1850; VI-NEXT: v_or_b32_e32 v0, v8, v9 1851; VI-NEXT: v_or_b32_e32 v3, v14, v15 1852; VI-NEXT: v_or_b32_e32 v2, v12, v13 1853; VI-NEXT: v_mov_b32_e32 v4, s0 1854; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1855; VI-NEXT: s_endpgm 1856 %val = load <16 x float>, <16 x float> addrspace(1)* %in 1857 %cvt = fptrunc <16 x float> %val to <16 x half> 1858 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 1859 ret void 1860} 1861 1862; FIXME: Unsafe math should fold conversions away 1863define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 1864; SI-LABEL: fadd_f16: 1865; SI: ; %bb.0: 1866; SI-NEXT: s_load_dword s0, s[4:5], 0x2 1867; SI-NEXT: s_waitcnt lgkmcnt(0) 1868; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 1869; SI-NEXT: s_lshr_b32 s0, s0, 16 1870; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 1871; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1872; SI-NEXT: v_add_f32_e32 v0, v0, v1 1873; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 1874; SI-NEXT: s_waitcnt lgkmcnt(0) 1875; SI-NEXT: v_mov_b32_e32 v0, s0 1876; SI-NEXT: v_mov_b32_e32 v1, s1 1877; SI-NEXT: flat_store_short v[0:1], v2 1878; SI-NEXT: s_endpgm 1879; 1880; VI-LABEL: fadd_f16: 1881; VI: ; %bb.0: 1882; VI-NEXT: s_load_dword s2, s[4:5], 0x8 1883; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1884; VI-NEXT: s_waitcnt lgkmcnt(0) 1885; VI-NEXT: s_lshr_b32 s3, s2, 16 1886; VI-NEXT: v_mov_b32_e32 v0, s3 1887; VI-NEXT: v_add_f16_e32 v2, s2, v0 1888; VI-NEXT: v_mov_b32_e32 v0, s0 1889; VI-NEXT: v_mov_b32_e32 v1, s1 1890; VI-NEXT: flat_store_short v[0:1], v2 1891; VI-NEXT: s_endpgm 1892 %add = fadd half %a, %b 1893 store half %add, half addrspace(1)* %out, align 4 1894 ret void 1895} 1896 1897define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 1898; SI-LABEL: fadd_v2f16: 1899; SI: ; %bb.0: 1900; SI-NEXT: s_load_dword s0, s[4:5], 0x2 1901; SI-NEXT: s_load_dword s1, s[4:5], 0x3 1902; SI-NEXT: s_waitcnt lgkmcnt(0) 1903; SI-NEXT: s_lshr_b32 s2, s0, 16 1904; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 1905; SI-NEXT: s_lshr_b32 s0, s1, 16 1906; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 1907; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 1908; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 1909; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1910; SI-NEXT: v_add_f32_e32 v0, v0, v1 1911; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1912; SI-NEXT: v_add_f32_e32 v1, v2, v3 1913; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1914; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1915; SI-NEXT: v_or_b32_e32 v2, v0, v1 1916; SI-NEXT: s_waitcnt lgkmcnt(0) 1917; SI-NEXT: v_mov_b32_e32 v0, s0 1918; SI-NEXT: v_mov_b32_e32 v1, s1 1919; SI-NEXT: flat_store_dword v[0:1], v2 1920; SI-NEXT: s_endpgm 1921; 1922; VI-LABEL: fadd_v2f16: 1923; VI: ; %bb.0: 1924; VI-NEXT: s_load_dword s2, s[4:5], 0xc 1925; VI-NEXT: s_load_dword s3, s[4:5], 0x8 1926; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1927; VI-NEXT: s_waitcnt lgkmcnt(0) 1928; VI-NEXT: s_lshr_b32 s4, s2, 16 1929; VI-NEXT: s_lshr_b32 s5, s3, 16 1930; VI-NEXT: v_mov_b32_e32 v0, s4 1931; VI-NEXT: v_mov_b32_e32 v1, s5 1932; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1933; VI-NEXT: v_mov_b32_e32 v1, s2 1934; VI-NEXT: v_add_f16_e32 v1, s3, v1 1935; VI-NEXT: v_or_b32_e32 v2, v1, v0 1936; VI-NEXT: v_mov_b32_e32 v0, s0 1937; VI-NEXT: v_mov_b32_e32 v1, s1 1938; VI-NEXT: flat_store_dword v[0:1], v2 1939; VI-NEXT: s_endpgm 1940 %add = fadd <2 x half> %a, %b 1941 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 1942 ret void 1943} 1944 1945define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1946; SI-LABEL: fadd_v4f16: 1947; SI: ; %bb.0: 1948; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1949; SI-NEXT: s_waitcnt lgkmcnt(0) 1950; SI-NEXT: v_mov_b32_e32 v0, s2 1951; SI-NEXT: v_mov_b32_e32 v1, s3 1952; SI-NEXT: s_add_u32 s2, s2, 8 1953; SI-NEXT: s_addc_u32 s3, s3, 0 1954; SI-NEXT: v_mov_b32_e32 v2, s2 1955; SI-NEXT: v_mov_b32_e32 v3, s3 1956; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1957; SI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1958; SI-NEXT: v_mov_b32_e32 v4, s0 1959; SI-NEXT: v_mov_b32_e32 v5, s1 1960; SI-NEXT: s_waitcnt vmcnt(1) 1961; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1962; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1963; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1964; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1965; SI-NEXT: s_waitcnt vmcnt(0) 1966; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 1967; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1968; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 1969; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1970; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1971; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1972; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1973; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1974; SI-NEXT: v_add_f32_e32 v7, v7, v9 1975; SI-NEXT: v_add_f32_e32 v6, v6, v8 1976; SI-NEXT: v_add_f32_e32 v1, v1, v3 1977; SI-NEXT: v_add_f32_e32 v0, v0, v2 1978; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1979; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1980; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 1981; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 1982; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1983; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1984; SI-NEXT: v_or_b32_e32 v1, v2, v1 1985; SI-NEXT: v_or_b32_e32 v0, v3, v0 1986; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1987; SI-NEXT: s_endpgm 1988; 1989; VI-LABEL: fadd_v4f16: 1990; VI: ; %bb.0: 1991; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1992; VI-NEXT: s_waitcnt lgkmcnt(0) 1993; VI-NEXT: s_add_u32 s4, s2, 8 1994; VI-NEXT: v_mov_b32_e32 v0, s2 1995; VI-NEXT: s_addc_u32 s5, s3, 0 1996; VI-NEXT: v_mov_b32_e32 v2, s4 1997; VI-NEXT: v_mov_b32_e32 v1, s3 1998; VI-NEXT: v_mov_b32_e32 v3, s5 1999; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2000; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2001; VI-NEXT: v_mov_b32_e32 v4, s0 2002; VI-NEXT: v_mov_b32_e32 v5, s1 2003; VI-NEXT: s_waitcnt vmcnt(0) 2004; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2005; VI-NEXT: v_add_f16_e32 v1, v1, v3 2006; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2007; VI-NEXT: v_add_f16_e32 v0, v0, v2 2008; VI-NEXT: v_or_b32_e32 v1, v1, v6 2009; VI-NEXT: v_or_b32_e32 v0, v0, v3 2010; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2011; VI-NEXT: s_endpgm 2012 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 2013 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 2014 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 2015 %result = fadd <4 x half> %a, %b 2016 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 2017 ret void 2018} 2019 2020define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 2021; SI-LABEL: fadd_v8f16: 2022; SI: ; %bb.0: 2023; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 2024; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2025; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 2026; SI-NEXT: s_waitcnt lgkmcnt(0) 2027; SI-NEXT: s_lshr_b32 s10, s0, 16 2028; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 2029; SI-NEXT: s_lshr_b32 s0, s4, 16 2030; SI-NEXT: v_cvt_f32_f16_e32 v8, s0 2031; SI-NEXT: s_lshr_b32 s0, s5, 16 2032; SI-NEXT: s_lshr_b32 s11, s1, 16 2033; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 2034; SI-NEXT: s_lshr_b32 s10, s2, 16 2035; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 2036; SI-NEXT: s_lshr_b32 s0, s6, 16 2037; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 2038; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 2039; SI-NEXT: s_lshr_b32 s10, s3, 16 2040; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 2041; SI-NEXT: s_lshr_b32 s0, s7, 16 2042; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 2043; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 2044; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 2045; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 2046; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 2047; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 2048; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 2049; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 2050; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 2051; SI-NEXT: v_add_f32_e32 v1, v1, v9 2052; SI-NEXT: v_add_f32_e32 v0, v0, v8 2053; SI-NEXT: v_add_f32_e32 v3, v3, v11 2054; SI-NEXT: v_add_f32_e32 v2, v2, v10 2055; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2056; SI-NEXT: v_add_f32_e32 v5, v5, v13 2057; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2058; SI-NEXT: v_add_f32_e32 v4, v4, v12 2059; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 2060; SI-NEXT: v_add_f32_e32 v7, v7, v14 2061; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 2062; SI-NEXT: v_add_f32_e32 v6, v6, v15 2063; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 2064; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 2065; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 2066; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 2067; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2068; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2069; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2070; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2071; SI-NEXT: v_or_b32_e32 v1, v5, v1 2072; SI-NEXT: v_or_b32_e32 v0, v4, v0 2073; SI-NEXT: v_mov_b32_e32 v4, s8 2074; SI-NEXT: v_or_b32_e32 v3, v7, v3 2075; SI-NEXT: v_or_b32_e32 v2, v6, v2 2076; SI-NEXT: v_mov_b32_e32 v5, s9 2077; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2078; SI-NEXT: s_endpgm 2079; 2080; VI-LABEL: fadd_v8f16: 2081; VI: ; %bb.0: 2082; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2083; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 2084; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2085; VI-NEXT: s_waitcnt lgkmcnt(0) 2086; VI-NEXT: s_lshr_b32 s6, s3, 16 2087; VI-NEXT: s_lshr_b32 s7, s11, 16 2088; VI-NEXT: v_mov_b32_e32 v0, s6 2089; VI-NEXT: v_mov_b32_e32 v1, s7 2090; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2091; VI-NEXT: v_mov_b32_e32 v1, s3 2092; VI-NEXT: v_add_f16_e32 v1, s11, v1 2093; VI-NEXT: s_lshr_b32 s3, s2, 16 2094; VI-NEXT: s_lshr_b32 s6, s10, 16 2095; VI-NEXT: v_or_b32_e32 v3, v1, v0 2096; VI-NEXT: v_mov_b32_e32 v0, s3 2097; VI-NEXT: v_mov_b32_e32 v1, s6 2098; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2099; VI-NEXT: v_mov_b32_e32 v1, s2 2100; VI-NEXT: v_add_f16_e32 v1, s10, v1 2101; VI-NEXT: s_lshr_b32 s2, s1, 16 2102; VI-NEXT: s_lshr_b32 s3, s9, 16 2103; VI-NEXT: v_or_b32_e32 v2, v1, v0 2104; VI-NEXT: v_mov_b32_e32 v0, s2 2105; VI-NEXT: v_mov_b32_e32 v1, s3 2106; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2107; VI-NEXT: v_mov_b32_e32 v1, s1 2108; VI-NEXT: v_add_f16_e32 v1, s9, v1 2109; VI-NEXT: s_lshr_b32 s1, s0, 16 2110; VI-NEXT: s_lshr_b32 s2, s8, 16 2111; VI-NEXT: v_or_b32_e32 v1, v1, v0 2112; VI-NEXT: v_mov_b32_e32 v0, s1 2113; VI-NEXT: v_mov_b32_e32 v4, s2 2114; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2115; VI-NEXT: v_mov_b32_e32 v4, s0 2116; VI-NEXT: v_add_f16_e32 v4, s8, v4 2117; VI-NEXT: v_or_b32_e32 v0, v4, v0 2118; VI-NEXT: v_mov_b32_e32 v4, s4 2119; VI-NEXT: v_mov_b32_e32 v5, s5 2120; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2121; VI-NEXT: s_endpgm 2122 %add = fadd <8 x half> %a, %b 2123 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 2124 ret void 2125} 2126 2127define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 2128; GCN-LABEL: test_bitcast_from_half: 2129; GCN: ; %bb.0: 2130; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2131; GCN-NEXT: s_waitcnt lgkmcnt(0) 2132; GCN-NEXT: v_mov_b32_e32 v0, s0 2133; GCN-NEXT: v_mov_b32_e32 v1, s1 2134; GCN-NEXT: flat_load_ushort v2, v[0:1] 2135; GCN-NEXT: v_mov_b32_e32 v0, s2 2136; GCN-NEXT: v_mov_b32_e32 v1, s3 2137; GCN-NEXT: s_waitcnt vmcnt(0) 2138; GCN-NEXT: flat_store_short v[0:1], v2 2139; GCN-NEXT: s_endpgm 2140 %val = load half, half addrspace(1)* %in 2141 %val_int = bitcast half %val to i16 2142 store i16 %val_int, i16 addrspace(1)* %out 2143 ret void 2144} 2145 2146define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 2147; GCN-LABEL: test_bitcast_to_half: 2148; GCN: ; %bb.0: 2149; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2150; GCN-NEXT: s_waitcnt lgkmcnt(0) 2151; GCN-NEXT: v_mov_b32_e32 v0, s2 2152; GCN-NEXT: v_mov_b32_e32 v1, s3 2153; GCN-NEXT: flat_load_ushort v2, v[0:1] 2154; GCN-NEXT: v_mov_b32_e32 v0, s0 2155; GCN-NEXT: v_mov_b32_e32 v1, s1 2156; GCN-NEXT: s_waitcnt vmcnt(0) 2157; GCN-NEXT: flat_store_short v[0:1], v2 2158; GCN-NEXT: s_endpgm 2159 %val = load i16, i16 addrspace(1)* %in 2160 %val_fp = bitcast i16 %val to half 2161 store half %val_fp, half addrspace(1)* %out 2162 ret void 2163} 2164 2165attributes #0 = { nounwind } 2166