1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,CI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 4 5; half args should be promoted to float for CI and lower. 6 7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 8; CI-LABEL: load_f16_arg: 9; CI: ; %bb.0: 10; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11; CI-NEXT: s_load_dword s2, s[4:5], 0x2 12; CI-NEXT: s_waitcnt lgkmcnt(0) 13; CI-NEXT: v_mov_b32_e32 v0, s0 14; CI-NEXT: v_mov_b32_e32 v1, s1 15; CI-NEXT: v_mov_b32_e32 v2, s2 16; CI-NEXT: flat_store_short v[0:1], v2 17; CI-NEXT: s_endpgm 18; 19; VI-LABEL: load_f16_arg: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 22; VI-NEXT: s_load_dword s2, s[4:5], 0x8 23; VI-NEXT: s_waitcnt lgkmcnt(0) 24; VI-NEXT: v_mov_b32_e32 v0, s0 25; VI-NEXT: v_mov_b32_e32 v1, s1 26; VI-NEXT: v_mov_b32_e32 v2, s2 27; VI-NEXT: flat_store_short v[0:1], v2 28; VI-NEXT: s_endpgm 29 store half %arg, half addrspace(1)* %out 30 ret void 31} 32 33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 34; CI-LABEL: load_v2f16_arg: 35; CI: ; %bb.0: 36; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 37; CI-NEXT: s_load_dword s2, s[4:5], 0x2 38; CI-NEXT: s_waitcnt lgkmcnt(0) 39; CI-NEXT: v_mov_b32_e32 v0, s0 40; CI-NEXT: v_mov_b32_e32 v1, s1 41; CI-NEXT: v_mov_b32_e32 v2, s2 42; CI-NEXT: flat_store_dword v[0:1], v2 43; CI-NEXT: s_endpgm 44; 45; VI-LABEL: load_v2f16_arg: 46; VI: ; %bb.0: 47; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 48; VI-NEXT: s_load_dword s2, s[4:5], 0x8 49; VI-NEXT: s_waitcnt lgkmcnt(0) 50; VI-NEXT: v_mov_b32_e32 v0, s0 51; VI-NEXT: v_mov_b32_e32 v1, s1 52; VI-NEXT: v_mov_b32_e32 v2, s2 53; VI-NEXT: flat_store_dword v[0:1], v2 54; VI-NEXT: s_endpgm 55 store <2 x half> %arg, <2 x half> addrspace(1)* %out 56 ret void 57} 58 59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 60; CI-LABEL: load_v3f16_arg: 61; CI: ; %bb.0: 62; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 63; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 64; CI-NEXT: s_waitcnt lgkmcnt(0) 65; CI-NEXT: s_add_u32 s4, s0, 4 66; CI-NEXT: s_addc_u32 s5, s1, 0 67; CI-NEXT: v_mov_b32_e32 v2, s4 68; CI-NEXT: v_mov_b32_e32 v4, s3 69; CI-NEXT: v_mov_b32_e32 v0, s0 70; CI-NEXT: v_mov_b32_e32 v3, s5 71; CI-NEXT: v_mov_b32_e32 v1, s1 72; CI-NEXT: v_mov_b32_e32 v5, s2 73; CI-NEXT: flat_store_short v[2:3], v4 74; CI-NEXT: flat_store_dword v[0:1], v5 75; CI-NEXT: s_endpgm 76; 77; VI-LABEL: load_v3f16_arg: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 80; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_add_u32 s4, s0, 4 83; VI-NEXT: s_addc_u32 s5, s1, 0 84; VI-NEXT: v_mov_b32_e32 v2, s4 85; VI-NEXT: v_mov_b32_e32 v4, s3 86; VI-NEXT: v_mov_b32_e32 v0, s0 87; VI-NEXT: v_mov_b32_e32 v3, s5 88; VI-NEXT: v_mov_b32_e32 v1, s1 89; VI-NEXT: v_mov_b32_e32 v5, s2 90; VI-NEXT: flat_store_short v[2:3], v4 91; VI-NEXT: flat_store_dword v[0:1], v5 92; VI-NEXT: s_endpgm 93 store <3 x half> %arg, <3 x half> addrspace(1)* %out 94 ret void 95} 96 97 98; FIXME: Why not one load? 99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 100; CI-LABEL: load_v4f16_arg: 101; CI: ; %bb.0: 102; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 103; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 104; CI-NEXT: s_waitcnt lgkmcnt(0) 105; CI-NEXT: v_mov_b32_e32 v0, s0 106; CI-NEXT: v_mov_b32_e32 v2, s2 107; CI-NEXT: v_mov_b32_e32 v1, s1 108; CI-NEXT: v_mov_b32_e32 v3, s3 109; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 110; CI-NEXT: s_endpgm 111; 112; VI-LABEL: load_v4f16_arg: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 115; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s0 118; VI-NEXT: v_mov_b32_e32 v2, s2 119; VI-NEXT: v_mov_b32_e32 v1, s1 120; VI-NEXT: v_mov_b32_e32 v3, s3 121; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 122; VI-NEXT: s_endpgm 123 store <4 x half> %arg, <4 x half> addrspace(1)* %out 124 ret void 125} 126 127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 128; CI-LABEL: load_v8f16_arg: 129; CI: ; %bb.0: 130; CI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 131; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 132; CI-NEXT: s_waitcnt lgkmcnt(0) 133; CI-NEXT: v_mov_b32_e32 v4, s6 134; CI-NEXT: v_mov_b32_e32 v0, s0 135; CI-NEXT: v_mov_b32_e32 v5, s7 136; CI-NEXT: v_mov_b32_e32 v1, s1 137; CI-NEXT: v_mov_b32_e32 v2, s2 138; CI-NEXT: v_mov_b32_e32 v3, s3 139; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 140; CI-NEXT: s_endpgm 141; 142; VI-LABEL: load_v8f16_arg: 143; VI: ; %bb.0: 144; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 145; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: v_mov_b32_e32 v4, s6 148; VI-NEXT: v_mov_b32_e32 v0, s0 149; VI-NEXT: v_mov_b32_e32 v5, s7 150; VI-NEXT: v_mov_b32_e32 v1, s1 151; VI-NEXT: v_mov_b32_e32 v2, s2 152; VI-NEXT: v_mov_b32_e32 v3, s3 153; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 154; VI-NEXT: s_endpgm 155 store <8 x half> %arg, <8 x half> addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 160; CI-LABEL: extload_v2f16_arg: 161; CI: ; %bb.0: 162; CI-NEXT: s_load_dword s2, s[4:5], 0x2 163; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 164; CI-NEXT: s_waitcnt lgkmcnt(0) 165; CI-NEXT: s_lshr_b32 s3, s2, 16 166; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 167; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 168; CI-NEXT: v_mov_b32_e32 v3, s1 169; CI-NEXT: v_mov_b32_e32 v2, s0 170; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 171; CI-NEXT: s_endpgm 172; 173; VI-LABEL: extload_v2f16_arg: 174; VI: ; %bb.0: 175; VI-NEXT: s_load_dword s2, s[4:5], 0x8 176; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 177; VI-NEXT: s_waitcnt lgkmcnt(0) 178; VI-NEXT: s_lshr_b32 s3, s2, 16 179; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 180; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 181; VI-NEXT: v_mov_b32_e32 v3, s1 182; VI-NEXT: v_mov_b32_e32 v2, s0 183; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 184; VI-NEXT: s_endpgm 185 %fpext = fpext <2 x half> %in to <2 x float> 186 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 187 ret void 188} 189 190define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 191; CI-LABEL: extload_f16_to_f32_arg: 192; CI: ; %bb.0: 193; CI-NEXT: s_load_dword s2, s[4:5], 0x2 194; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 195; CI-NEXT: s_waitcnt lgkmcnt(0) 196; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 197; CI-NEXT: v_mov_b32_e32 v0, s0 198; CI-NEXT: v_mov_b32_e32 v1, s1 199; CI-NEXT: flat_store_dword v[0:1], v2 200; CI-NEXT: s_endpgm 201; 202; VI-LABEL: extload_f16_to_f32_arg: 203; VI: ; %bb.0: 204; VI-NEXT: s_load_dword s2, s[4:5], 0x8 205; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 206; VI-NEXT: s_waitcnt lgkmcnt(0) 207; VI-NEXT: v_cvt_f32_f16_e32 v2, s2 208; VI-NEXT: v_mov_b32_e32 v0, s0 209; VI-NEXT: v_mov_b32_e32 v1, s1 210; VI-NEXT: flat_store_dword v[0:1], v2 211; VI-NEXT: s_endpgm 212 %ext = fpext half %arg to float 213 store float %ext, float addrspace(1)* %out 214 ret void 215} 216 217define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 218; CI-LABEL: extload_v2f16_to_v2f32_arg: 219; CI: ; %bb.0: 220; CI-NEXT: s_load_dword s2, s[4:5], 0x2 221; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 222; CI-NEXT: s_waitcnt lgkmcnt(0) 223; CI-NEXT: s_lshr_b32 s3, s2, 16 224; CI-NEXT: v_cvt_f32_f16_e32 v1, s3 225; CI-NEXT: v_cvt_f32_f16_e32 v0, s2 226; CI-NEXT: v_mov_b32_e32 v3, s1 227; CI-NEXT: v_mov_b32_e32 v2, s0 228; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 229; CI-NEXT: s_endpgm 230; 231; VI-LABEL: extload_v2f16_to_v2f32_arg: 232; VI: ; %bb.0: 233; VI-NEXT: s_load_dword s2, s[4:5], 0x8 234; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 235; VI-NEXT: s_waitcnt lgkmcnt(0) 236; VI-NEXT: s_lshr_b32 s3, s2, 16 237; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 238; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 239; VI-NEXT: v_mov_b32_e32 v3, s1 240; VI-NEXT: v_mov_b32_e32 v2, s0 241; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 242; VI-NEXT: s_endpgm 243 %ext = fpext <2 x half> %arg to <2 x float> 244 store <2 x float> %ext, <2 x float> addrspace(1)* %out 245 ret void 246} 247 248define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 249; CI-LABEL: extload_v3f16_to_v3f32_arg: 250; CI: ; %bb.0: 251; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 252; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 253; CI-NEXT: s_waitcnt lgkmcnt(0) 254; CI-NEXT: s_lshr_b32 s4, s0, 16 255; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 256; CI-NEXT: v_cvt_f32_f16_e32 v1, s4 257; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 258; CI-NEXT: v_mov_b32_e32 v4, s3 259; CI-NEXT: v_mov_b32_e32 v3, s2 260; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 261; CI-NEXT: s_endpgm 262; 263; VI-LABEL: extload_v3f16_to_v3f32_arg: 264; VI: ; %bb.0: 265; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 266; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 267; VI-NEXT: s_waitcnt lgkmcnt(0) 268; VI-NEXT: s_lshr_b32 s4, s0, 16 269; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 270; VI-NEXT: v_cvt_f32_f16_e32 v1, s4 271; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 272; VI-NEXT: v_mov_b32_e32 v4, s3 273; VI-NEXT: v_mov_b32_e32 v3, s2 274; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 275; VI-NEXT: s_endpgm 276 %ext = fpext <3 x half> %arg to <3 x float> 277 store <3 x float> %ext, <3 x float> addrspace(1)* %out 278 ret void 279} 280 281define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 282; CI-LABEL: extload_v4f16_to_v4f32_arg: 283; CI: ; %bb.0: 284; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 285; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 286; CI-NEXT: s_waitcnt lgkmcnt(0) 287; CI-NEXT: s_lshr_b32 s4, s1, 16 288; CI-NEXT: s_lshr_b32 s5, s0, 16 289; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 290; CI-NEXT: v_cvt_f32_f16_e32 v3, s4 291; CI-NEXT: v_cvt_f32_f16_e32 v1, s5 292; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 293; CI-NEXT: v_mov_b32_e32 v5, s3 294; CI-NEXT: v_mov_b32_e32 v4, s2 295; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 296; CI-NEXT: s_endpgm 297; 298; VI-LABEL: extload_v4f16_to_v4f32_arg: 299; VI: ; %bb.0: 300; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 301; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 302; VI-NEXT: s_waitcnt lgkmcnt(0) 303; VI-NEXT: s_lshr_b32 s4, s1, 16 304; VI-NEXT: s_lshr_b32 s5, s0, 16 305; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 306; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 307; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 308; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 309; VI-NEXT: v_mov_b32_e32 v5, s3 310; VI-NEXT: v_mov_b32_e32 v4, s2 311; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 312; VI-NEXT: s_endpgm 313 %ext = fpext <4 x half> %arg to <4 x float> 314 store <4 x float> %ext, <4 x float> addrspace(1)* %out 315 ret void 316} 317 318define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 319; CI-LABEL: extload_v8f16_to_v8f32_arg: 320; CI: ; %bb.0: 321; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 322; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 323; CI-NEXT: s_waitcnt lgkmcnt(0) 324; CI-NEXT: s_lshr_b32 s6, s1, 16 325; CI-NEXT: s_lshr_b32 s7, s0, 16 326; CI-NEXT: s_lshr_b32 s8, s3, 16 327; CI-NEXT: v_cvt_f32_f16_e32 v3, s6 328; CI-NEXT: s_lshr_b32 s6, s2, 16 329; CI-NEXT: v_cvt_f32_f16_e32 v7, s8 330; CI-NEXT: v_cvt_f32_f16_e32 v5, s6 331; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 332; CI-NEXT: v_cvt_f32_f16_e32 v6, s3 333; CI-NEXT: v_cvt_f32_f16_e32 v4, s2 334; CI-NEXT: s_add_u32 s0, s4, 16 335; CI-NEXT: v_cvt_f32_f16_e32 v2, s1 336; CI-NEXT: s_addc_u32 s1, s5, 0 337; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 338; CI-NEXT: v_mov_b32_e32 v9, s1 339; CI-NEXT: v_mov_b32_e32 v8, s0 340; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 341; CI-NEXT: s_nop 0 342; CI-NEXT: v_mov_b32_e32 v4, s4 343; CI-NEXT: v_mov_b32_e32 v5, s5 344; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 345; CI-NEXT: s_endpgm 346; 347; VI-LABEL: extload_v8f16_to_v8f32_arg: 348; VI: ; %bb.0: 349; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 350; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 351; VI-NEXT: s_waitcnt lgkmcnt(0) 352; VI-NEXT: s_lshr_b32 s6, s1, 16 353; VI-NEXT: s_lshr_b32 s7, s0, 16 354; VI-NEXT: s_lshr_b32 s8, s3, 16 355; VI-NEXT: v_cvt_f32_f16_e32 v3, s6 356; VI-NEXT: s_lshr_b32 s6, s2, 16 357; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 358; VI-NEXT: v_cvt_f32_f16_e32 v5, s6 359; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 360; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 361; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 362; VI-NEXT: s_add_u32 s0, s4, 16 363; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 364; VI-NEXT: s_addc_u32 s1, s5, 0 365; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 366; VI-NEXT: v_mov_b32_e32 v9, s1 367; VI-NEXT: v_mov_b32_e32 v8, s0 368; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 369; VI-NEXT: s_nop 0 370; VI-NEXT: v_mov_b32_e32 v4, s4 371; VI-NEXT: v_mov_b32_e32 v5, s5 372; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 373; VI-NEXT: s_endpgm 374 %ext = fpext <8 x half> %arg to <8 x float> 375 store <8 x float> %ext, <8 x float> addrspace(1)* %out 376 ret void 377} 378 379define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 380; CI-LABEL: extload_f16_to_f64_arg: 381; CI: ; %bb.0: 382; CI-NEXT: s_load_dword s0, s[4:5], 0x2 383; CI-NEXT: s_waitcnt lgkmcnt(0) 384; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 385; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 386; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 387; CI-NEXT: s_waitcnt lgkmcnt(0) 388; CI-NEXT: v_mov_b32_e32 v3, s1 389; CI-NEXT: v_mov_b32_e32 v2, s0 390; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 391; CI-NEXT: s_endpgm 392; 393; VI-LABEL: extload_f16_to_f64_arg: 394; VI: ; %bb.0: 395; VI-NEXT: s_load_dword s0, s[4:5], 0x8 396; VI-NEXT: s_waitcnt lgkmcnt(0) 397; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 398; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 399; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 400; VI-NEXT: s_waitcnt lgkmcnt(0) 401; VI-NEXT: v_mov_b32_e32 v3, s1 402; VI-NEXT: v_mov_b32_e32 v2, s0 403; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 404; VI-NEXT: s_endpgm 405 %ext = fpext half %arg to double 406 store double %ext, double addrspace(1)* %out 407 ret void 408} 409 410define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 411; CI-LABEL: extload_v2f16_to_v2f64_arg: 412; CI: ; %bb.0: 413; CI-NEXT: s_load_dword s0, s[4:5], 0x2 414; CI-NEXT: s_waitcnt lgkmcnt(0) 415; CI-NEXT: s_lshr_b32 s1, s0, 16 416; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 417; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 418; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 419; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 420; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 421; CI-NEXT: s_waitcnt lgkmcnt(0) 422; CI-NEXT: v_mov_b32_e32 v5, s1 423; CI-NEXT: v_mov_b32_e32 v4, s0 424; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 425; CI-NEXT: s_endpgm 426; 427; VI-LABEL: extload_v2f16_to_v2f64_arg: 428; VI: ; %bb.0: 429; VI-NEXT: s_load_dword s0, s[4:5], 0x8 430; VI-NEXT: s_waitcnt lgkmcnt(0) 431; VI-NEXT: s_lshr_b32 s1, s0, 16 432; VI-NEXT: v_cvt_f32_f16_e32 v0, s1 433; VI-NEXT: v_cvt_f32_f16_e32 v1, s0 434; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 435; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 436; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 437; VI-NEXT: s_waitcnt lgkmcnt(0) 438; VI-NEXT: v_mov_b32_e32 v5, s1 439; VI-NEXT: v_mov_b32_e32 v4, s0 440; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 441; VI-NEXT: s_endpgm 442 %ext = fpext <2 x half> %arg to <2 x double> 443 store <2 x double> %ext, <2 x double> addrspace(1)* %out 444 ret void 445} 446 447define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 448; CI-LABEL: extload_v3f16_to_v3f64_arg: 449; CI: ; %bb.0: 450; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 451; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 452; CI-NEXT: s_waitcnt lgkmcnt(0) 453; CI-NEXT: v_cvt_f32_f16_e32 v0, s1 454; CI-NEXT: s_lshr_b32 s4, s0, 16 455; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 456; CI-NEXT: v_cvt_f32_f16_e32 v2, s4 457; CI-NEXT: s_add_u32 s0, s2, 16 458; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 459; CI-NEXT: s_addc_u32 s1, s3, 0 460; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 461; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 462; CI-NEXT: v_mov_b32_e32 v7, s1 463; CI-NEXT: v_mov_b32_e32 v6, s0 464; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 465; CI-NEXT: v_mov_b32_e32 v5, s3 466; CI-NEXT: v_mov_b32_e32 v4, s2 467; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 468; CI-NEXT: s_endpgm 469; 470; VI-LABEL: extload_v3f16_to_v3f64_arg: 471; VI: ; %bb.0: 472; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 473; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 474; VI-NEXT: s_waitcnt lgkmcnt(0) 475; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 476; VI-NEXT: s_lshr_b32 s4, s0, 16 477; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 478; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 479; VI-NEXT: s_add_u32 s0, s2, 16 480; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 481; VI-NEXT: s_addc_u32 s1, s3, 0 482; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 483; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 484; VI-NEXT: v_mov_b32_e32 v7, s1 485; VI-NEXT: v_mov_b32_e32 v6, s0 486; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 487; VI-NEXT: v_mov_b32_e32 v5, s3 488; VI-NEXT: v_mov_b32_e32 v4, s2 489; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 490; VI-NEXT: s_endpgm 491 %ext = fpext <3 x half> %arg to <3 x double> 492 store <3 x double> %ext, <3 x double> addrspace(1)* %out 493 ret void 494} 495 496define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 497; CI-LABEL: extload_v4f16_to_v4f64_arg: 498; CI: ; %bb.0: 499; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 500; CI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 501; CI-NEXT: s_waitcnt lgkmcnt(0) 502; CI-NEXT: s_lshr_b32 s4, s1, 16 503; CI-NEXT: v_cvt_f32_f16_e32 v4, s4 504; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 505; CI-NEXT: s_lshr_b32 s5, s0, 16 506; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 507; CI-NEXT: v_cvt_f32_f16_e32 v2, s5 508; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 509; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 510; CI-NEXT: s_add_u32 s0, s2, 16 511; CI-NEXT: s_addc_u32 s1, s3, 0 512; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 513; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 514; CI-NEXT: v_mov_b32_e32 v9, s1 515; CI-NEXT: v_mov_b32_e32 v8, s0 516; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 517; CI-NEXT: s_nop 0 518; CI-NEXT: v_mov_b32_e32 v5, s3 519; CI-NEXT: v_mov_b32_e32 v4, s2 520; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 521; CI-NEXT: s_endpgm 522; 523; VI-LABEL: extload_v4f16_to_v4f64_arg: 524; VI: ; %bb.0: 525; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 526; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 527; VI-NEXT: s_waitcnt lgkmcnt(0) 528; VI-NEXT: s_lshr_b32 s5, s1, 16 529; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 530; VI-NEXT: v_cvt_f32_f16_e32 v5, s1 531; VI-NEXT: s_lshr_b32 s4, s0, 16 532; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 533; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 534; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 535; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 536; VI-NEXT: s_add_u32 s0, s2, 16 537; VI-NEXT: s_addc_u32 s1, s3, 0 538; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 539; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 540; VI-NEXT: v_mov_b32_e32 v9, s1 541; VI-NEXT: v_mov_b32_e32 v8, s0 542; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 543; VI-NEXT: s_nop 0 544; VI-NEXT: v_mov_b32_e32 v5, s3 545; VI-NEXT: v_mov_b32_e32 v4, s2 546; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 547; VI-NEXT: s_endpgm 548 %ext = fpext <4 x half> %arg to <4 x double> 549 store <4 x double> %ext, <4 x double> addrspace(1)* %out 550 ret void 551} 552 553define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 554; CI-LABEL: extload_v8f16_to_v8f64_arg: 555; CI: ; %bb.0: 556; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 557; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 558; CI-NEXT: s_waitcnt lgkmcnt(0) 559; CI-NEXT: s_lshr_b32 s6, s3, 16 560; CI-NEXT: v_cvt_f32_f16_e32 v0, s6 561; CI-NEXT: v_cvt_f32_f16_e32 v12, s3 562; CI-NEXT: s_lshr_b32 s7, s2, 16 563; CI-NEXT: s_lshr_b32 s8, s1, 16 564; CI-NEXT: s_lshr_b32 s6, s0, 16 565; CI-NEXT: v_cvt_f32_f16_e32 v1, s7 566; CI-NEXT: v_cvt_f32_f16_e32 v8, s2 567; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 568; CI-NEXT: s_add_u32 s0, s4, 48 569; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 570; CI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 571; CI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 572; CI-NEXT: s_addc_u32 s1, s5, 0 573; CI-NEXT: v_cvt_f32_f16_e32 v4, s8 574; CI-NEXT: v_mov_b32_e32 v17, s1 575; CI-NEXT: v_mov_b32_e32 v16, s0 576; CI-NEXT: s_add_u32 s0, s4, 32 577; CI-NEXT: v_cvt_f32_f16_e32 v2, s6 578; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 579; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 580; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 581; CI-NEXT: s_addc_u32 s1, s5, 0 582; CI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 583; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 584; CI-NEXT: v_mov_b32_e32 v13, s1 585; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 586; CI-NEXT: v_mov_b32_e32 v12, s0 587; CI-NEXT: s_add_u32 s0, s4, 16 588; CI-NEXT: s_addc_u32 s1, s5, 0 589; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 590; CI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 591; CI-NEXT: s_nop 0 592; CI-NEXT: v_mov_b32_e32 v9, s1 593; CI-NEXT: v_mov_b32_e32 v8, s0 594; CI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 595; CI-NEXT: s_nop 0 596; CI-NEXT: v_mov_b32_e32 v4, s4 597; CI-NEXT: v_mov_b32_e32 v5, s5 598; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 599; CI-NEXT: s_endpgm 600; 601; VI-LABEL: extload_v8f16_to_v8f64_arg: 602; VI: ; %bb.0: 603; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 604; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 605; VI-NEXT: s_waitcnt lgkmcnt(0) 606; VI-NEXT: s_lshr_b32 s6, s0, 16 607; VI-NEXT: s_lshr_b32 s8, s2, 16 608; VI-NEXT: s_lshr_b32 s9, s3, 16 609; VI-NEXT: v_cvt_f32_f16_e32 v0, s6 610; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 611; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 612; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 613; VI-NEXT: s_lshr_b32 s7, s1, 16 614; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 615; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 616; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 617; VI-NEXT: s_add_u32 s0, s4, 48 618; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 619; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 620; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 621; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 622; VI-NEXT: s_addc_u32 s1, s5, 0 623; VI-NEXT: v_cvt_f32_f16_e32 v1, s7 624; VI-NEXT: v_mov_b32_e32 v17, s1 625; VI-NEXT: v_mov_b32_e32 v16, s0 626; VI-NEXT: s_add_u32 s0, s4, 32 627; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 628; VI-NEXT: s_addc_u32 s1, s5, 0 629; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 630; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 631; VI-NEXT: v_mov_b32_e32 v13, s1 632; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 633; VI-NEXT: v_mov_b32_e32 v12, s0 634; VI-NEXT: s_add_u32 s0, s4, 16 635; VI-NEXT: s_addc_u32 s1, s5, 0 636; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 637; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 638; VI-NEXT: s_nop 0 639; VI-NEXT: v_mov_b32_e32 v9, s1 640; VI-NEXT: v_mov_b32_e32 v8, s0 641; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 642; VI-NEXT: s_nop 0 643; VI-NEXT: v_mov_b32_e32 v4, s4 644; VI-NEXT: v_mov_b32_e32 v5, s5 645; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 646; VI-NEXT: s_endpgm 647 %ext = fpext <8 x half> %arg to <8 x double> 648 store <8 x double> %ext, <8 x double> addrspace(1)* %out 649 ret void 650} 651 652define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 653; GCN-LABEL: global_load_store_f16: 654; GCN: ; %bb.0: 655; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 656; GCN-NEXT: s_waitcnt lgkmcnt(0) 657; GCN-NEXT: v_mov_b32_e32 v0, s2 658; GCN-NEXT: v_mov_b32_e32 v1, s3 659; GCN-NEXT: flat_load_ushort v2, v[0:1] 660; GCN-NEXT: v_mov_b32_e32 v0, s0 661; GCN-NEXT: v_mov_b32_e32 v1, s1 662; GCN-NEXT: s_waitcnt vmcnt(0) 663; GCN-NEXT: flat_store_short v[0:1], v2 664; GCN-NEXT: s_endpgm 665 %val = load half, half addrspace(1)* %in 666 store half %val, half addrspace(1)* %out 667 ret void 668} 669 670define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 671; GCN-LABEL: global_load_store_v2f16: 672; GCN: ; %bb.0: 673; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 674; GCN-NEXT: s_waitcnt lgkmcnt(0) 675; GCN-NEXT: v_mov_b32_e32 v0, s2 676; GCN-NEXT: v_mov_b32_e32 v1, s3 677; GCN-NEXT: flat_load_dword v2, v[0:1] 678; GCN-NEXT: v_mov_b32_e32 v0, s0 679; GCN-NEXT: v_mov_b32_e32 v1, s1 680; GCN-NEXT: s_waitcnt vmcnt(0) 681; GCN-NEXT: flat_store_dword v[0:1], v2 682; GCN-NEXT: s_endpgm 683 %val = load <2 x half>, <2 x half> addrspace(1)* %in 684 store <2 x half> %val, <2 x half> addrspace(1)* %out 685 ret void 686} 687 688define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 689; GCN-LABEL: global_load_store_v4f16: 690; GCN: ; %bb.0: 691; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 692; GCN-NEXT: s_waitcnt lgkmcnt(0) 693; GCN-NEXT: v_mov_b32_e32 v0, s0 694; GCN-NEXT: v_mov_b32_e32 v1, s1 695; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 696; GCN-NEXT: v_mov_b32_e32 v2, s2 697; GCN-NEXT: v_mov_b32_e32 v3, s3 698; GCN-NEXT: s_waitcnt vmcnt(0) 699; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 700; GCN-NEXT: s_endpgm 701 %val = load <4 x half>, <4 x half> addrspace(1)* %in 702 store <4 x half> %val, <4 x half> addrspace(1)* %out 703 ret void 704} 705 706define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 707; GCN-LABEL: global_load_store_v8f16: 708; GCN: ; %bb.0: 709; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 710; GCN-NEXT: s_waitcnt lgkmcnt(0) 711; GCN-NEXT: v_mov_b32_e32 v0, s2 712; GCN-NEXT: v_mov_b32_e32 v1, s3 713; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 714; GCN-NEXT: v_mov_b32_e32 v4, s0 715; GCN-NEXT: v_mov_b32_e32 v5, s1 716; GCN-NEXT: s_waitcnt vmcnt(0) 717; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 718; GCN-NEXT: s_endpgm 719 %val = load <8 x half>, <8 x half> addrspace(1)* %in 720 store <8 x half> %val, <8 x half> addrspace(1)* %out 721 ret void 722} 723 724define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 725; GCN-LABEL: global_extload_f16_to_f32: 726; GCN: ; %bb.0: 727; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 728; GCN-NEXT: s_waitcnt lgkmcnt(0) 729; GCN-NEXT: v_mov_b32_e32 v0, s2 730; GCN-NEXT: v_mov_b32_e32 v1, s3 731; GCN-NEXT: flat_load_ushort v0, v[0:1] 732; GCN-NEXT: v_mov_b32_e32 v1, s1 733; GCN-NEXT: s_waitcnt vmcnt(0) 734; GCN-NEXT: v_cvt_f32_f16_e32 v2, v0 735; GCN-NEXT: v_mov_b32_e32 v0, s0 736; GCN-NEXT: flat_store_dword v[0:1], v2 737; GCN-NEXT: s_endpgm 738 %val = load half, half addrspace(1)* %in 739 %cvt = fpext half %val to float 740 store float %cvt, float addrspace(1)* %out 741 ret void 742} 743 744define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 745; CI-LABEL: global_extload_v2f16_to_v2f32: 746; CI: ; %bb.0: 747; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 748; CI-NEXT: s_waitcnt lgkmcnt(0) 749; CI-NEXT: v_mov_b32_e32 v0, s2 750; CI-NEXT: v_mov_b32_e32 v1, s3 751; CI-NEXT: flat_load_dword v1, v[0:1] 752; CI-NEXT: v_mov_b32_e32 v2, s0 753; CI-NEXT: v_mov_b32_e32 v3, s1 754; CI-NEXT: s_waitcnt vmcnt(0) 755; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 756; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 757; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 758; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 759; CI-NEXT: s_endpgm 760; 761; VI-LABEL: global_extload_v2f16_to_v2f32: 762; VI: ; %bb.0: 763; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 764; VI-NEXT: s_waitcnt lgkmcnt(0) 765; VI-NEXT: v_mov_b32_e32 v0, s2 766; VI-NEXT: v_mov_b32_e32 v1, s3 767; VI-NEXT: flat_load_dword v1, v[0:1] 768; VI-NEXT: v_mov_b32_e32 v2, s0 769; VI-NEXT: v_mov_b32_e32 v3, s1 770; VI-NEXT: s_waitcnt vmcnt(0) 771; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 772; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 773; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 774; VI-NEXT: s_endpgm 775 %val = load <2 x half>, <2 x half> addrspace(1)* %in 776 %cvt = fpext <2 x half> %val to <2 x float> 777 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 778 ret void 779} 780 781define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 782; CI-LABEL: global_extload_v3f16_to_v3f32: 783; CI: ; %bb.0: 784; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 785; CI-NEXT: s_waitcnt lgkmcnt(0) 786; CI-NEXT: v_mov_b32_e32 v0, s2 787; CI-NEXT: v_mov_b32_e32 v1, s3 788; CI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 789; CI-NEXT: v_mov_b32_e32 v3, s0 790; CI-NEXT: v_mov_b32_e32 v4, s1 791; CI-NEXT: s_waitcnt vmcnt(0) 792; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 793; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 794; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 795; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 796; CI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 797; CI-NEXT: s_endpgm 798; 799; VI-LABEL: global_extload_v3f16_to_v3f32: 800; VI: ; %bb.0: 801; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 802; VI-NEXT: s_waitcnt lgkmcnt(0) 803; VI-NEXT: v_mov_b32_e32 v0, s2 804; VI-NEXT: v_mov_b32_e32 v1, s3 805; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 806; VI-NEXT: v_mov_b32_e32 v3, s0 807; VI-NEXT: v_mov_b32_e32 v4, s1 808; VI-NEXT: s_waitcnt vmcnt(0) 809; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 810; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 811; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 812; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 813; VI-NEXT: s_endpgm 814 %val = load <3 x half>, <3 x half> addrspace(1)* %in 815 %cvt = fpext <3 x half> %val to <3 x float> 816 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 817 ret void 818} 819 820define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 821; CI-LABEL: global_extload_v4f16_to_v4f32: 822; CI: ; %bb.0: 823; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 824; CI-NEXT: s_waitcnt lgkmcnt(0) 825; CI-NEXT: v_mov_b32_e32 v0, s2 826; CI-NEXT: v_mov_b32_e32 v1, s3 827; CI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] 828; CI-NEXT: v_mov_b32_e32 v5, s1 829; CI-NEXT: s_waitcnt vmcnt(0) 830; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 831; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 832; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 833; CI-NEXT: v_cvt_f32_f16_e32 v0, v3 834; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 835; CI-NEXT: v_cvt_f32_f16_e32 v1, v4 836; CI-NEXT: v_mov_b32_e32 v4, s0 837; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 838; CI-NEXT: s_endpgm 839; 840; VI-LABEL: global_extload_v4f16_to_v4f32: 841; VI: ; %bb.0: 842; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 843; VI-NEXT: s_waitcnt lgkmcnt(0) 844; VI-NEXT: v_mov_b32_e32 v0, s2 845; VI-NEXT: v_mov_b32_e32 v1, s3 846; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 847; VI-NEXT: s_waitcnt vmcnt(0) 848; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 849; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 850; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 851; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 852; VI-NEXT: v_mov_b32_e32 v4, s0 853; VI-NEXT: v_mov_b32_e32 v5, s1 854; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 855; VI-NEXT: s_endpgm 856 %val = load <4 x half>, <4 x half> addrspace(1)* %in 857 %cvt = fpext <4 x half> %val to <4 x float> 858 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 859 ret void 860} 861 862define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 863; CI-LABEL: global_extload_v8f16_to_v8f32: 864; CI: ; %bb.0: 865; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 866; CI-NEXT: s_waitcnt lgkmcnt(0) 867; CI-NEXT: v_mov_b32_e32 v0, s2 868; CI-NEXT: v_mov_b32_e32 v1, s3 869; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 870; CI-NEXT: s_add_u32 s2, s0, 16 871; CI-NEXT: s_addc_u32 s3, s1, 0 872; CI-NEXT: v_mov_b32_e32 v13, s1 873; CI-NEXT: v_mov_b32_e32 v12, s0 874; CI-NEXT: s_waitcnt vmcnt(0) 875; CI-NEXT: v_cvt_f32_f16_e32 v10, v3 876; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 877; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 878; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 879; CI-NEXT: v_cvt_f32_f16_e32 v6, v1 880; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 881; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 882; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 883; CI-NEXT: v_cvt_f32_f16_e32 v11, v3 884; CI-NEXT: v_cvt_f32_f16_e32 v9, v2 885; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 886; CI-NEXT: v_cvt_f32_f16_e32 v5, v0 887; CI-NEXT: v_mov_b32_e32 v0, s2 888; CI-NEXT: v_mov_b32_e32 v1, s3 889; CI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 890; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 891; CI-NEXT: s_endpgm 892; 893; VI-LABEL: global_extload_v8f16_to_v8f32: 894; VI: ; %bb.0: 895; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 896; VI-NEXT: s_waitcnt lgkmcnt(0) 897; VI-NEXT: v_mov_b32_e32 v0, s2 898; VI-NEXT: v_mov_b32_e32 v1, s3 899; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 900; VI-NEXT: s_add_u32 s2, s0, 16 901; VI-NEXT: s_addc_u32 s3, s1, 0 902; VI-NEXT: v_mov_b32_e32 v13, s1 903; VI-NEXT: v_mov_b32_e32 v12, s0 904; VI-NEXT: s_waitcnt vmcnt(0) 905; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 906; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 907; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 908; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 909; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 910; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 911; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 912; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 913; VI-NEXT: v_mov_b32_e32 v0, s2 914; VI-NEXT: v_mov_b32_e32 v1, s3 915; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 916; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 917; VI-NEXT: s_endpgm 918 %val = load <8 x half>, <8 x half> addrspace(1)* %in 919 %cvt = fpext <8 x half> %val to <8 x float> 920 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 921 ret void 922} 923 924define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 925; CI-LABEL: global_extload_v16f16_to_v16f32: 926; CI: ; %bb.0: 927; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 928; CI-NEXT: s_waitcnt lgkmcnt(0) 929; CI-NEXT: s_add_u32 s4, s2, 16 930; CI-NEXT: v_mov_b32_e32 v5, s3 931; CI-NEXT: s_addc_u32 s5, s3, 0 932; CI-NEXT: v_mov_b32_e32 v0, s4 933; CI-NEXT: v_mov_b32_e32 v4, s2 934; CI-NEXT: v_mov_b32_e32 v1, s5 935; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 936; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 937; CI-NEXT: s_add_u32 s2, s0, 16 938; CI-NEXT: s_addc_u32 s3, s1, 0 939; CI-NEXT: v_mov_b32_e32 v14, s3 940; CI-NEXT: v_mov_b32_e32 v13, s2 941; CI-NEXT: s_add_u32 s2, s0, 48 942; CI-NEXT: s_addc_u32 s3, s1, 0 943; CI-NEXT: s_waitcnt vmcnt(1) 944; CI-NEXT: v_cvt_f32_f16_e32 v8, v1 945; CI-NEXT: s_waitcnt vmcnt(0) 946; CI-NEXT: v_cvt_f32_f16_e32 v11, v7 947; CI-NEXT: v_cvt_f32_f16_e32 v9, v6 948; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 949; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 950; CI-NEXT: v_cvt_f32_f16_e32 v12, v7 951; CI-NEXT: v_cvt_f32_f16_e32 v10, v6 952; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 953; CI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 954; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 955; CI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] 956; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 957; CI-NEXT: v_cvt_f32_f16_e32 v12, v3 958; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 959; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 960; CI-NEXT: v_cvt_f32_f16_e32 v10, v2 961; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 962; CI-NEXT: v_cvt_f32_f16_e32 v2, v5 963; CI-NEXT: v_cvt_f32_f16_e32 v0, v4 964; CI-NEXT: v_mov_b32_e32 v5, s1 965; CI-NEXT: v_cvt_f32_f16_e32 v9, v1 966; CI-NEXT: v_cvt_f32_f16_e32 v13, v3 967; CI-NEXT: v_cvt_f32_f16_e32 v3, v16 968; CI-NEXT: v_cvt_f32_f16_e32 v1, v17 969; CI-NEXT: v_mov_b32_e32 v4, s0 970; CI-NEXT: s_add_u32 s0, s0, 32 971; CI-NEXT: v_cvt_f32_f16_e32 v11, v11 972; CI-NEXT: s_addc_u32 s1, s1, 0 973; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 974; CI-NEXT: v_mov_b32_e32 v15, s3 975; CI-NEXT: v_mov_b32_e32 v17, s1 976; CI-NEXT: v_mov_b32_e32 v14, s2 977; CI-NEXT: v_mov_b32_e32 v16, s0 978; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 979; CI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] 980; CI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] 981; CI-NEXT: s_endpgm 982; 983; VI-LABEL: global_extload_v16f16_to_v16f32: 984; VI: ; %bb.0: 985; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 986; VI-NEXT: s_waitcnt lgkmcnt(0) 987; VI-NEXT: v_mov_b32_e32 v0, s2 988; VI-NEXT: v_mov_b32_e32 v1, s3 989; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 990; VI-NEXT: s_add_u32 s2, s2, 16 991; VI-NEXT: s_addc_u32 s3, s3, 0 992; VI-NEXT: v_mov_b32_e32 v5, s3 993; VI-NEXT: v_mov_b32_e32 v4, s2 994; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 995; VI-NEXT: s_add_u32 s2, s0, 16 996; VI-NEXT: s_addc_u32 s3, s1, 0 997; VI-NEXT: v_mov_b32_e32 v19, s3 998; VI-NEXT: v_mov_b32_e32 v18, s2 999; VI-NEXT: s_add_u32 s2, s0, 48 1000; VI-NEXT: v_mov_b32_e32 v17, s1 1001; VI-NEXT: s_addc_u32 s3, s1, 0 1002; VI-NEXT: v_mov_b32_e32 v16, s0 1003; VI-NEXT: s_add_u32 s0, s0, 32 1004; VI-NEXT: s_addc_u32 s1, s1, 0 1005; VI-NEXT: v_mov_b32_e32 v21, s3 1006; VI-NEXT: v_mov_b32_e32 v20, s2 1007; VI-NEXT: s_waitcnt vmcnt(1) 1008; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 1009; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 1010; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1011; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1012; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1013; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 1014; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1015; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1016; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] 1017; VI-NEXT: s_waitcnt vmcnt(1) 1018; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 1019; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 1020; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 1021; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1022; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1023; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 1024; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1025; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1026; VI-NEXT: v_mov_b32_e32 v5, s1 1027; VI-NEXT: v_mov_b32_e32 v4, s0 1028; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1029; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] 1030; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1031; VI-NEXT: s_endpgm 1032 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1033 %cvt = fpext <16 x half> %val to <16 x float> 1034 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 1035 ret void 1036} 1037 1038define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 1039; GCN-LABEL: global_extload_f16_to_f64: 1040; GCN: ; %bb.0: 1041; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1042; GCN-NEXT: s_waitcnt lgkmcnt(0) 1043; GCN-NEXT: v_mov_b32_e32 v0, s2 1044; GCN-NEXT: v_mov_b32_e32 v1, s3 1045; GCN-NEXT: flat_load_ushort v0, v[0:1] 1046; GCN-NEXT: v_mov_b32_e32 v2, s0 1047; GCN-NEXT: v_mov_b32_e32 v3, s1 1048; GCN-NEXT: s_waitcnt vmcnt(0) 1049; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 1050; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1051; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1052; GCN-NEXT: s_endpgm 1053 %val = load half, half addrspace(1)* %in 1054 %cvt = fpext half %val to double 1055 store double %cvt, double addrspace(1)* %out 1056 ret void 1057} 1058 1059define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1060; CI-LABEL: global_extload_v2f16_to_v2f64: 1061; CI: ; %bb.0: 1062; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1063; CI-NEXT: s_waitcnt lgkmcnt(0) 1064; CI-NEXT: v_mov_b32_e32 v0, s2 1065; CI-NEXT: v_mov_b32_e32 v1, s3 1066; CI-NEXT: flat_load_dword v0, v[0:1] 1067; CI-NEXT: v_mov_b32_e32 v4, s0 1068; CI-NEXT: v_mov_b32_e32 v5, s1 1069; CI-NEXT: s_waitcnt vmcnt(0) 1070; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1071; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1072; CI-NEXT: v_cvt_f32_f16_e32 v2, v1 1073; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1074; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1075; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1076; CI-NEXT: s_endpgm 1077; 1078; VI-LABEL: global_extload_v2f16_to_v2f64: 1079; VI: ; %bb.0: 1080; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1081; VI-NEXT: s_waitcnt lgkmcnt(0) 1082; VI-NEXT: v_mov_b32_e32 v0, s2 1083; VI-NEXT: v_mov_b32_e32 v1, s3 1084; VI-NEXT: flat_load_dword v0, v[0:1] 1085; VI-NEXT: v_mov_b32_e32 v4, s0 1086; VI-NEXT: v_mov_b32_e32 v5, s1 1087; VI-NEXT: s_waitcnt vmcnt(0) 1088; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1089; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1090; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1091; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1092; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1093; VI-NEXT: s_endpgm 1094 %val = load <2 x half>, <2 x half> addrspace(1)* %in 1095 %cvt = fpext <2 x half> %val to <2 x double> 1096 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 1097 ret void 1098} 1099 1100define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 1101; CI-LABEL: global_extload_v3f16_to_v3f64: 1102; CI: ; %bb.0: 1103; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1104; CI-NEXT: s_waitcnt lgkmcnt(0) 1105; CI-NEXT: v_mov_b32_e32 v0, s2 1106; CI-NEXT: v_mov_b32_e32 v1, s3 1107; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1108; CI-NEXT: s_add_u32 s2, s0, 16 1109; CI-NEXT: s_addc_u32 s3, s1, 0 1110; CI-NEXT: v_mov_b32_e32 v7, s3 1111; CI-NEXT: v_mov_b32_e32 v6, s2 1112; CI-NEXT: s_waitcnt vmcnt(0) 1113; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1114; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1115; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1116; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1117; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 1118; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1119; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1120; CI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 1121; CI-NEXT: v_mov_b32_e32 v5, s1 1122; CI-NEXT: v_mov_b32_e32 v4, s0 1123; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1124; CI-NEXT: s_endpgm 1125; 1126; VI-LABEL: global_extload_v3f16_to_v3f64: 1127; VI: ; %bb.0: 1128; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1129; VI-NEXT: s_waitcnt lgkmcnt(0) 1130; VI-NEXT: v_mov_b32_e32 v0, s2 1131; VI-NEXT: v_mov_b32_e32 v1, s3 1132; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1133; VI-NEXT: s_add_u32 s2, s0, 16 1134; VI-NEXT: s_addc_u32 s3, s1, 0 1135; VI-NEXT: v_mov_b32_e32 v5, s1 1136; VI-NEXT: v_mov_b32_e32 v4, s0 1137; VI-NEXT: s_waitcnt vmcnt(0) 1138; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1139; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1140; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1141; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 1142; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1143; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1144; VI-NEXT: v_mov_b32_e32 v9, s3 1145; VI-NEXT: v_mov_b32_e32 v8, s2 1146; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] 1147; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1148; VI-NEXT: s_endpgm 1149 %val = load <3 x half>, <3 x half> addrspace(1)* %in 1150 %cvt = fpext <3 x half> %val to <3 x double> 1151 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 1152 ret void 1153} 1154 1155define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1156; CI-LABEL: global_extload_v4f16_to_v4f64: 1157; CI: ; %bb.0: 1158; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1159; CI-NEXT: s_waitcnt lgkmcnt(0) 1160; CI-NEXT: v_mov_b32_e32 v0, s2 1161; CI-NEXT: v_mov_b32_e32 v1, s3 1162; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1163; CI-NEXT: s_add_u32 s2, s0, 16 1164; CI-NEXT: s_addc_u32 s3, s1, 0 1165; CI-NEXT: v_mov_b32_e32 v9, s1 1166; CI-NEXT: v_mov_b32_e32 v8, s0 1167; CI-NEXT: s_waitcnt vmcnt(0) 1168; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1169; CI-NEXT: v_cvt_f32_f16_e32 v3, v1 1170; CI-NEXT: v_cvt_f32_f16_e32 v1, v0 1171; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1172; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1173; CI-NEXT: v_cvt_f32_f16_e32 v10, v0 1174; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1175; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1176; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 1177; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1178; CI-NEXT: v_mov_b32_e32 v11, s3 1179; CI-NEXT: v_mov_b32_e32 v10, s2 1180; CI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1181; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1182; CI-NEXT: s_endpgm 1183; 1184; VI-LABEL: global_extload_v4f16_to_v4f64: 1185; VI: ; %bb.0: 1186; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1187; VI-NEXT: s_waitcnt lgkmcnt(0) 1188; VI-NEXT: v_mov_b32_e32 v0, s2 1189; VI-NEXT: v_mov_b32_e32 v1, s3 1190; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1191; VI-NEXT: s_add_u32 s2, s0, 16 1192; VI-NEXT: s_addc_u32 s3, s1, 0 1193; VI-NEXT: v_mov_b32_e32 v9, s1 1194; VI-NEXT: v_mov_b32_e32 v8, s0 1195; VI-NEXT: s_waitcnt vmcnt(0) 1196; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1197; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1198; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1199; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1200; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1201; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1202; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1203; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1204; VI-NEXT: v_mov_b32_e32 v11, s3 1205; VI-NEXT: v_mov_b32_e32 v10, s2 1206; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1207; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1208; VI-NEXT: s_endpgm 1209 %val = load <4 x half>, <4 x half> addrspace(1)* %in 1210 %cvt = fpext <4 x half> %val to <4 x double> 1211 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 1212 ret void 1213} 1214 1215define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 1216; CI-LABEL: global_extload_v8f16_to_v8f64: 1217; CI: ; %bb.0: 1218; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1219; CI-NEXT: s_waitcnt lgkmcnt(0) 1220; CI-NEXT: v_mov_b32_e32 v0, s2 1221; CI-NEXT: v_mov_b32_e32 v1, s3 1222; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1223; CI-NEXT: s_add_u32 s2, s0, 48 1224; CI-NEXT: s_addc_u32 s3, s1, 0 1225; CI-NEXT: v_mov_b32_e32 v7, s3 1226; CI-NEXT: v_mov_b32_e32 v6, s2 1227; CI-NEXT: s_add_u32 s2, s0, 32 1228; CI-NEXT: v_mov_b32_e32 v13, s1 1229; CI-NEXT: s_addc_u32 s3, s1, 0 1230; CI-NEXT: v_mov_b32_e32 v12, s0 1231; CI-NEXT: s_add_u32 s0, s0, 16 1232; CI-NEXT: v_mov_b32_e32 v15, s3 1233; CI-NEXT: s_addc_u32 s1, s1, 0 1234; CI-NEXT: v_mov_b32_e32 v14, s2 1235; CI-NEXT: s_waitcnt vmcnt(0) 1236; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1237; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1238; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1239; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 1240; CI-NEXT: v_cvt_f32_f16_e32 v2, v4 1241; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1242; CI-NEXT: v_cvt_f32_f16_e32 v10, v1 1243; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 1244; CI-NEXT: v_cvt_f32_f16_e32 v4, v0 1245; CI-NEXT: v_cvt_f32_f16_e32 v16, v5 1246; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 1247; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1248; CI-NEXT: v_cvt_f32_f16_e32 v17, v9 1249; CI-NEXT: v_cvt_f32_f16_e32 v18, v11 1250; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1251; CI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] 1252; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1253; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1254; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 1255; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1256; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 1257; CI-NEXT: v_mov_b32_e32 v17, s1 1258; CI-NEXT: v_mov_b32_e32 v16, s0 1259; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1260; CI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1261; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1262; CI-NEXT: s_endpgm 1263; 1264; VI-LABEL: global_extload_v8f16_to_v8f64: 1265; VI: ; %bb.0: 1266; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1267; VI-NEXT: s_waitcnt lgkmcnt(0) 1268; VI-NEXT: v_mov_b32_e32 v0, s2 1269; VI-NEXT: v_mov_b32_e32 v1, s3 1270; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1271; VI-NEXT: s_add_u32 s2, s0, 48 1272; VI-NEXT: s_addc_u32 s3, s1, 0 1273; VI-NEXT: v_mov_b32_e32 v8, s3 1274; VI-NEXT: v_mov_b32_e32 v7, s2 1275; VI-NEXT: s_add_u32 s2, s0, 32 1276; VI-NEXT: v_mov_b32_e32 v13, s1 1277; VI-NEXT: s_addc_u32 s3, s1, 0 1278; VI-NEXT: v_mov_b32_e32 v12, s0 1279; VI-NEXT: s_add_u32 s0, s0, 16 1280; VI-NEXT: v_mov_b32_e32 v15, s3 1281; VI-NEXT: s_addc_u32 s1, s1, 0 1282; VI-NEXT: v_mov_b32_e32 v14, s2 1283; VI-NEXT: s_waitcnt vmcnt(0) 1284; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1285; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1286; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1287; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1288; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1289; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 1290; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1291; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 1292; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1293; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1294; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1295; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] 1296; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 1297; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 1298; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1299; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 1300; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 1301; VI-NEXT: v_mov_b32_e32 v17, s1 1302; VI-NEXT: v_mov_b32_e32 v16, s0 1303; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1304; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] 1305; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1306; VI-NEXT: s_endpgm 1307 %val = load <8 x half>, <8 x half> addrspace(1)* %in 1308 %cvt = fpext <8 x half> %val to <8 x double> 1309 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 1310 ret void 1311} 1312 1313define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 1314; CI-LABEL: global_extload_v16f16_to_v16f64: 1315; CI: ; %bb.0: 1316; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1317; CI-NEXT: s_waitcnt lgkmcnt(0) 1318; CI-NEXT: v_mov_b32_e32 v0, s2 1319; CI-NEXT: v_mov_b32_e32 v1, s3 1320; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1321; CI-NEXT: s_add_u32 s2, s2, 16 1322; CI-NEXT: s_addc_u32 s3, s3, 0 1323; CI-NEXT: v_mov_b32_e32 v5, s3 1324; CI-NEXT: v_mov_b32_e32 v4, s2 1325; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1326; CI-NEXT: s_add_u32 s2, s0, 48 1327; CI-NEXT: s_addc_u32 s3, s1, 0 1328; CI-NEXT: v_mov_b32_e32 v15, s3 1329; CI-NEXT: v_mov_b32_e32 v14, s2 1330; CI-NEXT: s_add_u32 s2, s0, 32 1331; CI-NEXT: s_addc_u32 s3, s1, 0 1332; CI-NEXT: v_mov_b32_e32 v17, s3 1333; CI-NEXT: v_mov_b32_e32 v16, s2 1334; CI-NEXT: s_add_u32 s2, s0, 16 1335; CI-NEXT: s_addc_u32 s3, s1, 0 1336; CI-NEXT: v_mov_b32_e32 v19, s3 1337; CI-NEXT: v_mov_b32_e32 v18, s2 1338; CI-NEXT: s_add_u32 s2, s0, 0x70 1339; CI-NEXT: s_addc_u32 s3, s1, 0 1340; CI-NEXT: v_mov_b32_e32 v13, s1 1341; CI-NEXT: v_mov_b32_e32 v12, s0 1342; CI-NEXT: s_waitcnt vmcnt(1) 1343; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 1344; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1345; CI-NEXT: v_cvt_f32_f16_e32 v10, v8 1346; CI-NEXT: s_waitcnt vmcnt(0) 1347; CI-NEXT: v_lshrrev_b32_e32 v20, 16, v5 1348; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v3 1349; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 1350; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1351; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1352; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1353; CI-NEXT: v_cvt_f32_f16_e32 v21, v5 1354; CI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1355; CI-NEXT: v_mov_b32_e32 v15, s3 1356; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v2 1357; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v3 1358; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1359; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1360; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1361; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1362; CI-NEXT: v_mov_b32_e32 v14, s2 1363; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0 1364; CI-NEXT: v_cvt_f32_f16_e32 v9, v0 1365; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1366; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1367; CI-NEXT: v_cvt_f32_f16_e32 v8, v8 1368; CI-NEXT: v_lshrrev_b32_e32 v10, 16, v7 1369; CI-NEXT: v_cvt_f32_f16_e32 v7, v7 1370; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1371; CI-NEXT: v_lshrrev_b32_e32 v11, 16, v6 1372; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1373; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1374; CI-NEXT: v_cvt_f32_f16_e32 v8, v10 1375; CI-NEXT: s_add_u32 s2, s0, 0x60 1376; CI-NEXT: v_cvt_f32_f16_e32 v6, v6 1377; CI-NEXT: v_cvt_f32_f16_e32 v10, v11 1378; CI-NEXT: s_addc_u32 s3, s1, 0 1379; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4 1380; CI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1381; CI-NEXT: v_mov_b32_e32 v17, s3 1382; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1383; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1384; CI-NEXT: v_cvt_f32_f16_e32 v7, v20 1385; CI-NEXT: v_cvt_f32_f16_e32 v4, v4 1386; CI-NEXT: v_cvt_f32_f16_e32 v12, v5 1387; CI-NEXT: v_mov_b32_e32 v16, s2 1388; CI-NEXT: s_add_u32 s2, s0, 0x50 1389; CI-NEXT: s_addc_u32 s3, s1, 0 1390; CI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 1391; CI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1392; CI-NEXT: s_add_u32 s0, s0, 64 1393; CI-NEXT: flat_store_dwordx4 v[14:15], v[0:3] 1394; CI-NEXT: s_addc_u32 s1, s1, 0 1395; CI-NEXT: v_cvt_f64_f32_e32 v[0:1], v21 1396; CI-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 1397; CI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1398; CI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 1399; CI-NEXT: v_mov_b32_e32 v19, s3 1400; CI-NEXT: v_mov_b32_e32 v13, s1 1401; CI-NEXT: v_mov_b32_e32 v18, s2 1402; CI-NEXT: v_mov_b32_e32 v12, s0 1403; CI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1404; CI-NEXT: flat_store_dwordx4 v[18:19], v[0:3] 1405; CI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1406; CI-NEXT: s_endpgm 1407; 1408; VI-LABEL: global_extload_v16f16_to_v16f64: 1409; VI: ; %bb.0: 1410; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1411; VI-NEXT: s_waitcnt lgkmcnt(0) 1412; VI-NEXT: v_mov_b32_e32 v0, s2 1413; VI-NEXT: v_mov_b32_e32 v1, s3 1414; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 1415; VI-NEXT: s_add_u32 s2, s2, 16 1416; VI-NEXT: s_addc_u32 s3, s3, 0 1417; VI-NEXT: v_mov_b32_e32 v0, s2 1418; VI-NEXT: v_mov_b32_e32 v1, s3 1419; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1420; VI-NEXT: s_add_u32 s2, s0, 48 1421; VI-NEXT: s_addc_u32 s3, s1, 0 1422; VI-NEXT: v_mov_b32_e32 v14, s3 1423; VI-NEXT: v_mov_b32_e32 v13, s2 1424; VI-NEXT: s_add_u32 s2, s0, 32 1425; VI-NEXT: s_addc_u32 s3, s1, 0 1426; VI-NEXT: v_mov_b32_e32 v16, s3 1427; VI-NEXT: v_mov_b32_e32 v15, s2 1428; VI-NEXT: s_add_u32 s2, s0, 16 1429; VI-NEXT: s_addc_u32 s3, s1, 0 1430; VI-NEXT: v_mov_b32_e32 v18, s3 1431; VI-NEXT: v_mov_b32_e32 v17, s2 1432; VI-NEXT: s_add_u32 s2, s0, 0x50 1433; VI-NEXT: v_mov_b32_e32 v12, s1 1434; VI-NEXT: s_addc_u32 s3, s1, 0 1435; VI-NEXT: v_mov_b32_e32 v11, s0 1436; VI-NEXT: s_waitcnt vmcnt(1) 1437; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1438; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1439; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1440; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1441; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1442; VI-NEXT: s_nop 0 1443; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1444; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1445; VI-NEXT: s_waitcnt vmcnt(1) 1446; VI-NEXT: v_cvt_f32_f16_e32 v10, v2 1447; VI-NEXT: v_mov_b32_e32 v14, s3 1448; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1449; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1450; VI-NEXT: v_mov_b32_e32 v13, s2 1451; VI-NEXT: s_add_u32 s2, s0, 64 1452; VI-NEXT: s_addc_u32 s3, s1, 0 1453; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] 1454; VI-NEXT: v_mov_b32_e32 v16, s3 1455; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1456; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1457; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 1458; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1459; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 1460; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1461; VI-NEXT: v_mov_b32_e32 v15, s2 1462; VI-NEXT: s_add_u32 s2, s0, 0x70 1463; VI-NEXT: s_addc_u32 s3, s1, 0 1464; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] 1465; VI-NEXT: v_cvt_f32_f16_sdwa v17, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1466; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 1467; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 1468; VI-NEXT: v_cvt_f32_f16_sdwa v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1469; VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1470; VI-NEXT: v_cvt_f32_f16_e32 v2, v1 1471; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 1472; VI-NEXT: v_cvt_f32_f16_sdwa v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1473; VI-NEXT: v_cvt_f32_f16_e32 v7, v3 1474; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v9 1475; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1476; VI-NEXT: v_cvt_f64_f32_e32 v[1:2], v2 1477; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v10 1478; VI-NEXT: v_cvt_f64_f32_e32 v[11:12], v11 1479; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1480; VI-NEXT: s_add_u32 s0, s0, 0x60 1481; VI-NEXT: flat_store_dwordx4 v[13:14], v[1:4] 1482; VI-NEXT: s_addc_u32 s1, s1, 0 1483; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1484; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1485; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1486; VI-NEXT: v_mov_b32_e32 v20, s3 1487; VI-NEXT: v_mov_b32_e32 v14, s1 1488; VI-NEXT: v_mov_b32_e32 v19, s2 1489; VI-NEXT: v_mov_b32_e32 v13, s0 1490; VI-NEXT: flat_store_dwordx4 v[15:16], v[9:12] 1491; VI-NEXT: flat_store_dwordx4 v[19:20], v[0:3] 1492; VI-NEXT: flat_store_dwordx4 v[13:14], v[5:8] 1493; VI-NEXT: s_endpgm 1494 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1495 %cvt = fpext <16 x half> %val to <16 x double> 1496 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 1497 ret void 1498} 1499 1500define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 1501; GCN-LABEL: global_truncstore_f32_to_f16: 1502; GCN: ; %bb.0: 1503; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1504; GCN-NEXT: s_waitcnt lgkmcnt(0) 1505; GCN-NEXT: v_mov_b32_e32 v0, s2 1506; GCN-NEXT: v_mov_b32_e32 v1, s3 1507; GCN-NEXT: flat_load_dword v0, v[0:1] 1508; GCN-NEXT: v_mov_b32_e32 v1, s1 1509; GCN-NEXT: s_waitcnt vmcnt(0) 1510; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 1511; GCN-NEXT: v_mov_b32_e32 v0, s0 1512; GCN-NEXT: flat_store_short v[0:1], v2 1513; GCN-NEXT: s_endpgm 1514 %val = load float, float addrspace(1)* %in 1515 %cvt = fptrunc float %val to half 1516 store half %cvt, half addrspace(1)* %out 1517 ret void 1518} 1519 1520define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 1521; CI-LABEL: global_truncstore_v2f32_to_v2f16: 1522; CI: ; %bb.0: 1523; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1524; CI-NEXT: s_waitcnt lgkmcnt(0) 1525; CI-NEXT: v_mov_b32_e32 v0, s2 1526; CI-NEXT: v_mov_b32_e32 v1, s3 1527; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1528; CI-NEXT: s_waitcnt vmcnt(0) 1529; CI-NEXT: v_cvt_f16_f32_e32 v2, v1 1530; CI-NEXT: v_cvt_f16_f32_e32 v3, v0 1531; CI-NEXT: v_mov_b32_e32 v0, s0 1532; CI-NEXT: v_mov_b32_e32 v1, s1 1533; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1534; CI-NEXT: v_or_b32_e32 v2, v3, v2 1535; CI-NEXT: flat_store_dword v[0:1], v2 1536; CI-NEXT: s_endpgm 1537; 1538; VI-LABEL: global_truncstore_v2f32_to_v2f16: 1539; VI: ; %bb.0: 1540; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1541; VI-NEXT: s_waitcnt lgkmcnt(0) 1542; VI-NEXT: v_mov_b32_e32 v0, s2 1543; VI-NEXT: v_mov_b32_e32 v1, s3 1544; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1545; VI-NEXT: s_waitcnt vmcnt(0) 1546; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1547; VI-NEXT: v_cvt_f16_f32_e32 v3, v0 1548; VI-NEXT: v_mov_b32_e32 v0, s0 1549; VI-NEXT: v_mov_b32_e32 v1, s1 1550; VI-NEXT: v_or_b32_e32 v2, v3, v2 1551; VI-NEXT: flat_store_dword v[0:1], v2 1552; VI-NEXT: s_endpgm 1553 %val = load <2 x float>, <2 x float> addrspace(1)* %in 1554 %cvt = fptrunc <2 x float> %val to <2 x half> 1555 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 1556 ret void 1557} 1558 1559define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 1560; CI-LABEL: global_truncstore_v3f32_to_v3f16: 1561; CI: ; %bb.0: 1562; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1563; CI-NEXT: s_waitcnt lgkmcnt(0) 1564; CI-NEXT: v_mov_b32_e32 v0, s2 1565; CI-NEXT: v_mov_b32_e32 v1, s3 1566; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1567; CI-NEXT: s_add_u32 s2, s0, 4 1568; CI-NEXT: s_addc_u32 s3, s1, 0 1569; CI-NEXT: s_waitcnt vmcnt(0) 1570; CI-NEXT: v_cvt_f16_f32_e32 v3, v1 1571; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1572; CI-NEXT: v_cvt_f16_f32_e32 v4, v0 1573; CI-NEXT: v_mov_b32_e32 v0, s2 1574; CI-NEXT: v_mov_b32_e32 v1, s3 1575; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1576; CI-NEXT: flat_store_short v[0:1], v2 1577; CI-NEXT: v_mov_b32_e32 v0, s0 1578; CI-NEXT: v_or_b32_e32 v2, v4, v3 1579; CI-NEXT: v_mov_b32_e32 v1, s1 1580; CI-NEXT: flat_store_dword v[0:1], v2 1581; CI-NEXT: s_endpgm 1582; 1583; VI-LABEL: global_truncstore_v3f32_to_v3f16: 1584; VI: ; %bb.0: 1585; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1586; VI-NEXT: s_waitcnt lgkmcnt(0) 1587; VI-NEXT: v_mov_b32_e32 v0, s2 1588; VI-NEXT: v_mov_b32_e32 v1, s3 1589; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1590; VI-NEXT: s_add_u32 s2, s0, 4 1591; VI-NEXT: s_addc_u32 s3, s1, 0 1592; VI-NEXT: s_waitcnt vmcnt(0) 1593; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1594; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1595; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 1596; VI-NEXT: v_mov_b32_e32 v0, s2 1597; VI-NEXT: v_mov_b32_e32 v1, s3 1598; VI-NEXT: flat_store_short v[0:1], v2 1599; VI-NEXT: v_mov_b32_e32 v0, s0 1600; VI-NEXT: v_or_b32_e32 v3, v4, v3 1601; VI-NEXT: v_mov_b32_e32 v1, s1 1602; VI-NEXT: flat_store_dword v[0:1], v3 1603; VI-NEXT: s_endpgm 1604 %val = load <3 x float>, <3 x float> addrspace(1)* %in 1605 %cvt = fptrunc <3 x float> %val to <3 x half> 1606 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 1607 ret void 1608} 1609 1610define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 1611; CI-LABEL: global_truncstore_v4f32_to_v4f16: 1612; CI: ; %bb.0: 1613; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1614; CI-NEXT: s_waitcnt lgkmcnt(0) 1615; CI-NEXT: v_mov_b32_e32 v0, s2 1616; CI-NEXT: v_mov_b32_e32 v1, s3 1617; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1618; CI-NEXT: v_mov_b32_e32 v4, s0 1619; CI-NEXT: v_mov_b32_e32 v5, s1 1620; CI-NEXT: s_waitcnt vmcnt(0) 1621; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 1622; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1623; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1624; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1625; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1626; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1627; CI-NEXT: v_or_b32_e32 v1, v2, v3 1628; CI-NEXT: v_or_b32_e32 v0, v0, v6 1629; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1630; CI-NEXT: s_endpgm 1631; 1632; VI-LABEL: global_truncstore_v4f32_to_v4f16: 1633; VI: ; %bb.0: 1634; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1635; VI-NEXT: s_waitcnt lgkmcnt(0) 1636; VI-NEXT: v_mov_b32_e32 v0, s2 1637; VI-NEXT: v_mov_b32_e32 v1, s3 1638; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1639; VI-NEXT: s_waitcnt vmcnt(0) 1640; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1641; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1642; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1643; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 1644; VI-NEXT: v_mov_b32_e32 v0, s0 1645; VI-NEXT: v_mov_b32_e32 v1, s1 1646; VI-NEXT: v_or_b32_e32 v3, v2, v3 1647; VI-NEXT: v_or_b32_e32 v2, v5, v4 1648; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1649; VI-NEXT: s_endpgm 1650 %val = load <4 x float>, <4 x float> addrspace(1)* %in 1651 %cvt = fptrunc <4 x float> %val to <4 x half> 1652 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 1653 ret void 1654} 1655 1656define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 1657; CI-LABEL: global_truncstore_v8f32_to_v8f16: 1658; CI: ; %bb.0: 1659; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1660; CI-NEXT: s_waitcnt lgkmcnt(0) 1661; CI-NEXT: v_mov_b32_e32 v0, s2 1662; CI-NEXT: v_mov_b32_e32 v1, s3 1663; CI-NEXT: s_add_u32 s2, s2, 16 1664; CI-NEXT: s_addc_u32 s3, s3, 0 1665; CI-NEXT: v_mov_b32_e32 v5, s3 1666; CI-NEXT: v_mov_b32_e32 v4, s2 1667; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1668; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1669; CI-NEXT: v_mov_b32_e32 v8, s0 1670; CI-NEXT: v_mov_b32_e32 v9, s1 1671; CI-NEXT: s_waitcnt vmcnt(1) 1672; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 1673; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1674; CI-NEXT: s_waitcnt vmcnt(0) 1675; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 1676; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 1677; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1678; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1679; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 1680; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 1681; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1682; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v1 1683; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v7 1684; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1685; CI-NEXT: v_or_b32_e32 v1, v2, v3 1686; CI-NEXT: v_or_b32_e32 v0, v0, v10 1687; CI-NEXT: v_or_b32_e32 v3, v6, v7 1688; CI-NEXT: v_or_b32_e32 v2, v4, v5 1689; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1690; CI-NEXT: s_endpgm 1691; 1692; VI-LABEL: global_truncstore_v8f32_to_v8f16: 1693; VI: ; %bb.0: 1694; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1695; VI-NEXT: s_waitcnt lgkmcnt(0) 1696; VI-NEXT: v_mov_b32_e32 v0, s2 1697; VI-NEXT: v_mov_b32_e32 v1, s3 1698; VI-NEXT: s_add_u32 s2, s2, 16 1699; VI-NEXT: s_addc_u32 s3, s3, 0 1700; VI-NEXT: v_mov_b32_e32 v5, s3 1701; VI-NEXT: v_mov_b32_e32 v4, s2 1702; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1703; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1704; VI-NEXT: v_mov_b32_e32 v8, s0 1705; VI-NEXT: v_mov_b32_e32 v9, s1 1706; VI-NEXT: s_waitcnt vmcnt(1) 1707; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1708; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1709; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1710; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1711; VI-NEXT: s_waitcnt vmcnt(0) 1712; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1713; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1714; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1715; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 1716; VI-NEXT: v_or_b32_e32 v1, v2, v3 1717; VI-NEXT: v_or_b32_e32 v0, v0, v10 1718; VI-NEXT: v_or_b32_e32 v3, v6, v7 1719; VI-NEXT: v_or_b32_e32 v2, v4, v5 1720; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1721; VI-NEXT: s_endpgm 1722 %val = load <8 x float>, <8 x float> addrspace(1)* %in 1723 %cvt = fptrunc <8 x float> %val to <8 x half> 1724 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 1725 ret void 1726} 1727 1728define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 1729; CI-LABEL: global_truncstore_v16f32_to_v16f16: 1730; CI: ; %bb.0: 1731; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1732; CI-NEXT: s_waitcnt lgkmcnt(0) 1733; CI-NEXT: s_add_u32 s4, s2, 32 1734; CI-NEXT: s_addc_u32 s5, s3, 0 1735; CI-NEXT: v_mov_b32_e32 v0, s4 1736; CI-NEXT: v_mov_b32_e32 v1, s5 1737; CI-NEXT: s_add_u32 s4, s2, 48 1738; CI-NEXT: s_addc_u32 s5, s3, 0 1739; CI-NEXT: v_mov_b32_e32 v9, s3 1740; CI-NEXT: v_mov_b32_e32 v4, s4 1741; CI-NEXT: v_mov_b32_e32 v8, s2 1742; CI-NEXT: s_add_u32 s2, s2, 16 1743; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1744; CI-NEXT: v_mov_b32_e32 v5, s5 1745; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1746; CI-NEXT: s_addc_u32 s3, s3, 0 1747; CI-NEXT: v_mov_b32_e32 v13, s3 1748; CI-NEXT: v_mov_b32_e32 v12, s2 1749; CI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1750; CI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1751; CI-NEXT: s_add_u32 s2, s0, 16 1752; CI-NEXT: s_addc_u32 s3, s1, 0 1753; CI-NEXT: s_waitcnt vmcnt(3) 1754; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 1755; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 1756; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1757; CI-NEXT: s_waitcnt vmcnt(2) 1758; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 1759; CI-NEXT: v_cvt_f16_f32_e32 v16, v5 1760; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1761; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 1762; CI-NEXT: v_cvt_f16_f32_e32 v17, v4 1763; CI-NEXT: s_waitcnt vmcnt(1) 1764; CI-NEXT: v_cvt_f16_f32_e32 v11, v11 1765; CI-NEXT: v_cvt_f16_f32_e32 v9, v9 1766; CI-NEXT: s_waitcnt vmcnt(0) 1767; CI-NEXT: v_cvt_f16_f32_e32 v15, v15 1768; CI-NEXT: v_cvt_f16_f32_e32 v13, v13 1769; CI-NEXT: v_cvt_f16_f32_e32 v10, v10 1770; CI-NEXT: v_cvt_f16_f32_e32 v8, v8 1771; CI-NEXT: v_cvt_f16_f32_e32 v14, v14 1772; CI-NEXT: v_cvt_f16_f32_e32 v12, v12 1773; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1774; CI-NEXT: v_mov_b32_e32 v5, s3 1775; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 1776; CI-NEXT: v_or_b32_e32 v1, v2, v3 1777; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 1778; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 1779; CI-NEXT: v_mov_b32_e32 v4, s2 1780; CI-NEXT: v_or_b32_e32 v0, v0, v18 1781; CI-NEXT: v_or_b32_e32 v3, v6, v2 1782; CI-NEXT: v_or_b32_e32 v2, v17, v7 1783; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v11 1784; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v9 1785; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v15 1786; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v13 1787; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1788; CI-NEXT: v_mov_b32_e32 v5, s1 1789; CI-NEXT: v_or_b32_e32 v1, v10, v6 1790; CI-NEXT: v_or_b32_e32 v0, v8, v7 1791; CI-NEXT: v_or_b32_e32 v3, v14, v9 1792; CI-NEXT: v_or_b32_e32 v2, v12, v11 1793; CI-NEXT: v_mov_b32_e32 v4, s0 1794; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1795; CI-NEXT: s_endpgm 1796; 1797; VI-LABEL: global_truncstore_v16f32_to_v16f16: 1798; VI: ; %bb.0: 1799; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1800; VI-NEXT: s_waitcnt lgkmcnt(0) 1801; VI-NEXT: s_add_u32 s4, s2, 32 1802; VI-NEXT: s_addc_u32 s5, s3, 0 1803; VI-NEXT: v_mov_b32_e32 v0, s4 1804; VI-NEXT: v_mov_b32_e32 v1, s5 1805; VI-NEXT: s_add_u32 s4, s2, 48 1806; VI-NEXT: s_addc_u32 s5, s3, 0 1807; VI-NEXT: v_mov_b32_e32 v9, s3 1808; VI-NEXT: v_mov_b32_e32 v4, s4 1809; VI-NEXT: v_mov_b32_e32 v8, s2 1810; VI-NEXT: s_add_u32 s2, s2, 16 1811; VI-NEXT: v_mov_b32_e32 v5, s5 1812; VI-NEXT: s_addc_u32 s3, s3, 0 1813; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1814; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1815; VI-NEXT: v_mov_b32_e32 v13, s3 1816; VI-NEXT: v_mov_b32_e32 v12, s2 1817; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1818; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1819; VI-NEXT: s_add_u32 s2, s0, 16 1820; VI-NEXT: s_addc_u32 s3, s1, 0 1821; VI-NEXT: s_waitcnt vmcnt(3) 1822; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1823; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1824; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1825; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1826; VI-NEXT: s_waitcnt vmcnt(2) 1827; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1828; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1829; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1830; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 1831; VI-NEXT: s_waitcnt vmcnt(1) 1832; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1833; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 1834; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1835; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 1836; VI-NEXT: s_waitcnt vmcnt(0) 1837; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1838; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 1839; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1840; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 1841; VI-NEXT: v_mov_b32_e32 v5, s3 1842; VI-NEXT: v_mov_b32_e32 v4, s2 1843; VI-NEXT: v_or_b32_e32 v1, v2, v3 1844; VI-NEXT: v_or_b32_e32 v0, v0, v16 1845; VI-NEXT: v_or_b32_e32 v3, v6, v7 1846; VI-NEXT: v_or_b32_e32 v2, v18, v17 1847; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1848; VI-NEXT: v_mov_b32_e32 v5, s1 1849; VI-NEXT: v_or_b32_e32 v1, v10, v11 1850; VI-NEXT: v_or_b32_e32 v0, v8, v9 1851; VI-NEXT: v_or_b32_e32 v3, v14, v15 1852; VI-NEXT: v_or_b32_e32 v2, v12, v13 1853; VI-NEXT: v_mov_b32_e32 v4, s0 1854; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1855; VI-NEXT: s_endpgm 1856 %val = load <16 x float>, <16 x float> addrspace(1)* %in 1857 %cvt = fptrunc <16 x float> %val to <16 x half> 1858 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 1859 ret void 1860} 1861 1862; FIXME: Unsafe math should fold conversions away 1863define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 1864; CI-LABEL: fadd_f16: 1865; CI: ; %bb.0: 1866; CI-NEXT: s_load_dword s0, s[4:5], 0x2 1867; CI-NEXT: s_waitcnt lgkmcnt(0) 1868; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 1869; CI-NEXT: s_lshr_b32 s0, s0, 16 1870; CI-NEXT: v_cvt_f32_f16_e32 v1, s0 1871; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1872; CI-NEXT: v_add_f32_e32 v0, v0, v1 1873; CI-NEXT: v_cvt_f16_f32_e32 v2, v0 1874; CI-NEXT: s_waitcnt lgkmcnt(0) 1875; CI-NEXT: v_mov_b32_e32 v0, s0 1876; CI-NEXT: v_mov_b32_e32 v1, s1 1877; CI-NEXT: flat_store_short v[0:1], v2 1878; CI-NEXT: s_endpgm 1879; 1880; VI-LABEL: fadd_f16: 1881; VI: ; %bb.0: 1882; VI-NEXT: s_load_dword s2, s[4:5], 0x8 1883; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1884; VI-NEXT: s_waitcnt lgkmcnt(0) 1885; VI-NEXT: s_lshr_b32 s3, s2, 16 1886; VI-NEXT: v_mov_b32_e32 v0, s3 1887; VI-NEXT: v_add_f16_e32 v2, s2, v0 1888; VI-NEXT: v_mov_b32_e32 v0, s0 1889; VI-NEXT: v_mov_b32_e32 v1, s1 1890; VI-NEXT: flat_store_short v[0:1], v2 1891; VI-NEXT: s_endpgm 1892 %add = fadd half %a, %b 1893 store half %add, half addrspace(1)* %out, align 4 1894 ret void 1895} 1896 1897define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 1898; CI-LABEL: fadd_v2f16: 1899; CI: ; %bb.0: 1900; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 1901; CI-NEXT: s_waitcnt lgkmcnt(0) 1902; CI-NEXT: s_lshr_b32 s2, s0, 16 1903; CI-NEXT: v_cvt_f32_f16_e32 v0, s0 1904; CI-NEXT: s_lshr_b32 s0, s1, 16 1905; CI-NEXT: v_cvt_f32_f16_e32 v1, s1 1906; CI-NEXT: v_cvt_f32_f16_e32 v2, s2 1907; CI-NEXT: v_cvt_f32_f16_e32 v3, s0 1908; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1909; CI-NEXT: v_add_f32_e32 v0, v0, v1 1910; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1911; CI-NEXT: v_add_f32_e32 v1, v2, v3 1912; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1913; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1914; CI-NEXT: v_or_b32_e32 v2, v0, v1 1915; CI-NEXT: s_waitcnt lgkmcnt(0) 1916; CI-NEXT: v_mov_b32_e32 v0, s0 1917; CI-NEXT: v_mov_b32_e32 v1, s1 1918; CI-NEXT: flat_store_dword v[0:1], v2 1919; CI-NEXT: s_endpgm 1920; 1921; VI-LABEL: fadd_v2f16: 1922; VI: ; %bb.0: 1923; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 1924; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 1925; VI-NEXT: s_waitcnt lgkmcnt(0) 1926; VI-NEXT: s_lshr_b32 s4, s1, 16 1927; VI-NEXT: s_lshr_b32 s5, s0, 16 1928; VI-NEXT: v_mov_b32_e32 v0, s1 1929; VI-NEXT: v_mov_b32_e32 v1, s4 1930; VI-NEXT: v_mov_b32_e32 v2, s5 1931; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1932; VI-NEXT: v_add_f16_e32 v0, s0, v0 1933; VI-NEXT: v_or_b32_e32 v2, v0, v1 1934; VI-NEXT: v_mov_b32_e32 v0, s2 1935; VI-NEXT: v_mov_b32_e32 v1, s3 1936; VI-NEXT: flat_store_dword v[0:1], v2 1937; VI-NEXT: s_endpgm 1938 %add = fadd <2 x half> %a, %b 1939 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 1940 ret void 1941} 1942 1943define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1944; CI-LABEL: fadd_v4f16: 1945; CI: ; %bb.0: 1946; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1947; CI-NEXT: s_waitcnt lgkmcnt(0) 1948; CI-NEXT: v_mov_b32_e32 v0, s2 1949; CI-NEXT: v_mov_b32_e32 v1, s3 1950; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1951; CI-NEXT: v_mov_b32_e32 v4, s0 1952; CI-NEXT: v_mov_b32_e32 v5, s1 1953; CI-NEXT: s_waitcnt vmcnt(0) 1954; CI-NEXT: v_cvt_f32_f16_e32 v6, v0 1955; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1956; CI-NEXT: v_cvt_f32_f16_e32 v7, v1 1957; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1958; CI-NEXT: v_cvt_f32_f16_e32 v8, v2 1959; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1960; CI-NEXT: v_cvt_f32_f16_e32 v9, v3 1961; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1962; CI-NEXT: v_cvt_f32_f16_e32 v0, v0 1963; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 1964; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 1965; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 1966; CI-NEXT: v_add_f32_e32 v7, v7, v9 1967; CI-NEXT: v_add_f32_e32 v6, v6, v8 1968; CI-NEXT: v_add_f32_e32 v1, v1, v3 1969; CI-NEXT: v_add_f32_e32 v0, v0, v2 1970; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 1971; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 1972; CI-NEXT: v_cvt_f16_f32_e32 v2, v7 1973; CI-NEXT: v_cvt_f16_f32_e32 v3, v6 1974; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1975; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1976; CI-NEXT: v_or_b32_e32 v1, v2, v1 1977; CI-NEXT: v_or_b32_e32 v0, v3, v0 1978; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1979; CI-NEXT: s_endpgm 1980; 1981; VI-LABEL: fadd_v4f16: 1982; VI: ; %bb.0: 1983; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1984; VI-NEXT: s_waitcnt lgkmcnt(0) 1985; VI-NEXT: v_mov_b32_e32 v0, s2 1986; VI-NEXT: v_mov_b32_e32 v1, s3 1987; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1988; VI-NEXT: v_mov_b32_e32 v4, s0 1989; VI-NEXT: v_mov_b32_e32 v5, s1 1990; VI-NEXT: s_waitcnt vmcnt(0) 1991; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1992; VI-NEXT: v_add_f16_e32 v1, v1, v3 1993; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1994; VI-NEXT: v_add_f16_e32 v0, v0, v2 1995; VI-NEXT: v_or_b32_e32 v1, v1, v6 1996; VI-NEXT: v_or_b32_e32 v0, v0, v3 1997; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1998; VI-NEXT: s_endpgm 1999 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 2000 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 2001 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 2002 %result = fadd <4 x half> %a, %b 2003 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 2004 ret void 2005} 2006 2007define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 2008; CI-LABEL: fadd_v8f16: 2009; CI: ; %bb.0: 2010; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 2011; CI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2012; CI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 2013; CI-NEXT: s_waitcnt lgkmcnt(0) 2014; CI-NEXT: s_lshr_b32 s10, s0, 16 2015; CI-NEXT: v_cvt_f32_f16_e32 v4, s0 2016; CI-NEXT: s_lshr_b32 s0, s4, 16 2017; CI-NEXT: v_cvt_f32_f16_e32 v8, s0 2018; CI-NEXT: s_lshr_b32 s0, s5, 16 2019; CI-NEXT: s_lshr_b32 s11, s1, 16 2020; CI-NEXT: v_cvt_f32_f16_e32 v0, s10 2021; CI-NEXT: s_lshr_b32 s10, s2, 16 2022; CI-NEXT: v_cvt_f32_f16_e32 v9, s0 2023; CI-NEXT: s_lshr_b32 s0, s6, 16 2024; CI-NEXT: v_cvt_f32_f16_e32 v1, s11 2025; CI-NEXT: v_cvt_f32_f16_e32 v2, s10 2026; CI-NEXT: s_lshr_b32 s10, s3, 16 2027; CI-NEXT: v_cvt_f32_f16_e32 v10, s0 2028; CI-NEXT: s_lshr_b32 s0, s7, 16 2029; CI-NEXT: v_cvt_f32_f16_e32 v3, s10 2030; CI-NEXT: v_cvt_f32_f16_e32 v5, s1 2031; CI-NEXT: v_cvt_f32_f16_e32 v11, s0 2032; CI-NEXT: v_cvt_f32_f16_e32 v12, s4 2033; CI-NEXT: v_cvt_f32_f16_e32 v13, s5 2034; CI-NEXT: v_cvt_f32_f16_e32 v6, s2 2035; CI-NEXT: v_cvt_f32_f16_e32 v7, s3 2036; CI-NEXT: v_cvt_f32_f16_e32 v14, s7 2037; CI-NEXT: v_cvt_f32_f16_e32 v15, s6 2038; CI-NEXT: v_add_f32_e32 v1, v1, v9 2039; CI-NEXT: v_add_f32_e32 v0, v0, v8 2040; CI-NEXT: v_add_f32_e32 v3, v3, v11 2041; CI-NEXT: v_add_f32_e32 v2, v2, v10 2042; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 2043; CI-NEXT: v_add_f32_e32 v5, v5, v13 2044; CI-NEXT: v_cvt_f16_f32_e32 v0, v0 2045; CI-NEXT: v_add_f32_e32 v4, v4, v12 2046; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 2047; CI-NEXT: v_add_f32_e32 v7, v7, v14 2048; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 2049; CI-NEXT: v_add_f32_e32 v6, v6, v15 2050; CI-NEXT: v_cvt_f16_f32_e32 v5, v5 2051; CI-NEXT: v_cvt_f16_f32_e32 v4, v4 2052; CI-NEXT: v_cvt_f16_f32_e32 v7, v7 2053; CI-NEXT: v_cvt_f16_f32_e32 v6, v6 2054; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2055; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2056; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2057; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2058; CI-NEXT: v_or_b32_e32 v1, v5, v1 2059; CI-NEXT: v_or_b32_e32 v0, v4, v0 2060; CI-NEXT: v_mov_b32_e32 v4, s8 2061; CI-NEXT: v_or_b32_e32 v3, v7, v3 2062; CI-NEXT: v_or_b32_e32 v2, v6, v2 2063; CI-NEXT: v_mov_b32_e32 v5, s9 2064; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2065; CI-NEXT: s_endpgm 2066; 2067; VI-LABEL: fadd_v8f16: 2068; VI: ; %bb.0: 2069; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x20 2070; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10 2071; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 2072; VI-NEXT: s_waitcnt lgkmcnt(0) 2073; VI-NEXT: s_lshr_b32 s6, s3, 16 2074; VI-NEXT: s_lshr_b32 s7, s11, 16 2075; VI-NEXT: v_mov_b32_e32 v0, s6 2076; VI-NEXT: v_mov_b32_e32 v1, s7 2077; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2078; VI-NEXT: v_mov_b32_e32 v1, s3 2079; VI-NEXT: v_add_f16_e32 v1, s11, v1 2080; VI-NEXT: s_lshr_b32 s3, s2, 16 2081; VI-NEXT: s_lshr_b32 s6, s10, 16 2082; VI-NEXT: v_or_b32_e32 v3, v1, v0 2083; VI-NEXT: v_mov_b32_e32 v0, s3 2084; VI-NEXT: v_mov_b32_e32 v1, s6 2085; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2086; VI-NEXT: v_mov_b32_e32 v1, s2 2087; VI-NEXT: v_add_f16_e32 v1, s10, v1 2088; VI-NEXT: s_lshr_b32 s2, s1, 16 2089; VI-NEXT: s_lshr_b32 s3, s9, 16 2090; VI-NEXT: v_or_b32_e32 v2, v1, v0 2091; VI-NEXT: v_mov_b32_e32 v0, s2 2092; VI-NEXT: v_mov_b32_e32 v1, s3 2093; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2094; VI-NEXT: v_mov_b32_e32 v1, s1 2095; VI-NEXT: v_add_f16_e32 v1, s9, v1 2096; VI-NEXT: s_lshr_b32 s1, s0, 16 2097; VI-NEXT: s_lshr_b32 s2, s8, 16 2098; VI-NEXT: v_or_b32_e32 v1, v1, v0 2099; VI-NEXT: v_mov_b32_e32 v0, s1 2100; VI-NEXT: v_mov_b32_e32 v4, s2 2101; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2102; VI-NEXT: v_mov_b32_e32 v4, s0 2103; VI-NEXT: v_add_f16_e32 v4, s8, v4 2104; VI-NEXT: v_or_b32_e32 v0, v4, v0 2105; VI-NEXT: v_mov_b32_e32 v4, s4 2106; VI-NEXT: v_mov_b32_e32 v5, s5 2107; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2108; VI-NEXT: s_endpgm 2109 %add = fadd <8 x half> %a, %b 2110 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 2111 ret void 2112} 2113 2114define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 2115; GCN-LABEL: test_bitcast_from_half: 2116; GCN: ; %bb.0: 2117; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2118; GCN-NEXT: s_waitcnt lgkmcnt(0) 2119; GCN-NEXT: v_mov_b32_e32 v0, s0 2120; GCN-NEXT: v_mov_b32_e32 v1, s1 2121; GCN-NEXT: flat_load_ushort v2, v[0:1] 2122; GCN-NEXT: v_mov_b32_e32 v0, s2 2123; GCN-NEXT: v_mov_b32_e32 v1, s3 2124; GCN-NEXT: s_waitcnt vmcnt(0) 2125; GCN-NEXT: flat_store_short v[0:1], v2 2126; GCN-NEXT: s_endpgm 2127 %val = load half, half addrspace(1)* %in 2128 %val_int = bitcast half %val to i16 2129 store i16 %val_int, i16 addrspace(1)* %out 2130 ret void 2131} 2132 2133define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 2134; GCN-LABEL: test_bitcast_to_half: 2135; GCN: ; %bb.0: 2136; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2137; GCN-NEXT: s_waitcnt lgkmcnt(0) 2138; GCN-NEXT: v_mov_b32_e32 v0, s2 2139; GCN-NEXT: v_mov_b32_e32 v1, s3 2140; GCN-NEXT: flat_load_ushort v2, v[0:1] 2141; GCN-NEXT: v_mov_b32_e32 v0, s0 2142; GCN-NEXT: v_mov_b32_e32 v1, s1 2143; GCN-NEXT: s_waitcnt vmcnt(0) 2144; GCN-NEXT: flat_store_short v[0:1], v2 2145; GCN-NEXT: s_endpgm 2146 %val = load i16, i16 addrspace(1)* %in 2147 %val_fp = bitcast i16 %val to half 2148 store half %val_fp, half addrspace(1)* %out 2149 ret void 2150} 2151 2152attributes #0 = { nounwind } 2153