1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; half args should be promoted to float for SI and lower. 5 6; GCN-LABEL: {{^}}load_f16_arg: 7; GCN: s_load_dword [[ARG:s[0-9]+]] 8; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] 9; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]] 10; GCN: buffer_store_short [[CVT]] 11define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 12 store half %arg, half addrspace(1)* %out 13 ret void 14} 15 16; GCN-LABEL: {{^}}load_v2f16_arg: 17; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 18; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 19; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] 20; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[V0]], [[HI]] 21; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 22; GCN: s_endpgm 23define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 24 store <2 x half> %arg, <2 x half> addrspace(1)* %out 25 ret void 26} 27 28; GCN-LABEL: {{^}}load_v3f16_arg: 29; GCN: buffer_load_ushort 30; GCN: buffer_load_ushort 31; GCN: buffer_load_ushort 32; GCN-NOT: buffer_load 33; GCN-DAG: buffer_store_dword 34; GCN-DAG: buffer_store_short 35; GCN-NOT: buffer_store 36; GCN: s_endpgm 37define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 38 store <3 x half> %arg, <3 x half> addrspace(1)* %out 39 ret void 40} 41 42; GCN-LABEL: {{^}}load_v4f16_arg: 43; GCN: buffer_load_ushort 44; GCN: buffer_load_ushort 45; GCN: buffer_load_ushort 46; GCN: buffer_load_ushort 47; GCN: buffer_store_dwordx2 48; GCN: s_endpgm 49define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 50 store <4 x half> %arg, <4 x half> addrspace(1)* %out 51 ret void 52} 53 54; GCN-LABEL: {{^}}load_v8f16_arg: 55define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 56 store <8 x half> %arg, <8 x half> addrspace(1)* %out 57 ret void 58} 59 60; GCN-LABEL: {{^}}extload_v2f16_arg: 61define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 62 %fpext = fpext <2 x half> %in to <2 x float> 63 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 64 ret void 65} 66 67; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 68define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 69 %ext = fpext half %arg to float 70 store float %ext, float addrspace(1)* %out 71 ret void 72} 73 74; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 75define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 76 %ext = fpext <2 x half> %arg to <2 x float> 77 store <2 x float> %ext, <2 x float> addrspace(1)* %out 78 ret void 79} 80 81; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 82; GCN: buffer_load_ushort 83; GCN: buffer_load_ushort 84; GCN: buffer_load_ushort 85; GCN-NOT: buffer_load 86; GCN: v_cvt_f32_f16_e32 87; GCN: v_cvt_f32_f16_e32 88; GCN: v_cvt_f32_f16_e32 89; GCN-NOT: v_cvt_f32_f16 90; GCN-DAG: buffer_store_dword 91; GCN-DAG: buffer_store_dwordx2 92; GCN: s_endpgm 93define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 94 %ext = fpext <3 x half> %arg to <3 x float> 95 store <3 x float> %ext, <3 x float> addrspace(1)* %out 96 ret void 97} 98 99; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 100define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 101 %ext = fpext <4 x half> %arg to <4 x float> 102 store <4 x float> %ext, <4 x float> addrspace(1)* %out 103 ret void 104} 105 106; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 107; GCN: buffer_load_ushort 108; GCN: buffer_load_ushort 109; GCN: buffer_load_ushort 110; GCN: buffer_load_ushort 111; GCN: buffer_load_ushort 112; GCN: buffer_load_ushort 113; GCN: buffer_load_ushort 114; GCN: buffer_load_ushort 115 116; GCN: v_cvt_f32_f16_e32 117; GCN: v_cvt_f32_f16_e32 118; GCN: v_cvt_f32_f16_e32 119; GCN: v_cvt_f32_f16_e32 120; GCN: v_cvt_f32_f16_e32 121; GCN: v_cvt_f32_f16_e32 122; GCN: v_cvt_f32_f16_e32 123; GCN: v_cvt_f32_f16_e32 124 125; GCN: buffer_store_dwordx4 126; GCN: buffer_store_dwordx4 127define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 128 %ext = fpext <8 x half> %arg to <8 x float> 129 store <8 x float> %ext, <8 x float> addrspace(1)* %out 130 ret void 131} 132 133; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 134; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} 135; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] 136; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} 137; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]] 138; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]] 139; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]] 140; GCN: buffer_store_dwordx2 [[RESULT]] 141define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 142 %ext = fpext half %arg to double 143 store double %ext, double addrspace(1)* %out 144 ret void 145} 146 147; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 148; GCN-DAG: buffer_load_ushort v 149; GCN-DAG: buffer_load_ushort v 150; GCN-DAG: v_cvt_f32_f16_e32 151; GCN-DAG: v_cvt_f32_f16_e32 152; GCN-DAG: v_cvt_f64_f32_e32 153; GCN-DAG: v_cvt_f64_f32_e32 154; GCN: s_endpgm 155define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 156 %ext = fpext <2 x half> %arg to <2 x double> 157 store <2 x double> %ext, <2 x double> addrspace(1)* %out 158 ret void 159} 160 161; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 162; GCN-DAG: buffer_load_ushort v 163; GCN-DAG: buffer_load_ushort v 164; GCN-DAG: buffer_load_ushort v 165; GCN-DAG: v_cvt_f32_f16_e32 166; GCN-DAG: v_cvt_f32_f16_e32 167; GCN-DAG: v_cvt_f32_f16_e32 168; GCN-DAG: v_cvt_f64_f32_e32 169; GCN-DAG: v_cvt_f64_f32_e32 170; GCN-DAG: v_cvt_f64_f32_e32 171; GCN: s_endpgm 172define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 173 %ext = fpext <3 x half> %arg to <3 x double> 174 store <3 x double> %ext, <3 x double> addrspace(1)* %out 175 ret void 176} 177 178; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 179; GCN-DAG: buffer_load_ushort v 180; GCN-DAG: buffer_load_ushort v 181; GCN-DAG: buffer_load_ushort v 182; GCN-DAG: buffer_load_ushort v 183; GCN-DAG: v_cvt_f32_f16_e32 184; GCN-DAG: v_cvt_f32_f16_e32 185; GCN-DAG: v_cvt_f32_f16_e32 186; GCN-DAG: v_cvt_f32_f16_e32 187; GCN-DAG: v_cvt_f64_f32_e32 188; GCN-DAG: v_cvt_f64_f32_e32 189; GCN-DAG: v_cvt_f64_f32_e32 190; GCN-DAG: v_cvt_f64_f32_e32 191; GCN: s_endpgm 192define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 193 %ext = fpext <4 x half> %arg to <4 x double> 194 store <4 x double> %ext, <4 x double> addrspace(1)* %out 195 ret void 196} 197 198; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 199; GCN-DAG: buffer_load_ushort v 200; GCN-DAG: buffer_load_ushort v 201; GCN-DAG: buffer_load_ushort v 202; GCN-DAG: buffer_load_ushort v 203 204; GCN-DAG: buffer_load_ushort v 205; GCN-DAG: buffer_load_ushort v 206; GCN-DAG: buffer_load_ushort v 207; GCN-DAG: buffer_load_ushort v 208 209; GCN-DAG: v_cvt_f32_f16_e32 210; GCN-DAG: v_cvt_f32_f16_e32 211; GCN-DAG: v_cvt_f32_f16_e32 212; GCN-DAG: v_cvt_f32_f16_e32 213 214; GCN-DAG: v_cvt_f32_f16_e32 215; GCN-DAG: v_cvt_f32_f16_e32 216; GCN-DAG: v_cvt_f32_f16_e32 217; GCN-DAG: v_cvt_f32_f16_e32 218 219; GCN-DAG: v_cvt_f64_f32_e32 220; GCN-DAG: v_cvt_f64_f32_e32 221; GCN-DAG: v_cvt_f64_f32_e32 222; GCN-DAG: v_cvt_f64_f32_e32 223 224; GCN-DAG: v_cvt_f64_f32_e32 225; GCN-DAG: v_cvt_f64_f32_e32 226; GCN-DAG: v_cvt_f64_f32_e32 227; GCN-DAG: v_cvt_f64_f32_e32 228 229; GCN: s_endpgm 230define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 231 %ext = fpext <8 x half> %arg to <8 x double> 232 store <8 x double> %ext, <8 x double> addrspace(1)* %out 233 ret void 234} 235 236; GCN-LABEL: {{^}}global_load_store_f16: 237; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 238; GCN: buffer_store_short [[TMP]] 239define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 240 %val = load half, half addrspace(1)* %in 241 store half %val, half addrspace(1)* %out 242 ret void 243} 244 245; GCN-LABEL: {{^}}global_load_store_v2f16: 246; GCN: buffer_load_dword [[TMP:v[0-9]+]] 247; GCN: buffer_store_dword [[TMP]] 248define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 249 %val = load <2 x half>, <2 x half> addrspace(1)* %in 250 store <2 x half> %val, <2 x half> addrspace(1)* %out 251 ret void 252} 253 254; GCN-LABEL: {{^}}global_load_store_v4f16: 255; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 256; GCN: buffer_store_dwordx2 [[TMP]] 257define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 258 %val = load <4 x half>, <4 x half> addrspace(1)* %in 259 store <4 x half> %val, <4 x half> addrspace(1)* %out 260 ret void 261} 262 263; GCN-LABEL: {{^}}global_load_store_v8f16: 264; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 265; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 266; GCN: s_endpgm 267define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 268 %val = load <8 x half>, <8 x half> addrspace(1)* %in 269 store <8 x half> %val, <8 x half> addrspace(1)* %out 270 ret void 271} 272 273; GCN-LABEL: {{^}}global_extload_f16_to_f32: 274; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 275; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 276; GCN: buffer_store_dword [[CVT]] 277define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 278 %val = load half, half addrspace(1)* %in 279 %cvt = fpext half %val to float 280 store float %cvt, float addrspace(1)* %out 281 ret void 282} 283 284; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 285; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 286; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 287; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 288; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 289; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 290; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 291; GCN: s_endpgm 292define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 293 %val = load <2 x half>, <2 x half> addrspace(1)* %in 294 %cvt = fpext <2 x half> %val to <2 x float> 295 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 296 ret void 297} 298 299; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 300define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 301 %val = load <3 x half>, <3 x half> addrspace(1)* %in 302 %cvt = fpext <3 x half> %val to <3 x float> 303 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 304 ret void 305} 306 307; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 308define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 309 %val = load <4 x half>, <4 x half> addrspace(1)* %in 310 %cvt = fpext <4 x half> %val to <4 x float> 311 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 312 ret void 313} 314 315; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 316define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 317 %val = load <8 x half>, <8 x half> addrspace(1)* %in 318 %cvt = fpext <8 x half> %val to <8 x float> 319 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 320 ret void 321} 322 323; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 324; GCN: buffer_load_dwordx4 325; GCN: buffer_load_dwordx4 326 327; SI: v_cvt_f32_f16_e32 328; SI: v_cvt_f32_f16_e32 329; SI: v_cvt_f32_f16_e32 330; SI: v_cvt_f32_f16_e32 331; SI: v_cvt_f32_f16_e32 332; SI: v_cvt_f32_f16_e32 333; SI: v_cvt_f32_f16_e32 334; SI: v_cvt_f32_f16_e32 335; SI: v_cvt_f32_f16_e32 336; SI: v_cvt_f32_f16_e32 337; SI: v_cvt_f32_f16_e32 338; SI: v_cvt_f32_f16_e32 339; SI: v_cvt_f32_f16_e32 340; SI: v_cvt_f32_f16_e32 341; SI: v_cvt_f32_f16_e32 342; SI: v_cvt_f32_f16_e32 343 344; VI: v_cvt_f32_f16_e32 345; VI: v_cvt_f32_f16_sdwa 346; ... 347 348; GCN: buffer_store_dwordx4 349; GCN: buffer_store_dwordx4 350; GCN: buffer_store_dwordx4 351; GCN: buffer_store_dwordx4 352 353; GCN: s_endpgm 354define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 355 %val = load <16 x half>, <16 x half> addrspace(1)* %in 356 %cvt = fpext <16 x half> %val to <16 x float> 357 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 358 ret void 359} 360 361; GCN-LABEL: {{^}}global_extload_f16_to_f64: 362; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 363; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 364; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 365; GCN: buffer_store_dwordx2 [[CVT1]] 366define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 367 %val = load half, half addrspace(1)* %in 368 %cvt = fpext half %val to double 369 store double %cvt, double addrspace(1)* %out 370 ret void 371} 372 373; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 374; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 375 376; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 377; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 378; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 379; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 380; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 381 382; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 383; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]] 384; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]] 385; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]] 386 387; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 388; GCN: s_endpgm 389define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 390 %val = load <2 x half>, <2 x half> addrspace(1)* %in 391 %cvt = fpext <2 x half> %val to <2 x double> 392 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 393 ret void 394} 395 396; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 397 398; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 399; XSI: v_cvt_f32_f16_e32 400; XSI: v_cvt_f32_f16_e32 401; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 402; XSI: v_cvt_f32_f16_e32 403; XSI-NOT: v_cvt_f32_f16 404 405; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 406; XVI: v_cvt_f32_f16_e32 407; XVI: v_cvt_f32_f16_e32 408; XVI: v_cvt_f32_f16_sdwa 409; XVI-NOT: v_cvt_f32_f16 410 411; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] 412; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] 413; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] 414; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] 415; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] 416; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 417 418; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] 419; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] 420; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] 421; GCN-NOT: v_cvt_f64_f32_e32 422 423; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 424; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 425; GCN: s_endpgm 426define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 427 %val = load <3 x half>, <3 x half> addrspace(1)* %in 428 %cvt = fpext <3 x half> %val to <3 x double> 429 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 430 ret void 431} 432 433; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 434define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 435 %val = load <4 x half>, <4 x half> addrspace(1)* %in 436 %cvt = fpext <4 x half> %val to <4 x double> 437 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 438 ret void 439} 440 441; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 442define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 443 %val = load <8 x half>, <8 x half> addrspace(1)* %in 444 %cvt = fpext <8 x half> %val to <8 x double> 445 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 446 ret void 447} 448 449; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 450define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 451 %val = load <16 x half>, <16 x half> addrspace(1)* %in 452 %cvt = fpext <16 x half> %val to <16 x double> 453 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 454 ret void 455} 456 457; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 458; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 459; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 460; GCN: buffer_store_short [[CVT]] 461define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 462 %val = load float, float addrspace(1)* %in 463 %cvt = fptrunc float %val to half 464 store half %cvt, half addrspace(1)* %out 465 ret void 466} 467 468; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 469; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 470; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 471 472; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 473; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] 474; SI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[SHL]] 475 476; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 477; VI: v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT0]], [[CVT1]] 478 479; GCN-DAG: buffer_store_dword [[PACKED]] 480; GCN: s_endpgm 481define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 482 %val = load <2 x float>, <2 x float> addrspace(1)* %in 483 %cvt = fptrunc <2 x float> %val to <2 x half> 484 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 485 ret void 486} 487 488; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 489; GCN: buffer_load_dwordx4 490; GCN-DAG: v_cvt_f16_f32_e32 491; SI-DAG: v_cvt_f16_f32_e32 492; VI-DAG: v_cvt_f16_f32_sdwa 493; GCN-DAG: v_cvt_f16_f32_e32 494; GCN: buffer_store_short 495; GCN: buffer_store_dword 496; GCN: s_endpgm 497define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 498 %val = load <3 x float>, <3 x float> addrspace(1)* %in 499 %cvt = fptrunc <3 x float> %val to <3 x half> 500 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 501 ret void 502} 503 504; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 505; GCN: buffer_load_dwordx4 506; GCN-DAG: v_cvt_f16_f32_e32 507; SI-DAG: v_cvt_f16_f32_e32 508; SI-DAG: v_cvt_f16_f32_e32 509; VI-DAG: v_cvt_f16_f32_sdwa 510; VI-DAG: v_cvt_f16_f32_sdwa 511; GCN-DAG: v_cvt_f16_f32_e32 512; GCN: buffer_store_dwordx2 513; GCN: s_endpgm 514define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 515 %val = load <4 x float>, <4 x float> addrspace(1)* %in 516 %cvt = fptrunc <4 x float> %val to <4 x half> 517 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 518 ret void 519} 520 521; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 522; GCN: buffer_load_dwordx4 523; GCN: buffer_load_dwordx4 524; SI: v_cvt_f16_f32_e32 525; SI: v_cvt_f16_f32_e32 526; SI: v_cvt_f16_f32_e32 527; SI: v_cvt_f16_f32_e32 528; SI: v_cvt_f16_f32_e32 529; SI: v_cvt_f16_f32_e32 530; SI: v_cvt_f16_f32_e32 531; SI: v_cvt_f16_f32_e32 532; VI-DAG: v_cvt_f16_f32_e32 533; VI-DAG: v_cvt_f16_f32_e32 534; VI-DAG: v_cvt_f16_f32_e32 535; VI-DAG: v_cvt_f16_f32_e32 536; VI-DAG: v_cvt_f16_f32_sdwa 537; VI-DAG: v_cvt_f16_f32_sdwa 538; VI-DAG: v_cvt_f16_f32_sdwa 539; VI-DAG: v_cvt_f16_f32_sdwa 540; GCN: buffer_store_dwordx4 541; GCN: s_endpgm 542define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 543 %val = load <8 x float>, <8 x float> addrspace(1)* %in 544 %cvt = fptrunc <8 x float> %val to <8 x half> 545 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 546 ret void 547} 548 549; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 550; GCN: buffer_load_dwordx4 551; GCN: buffer_load_dwordx4 552; GCN: buffer_load_dwordx4 553; GCN: buffer_load_dwordx4 554; GCN-DAG: v_cvt_f16_f32_e32 555; GCN-DAG: v_cvt_f16_f32_e32 556; GCN-DAG: v_cvt_f16_f32_e32 557; GCN-DAG: v_cvt_f16_f32_e32 558; GCN-DAG: v_cvt_f16_f32_e32 559; GCN-DAG: v_cvt_f16_f32_e32 560; GCN-DAG: v_cvt_f16_f32_e32 561; GCN-DAG: v_cvt_f16_f32_e32 562; GCN-DAG: v_cvt_f16_f32_e32 563; GCN-DAG: v_cvt_f16_f32_e32 564; GCN-DAG: v_cvt_f16_f32_e32 565; GCN-DAG: v_cvt_f16_f32_e32 566; GCN-DAG: v_cvt_f16_f32_e32 567; GCN-DAG: v_cvt_f16_f32_e32 568; GCN-DAG: v_cvt_f16_f32_e32 569; GCN-DAG: v_cvt_f16_f32_e32 570; GCN-DAG: buffer_store_dwordx4 571; GCN-DAG: buffer_store_dwordx4 572; GCN: s_endpgm 573define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 574 %val = load <16 x float>, <16 x float> addrspace(1)* %in 575 %cvt = fptrunc <16 x float> %val to <16 x half> 576 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 577 ret void 578} 579 580; FIXME: Unsafe math should fold conversions away 581; GCN-LABEL: {{^}}fadd_f16: 582; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 583; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 584; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 585; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 586; SI: v_add_f32 587; GCN: s_endpgm 588define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 589 %add = fadd half %a, %b 590 store half %add, half addrspace(1)* %out, align 4 591 ret void 592} 593 594; GCN-LABEL: {{^}}fadd_v2f16: 595; SI: v_add_f32 596; SI: v_add_f32 597; GCN: s_endpgm 598define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 599 %add = fadd <2 x half> %a, %b 600 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 601 ret void 602} 603 604; GCN-LABEL: {{^}}fadd_v4f16: 605; SI: v_add_f32 606; SI: v_add_f32 607; SI: v_add_f32 608; SI: v_add_f32 609; GCN: s_endpgm 610define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 611 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 612 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 613 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 614 %result = fadd <4 x half> %a, %b 615 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 616 ret void 617} 618 619; GCN-LABEL: {{^}}fadd_v8f16: 620; SI: v_add_f32 621; SI: v_add_f32 622; SI: v_add_f32 623; SI: v_add_f32 624; SI: v_add_f32 625; SI: v_add_f32 626; SI: v_add_f32 627; SI: v_add_f32 628; GCN: s_endpgm 629define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 630 %add = fadd <8 x half> %a, %b 631 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 632 ret void 633} 634 635; GCN-LABEL: {{^}}test_bitcast_from_half: 636; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 637; GCN: buffer_store_short [[TMP]] 638define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 639 %val = load half, half addrspace(1)* %in 640 %val_int = bitcast half %val to i16 641 store i16 %val_int, i16 addrspace(1)* %out 642 ret void 643} 644 645; GCN-LABEL: {{^}}test_bitcast_to_half: 646; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 647; GCN: buffer_store_short [[TMP]] 648define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 649 %val = load i16, i16 addrspace(1)* %in 650 %val_fp = bitcast i16 %val to half 651 store half %val_fp, half addrspace(1)* %out 652 ret void 653} 654 655attributes #0 = { nounwind } 656