1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s 2; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s 3 4; half args should be promoted to float for SI and lower. 5 6; GCN-LABEL: {{^}}load_f16_arg: 7; GCN: s_load_dword [[ARG:s[0-9]+]] 8; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]] 9; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]] 10; GCN: buffer_store_short [[CVT]] 11define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 12 store half %arg, half addrspace(1)* %out 13 ret void 14} 15 16; GCN-LABEL: {{^}}load_v2f16_arg: 17; GCN-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44 18; GCN-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46 19; GCN: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]] 20; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]] 21; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 22; GCN: s_endpgm 23define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 24 store <2 x half> %arg, <2 x half> addrspace(1)* %out 25 ret void 26} 27 28; GCN-LABEL: {{^}}load_v3f16_arg: 29; GCN: buffer_load_ushort 30; GCN: buffer_load_ushort 31; GCN: buffer_load_ushort 32; GCN-NOT: buffer_load 33; GCN-DAG: buffer_store_dword 34; GCN-DAG: buffer_store_short 35; GCN-NOT: buffer_store 36; GCN: s_endpgm 37define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 38 store <3 x half> %arg, <3 x half> addrspace(1)* %out 39 ret void 40} 41 42; GCN-LABEL: {{^}}load_v4f16_arg: 43; GCN: buffer_load_ushort 44; GCN: buffer_load_ushort 45; GCN: buffer_load_ushort 46; GCN: buffer_load_ushort 47; GCN: buffer_store_dwordx2 48; GCN: s_endpgm 49define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 50 store <4 x half> %arg, <4 x half> addrspace(1)* %out 51 ret void 52} 53 54; GCN-LABEL: {{^}}load_v8f16_arg: 55define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 56 store <8 x half> %arg, <8 x half> addrspace(1)* %out 57 ret void 58} 59 60; GCN-LABEL: {{^}}extload_v2f16_arg: 61define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 62 %fpext = fpext <2 x half> %in to <2 x float> 63 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 64 ret void 65} 66 67; GCN-LABEL: {{^}}extload_f16_to_f32_arg: 68define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 69 %ext = fpext half %arg to float 70 store float %ext, float addrspace(1)* %out 71 ret void 72} 73 74; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg: 75define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 76 %ext = fpext <2 x half> %arg to <2 x float> 77 store <2 x float> %ext, <2 x float> addrspace(1)* %out 78 ret void 79} 80 81; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg: 82; GCN: buffer_load_ushort 83; GCN: buffer_load_ushort 84; GCN: buffer_load_ushort 85; GCN-NOT: buffer_load 86; GCN: v_cvt_f32_f16_e32 87; GCN: v_cvt_f32_f16_e32 88; GCN: v_cvt_f32_f16_e32 89; GCN-NOT: v_cvt_f32_f16 90; GCN-DAG: buffer_store_dword 91; GCN-DAG: buffer_store_dwordx2 92; GCN: s_endpgm 93define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 94 %ext = fpext <3 x half> %arg to <3 x float> 95 store <3 x float> %ext, <3 x float> addrspace(1)* %out 96 ret void 97} 98 99; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg: 100define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 101 %ext = fpext <4 x half> %arg to <4 x float> 102 store <4 x float> %ext, <4 x float> addrspace(1)* %out 103 ret void 104} 105 106; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg: 107; GCN: buffer_load_ushort 108; GCN: buffer_load_ushort 109; GCN: buffer_load_ushort 110; GCN: buffer_load_ushort 111; GCN: buffer_load_ushort 112; GCN: buffer_load_ushort 113; GCN: buffer_load_ushort 114; GCN: buffer_load_ushort 115 116; GCN: v_cvt_f32_f16_e32 117; GCN: v_cvt_f32_f16_e32 118; GCN: v_cvt_f32_f16_e32 119; GCN: v_cvt_f32_f16_e32 120; GCN: v_cvt_f32_f16_e32 121; GCN: v_cvt_f32_f16_e32 122; GCN: v_cvt_f32_f16_e32 123; GCN: v_cvt_f32_f16_e32 124 125; GCN: buffer_store_dwordx4 126; GCN: buffer_store_dwordx4 127define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 128 %ext = fpext <8 x half> %arg to <8 x float> 129 store <8 x float> %ext, <8 x float> addrspace(1)* %out 130 ret void 131} 132 133; GCN-LABEL: {{^}}extload_f16_to_f64_arg: 134; SI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}} 135; SI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[ARG]] 136; VI: s_load_dword [[ARG:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c{{$}} 137; VI: v_trunc_f16_e32 v[[VARG:[0-9]+]], [[ARG]] 138; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]] 139; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]] 140; GCN: buffer_store_dwordx2 [[RESULT]] 141define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 142 %ext = fpext half %arg to double 143 store double %ext, double addrspace(1)* %out 144 ret void 145} 146 147; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg: 148; GCN-DAG: buffer_load_ushort v 149; GCN-DAG: buffer_load_ushort v 150; GCN-DAG: v_cvt_f32_f16_e32 151; GCN-DAG: v_cvt_f32_f16_e32 152; GCN-DAG: v_cvt_f64_f32_e32 153; GCN-DAG: v_cvt_f64_f32_e32 154; GCN: s_endpgm 155define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 156 %ext = fpext <2 x half> %arg to <2 x double> 157 store <2 x double> %ext, <2 x double> addrspace(1)* %out 158 ret void 159} 160 161; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg: 162; GCN-DAG: buffer_load_ushort v 163; GCN-DAG: buffer_load_ushort v 164; GCN-DAG: buffer_load_ushort v 165; GCN-DAG: v_cvt_f32_f16_e32 166; GCN-DAG: v_cvt_f32_f16_e32 167; GCN-DAG: v_cvt_f32_f16_e32 168; GCN-DAG: v_cvt_f64_f32_e32 169; GCN-DAG: v_cvt_f64_f32_e32 170; GCN-DAG: v_cvt_f64_f32_e32 171; GCN: s_endpgm 172define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 173 %ext = fpext <3 x half> %arg to <3 x double> 174 store <3 x double> %ext, <3 x double> addrspace(1)* %out 175 ret void 176} 177 178; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg: 179; GCN-DAG: buffer_load_ushort v 180; GCN-DAG: buffer_load_ushort v 181; GCN-DAG: buffer_load_ushort v 182; GCN-DAG: buffer_load_ushort v 183; GCN-DAG: v_cvt_f32_f16_e32 184; GCN-DAG: v_cvt_f32_f16_e32 185; GCN-DAG: v_cvt_f32_f16_e32 186; GCN-DAG: v_cvt_f32_f16_e32 187; GCN-DAG: v_cvt_f64_f32_e32 188; GCN-DAG: v_cvt_f64_f32_e32 189; GCN-DAG: v_cvt_f64_f32_e32 190; GCN-DAG: v_cvt_f64_f32_e32 191; GCN: s_endpgm 192define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 193 %ext = fpext <4 x half> %arg to <4 x double> 194 store <4 x double> %ext, <4 x double> addrspace(1)* %out 195 ret void 196} 197 198; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg: 199; GCN-DAG: buffer_load_ushort v 200; GCN-DAG: buffer_load_ushort v 201; GCN-DAG: buffer_load_ushort v 202; GCN-DAG: buffer_load_ushort v 203 204; GCN-DAG: buffer_load_ushort v 205; GCN-DAG: buffer_load_ushort v 206; GCN-DAG: buffer_load_ushort v 207; GCN-DAG: buffer_load_ushort v 208 209; GCN-DAG: v_cvt_f32_f16_e32 210; GCN-DAG: v_cvt_f32_f16_e32 211; GCN-DAG: v_cvt_f32_f16_e32 212; GCN-DAG: v_cvt_f32_f16_e32 213 214; GCN-DAG: v_cvt_f32_f16_e32 215; GCN-DAG: v_cvt_f32_f16_e32 216; GCN-DAG: v_cvt_f32_f16_e32 217; GCN-DAG: v_cvt_f32_f16_e32 218 219; GCN-DAG: v_cvt_f64_f32_e32 220; GCN-DAG: v_cvt_f64_f32_e32 221; GCN-DAG: v_cvt_f64_f32_e32 222; GCN-DAG: v_cvt_f64_f32_e32 223 224; GCN-DAG: v_cvt_f64_f32_e32 225; GCN-DAG: v_cvt_f64_f32_e32 226; GCN-DAG: v_cvt_f64_f32_e32 227; GCN-DAG: v_cvt_f64_f32_e32 228 229; GCN: s_endpgm 230define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 231 %ext = fpext <8 x half> %arg to <8 x double> 232 store <8 x double> %ext, <8 x double> addrspace(1)* %out 233 ret void 234} 235 236; GCN-LABEL: {{^}}global_load_store_f16: 237; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 238; GCN: buffer_store_short [[TMP]] 239define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 240 %val = load half, half addrspace(1)* %in 241 store half %val, half addrspace(1)* %out 242 ret void 243} 244 245; GCN-LABEL: {{^}}global_load_store_v2f16: 246; GCN: buffer_load_dword [[TMP:v[0-9]+]] 247; GCN: buffer_store_dword [[TMP]] 248define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 249 %val = load <2 x half>, <2 x half> addrspace(1)* %in 250 store <2 x half> %val, <2 x half> addrspace(1)* %out 251 ret void 252} 253 254; GCN-LABEL: {{^}}global_load_store_v4f16: 255; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]] 256; GCN: buffer_store_dwordx2 [[TMP]] 257define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 258 %val = load <4 x half>, <4 x half> addrspace(1)* %in 259 store <4 x half> %val, <4 x half> addrspace(1)* %out 260 ret void 261} 262 263; GCN-LABEL: {{^}}global_load_store_v8f16: 264; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 265; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]] 266; GCN: s_endpgm 267define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 268 %val = load <8 x half>, <8 x half> addrspace(1)* %in 269 store <8 x half> %val, <8 x half> addrspace(1)* %out 270 ret void 271} 272 273; GCN-LABEL: {{^}}global_extload_f16_to_f32: 274; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 275; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]] 276; GCN: buffer_store_dword [[CVT]] 277define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 278 %val = load half, half addrspace(1)* %in 279 %cvt = fpext half %val to float 280 store float %cvt, float addrspace(1)* %out 281 ret void 282} 283 284; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32: 285; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 286; VI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 287; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 288; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 289; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 290; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}} 291; GCN: s_endpgm 292define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 293 %val = load <2 x half>, <2 x half> addrspace(1)* %in 294 %cvt = fpext <2 x half> %val to <2 x float> 295 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 296 ret void 297} 298 299; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32: 300define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 301 %val = load <3 x half>, <3 x half> addrspace(1)* %in 302 %cvt = fpext <3 x half> %val to <3 x float> 303 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 304 ret void 305} 306 307; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32: 308define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 309 %val = load <4 x half>, <4 x half> addrspace(1)* %in 310 %cvt = fpext <4 x half> %val to <4 x float> 311 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 312 ret void 313} 314 315; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32: 316define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 317 %val = load <8 x half>, <8 x half> addrspace(1)* %in 318 %cvt = fpext <8 x half> %val to <8 x float> 319 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 320 ret void 321} 322 323; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32: 324; GCN: buffer_load_dwordx4 325; GCN: buffer_load_dwordx4 326 327; GCN: v_cvt_f32_f16_e32 328; GCN: v_cvt_f32_f16_e32 329; GCN: v_cvt_f32_f16_e32 330; GCN: v_cvt_f32_f16_e32 331; GCN: v_cvt_f32_f16_e32 332; GCN: v_cvt_f32_f16_e32 333; GCN: v_cvt_f32_f16_e32 334; GCN: v_cvt_f32_f16_e32 335; GCN: v_cvt_f32_f16_e32 336; GCN: v_cvt_f32_f16_e32 337; GCN: v_cvt_f32_f16_e32 338; GCN: v_cvt_f32_f16_e32 339; GCN: v_cvt_f32_f16_e32 340; GCN: v_cvt_f32_f16_e32 341; GCN: v_cvt_f32_f16_e32 342; GCN: v_cvt_f32_f16_e32 343 344; GCN: buffer_store_dwordx4 345; GCN: buffer_store_dwordx4 346; GCN: buffer_store_dwordx4 347; GCN: buffer_store_dwordx4 348 349; GCN: s_endpgm 350define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 351 %val = load <16 x half>, <16 x half> addrspace(1)* %in 352 %cvt = fpext <16 x half> %val to <16 x float> 353 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 354 ret void 355} 356 357; GCN-LABEL: {{^}}global_extload_f16_to_f64: 358; GCN: buffer_load_ushort [[LOAD:v[0-9]+]] 359; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]] 360; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]] 361; GCN: buffer_store_dwordx2 [[CVT1]] 362define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 363 %val = load half, half addrspace(1)* %in 364 %cvt = fpext half %val to double 365 store double %cvt, double addrspace(1)* %out 366 ret void 367} 368 369; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64: 370; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 371; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]] 372; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]] 373; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]] 374; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]] 375; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]] 376; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}} 377; GCN: s_endpgm 378define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 379 %val = load <2 x half>, <2 x half> addrspace(1)* %in 380 %cvt = fpext <2 x half> %val to <2 x double> 381 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 382 ret void 383} 384 385; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f64: 386 387; XSI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 388; XSI: v_cvt_f32_f16_e32 389; XSI: v_cvt_f32_f16_e32 390; XSI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 391; XSI: v_cvt_f32_f16_e32 392; XSI-NOT: v_cvt_f32_f16 393 394; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]] 395; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}} 396; XVI: v_cvt_f32_f16_e32 397; XVI: v_cvt_f32_f16_e32 398; XVI: v_cvt_f32_f16_e32 399; XVI-NOT: v_cvt_f32_f16 400 401; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]] 402; VI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] 403; GCN: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]] 404; GCN: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]] 405; SI: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]] 406; GCN: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]] 407 408; GCN: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]] 409; GCN: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]] 410; GCN: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]] 411; GCN-NOT: v_cvt_f64_f32_e32 412 413; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} 414; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 415; GCN: s_endpgm 416define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 417 %val = load <3 x half>, <3 x half> addrspace(1)* %in 418 %cvt = fpext <3 x half> %val to <3 x double> 419 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 420 ret void 421} 422 423; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64: 424define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 425 %val = load <4 x half>, <4 x half> addrspace(1)* %in 426 %cvt = fpext <4 x half> %val to <4 x double> 427 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 428 ret void 429} 430 431; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64: 432define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 433 %val = load <8 x half>, <8 x half> addrspace(1)* %in 434 %cvt = fpext <8 x half> %val to <8 x double> 435 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 436 ret void 437} 438 439; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64: 440define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 441 %val = load <16 x half>, <16 x half> addrspace(1)* %in 442 %cvt = fpext <16 x half> %val to <16 x double> 443 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 444 ret void 445} 446 447; GCN-LABEL: {{^}}global_truncstore_f32_to_f16: 448; GCN: buffer_load_dword [[LOAD:v[0-9]+]] 449; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]] 450; GCN: buffer_store_short [[CVT]] 451define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 452 %val = load float, float addrspace(1)* %in 453 %cvt = fptrunc float %val to half 454 store half %cvt, half addrspace(1)* %out 455 ret void 456} 457 458; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16: 459; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} 460; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]] 461; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]] 462; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]] 463; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]] 464; GCN-DAG: buffer_store_dword [[PACKED]] 465; GCN: s_endpgm 466define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 467 %val = load <2 x float>, <2 x float> addrspace(1)* %in 468 %cvt = fptrunc <2 x float> %val to <2 x half> 469 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 470 ret void 471} 472 473; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16: 474; GCN: buffer_load_dwordx4 475; GCN: v_cvt_f16_f32_e32 476; GCN: v_cvt_f16_f32_e32 477; GCN: v_cvt_f16_f32_e32 478; GCN-NOT: v_cvt_f16_f32_e32 479; GCN: buffer_store_short 480; GCN: buffer_store_dword 481; GCN: s_endpgm 482define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 483 %val = load <3 x float>, <3 x float> addrspace(1)* %in 484 %cvt = fptrunc <3 x float> %val to <3 x half> 485 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 486 ret void 487} 488 489; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16: 490; GCN: buffer_load_dwordx4 491; GCN: v_cvt_f16_f32_e32 492; GCN: v_cvt_f16_f32_e32 493; GCN: v_cvt_f16_f32_e32 494; GCN: v_cvt_f16_f32_e32 495; GCN: buffer_store_dwordx2 496; GCN: s_endpgm 497define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 498 %val = load <4 x float>, <4 x float> addrspace(1)* %in 499 %cvt = fptrunc <4 x float> %val to <4 x half> 500 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 501 ret void 502} 503 504; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16: 505; GCN: buffer_load_dwordx4 506; GCN: buffer_load_dwordx4 507; GCN: v_cvt_f16_f32_e32 508; GCN: v_cvt_f16_f32_e32 509; GCN: v_cvt_f16_f32_e32 510; GCN: v_cvt_f16_f32_e32 511; GCN: v_cvt_f16_f32_e32 512; GCN: v_cvt_f16_f32_e32 513; GCN: v_cvt_f16_f32_e32 514; GCN: v_cvt_f16_f32_e32 515; GCN: buffer_store_dwordx4 516; GCN: s_endpgm 517define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 518 %val = load <8 x float>, <8 x float> addrspace(1)* %in 519 %cvt = fptrunc <8 x float> %val to <8 x half> 520 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 521 ret void 522} 523 524; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16: 525; GCN: buffer_load_dwordx4 526; GCN: buffer_load_dwordx4 527; GCN: buffer_load_dwordx4 528; GCN: buffer_load_dwordx4 529; GCN-DAG: v_cvt_f16_f32_e32 530; GCN-DAG: v_cvt_f16_f32_e32 531; GCN-DAG: v_cvt_f16_f32_e32 532; GCN-DAG: v_cvt_f16_f32_e32 533; GCN-DAG: v_cvt_f16_f32_e32 534; GCN-DAG: v_cvt_f16_f32_e32 535; GCN-DAG: v_cvt_f16_f32_e32 536; GCN-DAG: v_cvt_f16_f32_e32 537; GCN-DAG: v_cvt_f16_f32_e32 538; GCN-DAG: v_cvt_f16_f32_e32 539; GCN-DAG: v_cvt_f16_f32_e32 540; GCN-DAG: v_cvt_f16_f32_e32 541; GCN-DAG: v_cvt_f16_f32_e32 542; GCN-DAG: v_cvt_f16_f32_e32 543; GCN-DAG: v_cvt_f16_f32_e32 544; GCN-DAG: v_cvt_f16_f32_e32 545; GCN-DAG: buffer_store_dwordx4 546; GCN-DAG: buffer_store_dwordx4 547; GCN: s_endpgm 548define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 549 %val = load <16 x float>, <16 x float> addrspace(1)* %in 550 %cvt = fptrunc <16 x float> %val to <16 x half> 551 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 552 ret void 553} 554 555; FIXME: Unsafe math should fold conversions away 556; GCN-LABEL: {{^}}fadd_f16: 557; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 558; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 559; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 560; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}}, 561; SI: v_add_f32 562; GCN: s_endpgm 563define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 564 %add = fadd half %a, %b 565 store half %add, half addrspace(1)* %out, align 4 566 ret void 567} 568 569; GCN-LABEL: {{^}}fadd_v2f16: 570; SI: v_add_f32 571; SI: v_add_f32 572; GCN: s_endpgm 573define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 574 %add = fadd <2 x half> %a, %b 575 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 576 ret void 577} 578 579; GCN-LABEL: {{^}}fadd_v4f16: 580; SI: v_add_f32 581; SI: v_add_f32 582; SI: v_add_f32 583; SI: v_add_f32 584; GCN: s_endpgm 585define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 586 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 587 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 588 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 589 %result = fadd <4 x half> %a, %b 590 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 591 ret void 592} 593 594; GCN-LABEL: {{^}}fadd_v8f16: 595; SI: v_add_f32 596; SI: v_add_f32 597; SI: v_add_f32 598; SI: v_add_f32 599; SI: v_add_f32 600; SI: v_add_f32 601; SI: v_add_f32 602; SI: v_add_f32 603; GCN: s_endpgm 604define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 605 %add = fadd <8 x half> %a, %b 606 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 607 ret void 608} 609 610; GCN-LABEL: {{^}}test_bitcast_from_half: 611; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 612; GCN: buffer_store_short [[TMP]] 613define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 614 %val = load half, half addrspace(1)* %in 615 %val_int = bitcast half %val to i16 616 store i16 %val_int, i16 addrspace(1)* %out 617 ret void 618} 619 620; GCN-LABEL: {{^}}test_bitcast_to_half: 621; GCN: buffer_load_ushort [[TMP:v[0-9]+]] 622; GCN: buffer_store_short [[TMP]] 623define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 624 %val = load i16, i16 addrspace(1)* %in 625 %val_fp = bitcast i16 %val to half 626 store half %val_fp, half addrspace(1)* %out 627 ret void 628} 629 630attributes #0 = { nounwind } 631