1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI %s 3; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI %s 4 5; half args should be promoted to float for SI and lower. 6 7define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 { 8; SI-LABEL: load_f16_arg: 9; SI: ; %bb.0: 10; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 11; SI-NEXT: s_load_dword s2, s[4:5], 0x2 12; SI-NEXT: s_waitcnt lgkmcnt(0) 13; SI-NEXT: v_mov_b32_e32 v0, s0 14; SI-NEXT: v_mov_b32_e32 v1, s1 15; SI-NEXT: v_mov_b32_e32 v2, s2 16; SI-NEXT: flat_store_short v[0:1], v2 17; SI-NEXT: s_endpgm 18; 19; VI-LABEL: load_f16_arg: 20; VI: ; %bb.0: 21; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 22; VI-NEXT: s_load_dword s2, s[4:5], 0x8 23; VI-NEXT: s_waitcnt lgkmcnt(0) 24; VI-NEXT: v_mov_b32_e32 v0, s0 25; VI-NEXT: v_mov_b32_e32 v1, s1 26; VI-NEXT: v_mov_b32_e32 v2, s2 27; VI-NEXT: flat_store_short v[0:1], v2 28; VI-NEXT: s_endpgm 29 store half %arg, half addrspace(1)* %out 30 ret void 31} 32 33define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 { 34; SI-LABEL: load_v2f16_arg: 35; SI: ; %bb.0: 36; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 37; SI-NEXT: s_load_dword s2, s[4:5], 0x2 38; SI-NEXT: s_waitcnt lgkmcnt(0) 39; SI-NEXT: v_mov_b32_e32 v0, s0 40; SI-NEXT: v_mov_b32_e32 v1, s1 41; SI-NEXT: v_mov_b32_e32 v2, s2 42; SI-NEXT: flat_store_dword v[0:1], v2 43; SI-NEXT: s_endpgm 44; 45; VI-LABEL: load_v2f16_arg: 46; VI: ; %bb.0: 47; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 48; VI-NEXT: s_load_dword s2, s[4:5], 0x8 49; VI-NEXT: s_waitcnt lgkmcnt(0) 50; VI-NEXT: v_mov_b32_e32 v0, s0 51; VI-NEXT: v_mov_b32_e32 v1, s1 52; VI-NEXT: v_mov_b32_e32 v2, s2 53; VI-NEXT: flat_store_dword v[0:1], v2 54; VI-NEXT: s_endpgm 55 store <2 x half> %arg, <2 x half> addrspace(1)* %out 56 ret void 57} 58 59define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 { 60; SI-LABEL: load_v3f16_arg: 61; SI: ; %bb.0: 62; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 63; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 64; SI-NEXT: s_waitcnt lgkmcnt(0) 65; SI-NEXT: s_add_u32 s4, s0, 4 66; SI-NEXT: s_addc_u32 s5, s1, 0 67; SI-NEXT: v_mov_b32_e32 v2, s4 68; SI-NEXT: v_mov_b32_e32 v4, s3 69; SI-NEXT: v_mov_b32_e32 v0, s0 70; SI-NEXT: v_mov_b32_e32 v3, s5 71; SI-NEXT: v_mov_b32_e32 v1, s1 72; SI-NEXT: v_mov_b32_e32 v5, s2 73; SI-NEXT: flat_store_short v[2:3], v4 74; SI-NEXT: flat_store_dword v[0:1], v5 75; SI-NEXT: s_endpgm 76; 77; VI-LABEL: load_v3f16_arg: 78; VI: ; %bb.0: 79; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 80; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 81; VI-NEXT: s_waitcnt lgkmcnt(0) 82; VI-NEXT: s_add_u32 s4, s0, 4 83; VI-NEXT: s_addc_u32 s5, s1, 0 84; VI-NEXT: v_mov_b32_e32 v2, s4 85; VI-NEXT: v_mov_b32_e32 v4, s3 86; VI-NEXT: v_mov_b32_e32 v0, s0 87; VI-NEXT: v_mov_b32_e32 v3, s5 88; VI-NEXT: v_mov_b32_e32 v1, s1 89; VI-NEXT: v_mov_b32_e32 v5, s2 90; VI-NEXT: flat_store_short v[2:3], v4 91; VI-NEXT: flat_store_dword v[0:1], v5 92; VI-NEXT: s_endpgm 93 store <3 x half> %arg, <3 x half> addrspace(1)* %out 94 ret void 95} 96 97 98; FIXME: Why not one load? 99define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 { 100; SI-LABEL: load_v4f16_arg: 101; SI: ; %bb.0: 102; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 103; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 104; SI-NEXT: s_waitcnt lgkmcnt(0) 105; SI-NEXT: v_mov_b32_e32 v0, s0 106; SI-NEXT: v_mov_b32_e32 v2, s2 107; SI-NEXT: v_mov_b32_e32 v1, s1 108; SI-NEXT: v_mov_b32_e32 v3, s3 109; SI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 110; SI-NEXT: s_endpgm 111; 112; VI-LABEL: load_v4f16_arg: 113; VI: ; %bb.0: 114; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 115; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 116; VI-NEXT: s_waitcnt lgkmcnt(0) 117; VI-NEXT: v_mov_b32_e32 v0, s0 118; VI-NEXT: v_mov_b32_e32 v2, s2 119; VI-NEXT: v_mov_b32_e32 v1, s1 120; VI-NEXT: v_mov_b32_e32 v3, s3 121; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 122; VI-NEXT: s_endpgm 123 store <4 x half> %arg, <4 x half> addrspace(1)* %out 124 ret void 125} 126 127define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 { 128; SI-LABEL: load_v8f16_arg: 129; SI: ; %bb.0: 130; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 131; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 132; SI-NEXT: s_waitcnt lgkmcnt(0) 133; SI-NEXT: v_mov_b32_e32 v4, s6 134; SI-NEXT: v_mov_b32_e32 v0, s0 135; SI-NEXT: v_mov_b32_e32 v5, s7 136; SI-NEXT: v_mov_b32_e32 v1, s1 137; SI-NEXT: v_mov_b32_e32 v2, s2 138; SI-NEXT: v_mov_b32_e32 v3, s3 139; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 140; SI-NEXT: s_endpgm 141; 142; VI-LABEL: load_v8f16_arg: 143; VI: ; %bb.0: 144; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 145; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 146; VI-NEXT: s_waitcnt lgkmcnt(0) 147; VI-NEXT: v_mov_b32_e32 v4, s6 148; VI-NEXT: v_mov_b32_e32 v0, s0 149; VI-NEXT: v_mov_b32_e32 v5, s7 150; VI-NEXT: v_mov_b32_e32 v1, s1 151; VI-NEXT: v_mov_b32_e32 v2, s2 152; VI-NEXT: v_mov_b32_e32 v3, s3 153; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 154; VI-NEXT: s_endpgm 155 store <8 x half> %arg, <8 x half> addrspace(1)* %out 156 ret void 157} 158 159define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 { 160; SI-LABEL: extload_v2f16_arg: 161; SI: ; %bb.0: 162; SI-NEXT: s_load_dword s0, s[4:5], 0x2 163; SI-NEXT: s_waitcnt lgkmcnt(0) 164; SI-NEXT: s_lshr_b32 s1, s0, 16 165; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 166; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 167; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 168; SI-NEXT: s_waitcnt lgkmcnt(0) 169; SI-NEXT: v_mov_b32_e32 v3, s1 170; SI-NEXT: v_mov_b32_e32 v2, s0 171; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 172; SI-NEXT: s_endpgm 173; 174; VI-LABEL: extload_v2f16_arg: 175; VI: ; %bb.0: 176; VI-NEXT: s_load_dword s0, s[4:5], 0x8 177; VI-NEXT: s_waitcnt lgkmcnt(0) 178; VI-NEXT: s_lshr_b32 s1, s0, 16 179; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 180; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 181; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 182; VI-NEXT: s_waitcnt lgkmcnt(0) 183; VI-NEXT: v_mov_b32_e32 v3, s1 184; VI-NEXT: v_mov_b32_e32 v2, s0 185; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 186; VI-NEXT: s_endpgm 187 %fpext = fpext <2 x half> %in to <2 x float> 188 store <2 x float> %fpext, <2 x float> addrspace(1)* %out 189 ret void 190} 191 192define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 { 193; SI-LABEL: extload_f16_to_f32_arg: 194; SI: ; %bb.0: 195; SI-NEXT: s_load_dword s0, s[4:5], 0x2 196; SI-NEXT: s_waitcnt lgkmcnt(0) 197; SI-NEXT: v_cvt_f32_f16_e32 v2, s0 198; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 199; SI-NEXT: s_waitcnt lgkmcnt(0) 200; SI-NEXT: v_mov_b32_e32 v0, s0 201; SI-NEXT: v_mov_b32_e32 v1, s1 202; SI-NEXT: flat_store_dword v[0:1], v2 203; SI-NEXT: s_endpgm 204; 205; VI-LABEL: extload_f16_to_f32_arg: 206; VI: ; %bb.0: 207; VI-NEXT: s_load_dword s0, s[4:5], 0x8 208; VI-NEXT: s_waitcnt lgkmcnt(0) 209; VI-NEXT: v_cvt_f32_f16_e32 v2, s0 210; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 211; VI-NEXT: s_waitcnt lgkmcnt(0) 212; VI-NEXT: v_mov_b32_e32 v0, s0 213; VI-NEXT: v_mov_b32_e32 v1, s1 214; VI-NEXT: flat_store_dword v[0:1], v2 215; VI-NEXT: s_endpgm 216 %ext = fpext half %arg to float 217 store float %ext, float addrspace(1)* %out 218 ret void 219} 220 221define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 { 222; SI-LABEL: extload_v2f16_to_v2f32_arg: 223; SI: ; %bb.0: 224; SI-NEXT: s_load_dword s0, s[4:5], 0x2 225; SI-NEXT: s_waitcnt lgkmcnt(0) 226; SI-NEXT: s_lshr_b32 s1, s0, 16 227; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 228; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 229; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 230; SI-NEXT: s_waitcnt lgkmcnt(0) 231; SI-NEXT: v_mov_b32_e32 v3, s1 232; SI-NEXT: v_mov_b32_e32 v2, s0 233; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 234; SI-NEXT: s_endpgm 235; 236; VI-LABEL: extload_v2f16_to_v2f32_arg: 237; VI: ; %bb.0: 238; VI-NEXT: s_load_dword s0, s[4:5], 0x8 239; VI-NEXT: s_waitcnt lgkmcnt(0) 240; VI-NEXT: s_lshr_b32 s1, s0, 16 241; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 242; VI-NEXT: v_cvt_f32_f16_e32 v1, s1 243; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 244; VI-NEXT: s_waitcnt lgkmcnt(0) 245; VI-NEXT: v_mov_b32_e32 v3, s1 246; VI-NEXT: v_mov_b32_e32 v2, s0 247; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 248; VI-NEXT: s_endpgm 249 %ext = fpext <2 x half> %arg to <2 x float> 250 store <2 x float> %ext, <2 x float> addrspace(1)* %out 251 ret void 252} 253 254define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 { 255; SI-LABEL: extload_v3f16_to_v3f32_arg: 256; SI: ; %bb.0: 257; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 258; SI-NEXT: s_waitcnt lgkmcnt(0) 259; SI-NEXT: s_lshr_b32 s2, s0, 16 260; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 261; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 262; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 263; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 264; SI-NEXT: s_waitcnt lgkmcnt(0) 265; SI-NEXT: v_mov_b32_e32 v4, s1 266; SI-NEXT: v_mov_b32_e32 v3, s0 267; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 268; SI-NEXT: s_endpgm 269; 270; VI-LABEL: extload_v3f16_to_v3f32_arg: 271; VI: ; %bb.0: 272; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 273; VI-NEXT: s_waitcnt lgkmcnt(0) 274; VI-NEXT: s_lshr_b32 s2, s0, 16 275; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 276; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 277; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 278; VI-NEXT: v_cvt_f32_f16_e32 v1, s2 279; VI-NEXT: s_waitcnt lgkmcnt(0) 280; VI-NEXT: v_mov_b32_e32 v4, s1 281; VI-NEXT: v_mov_b32_e32 v3, s0 282; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 283; VI-NEXT: s_endpgm 284 %ext = fpext <3 x half> %arg to <3 x float> 285 store <3 x float> %ext, <3 x float> addrspace(1)* %out 286 ret void 287} 288 289define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 { 290; SI-LABEL: extload_v4f16_to_v4f32_arg: 291; SI: ; %bb.0: 292; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 293; SI-NEXT: s_waitcnt lgkmcnt(0) 294; SI-NEXT: s_lshr_b32 s2, s1, 16 295; SI-NEXT: s_lshr_b32 s3, s0, 16 296; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 297; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 298; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 299; SI-NEXT: v_cvt_f32_f16_e32 v3, s2 300; SI-NEXT: v_cvt_f32_f16_e32 v1, s3 301; SI-NEXT: s_waitcnt lgkmcnt(0) 302; SI-NEXT: v_mov_b32_e32 v5, s1 303; SI-NEXT: v_mov_b32_e32 v4, s0 304; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 305; SI-NEXT: s_endpgm 306; 307; VI-LABEL: extload_v4f16_to_v4f32_arg: 308; VI: ; %bb.0: 309; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 310; VI-NEXT: s_waitcnt lgkmcnt(0) 311; VI-NEXT: s_lshr_b32 s2, s1, 16 312; VI-NEXT: s_lshr_b32 s3, s0, 16 313; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 314; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 315; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 316; VI-NEXT: v_cvt_f32_f16_e32 v3, s2 317; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 318; VI-NEXT: s_waitcnt lgkmcnt(0) 319; VI-NEXT: v_mov_b32_e32 v5, s1 320; VI-NEXT: v_mov_b32_e32 v4, s0 321; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 322; VI-NEXT: s_endpgm 323 %ext = fpext <4 x half> %arg to <4 x float> 324 store <4 x float> %ext, <4 x float> addrspace(1)* %out 325 ret void 326} 327 328define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 { 329; SI-LABEL: extload_v8f16_to_v8f32_arg: 330; SI: ; %bb.0: 331; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 332; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 333; SI-NEXT: s_waitcnt lgkmcnt(0) 334; SI-NEXT: s_lshr_b32 s4, s1, 16 335; SI-NEXT: s_lshr_b32 s5, s0, 16 336; SI-NEXT: s_lshr_b32 s8, s3, 16 337; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 338; SI-NEXT: s_lshr_b32 s4, s2, 16 339; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 340; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 341; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 342; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 343; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 344; SI-NEXT: s_add_u32 s0, s6, 16 345; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 346; SI-NEXT: s_addc_u32 s1, s7, 0 347; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 348; SI-NEXT: v_mov_b32_e32 v9, s1 349; SI-NEXT: v_mov_b32_e32 v8, s0 350; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 351; SI-NEXT: s_nop 0 352; SI-NEXT: v_mov_b32_e32 v4, s6 353; SI-NEXT: v_mov_b32_e32 v5, s7 354; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 355; SI-NEXT: s_endpgm 356; 357; VI-LABEL: extload_v8f16_to_v8f32_arg: 358; VI: ; %bb.0: 359; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 360; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 361; VI-NEXT: s_waitcnt lgkmcnt(0) 362; VI-NEXT: s_lshr_b32 s4, s1, 16 363; VI-NEXT: s_lshr_b32 s5, s0, 16 364; VI-NEXT: s_lshr_b32 s8, s3, 16 365; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 366; VI-NEXT: s_lshr_b32 s4, s2, 16 367; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 368; VI-NEXT: v_cvt_f32_f16_e32 v5, s4 369; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 370; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 371; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 372; VI-NEXT: s_add_u32 s0, s6, 16 373; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 374; VI-NEXT: s_addc_u32 s1, s7, 0 375; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 376; VI-NEXT: v_mov_b32_e32 v9, s1 377; VI-NEXT: v_mov_b32_e32 v8, s0 378; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 379; VI-NEXT: s_nop 0 380; VI-NEXT: v_mov_b32_e32 v4, s6 381; VI-NEXT: v_mov_b32_e32 v5, s7 382; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 383; VI-NEXT: s_endpgm 384 %ext = fpext <8 x half> %arg to <8 x float> 385 store <8 x float> %ext, <8 x float> addrspace(1)* %out 386 ret void 387} 388 389define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 { 390; SI-LABEL: extload_f16_to_f64_arg: 391; SI: ; %bb.0: 392; SI-NEXT: s_load_dword s0, s[4:5], 0x2 393; SI-NEXT: s_waitcnt lgkmcnt(0) 394; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 395; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 396; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 397; SI-NEXT: s_waitcnt lgkmcnt(0) 398; SI-NEXT: v_mov_b32_e32 v3, s1 399; SI-NEXT: v_mov_b32_e32 v2, s0 400; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 401; SI-NEXT: s_endpgm 402; 403; VI-LABEL: extload_f16_to_f64_arg: 404; VI: ; %bb.0: 405; VI-NEXT: s_load_dword s0, s[4:5], 0x8 406; VI-NEXT: s_waitcnt lgkmcnt(0) 407; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 408; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 409; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 410; VI-NEXT: s_waitcnt lgkmcnt(0) 411; VI-NEXT: v_mov_b32_e32 v3, s1 412; VI-NEXT: v_mov_b32_e32 v2, s0 413; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 414; VI-NEXT: s_endpgm 415 %ext = fpext half %arg to double 416 store double %ext, double addrspace(1)* %out 417 ret void 418} 419 420define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 { 421; SI-LABEL: extload_v2f16_to_v2f64_arg: 422; SI: ; %bb.0: 423; SI-NEXT: s_load_dword s0, s[4:5], 0x2 424; SI-NEXT: s_waitcnt lgkmcnt(0) 425; SI-NEXT: s_lshr_b32 s1, s0, 16 426; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 427; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 428; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 429; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 430; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 431; SI-NEXT: s_waitcnt lgkmcnt(0) 432; SI-NEXT: v_mov_b32_e32 v5, s1 433; SI-NEXT: v_mov_b32_e32 v4, s0 434; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 435; SI-NEXT: s_endpgm 436; 437; VI-LABEL: extload_v2f16_to_v2f64_arg: 438; VI: ; %bb.0: 439; VI-NEXT: s_load_dword s0, s[4:5], 0x8 440; VI-NEXT: s_waitcnt lgkmcnt(0) 441; VI-NEXT: s_lshr_b32 s1, s0, 16 442; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 443; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 444; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 445; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 446; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 447; VI-NEXT: s_waitcnt lgkmcnt(0) 448; VI-NEXT: v_mov_b32_e32 v5, s1 449; VI-NEXT: v_mov_b32_e32 v4, s0 450; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 451; VI-NEXT: s_endpgm 452 %ext = fpext <2 x half> %arg to <2 x double> 453 store <2 x double> %ext, <2 x double> addrspace(1)* %out 454 ret void 455} 456 457define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 { 458; SI-LABEL: extload_v3f16_to_v3f64_arg: 459; SI: ; %bb.0: 460; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 461; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 462; SI-NEXT: s_waitcnt lgkmcnt(0) 463; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 464; SI-NEXT: s_lshr_b32 s4, s2, 16 465; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 466; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 467; SI-NEXT: s_add_u32 s2, s0, 16 468; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 469; SI-NEXT: s_addc_u32 s3, s1, 0 470; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 471; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 472; SI-NEXT: v_mov_b32_e32 v7, s3 473; SI-NEXT: v_mov_b32_e32 v6, s2 474; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 475; SI-NEXT: v_mov_b32_e32 v5, s1 476; SI-NEXT: v_mov_b32_e32 v4, s0 477; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 478; SI-NEXT: s_endpgm 479; 480; VI-LABEL: extload_v3f16_to_v3f64_arg: 481; VI: ; %bb.0: 482; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 483; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 484; VI-NEXT: s_waitcnt lgkmcnt(0) 485; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 486; VI-NEXT: s_lshr_b32 s4, s2, 16 487; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 488; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 489; VI-NEXT: s_add_u32 s2, s0, 16 490; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 491; VI-NEXT: s_addc_u32 s3, s1, 0 492; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 493; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 494; VI-NEXT: v_mov_b32_e32 v7, s3 495; VI-NEXT: v_mov_b32_e32 v6, s2 496; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 497; VI-NEXT: v_mov_b32_e32 v5, s1 498; VI-NEXT: v_mov_b32_e32 v4, s0 499; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 500; VI-NEXT: s_endpgm 501 %ext = fpext <3 x half> %arg to <3 x double> 502 store <3 x double> %ext, <3 x double> addrspace(1)* %out 503 ret void 504} 505 506define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 { 507; SI-LABEL: extload_v4f16_to_v4f64_arg: 508; SI: ; %bb.0: 509; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 510; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 511; SI-NEXT: s_waitcnt lgkmcnt(0) 512; SI-NEXT: s_lshr_b32 s4, s3, 16 513; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 514; SI-NEXT: v_cvt_f32_f16_e32 v5, s3 515; SI-NEXT: s_lshr_b32 s5, s2, 16 516; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 517; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 518; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 519; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 520; SI-NEXT: s_add_u32 s2, s0, 16 521; SI-NEXT: s_addc_u32 s3, s1, 0 522; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 523; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 524; SI-NEXT: v_mov_b32_e32 v9, s3 525; SI-NEXT: v_mov_b32_e32 v8, s2 526; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 527; SI-NEXT: s_nop 0 528; SI-NEXT: v_mov_b32_e32 v5, s1 529; SI-NEXT: v_mov_b32_e32 v4, s0 530; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 531; SI-NEXT: s_endpgm 532; 533; VI-LABEL: extload_v4f16_to_v4f64_arg: 534; VI: ; %bb.0: 535; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 536; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 537; VI-NEXT: s_waitcnt lgkmcnt(0) 538; VI-NEXT: s_lshr_b32 s5, s3, 16 539; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 540; VI-NEXT: v_cvt_f32_f16_e32 v5, s3 541; VI-NEXT: s_lshr_b32 s4, s2, 16 542; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 543; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 544; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 545; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 546; VI-NEXT: s_add_u32 s2, s0, 16 547; VI-NEXT: s_addc_u32 s3, s1, 0 548; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 549; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 550; VI-NEXT: v_mov_b32_e32 v9, s3 551; VI-NEXT: v_mov_b32_e32 v8, s2 552; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 553; VI-NEXT: s_nop 0 554; VI-NEXT: v_mov_b32_e32 v5, s1 555; VI-NEXT: v_mov_b32_e32 v4, s0 556; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 557; VI-NEXT: s_endpgm 558 %ext = fpext <4 x half> %arg to <4 x double> 559 store <4 x double> %ext, <4 x double> addrspace(1)* %out 560 ret void 561} 562 563define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 { 564; SI-LABEL: extload_v8f16_to_v8f64_arg: 565; SI: ; %bb.0: 566; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 567; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 568; SI-NEXT: s_waitcnt lgkmcnt(0) 569; SI-NEXT: s_lshr_b32 s4, s3, 16 570; SI-NEXT: v_cvt_f32_f16_e32 v0, s4 571; SI-NEXT: v_cvt_f32_f16_e32 v12, s3 572; SI-NEXT: s_lshr_b32 s5, s2, 16 573; SI-NEXT: s_lshr_b32 s8, s1, 16 574; SI-NEXT: s_lshr_b32 s4, s0, 16 575; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 576; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 577; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 578; SI-NEXT: s_add_u32 s0, s6, 48 579; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 580; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 581; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 582; SI-NEXT: s_addc_u32 s1, s7, 0 583; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 584; SI-NEXT: v_mov_b32_e32 v17, s1 585; SI-NEXT: v_mov_b32_e32 v16, s0 586; SI-NEXT: s_add_u32 s0, s6, 32 587; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 588; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 589; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 590; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 591; SI-NEXT: s_addc_u32 s1, s7, 0 592; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 593; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 594; SI-NEXT: v_mov_b32_e32 v13, s1 595; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 596; SI-NEXT: v_mov_b32_e32 v12, s0 597; SI-NEXT: s_add_u32 s0, s6, 16 598; SI-NEXT: s_addc_u32 s1, s7, 0 599; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 600; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 601; SI-NEXT: s_nop 0 602; SI-NEXT: v_mov_b32_e32 v9, s1 603; SI-NEXT: v_mov_b32_e32 v8, s0 604; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 605; SI-NEXT: s_nop 0 606; SI-NEXT: v_mov_b32_e32 v4, s6 607; SI-NEXT: v_mov_b32_e32 v5, s7 608; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 609; SI-NEXT: s_endpgm 610; 611; VI-LABEL: extload_v8f16_to_v8f64_arg: 612; VI: ; %bb.0: 613; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 614; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 615; VI-NEXT: s_waitcnt lgkmcnt(0) 616; VI-NEXT: s_lshr_b32 s4, s0, 16 617; VI-NEXT: s_lshr_b32 s8, s2, 16 618; VI-NEXT: s_lshr_b32 s9, s3, 16 619; VI-NEXT: v_cvt_f32_f16_e32 v0, s4 620; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 621; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 622; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 623; VI-NEXT: s_lshr_b32 s5, s1, 16 624; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 625; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 626; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 627; VI-NEXT: s_add_u32 s0, s6, 48 628; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 629; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 630; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 631; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 632; VI-NEXT: s_addc_u32 s1, s7, 0 633; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 634; VI-NEXT: v_mov_b32_e32 v17, s1 635; VI-NEXT: v_mov_b32_e32 v16, s0 636; VI-NEXT: s_add_u32 s0, s6, 32 637; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 638; VI-NEXT: s_addc_u32 s1, s7, 0 639; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] 640; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 641; VI-NEXT: v_mov_b32_e32 v13, s1 642; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 643; VI-NEXT: v_mov_b32_e32 v12, s0 644; VI-NEXT: s_add_u32 s0, s6, 16 645; VI-NEXT: s_addc_u32 s1, s7, 0 646; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 647; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] 648; VI-NEXT: s_nop 0 649; VI-NEXT: v_mov_b32_e32 v9, s1 650; VI-NEXT: v_mov_b32_e32 v8, s0 651; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] 652; VI-NEXT: s_nop 0 653; VI-NEXT: v_mov_b32_e32 v4, s6 654; VI-NEXT: v_mov_b32_e32 v5, s7 655; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 656; VI-NEXT: s_endpgm 657 %ext = fpext <8 x half> %arg to <8 x double> 658 store <8 x double> %ext, <8 x double> addrspace(1)* %out 659 ret void 660} 661 662define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { 663; GCN-LABEL: global_load_store_f16: 664; GCN: ; %bb.0: 665; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 666; GCN-NEXT: s_waitcnt lgkmcnt(0) 667; GCN-NEXT: v_mov_b32_e32 v2, s2 668; GCN-NEXT: v_mov_b32_e32 v3, s3 669; GCN-NEXT: flat_load_ushort v2, v[2:3] 670; GCN-NEXT: v_mov_b32_e32 v0, s0 671; GCN-NEXT: v_mov_b32_e32 v1, s1 672; GCN-NEXT: s_waitcnt vmcnt(0) 673; GCN-NEXT: flat_store_short v[0:1], v2 674; GCN-NEXT: s_endpgm 675 %val = load half, half addrspace(1)* %in 676 store half %val, half addrspace(1)* %out 677 ret void 678} 679 680define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 681; GCN-LABEL: global_load_store_v2f16: 682; GCN: ; %bb.0: 683; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 684; GCN-NEXT: s_waitcnt lgkmcnt(0) 685; GCN-NEXT: v_mov_b32_e32 v2, s2 686; GCN-NEXT: v_mov_b32_e32 v3, s3 687; GCN-NEXT: flat_load_dword v2, v[2:3] 688; GCN-NEXT: v_mov_b32_e32 v0, s0 689; GCN-NEXT: v_mov_b32_e32 v1, s1 690; GCN-NEXT: s_waitcnt vmcnt(0) 691; GCN-NEXT: flat_store_dword v[0:1], v2 692; GCN-NEXT: s_endpgm 693 %val = load <2 x half>, <2 x half> addrspace(1)* %in 694 store <2 x half> %val, <2 x half> addrspace(1)* %out 695 ret void 696} 697 698define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 { 699; GCN-LABEL: global_load_store_v4f16: 700; GCN: ; %bb.0: 701; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 702; GCN-NEXT: s_waitcnt lgkmcnt(0) 703; GCN-NEXT: v_mov_b32_e32 v0, s0 704; GCN-NEXT: v_mov_b32_e32 v1, s1 705; GCN-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 706; GCN-NEXT: v_mov_b32_e32 v2, s2 707; GCN-NEXT: v_mov_b32_e32 v3, s3 708; GCN-NEXT: s_waitcnt vmcnt(0) 709; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 710; GCN-NEXT: s_endpgm 711 %val = load <4 x half>, <4 x half> addrspace(1)* %in 712 store <4 x half> %val, <4 x half> addrspace(1)* %out 713 ret void 714} 715 716define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 717; GCN-LABEL: global_load_store_v8f16: 718; GCN: ; %bb.0: 719; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 720; GCN-NEXT: s_waitcnt lgkmcnt(0) 721; GCN-NEXT: v_mov_b32_e32 v0, s2 722; GCN-NEXT: v_mov_b32_e32 v1, s3 723; GCN-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 724; GCN-NEXT: v_mov_b32_e32 v4, s0 725; GCN-NEXT: v_mov_b32_e32 v5, s1 726; GCN-NEXT: s_waitcnt vmcnt(0) 727; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 728; GCN-NEXT: s_endpgm 729 %val = load <8 x half>, <8 x half> addrspace(1)* %in 730 store <8 x half> %val, <8 x half> addrspace(1)* %out 731 ret void 732} 733 734define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 { 735; GCN-LABEL: global_extload_f16_to_f32: 736; GCN: ; %bb.0: 737; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 738; GCN-NEXT: s_waitcnt lgkmcnt(0) 739; GCN-NEXT: v_mov_b32_e32 v0, s2 740; GCN-NEXT: v_mov_b32_e32 v1, s3 741; GCN-NEXT: flat_load_ushort v0, v[0:1] 742; GCN-NEXT: v_mov_b32_e32 v1, s1 743; GCN-NEXT: s_waitcnt vmcnt(0) 744; GCN-NEXT: v_cvt_f32_f16_e32 v2, v0 745; GCN-NEXT: v_mov_b32_e32 v0, s0 746; GCN-NEXT: flat_store_dword v[0:1], v2 747; GCN-NEXT: s_endpgm 748 %val = load half, half addrspace(1)* %in 749 %cvt = fpext half %val to float 750 store float %cvt, float addrspace(1)* %out 751 ret void 752} 753 754define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 755; SI-LABEL: global_extload_v2f16_to_v2f32: 756; SI: ; %bb.0: 757; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 758; SI-NEXT: s_waitcnt lgkmcnt(0) 759; SI-NEXT: v_mov_b32_e32 v0, s2 760; SI-NEXT: v_mov_b32_e32 v1, s3 761; SI-NEXT: flat_load_dword v1, v[0:1] 762; SI-NEXT: v_mov_b32_e32 v2, s0 763; SI-NEXT: v_mov_b32_e32 v3, s1 764; SI-NEXT: s_waitcnt vmcnt(0) 765; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 766; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 767; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 768; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 769; SI-NEXT: s_endpgm 770; 771; VI-LABEL: global_extload_v2f16_to_v2f32: 772; VI: ; %bb.0: 773; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 774; VI-NEXT: s_waitcnt lgkmcnt(0) 775; VI-NEXT: v_mov_b32_e32 v0, s2 776; VI-NEXT: v_mov_b32_e32 v1, s3 777; VI-NEXT: flat_load_dword v1, v[0:1] 778; VI-NEXT: v_mov_b32_e32 v2, s0 779; VI-NEXT: v_mov_b32_e32 v3, s1 780; VI-NEXT: s_waitcnt vmcnt(0) 781; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 782; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 783; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 784; VI-NEXT: s_endpgm 785 %val = load <2 x half>, <2 x half> addrspace(1)* %in 786 %cvt = fpext <2 x half> %val to <2 x float> 787 store <2 x float> %cvt, <2 x float> addrspace(1)* %out 788 ret void 789} 790 791define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 792; SI-LABEL: global_extload_v3f16_to_v3f32: 793; SI: ; %bb.0: 794; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 795; SI-NEXT: s_waitcnt lgkmcnt(0) 796; SI-NEXT: v_mov_b32_e32 v0, s2 797; SI-NEXT: v_mov_b32_e32 v1, s3 798; SI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 799; SI-NEXT: v_mov_b32_e32 v3, s0 800; SI-NEXT: v_mov_b32_e32 v4, s1 801; SI-NEXT: s_waitcnt vmcnt(0) 802; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 803; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 804; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 805; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 806; SI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 807; SI-NEXT: s_endpgm 808; 809; VI-LABEL: global_extload_v3f16_to_v3f32: 810; VI: ; %bb.0: 811; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 812; VI-NEXT: s_waitcnt lgkmcnt(0) 813; VI-NEXT: v_mov_b32_e32 v0, s2 814; VI-NEXT: v_mov_b32_e32 v1, s3 815; VI-NEXT: flat_load_dwordx2 v[1:2], v[0:1] 816; VI-NEXT: v_mov_b32_e32 v3, s0 817; VI-NEXT: v_mov_b32_e32 v4, s1 818; VI-NEXT: s_waitcnt vmcnt(0) 819; VI-NEXT: v_cvt_f32_f16_e32 v0, v1 820; VI-NEXT: v_cvt_f32_f16_e32 v2, v2 821; VI-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 822; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2] 823; VI-NEXT: s_endpgm 824 %val = load <3 x half>, <3 x half> addrspace(1)* %in 825 %cvt = fpext <3 x half> %val to <3 x float> 826 store <3 x float> %cvt, <3 x float> addrspace(1)* %out 827 ret void 828} 829 830define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 831; SI-LABEL: global_extload_v4f16_to_v4f32: 832; SI: ; %bb.0: 833; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 834; SI-NEXT: s_waitcnt lgkmcnt(0) 835; SI-NEXT: v_mov_b32_e32 v0, s2 836; SI-NEXT: v_mov_b32_e32 v1, s3 837; SI-NEXT: flat_load_dwordx2 v[3:4], v[0:1] 838; SI-NEXT: v_mov_b32_e32 v5, s1 839; SI-NEXT: s_waitcnt vmcnt(0) 840; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 841; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4 842; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 843; SI-NEXT: v_cvt_f32_f16_e32 v0, v3 844; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 845; SI-NEXT: v_cvt_f32_f16_e32 v1, v4 846; SI-NEXT: v_mov_b32_e32 v4, s0 847; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 848; SI-NEXT: s_endpgm 849; 850; VI-LABEL: global_extload_v4f16_to_v4f32: 851; VI: ; %bb.0: 852; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 853; VI-NEXT: s_waitcnt lgkmcnt(0) 854; VI-NEXT: v_mov_b32_e32 v0, s2 855; VI-NEXT: v_mov_b32_e32 v1, s3 856; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] 857; VI-NEXT: s_waitcnt vmcnt(0) 858; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 859; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 860; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 861; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 862; VI-NEXT: v_mov_b32_e32 v4, s0 863; VI-NEXT: v_mov_b32_e32 v5, s1 864; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 865; VI-NEXT: s_endpgm 866 %val = load <4 x half>, <4 x half> addrspace(1)* %in 867 %cvt = fpext <4 x half> %val to <4 x float> 868 store <4 x float> %cvt, <4 x float> addrspace(1)* %out 869 ret void 870} 871 872define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 873; SI-LABEL: global_extload_v8f16_to_v8f32: 874; SI: ; %bb.0: 875; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 876; SI-NEXT: s_waitcnt lgkmcnt(0) 877; SI-NEXT: v_mov_b32_e32 v0, s2 878; SI-NEXT: v_mov_b32_e32 v1, s3 879; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 880; SI-NEXT: s_add_u32 s2, s0, 16 881; SI-NEXT: s_addc_u32 s3, s1, 0 882; SI-NEXT: v_mov_b32_e32 v13, s1 883; SI-NEXT: v_mov_b32_e32 v12, s0 884; SI-NEXT: s_waitcnt vmcnt(0) 885; SI-NEXT: v_cvt_f32_f16_e32 v10, v3 886; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 887; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 888; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 889; SI-NEXT: v_cvt_f32_f16_e32 v6, v1 890; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 891; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 892; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 893; SI-NEXT: v_cvt_f32_f16_e32 v11, v3 894; SI-NEXT: v_cvt_f32_f16_e32 v9, v2 895; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 896; SI-NEXT: v_cvt_f32_f16_e32 v5, v0 897; SI-NEXT: v_mov_b32_e32 v0, s2 898; SI-NEXT: v_mov_b32_e32 v1, s3 899; SI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 900; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 901; SI-NEXT: s_endpgm 902; 903; VI-LABEL: global_extload_v8f16_to_v8f32: 904; VI: ; %bb.0: 905; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 906; VI-NEXT: s_waitcnt lgkmcnt(0) 907; VI-NEXT: v_mov_b32_e32 v0, s2 908; VI-NEXT: v_mov_b32_e32 v1, s3 909; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 910; VI-NEXT: s_add_u32 s2, s0, 16 911; VI-NEXT: s_addc_u32 s3, s1, 0 912; VI-NEXT: v_mov_b32_e32 v13, s1 913; VI-NEXT: v_mov_b32_e32 v12, s0 914; VI-NEXT: s_waitcnt vmcnt(0) 915; VI-NEXT: v_cvt_f32_f16_e32 v10, v3 916; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 917; VI-NEXT: v_cvt_f32_f16_sdwa v11, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 918; VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 919; VI-NEXT: v_cvt_f32_f16_e32 v6, v1 920; VI-NEXT: v_cvt_f32_f16_e32 v4, v0 921; VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 922; VI-NEXT: v_cvt_f32_f16_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 923; VI-NEXT: v_mov_b32_e32 v0, s2 924; VI-NEXT: v_mov_b32_e32 v1, s3 925; VI-NEXT: flat_store_dwordx4 v[0:1], v[8:11] 926; VI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 927; VI-NEXT: s_endpgm 928 %val = load <8 x half>, <8 x half> addrspace(1)* %in 929 %cvt = fpext <8 x half> %val to <8 x float> 930 store <8 x float> %cvt, <8 x float> addrspace(1)* %out 931 ret void 932} 933 934define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 935; SI-LABEL: global_extload_v16f16_to_v16f32: 936; SI: ; %bb.0: 937; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 938; SI-NEXT: s_waitcnt lgkmcnt(0) 939; SI-NEXT: s_add_u32 s4, s2, 16 940; SI-NEXT: v_mov_b32_e32 v5, s3 941; SI-NEXT: s_addc_u32 s5, s3, 0 942; SI-NEXT: v_mov_b32_e32 v0, s4 943; SI-NEXT: v_mov_b32_e32 v4, s2 944; SI-NEXT: v_mov_b32_e32 v1, s5 945; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 946; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 947; SI-NEXT: s_add_u32 s2, s0, 16 948; SI-NEXT: s_addc_u32 s3, s1, 0 949; SI-NEXT: v_mov_b32_e32 v14, s3 950; SI-NEXT: v_mov_b32_e32 v13, s2 951; SI-NEXT: s_add_u32 s2, s0, 48 952; SI-NEXT: s_addc_u32 s3, s1, 0 953; SI-NEXT: s_waitcnt vmcnt(1) 954; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 955; SI-NEXT: s_waitcnt vmcnt(0) 956; SI-NEXT: v_cvt_f32_f16_e32 v11, v7 957; SI-NEXT: v_cvt_f32_f16_e32 v9, v6 958; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v7 959; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6 960; SI-NEXT: v_cvt_f32_f16_e32 v12, v7 961; SI-NEXT: v_cvt_f32_f16_e32 v10, v6 962; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 963; SI-NEXT: flat_store_dwordx4 v[13:14], v[9:12] 964; SI-NEXT: s_nop 0 965; SI-NEXT: v_cvt_f32_f16_e32 v12, v3 966; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 967; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v5 968; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4 969; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 970; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v0 971; SI-NEXT: v_cvt_f32_f16_e32 v10, v2 972; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 973; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 974; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 975; SI-NEXT: v_mov_b32_e32 v5, s1 976; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 977; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 978; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 979; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 980; SI-NEXT: v_mov_b32_e32 v4, s0 981; SI-NEXT: s_add_u32 s0, s0, 32 982; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 983; SI-NEXT: s_addc_u32 s1, s1, 0 984; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 985; SI-NEXT: v_mov_b32_e32 v15, s3 986; SI-NEXT: v_mov_b32_e32 v17, s1 987; SI-NEXT: v_mov_b32_e32 v14, s2 988; SI-NEXT: v_mov_b32_e32 v16, s0 989; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 990; SI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] 991; SI-NEXT: flat_store_dwordx4 v[16:17], v[6:9] 992; SI-NEXT: s_endpgm 993; 994; VI-LABEL: global_extload_v16f16_to_v16f32: 995; VI: ; %bb.0: 996; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 997; VI-NEXT: s_waitcnt lgkmcnt(0) 998; VI-NEXT: v_mov_b32_e32 v0, s2 999; VI-NEXT: v_mov_b32_e32 v1, s3 1000; VI-NEXT: s_add_u32 s2, s2, 16 1001; VI-NEXT: s_addc_u32 s3, s3, 0 1002; VI-NEXT: v_mov_b32_e32 v5, s3 1003; VI-NEXT: v_mov_b32_e32 v4, s2 1004; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1005; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1006; VI-NEXT: s_add_u32 s2, s0, 16 1007; VI-NEXT: s_addc_u32 s3, s1, 0 1008; VI-NEXT: v_mov_b32_e32 v19, s3 1009; VI-NEXT: v_mov_b32_e32 v18, s2 1010; VI-NEXT: s_add_u32 s2, s0, 48 1011; VI-NEXT: v_mov_b32_e32 v17, s1 1012; VI-NEXT: s_addc_u32 s3, s1, 0 1013; VI-NEXT: v_mov_b32_e32 v16, s0 1014; VI-NEXT: s_add_u32 s0, s0, 32 1015; VI-NEXT: s_addc_u32 s1, s1, 0 1016; VI-NEXT: v_mov_b32_e32 v21, s3 1017; VI-NEXT: v_mov_b32_e32 v20, s2 1018; VI-NEXT: s_waitcnt vmcnt(1) 1019; VI-NEXT: v_cvt_f32_f16_e32 v14, v3 1020; VI-NEXT: v_cvt_f32_f16_e32 v12, v2 1021; VI-NEXT: v_cvt_f32_f16_sdwa v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1022; VI-NEXT: v_cvt_f32_f16_sdwa v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1023; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1024; VI-NEXT: v_cvt_f32_f16_e32 v8, v0 1025; VI-NEXT: v_cvt_f32_f16_sdwa v11, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1026; VI-NEXT: v_cvt_f32_f16_sdwa v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1027; VI-NEXT: flat_store_dwordx4 v[18:19], v[12:15] 1028; VI-NEXT: s_waitcnt vmcnt(1) 1029; VI-NEXT: v_cvt_f32_f16_e32 v2, v5 1030; VI-NEXT: v_cvt_f32_f16_e32 v14, v7 1031; VI-NEXT: v_cvt_f32_f16_e32 v12, v6 1032; VI-NEXT: v_cvt_f32_f16_sdwa v15, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1033; VI-NEXT: v_cvt_f32_f16_sdwa v13, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1034; VI-NEXT: v_cvt_f32_f16_e32 v0, v4 1035; VI-NEXT: v_cvt_f32_f16_sdwa v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1036; VI-NEXT: v_cvt_f32_f16_sdwa v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1037; VI-NEXT: v_mov_b32_e32 v5, s1 1038; VI-NEXT: v_mov_b32_e32 v4, s0 1039; VI-NEXT: flat_store_dwordx4 v[16:17], v[8:11] 1040; VI-NEXT: flat_store_dwordx4 v[20:21], v[12:15] 1041; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1042; VI-NEXT: s_endpgm 1043 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1044 %cvt = fpext <16 x half> %val to <16 x float> 1045 store <16 x float> %cvt, <16 x float> addrspace(1)* %out 1046 ret void 1047} 1048 1049define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 { 1050; GCN-LABEL: global_extload_f16_to_f64: 1051; GCN: ; %bb.0: 1052; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1053; GCN-NEXT: s_waitcnt lgkmcnt(0) 1054; GCN-NEXT: v_mov_b32_e32 v0, s2 1055; GCN-NEXT: v_mov_b32_e32 v1, s3 1056; GCN-NEXT: flat_load_ushort v0, v[0:1] 1057; GCN-NEXT: v_mov_b32_e32 v2, s0 1058; GCN-NEXT: v_mov_b32_e32 v3, s1 1059; GCN-NEXT: s_waitcnt vmcnt(0) 1060; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 1061; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1062; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] 1063; GCN-NEXT: s_endpgm 1064 %val = load half, half addrspace(1)* %in 1065 %cvt = fpext half %val to double 1066 store double %cvt, double addrspace(1)* %out 1067 ret void 1068} 1069 1070define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { 1071; SI-LABEL: global_extload_v2f16_to_v2f64: 1072; SI: ; %bb.0: 1073; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1074; SI-NEXT: s_waitcnt lgkmcnt(0) 1075; SI-NEXT: v_mov_b32_e32 v0, s2 1076; SI-NEXT: v_mov_b32_e32 v1, s3 1077; SI-NEXT: flat_load_dword v0, v[0:1] 1078; SI-NEXT: v_mov_b32_e32 v4, s0 1079; SI-NEXT: v_mov_b32_e32 v5, s1 1080; SI-NEXT: s_waitcnt vmcnt(0) 1081; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 1082; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1083; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 1084; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1085; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1086; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1087; SI-NEXT: s_endpgm 1088; 1089; VI-LABEL: global_extload_v2f16_to_v2f64: 1090; VI: ; %bb.0: 1091; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1092; VI-NEXT: s_waitcnt lgkmcnt(0) 1093; VI-NEXT: v_mov_b32_e32 v0, s2 1094; VI-NEXT: v_mov_b32_e32 v1, s3 1095; VI-NEXT: flat_load_dword v0, v[0:1] 1096; VI-NEXT: v_mov_b32_e32 v4, s0 1097; VI-NEXT: v_mov_b32_e32 v5, s1 1098; VI-NEXT: s_waitcnt vmcnt(0) 1099; VI-NEXT: v_cvt_f32_f16_e32 v1, v0 1100; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1101; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1102; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1103; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1104; VI-NEXT: s_endpgm 1105 %val = load <2 x half>, <2 x half> addrspace(1)* %in 1106 %cvt = fpext <2 x half> %val to <2 x double> 1107 store <2 x double> %cvt, <2 x double> addrspace(1)* %out 1108 ret void 1109} 1110 1111define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 { 1112; SI-LABEL: global_extload_v3f16_to_v3f64: 1113; SI: ; %bb.0: 1114; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1115; SI-NEXT: s_waitcnt lgkmcnt(0) 1116; SI-NEXT: v_mov_b32_e32 v0, s2 1117; SI-NEXT: v_mov_b32_e32 v1, s3 1118; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1119; SI-NEXT: s_add_u32 s2, s0, 16 1120; SI-NEXT: s_addc_u32 s3, s1, 0 1121; SI-NEXT: v_mov_b32_e32 v7, s3 1122; SI-NEXT: v_mov_b32_e32 v6, s2 1123; SI-NEXT: s_waitcnt vmcnt(0) 1124; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1125; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 1126; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1127; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1128; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 1129; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] 1130; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 1131; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1132; SI-NEXT: v_mov_b32_e32 v5, s1 1133; SI-NEXT: v_mov_b32_e32 v4, s0 1134; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1135; SI-NEXT: s_endpgm 1136; 1137; VI-LABEL: global_extload_v3f16_to_v3f64: 1138; VI: ; %bb.0: 1139; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1140; VI-NEXT: s_waitcnt lgkmcnt(0) 1141; VI-NEXT: v_mov_b32_e32 v0, s2 1142; VI-NEXT: v_mov_b32_e32 v1, s3 1143; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1144; VI-NEXT: s_add_u32 s2, s0, 16 1145; VI-NEXT: s_addc_u32 s3, s1, 0 1146; VI-NEXT: v_mov_b32_e32 v5, s1 1147; VI-NEXT: v_mov_b32_e32 v4, s0 1148; VI-NEXT: s_waitcnt vmcnt(0) 1149; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1150; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1151; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1152; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 1153; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1154; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1155; VI-NEXT: v_mov_b32_e32 v9, s3 1156; VI-NEXT: v_mov_b32_e32 v8, s2 1157; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] 1158; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1159; VI-NEXT: s_endpgm 1160 %val = load <3 x half>, <3 x half> addrspace(1)* %in 1161 %cvt = fpext <3 x half> %val to <3 x double> 1162 store <3 x double> %cvt, <3 x double> addrspace(1)* %out 1163 ret void 1164} 1165 1166define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1167; SI-LABEL: global_extload_v4f16_to_v4f64: 1168; SI: ; %bb.0: 1169; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1170; SI-NEXT: s_waitcnt lgkmcnt(0) 1171; SI-NEXT: v_mov_b32_e32 v0, s2 1172; SI-NEXT: v_mov_b32_e32 v1, s3 1173; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1174; SI-NEXT: s_add_u32 s2, s0, 16 1175; SI-NEXT: s_addc_u32 s3, s1, 0 1176; SI-NEXT: v_mov_b32_e32 v9, s1 1177; SI-NEXT: v_mov_b32_e32 v8, s0 1178; SI-NEXT: s_waitcnt vmcnt(0) 1179; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 1180; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 1181; SI-NEXT: v_cvt_f32_f16_e32 v1, v0 1182; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1183; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1184; SI-NEXT: v_cvt_f32_f16_e32 v10, v0 1185; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1186; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 1187; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 1188; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1189; SI-NEXT: v_mov_b32_e32 v11, s3 1190; SI-NEXT: v_mov_b32_e32 v10, s2 1191; SI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1192; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1193; SI-NEXT: s_endpgm 1194; 1195; VI-LABEL: global_extload_v4f16_to_v4f64: 1196; VI: ; %bb.0: 1197; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1198; VI-NEXT: s_waitcnt lgkmcnt(0) 1199; VI-NEXT: v_mov_b32_e32 v0, s2 1200; VI-NEXT: v_mov_b32_e32 v1, s3 1201; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1202; VI-NEXT: s_add_u32 s2, s0, 16 1203; VI-NEXT: s_addc_u32 s3, s1, 0 1204; VI-NEXT: v_mov_b32_e32 v9, s1 1205; VI-NEXT: v_mov_b32_e32 v8, s0 1206; VI-NEXT: s_waitcnt vmcnt(0) 1207; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 1208; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1209; VI-NEXT: v_cvt_f32_f16_e32 v2, v0 1210; VI-NEXT: v_cvt_f32_f16_sdwa v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1211; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v3 1212; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1213; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 1214; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 1215; VI-NEXT: v_mov_b32_e32 v11, s3 1216; VI-NEXT: v_mov_b32_e32 v10, s2 1217; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] 1218; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1219; VI-NEXT: s_endpgm 1220 %val = load <4 x half>, <4 x half> addrspace(1)* %in 1221 %cvt = fpext <4 x half> %val to <4 x double> 1222 store <4 x double> %cvt, <4 x double> addrspace(1)* %out 1223 ret void 1224} 1225 1226define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 { 1227; SI-LABEL: global_extload_v8f16_to_v8f64: 1228; SI: ; %bb.0: 1229; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1230; SI-NEXT: s_waitcnt lgkmcnt(0) 1231; SI-NEXT: v_mov_b32_e32 v0, s2 1232; SI-NEXT: v_mov_b32_e32 v1, s3 1233; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1234; SI-NEXT: s_add_u32 s2, s0, 48 1235; SI-NEXT: s_addc_u32 s3, s1, 0 1236; SI-NEXT: v_mov_b32_e32 v7, s3 1237; SI-NEXT: v_mov_b32_e32 v6, s2 1238; SI-NEXT: s_add_u32 s2, s0, 32 1239; SI-NEXT: v_mov_b32_e32 v13, s1 1240; SI-NEXT: s_addc_u32 s3, s1, 0 1241; SI-NEXT: v_mov_b32_e32 v12, s0 1242; SI-NEXT: s_add_u32 s0, s0, 16 1243; SI-NEXT: v_mov_b32_e32 v15, s3 1244; SI-NEXT: s_addc_u32 s1, s1, 0 1245; SI-NEXT: v_mov_b32_e32 v14, s2 1246; SI-NEXT: s_waitcnt vmcnt(0) 1247; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 1248; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1249; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 1250; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 1251; SI-NEXT: v_cvt_f32_f16_e32 v2, v4 1252; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1 1253; SI-NEXT: v_cvt_f32_f16_e32 v10, v1 1254; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0 1255; SI-NEXT: v_cvt_f32_f16_e32 v4, v0 1256; SI-NEXT: v_cvt_f32_f16_e32 v16, v5 1257; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 1258; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 1259; SI-NEXT: v_cvt_f32_f16_e32 v17, v9 1260; SI-NEXT: v_cvt_f32_f16_e32 v18, v11 1261; SI-NEXT: flat_store_dwordx4 v[6:7], v[0:3] 1262; SI-NEXT: s_nop 0 1263; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1264; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1265; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v16 1266; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1267; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 1268; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v18 1269; SI-NEXT: v_mov_b32_e32 v17, s1 1270; SI-NEXT: v_mov_b32_e32 v16, s0 1271; SI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1272; SI-NEXT: flat_store_dwordx4 v[16:17], v[0:3] 1273; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1274; SI-NEXT: s_endpgm 1275; 1276; VI-LABEL: global_extload_v8f16_to_v8f64: 1277; VI: ; %bb.0: 1278; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1279; VI-NEXT: s_waitcnt lgkmcnt(0) 1280; VI-NEXT: v_mov_b32_e32 v0, s2 1281; VI-NEXT: v_mov_b32_e32 v1, s3 1282; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1283; VI-NEXT: s_add_u32 s2, s0, 48 1284; VI-NEXT: s_addc_u32 s3, s1, 0 1285; VI-NEXT: v_mov_b32_e32 v8, s3 1286; VI-NEXT: v_mov_b32_e32 v7, s2 1287; VI-NEXT: s_add_u32 s2, s0, 32 1288; VI-NEXT: v_mov_b32_e32 v13, s1 1289; VI-NEXT: s_addc_u32 s3, s1, 0 1290; VI-NEXT: v_mov_b32_e32 v12, s0 1291; VI-NEXT: s_add_u32 s0, s0, 16 1292; VI-NEXT: v_mov_b32_e32 v15, s3 1293; VI-NEXT: s_addc_u32 s1, s1, 0 1294; VI-NEXT: v_mov_b32_e32 v14, s2 1295; VI-NEXT: s_waitcnt vmcnt(0) 1296; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 1297; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1298; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1299; VI-NEXT: v_cvt_f32_f16_sdwa v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1300; VI-NEXT: v_cvt_f32_f16_e32 v10, v1 1301; VI-NEXT: v_cvt_f32_f16_e32 v11, v2 1302; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1303; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v5 1304; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1305; VI-NEXT: v_cvt_f32_f16_sdwa v17, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1306; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1307; VI-NEXT: flat_store_dwordx4 v[7:8], v[3:6] 1308; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v11 1309; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v10 1310; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1311; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v17 1312; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v16 1313; VI-NEXT: v_mov_b32_e32 v17, s1 1314; VI-NEXT: v_mov_b32_e32 v16, s0 1315; VI-NEXT: flat_store_dwordx4 v[14:15], v[8:11] 1316; VI-NEXT: flat_store_dwordx4 v[16:17], v[4:7] 1317; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1318; VI-NEXT: s_endpgm 1319 %val = load <8 x half>, <8 x half> addrspace(1)* %in 1320 %cvt = fpext <8 x half> %val to <8 x double> 1321 store <8 x double> %cvt, <8 x double> addrspace(1)* %out 1322 ret void 1323} 1324 1325define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 { 1326; SI-LABEL: global_extload_v16f16_to_v16f64: 1327; SI: ; %bb.0: 1328; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1329; SI-NEXT: s_waitcnt lgkmcnt(0) 1330; SI-NEXT: v_mov_b32_e32 v0, s2 1331; SI-NEXT: v_mov_b32_e32 v1, s3 1332; SI-NEXT: s_add_u32 s2, s2, 16 1333; SI-NEXT: s_addc_u32 s3, s3, 0 1334; SI-NEXT: v_mov_b32_e32 v2, s2 1335; SI-NEXT: v_mov_b32_e32 v3, s3 1336; SI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 1337; SI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] 1338; SI-NEXT: s_add_u32 s2, s0, 48 1339; SI-NEXT: s_addc_u32 s3, s1, 0 1340; SI-NEXT: v_mov_b32_e32 v14, s3 1341; SI-NEXT: v_mov_b32_e32 v13, s2 1342; SI-NEXT: s_add_u32 s2, s0, 32 1343; SI-NEXT: s_addc_u32 s3, s1, 0 1344; SI-NEXT: v_mov_b32_e32 v16, s3 1345; SI-NEXT: v_mov_b32_e32 v15, s2 1346; SI-NEXT: s_add_u32 s2, s0, 16 1347; SI-NEXT: s_addc_u32 s3, s1, 0 1348; SI-NEXT: v_mov_b32_e32 v18, s3 1349; SI-NEXT: v_mov_b32_e32 v17, s2 1350; SI-NEXT: s_add_u32 s2, s0, 0x70 1351; SI-NEXT: s_addc_u32 s3, s1, 0 1352; SI-NEXT: v_mov_b32_e32 v12, s1 1353; SI-NEXT: v_mov_b32_e32 v11, s0 1354; SI-NEXT: s_waitcnt vmcnt(1) 1355; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 1356; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 1357; SI-NEXT: v_cvt_f32_f16_e32 v9, v8 1358; SI-NEXT: s_waitcnt vmcnt(0) 1359; SI-NEXT: v_cvt_f32_f16_e32 v19, v3 1360; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v0 1361; SI-NEXT: v_cvt_f64_f32_e32 v[7:8], v7 1362; SI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1363; SI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1364; SI-NEXT: s_nop 0 1365; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6 1366; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1367; SI-NEXT: v_cvt_f32_f16_e32 v8, v7 1368; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3 1369; SI-NEXT: v_cvt_f32_f16_e32 v21, v0 1370; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1371; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1372; SI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] 1373; SI-NEXT: s_nop 0 1374; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v5 1375; SI-NEXT: v_cvt_f32_f16_e32 v5, v5 1376; SI-NEXT: v_cvt_f32_f16_e32 v6, v6 1377; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4 1378; SI-NEXT: v_cvt_f32_f16_e32 v9, v4 1379; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 1380; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 1381; SI-NEXT: v_cvt_f32_f16_e32 v8, v8 1382; SI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] 1383; SI-NEXT: s_nop 0 1384; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 1385; SI-NEXT: v_cvt_f32_f16_e32 v6, v2 1386; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 1387; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1388; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 1389; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1390; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 1391; SI-NEXT: v_mov_b32_e32 v14, s3 1392; SI-NEXT: v_mov_b32_e32 v13, s2 1393; SI-NEXT: s_add_u32 s2, s0, 0x60 1394; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 1395; SI-NEXT: s_addc_u32 s3, s1, 0 1396; SI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] 1397; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 1398; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 1399; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 1400; SI-NEXT: v_mov_b32_e32 v16, s3 1401; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 1402; SI-NEXT: v_mov_b32_e32 v15, s2 1403; SI-NEXT: s_add_u32 s2, s0, 0x50 1404; SI-NEXT: s_addc_u32 s3, s1, 0 1405; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 1406; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 1407; SI-NEXT: s_add_u32 s0, s0, 64 1408; SI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] 1409; SI-NEXT: s_addc_u32 s1, s1, 0 1410; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 1411; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 1412; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 1413; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v19 1414; SI-NEXT: v_mov_b32_e32 v18, s3 1415; SI-NEXT: v_mov_b32_e32 v13, s1 1416; SI-NEXT: v_mov_b32_e32 v17, s2 1417; SI-NEXT: v_mov_b32_e32 v12, s0 1418; SI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] 1419; SI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] 1420; SI-NEXT: flat_store_dwordx4 v[12:13], v[4:7] 1421; SI-NEXT: s_endpgm 1422; 1423; VI-LABEL: global_extload_v16f16_to_v16f64: 1424; VI: ; %bb.0: 1425; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1426; VI-NEXT: s_waitcnt lgkmcnt(0) 1427; VI-NEXT: v_mov_b32_e32 v0, s2 1428; VI-NEXT: v_mov_b32_e32 v1, s3 1429; VI-NEXT: s_add_u32 s2, s2, 16 1430; VI-NEXT: s_addc_u32 s3, s3, 0 1431; VI-NEXT: v_mov_b32_e32 v2, s2 1432; VI-NEXT: v_mov_b32_e32 v3, s3 1433; VI-NEXT: flat_load_dwordx4 v[4:7], v[0:1] 1434; VI-NEXT: flat_load_dwordx4 v[0:3], v[2:3] 1435; VI-NEXT: s_add_u32 s2, s0, 48 1436; VI-NEXT: s_addc_u32 s3, s1, 0 1437; VI-NEXT: v_mov_b32_e32 v14, s3 1438; VI-NEXT: v_mov_b32_e32 v13, s2 1439; VI-NEXT: s_add_u32 s2, s0, 32 1440; VI-NEXT: s_addc_u32 s3, s1, 0 1441; VI-NEXT: v_mov_b32_e32 v16, s3 1442; VI-NEXT: v_mov_b32_e32 v15, s2 1443; VI-NEXT: s_add_u32 s2, s0, 16 1444; VI-NEXT: s_addc_u32 s3, s1, 0 1445; VI-NEXT: v_mov_b32_e32 v18, s3 1446; VI-NEXT: v_mov_b32_e32 v17, s2 1447; VI-NEXT: s_add_u32 s2, s0, 0x70 1448; VI-NEXT: v_mov_b32_e32 v12, s1 1449; VI-NEXT: s_addc_u32 s3, s1, 0 1450; VI-NEXT: v_mov_b32_e32 v11, s0 1451; VI-NEXT: s_waitcnt vmcnt(1) 1452; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 1453; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1454; VI-NEXT: v_cvt_f64_f32_e32 v[7:8], v8 1455; VI-NEXT: v_cvt_f64_f32_e32 v[9:10], v9 1456; VI-NEXT: flat_store_dwordx4 v[13:14], v[7:10] 1457; VI-NEXT: s_nop 0 1458; VI-NEXT: v_cvt_f32_f16_e32 v7, v6 1459; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1460; VI-NEXT: s_waitcnt vmcnt(1) 1461; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 1462; VI-NEXT: v_mov_b32_e32 v14, s3 1463; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1464; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1465; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] 1466; VI-NEXT: s_nop 0 1467; VI-NEXT: v_cvt_f32_f16_e32 v6, v5 1468; VI-NEXT: v_cvt_f32_f16_sdwa v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1469; VI-NEXT: v_cvt_f32_f16_e32 v8, v4 1470; VI-NEXT: v_cvt_f32_f16_sdwa v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1471; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 1472; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 1473; VI-NEXT: flat_store_dwordx4 v[17:18], v[4:7] 1474; VI-NEXT: s_nop 0 1475; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v8 1476; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v9 1477; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1478; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1479; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 1480; VI-NEXT: v_mov_b32_e32 v13, s2 1481; VI-NEXT: s_add_u32 s2, s0, 0x60 1482; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] 1483; VI-NEXT: s_addc_u32 s3, s1, 0 1484; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 1485; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 1486; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1487; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 1488; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 1489; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 1490; VI-NEXT: v_mov_b32_e32 v16, s3 1491; VI-NEXT: v_mov_b32_e32 v15, s2 1492; VI-NEXT: s_add_u32 s2, s0, 0x50 1493; VI-NEXT: s_addc_u32 s3, s1, 0 1494; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 1495; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 1496; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 1497; VI-NEXT: s_add_u32 s0, s0, 64 1498; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] 1499; VI-NEXT: s_addc_u32 s1, s1, 0 1500; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 1501; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 1502; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 1503; VI-NEXT: v_mov_b32_e32 v20, s3 1504; VI-NEXT: v_mov_b32_e32 v13, s1 1505; VI-NEXT: v_mov_b32_e32 v19, s2 1506; VI-NEXT: v_mov_b32_e32 v12, s0 1507; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] 1508; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7] 1509; VI-NEXT: flat_store_dwordx4 v[12:13], v[0:3] 1510; VI-NEXT: s_endpgm 1511 %val = load <16 x half>, <16 x half> addrspace(1)* %in 1512 %cvt = fpext <16 x half> %val to <16 x double> 1513 store <16 x double> %cvt, <16 x double> addrspace(1)* %out 1514 ret void 1515} 1516 1517define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 { 1518; GCN-LABEL: global_truncstore_f32_to_f16: 1519; GCN: ; %bb.0: 1520; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1521; GCN-NEXT: s_waitcnt lgkmcnt(0) 1522; GCN-NEXT: v_mov_b32_e32 v0, s2 1523; GCN-NEXT: v_mov_b32_e32 v1, s3 1524; GCN-NEXT: flat_load_dword v0, v[0:1] 1525; GCN-NEXT: v_mov_b32_e32 v1, s1 1526; GCN-NEXT: s_waitcnt vmcnt(0) 1527; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0 1528; GCN-NEXT: v_mov_b32_e32 v0, s0 1529; GCN-NEXT: flat_store_short v[0:1], v2 1530; GCN-NEXT: s_endpgm 1531 %val = load float, float addrspace(1)* %in 1532 %cvt = fptrunc float %val to half 1533 store half %cvt, half addrspace(1)* %out 1534 ret void 1535} 1536 1537define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 { 1538; SI-LABEL: global_truncstore_v2f32_to_v2f16: 1539; SI: ; %bb.0: 1540; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1541; SI-NEXT: s_waitcnt lgkmcnt(0) 1542; SI-NEXT: v_mov_b32_e32 v0, s2 1543; SI-NEXT: v_mov_b32_e32 v1, s3 1544; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1545; SI-NEXT: s_waitcnt vmcnt(0) 1546; SI-NEXT: v_cvt_f16_f32_e32 v2, v1 1547; SI-NEXT: v_cvt_f16_f32_e32 v3, v0 1548; SI-NEXT: v_mov_b32_e32 v0, s0 1549; SI-NEXT: v_mov_b32_e32 v1, s1 1550; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1551; SI-NEXT: v_or_b32_e32 v2, v3, v2 1552; SI-NEXT: flat_store_dword v[0:1], v2 1553; SI-NEXT: s_endpgm 1554; 1555; VI-LABEL: global_truncstore_v2f32_to_v2f16: 1556; VI: ; %bb.0: 1557; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1558; VI-NEXT: s_waitcnt lgkmcnt(0) 1559; VI-NEXT: v_mov_b32_e32 v0, s2 1560; VI-NEXT: v_mov_b32_e32 v1, s3 1561; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1562; VI-NEXT: s_waitcnt vmcnt(0) 1563; VI-NEXT: v_cvt_f16_f32_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1564; VI-NEXT: v_cvt_f16_f32_e32 v3, v0 1565; VI-NEXT: v_mov_b32_e32 v0, s0 1566; VI-NEXT: v_mov_b32_e32 v1, s1 1567; VI-NEXT: v_or_b32_e32 v2, v3, v2 1568; VI-NEXT: flat_store_dword v[0:1], v2 1569; VI-NEXT: s_endpgm 1570 %val = load <2 x float>, <2 x float> addrspace(1)* %in 1571 %cvt = fptrunc <2 x float> %val to <2 x half> 1572 store <2 x half> %cvt, <2 x half> addrspace(1)* %out 1573 ret void 1574} 1575 1576define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 { 1577; SI-LABEL: global_truncstore_v3f32_to_v3f16: 1578; SI: ; %bb.0: 1579; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1580; SI-NEXT: s_waitcnt lgkmcnt(0) 1581; SI-NEXT: v_mov_b32_e32 v0, s2 1582; SI-NEXT: v_mov_b32_e32 v1, s3 1583; SI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1584; SI-NEXT: s_add_u32 s2, s0, 4 1585; SI-NEXT: s_addc_u32 s3, s1, 0 1586; SI-NEXT: s_waitcnt vmcnt(0) 1587; SI-NEXT: v_cvt_f16_f32_e32 v3, v1 1588; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1589; SI-NEXT: v_cvt_f16_f32_e32 v4, v0 1590; SI-NEXT: v_mov_b32_e32 v0, s2 1591; SI-NEXT: v_mov_b32_e32 v1, s3 1592; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1593; SI-NEXT: flat_store_short v[0:1], v2 1594; SI-NEXT: v_mov_b32_e32 v0, s0 1595; SI-NEXT: v_or_b32_e32 v2, v4, v3 1596; SI-NEXT: v_mov_b32_e32 v1, s1 1597; SI-NEXT: flat_store_dword v[0:1], v2 1598; SI-NEXT: s_endpgm 1599; 1600; VI-LABEL: global_truncstore_v3f32_to_v3f16: 1601; VI: ; %bb.0: 1602; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1603; VI-NEXT: s_waitcnt lgkmcnt(0) 1604; VI-NEXT: v_mov_b32_e32 v0, s2 1605; VI-NEXT: v_mov_b32_e32 v1, s3 1606; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1] 1607; VI-NEXT: s_add_u32 s2, s0, 4 1608; VI-NEXT: s_addc_u32 s3, s1, 0 1609; VI-NEXT: s_waitcnt vmcnt(0) 1610; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1611; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1612; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 1613; VI-NEXT: v_mov_b32_e32 v0, s2 1614; VI-NEXT: v_mov_b32_e32 v1, s3 1615; VI-NEXT: flat_store_short v[0:1], v2 1616; VI-NEXT: v_mov_b32_e32 v0, s0 1617; VI-NEXT: v_or_b32_e32 v3, v4, v3 1618; VI-NEXT: v_mov_b32_e32 v1, s1 1619; VI-NEXT: flat_store_dword v[0:1], v3 1620; VI-NEXT: s_endpgm 1621 %val = load <3 x float>, <3 x float> addrspace(1)* %in 1622 %cvt = fptrunc <3 x float> %val to <3 x half> 1623 store <3 x half> %cvt, <3 x half> addrspace(1)* %out 1624 ret void 1625} 1626 1627define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 { 1628; SI-LABEL: global_truncstore_v4f32_to_v4f16: 1629; SI: ; %bb.0: 1630; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1631; SI-NEXT: s_waitcnt lgkmcnt(0) 1632; SI-NEXT: v_mov_b32_e32 v0, s2 1633; SI-NEXT: v_mov_b32_e32 v1, s3 1634; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1635; SI-NEXT: v_mov_b32_e32 v4, s0 1636; SI-NEXT: v_mov_b32_e32 v5, s1 1637; SI-NEXT: s_waitcnt vmcnt(0) 1638; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1639; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1640; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1641; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1642; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1643; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v1 1644; SI-NEXT: v_or_b32_e32 v1, v2, v3 1645; SI-NEXT: v_or_b32_e32 v0, v0, v6 1646; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 1647; SI-NEXT: s_endpgm 1648; 1649; VI-LABEL: global_truncstore_v4f32_to_v4f16: 1650; VI: ; %bb.0: 1651; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1652; VI-NEXT: s_waitcnt lgkmcnt(0) 1653; VI-NEXT: v_mov_b32_e32 v0, s2 1654; VI-NEXT: v_mov_b32_e32 v1, s3 1655; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1656; VI-NEXT: s_waitcnt vmcnt(0) 1657; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1658; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1659; VI-NEXT: v_cvt_f16_f32_sdwa v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1660; VI-NEXT: v_cvt_f16_f32_e32 v5, v0 1661; VI-NEXT: v_mov_b32_e32 v0, s0 1662; VI-NEXT: v_mov_b32_e32 v1, s1 1663; VI-NEXT: v_or_b32_e32 v3, v2, v3 1664; VI-NEXT: v_or_b32_e32 v2, v5, v4 1665; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] 1666; VI-NEXT: s_endpgm 1667 %val = load <4 x float>, <4 x float> addrspace(1)* %in 1668 %cvt = fptrunc <4 x float> %val to <4 x half> 1669 store <4 x half> %cvt, <4 x half> addrspace(1)* %out 1670 ret void 1671} 1672 1673define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 { 1674; SI-LABEL: global_truncstore_v8f32_to_v8f16: 1675; SI: ; %bb.0: 1676; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1677; SI-NEXT: s_waitcnt lgkmcnt(0) 1678; SI-NEXT: s_add_u32 s4, s2, 16 1679; SI-NEXT: v_mov_b32_e32 v5, s3 1680; SI-NEXT: s_addc_u32 s5, s3, 0 1681; SI-NEXT: v_mov_b32_e32 v0, s4 1682; SI-NEXT: v_mov_b32_e32 v4, s2 1683; SI-NEXT: v_mov_b32_e32 v1, s5 1684; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1685; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1686; SI-NEXT: v_mov_b32_e32 v8, s0 1687; SI-NEXT: v_mov_b32_e32 v9, s1 1688; SI-NEXT: s_waitcnt vmcnt(1) 1689; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1690; SI-NEXT: s_waitcnt vmcnt(0) 1691; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1692; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 1693; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1694; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1695; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 1696; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1697; SI-NEXT: v_cvt_f16_f32_e32 v10, v0 1698; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v7 1699; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 1700; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1701; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v1 1702; SI-NEXT: v_or_b32_e32 v1, v6, v0 1703; SI-NEXT: v_or_b32_e32 v0, v4, v5 1704; SI-NEXT: v_or_b32_e32 v3, v2, v3 1705; SI-NEXT: v_or_b32_e32 v2, v10, v7 1706; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1707; SI-NEXT: s_endpgm 1708; 1709; VI-LABEL: global_truncstore_v8f32_to_v8f16: 1710; VI: ; %bb.0: 1711; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1712; VI-NEXT: s_waitcnt lgkmcnt(0) 1713; VI-NEXT: s_add_u32 s4, s2, 16 1714; VI-NEXT: v_mov_b32_e32 v5, s3 1715; VI-NEXT: s_addc_u32 s5, s3, 0 1716; VI-NEXT: v_mov_b32_e32 v0, s4 1717; VI-NEXT: v_mov_b32_e32 v4, s2 1718; VI-NEXT: v_mov_b32_e32 v1, s5 1719; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1720; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1721; VI-NEXT: v_mov_b32_e32 v8, s0 1722; VI-NEXT: v_mov_b32_e32 v9, s1 1723; VI-NEXT: s_waitcnt vmcnt(1) 1724; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1725; VI-NEXT: s_waitcnt vmcnt(0) 1726; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1727; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1728; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1729; VI-NEXT: v_cvt_f16_f32_e32 v4, v4 1730; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1731; VI-NEXT: v_cvt_f16_f32_sdwa v10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1732; VI-NEXT: v_cvt_f16_f32_e32 v11, v0 1733; VI-NEXT: v_or_b32_e32 v1, v6, v7 1734; VI-NEXT: v_or_b32_e32 v0, v4, v5 1735; VI-NEXT: v_or_b32_e32 v3, v2, v3 1736; VI-NEXT: v_or_b32_e32 v2, v11, v10 1737; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] 1738; VI-NEXT: s_endpgm 1739 %val = load <8 x float>, <8 x float> addrspace(1)* %in 1740 %cvt = fptrunc <8 x float> %val to <8 x half> 1741 store <8 x half> %cvt, <8 x half> addrspace(1)* %out 1742 ret void 1743} 1744 1745define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 { 1746; SI-LABEL: global_truncstore_v16f32_to_v16f16: 1747; SI: ; %bb.0: 1748; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1749; SI-NEXT: s_waitcnt lgkmcnt(0) 1750; SI-NEXT: s_add_u32 s4, s2, 32 1751; SI-NEXT: s_addc_u32 s5, s3, 0 1752; SI-NEXT: v_mov_b32_e32 v0, s4 1753; SI-NEXT: v_mov_b32_e32 v1, s5 1754; SI-NEXT: s_add_u32 s4, s2, 48 1755; SI-NEXT: v_mov_b32_e32 v13, s3 1756; SI-NEXT: s_addc_u32 s5, s3, 0 1757; SI-NEXT: v_mov_b32_e32 v12, s2 1758; SI-NEXT: v_mov_b32_e32 v4, s4 1759; SI-NEXT: s_add_u32 s2, s2, 16 1760; SI-NEXT: v_mov_b32_e32 v5, s5 1761; SI-NEXT: s_addc_u32 s3, s3, 0 1762; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1763; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1764; SI-NEXT: v_mov_b32_e32 v9, s3 1765; SI-NEXT: v_mov_b32_e32 v8, s2 1766; SI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1767; SI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1768; SI-NEXT: s_add_u32 s2, s0, 16 1769; SI-NEXT: s_addc_u32 s3, s1, 0 1770; SI-NEXT: s_waitcnt vmcnt(3) 1771; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 1772; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 1773; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1774; SI-NEXT: s_waitcnt vmcnt(2) 1775; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 1776; SI-NEXT: v_cvt_f16_f32_e32 v16, v5 1777; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1778; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 1779; SI-NEXT: v_cvt_f16_f32_e32 v17, v4 1780; SI-NEXT: s_waitcnt vmcnt(0) 1781; SI-NEXT: v_cvt_f16_f32_e32 v15, v15 1782; SI-NEXT: v_cvt_f16_f32_e32 v13, v13 1783; SI-NEXT: v_cvt_f16_f32_e32 v11, v11 1784; SI-NEXT: v_cvt_f16_f32_e32 v9, v9 1785; SI-NEXT: v_cvt_f16_f32_e32 v14, v14 1786; SI-NEXT: v_cvt_f16_f32_e32 v12, v12 1787; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 1788; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 1789; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1790; SI-NEXT: v_mov_b32_e32 v5, s3 1791; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 1792; SI-NEXT: v_or_b32_e32 v1, v2, v3 1793; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 1794; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 1795; SI-NEXT: v_mov_b32_e32 v4, s2 1796; SI-NEXT: v_or_b32_e32 v0, v0, v18 1797; SI-NEXT: v_or_b32_e32 v3, v6, v2 1798; SI-NEXT: v_or_b32_e32 v2, v17, v7 1799; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v15 1800; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v13 1801; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11 1802; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9 1803; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1804; SI-NEXT: v_mov_b32_e32 v5, s1 1805; SI-NEXT: v_or_b32_e32 v1, v14, v6 1806; SI-NEXT: v_or_b32_e32 v0, v12, v7 1807; SI-NEXT: v_or_b32_e32 v3, v10, v11 1808; SI-NEXT: v_or_b32_e32 v2, v8, v9 1809; SI-NEXT: v_mov_b32_e32 v4, s0 1810; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1811; SI-NEXT: s_endpgm 1812; 1813; VI-LABEL: global_truncstore_v16f32_to_v16f16: 1814; VI: ; %bb.0: 1815; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1816; VI-NEXT: s_waitcnt lgkmcnt(0) 1817; VI-NEXT: s_add_u32 s4, s2, 32 1818; VI-NEXT: s_addc_u32 s5, s3, 0 1819; VI-NEXT: v_mov_b32_e32 v0, s4 1820; VI-NEXT: v_mov_b32_e32 v1, s5 1821; VI-NEXT: s_add_u32 s4, s2, 48 1822; VI-NEXT: v_mov_b32_e32 v13, s3 1823; VI-NEXT: s_addc_u32 s5, s3, 0 1824; VI-NEXT: v_mov_b32_e32 v12, s2 1825; VI-NEXT: v_mov_b32_e32 v4, s4 1826; VI-NEXT: s_add_u32 s2, s2, 16 1827; VI-NEXT: v_mov_b32_e32 v5, s5 1828; VI-NEXT: s_addc_u32 s3, s3, 0 1829; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] 1830; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] 1831; VI-NEXT: v_mov_b32_e32 v9, s3 1832; VI-NEXT: v_mov_b32_e32 v8, s2 1833; VI-NEXT: flat_load_dwordx4 v[8:11], v[8:9] 1834; VI-NEXT: flat_load_dwordx4 v[12:15], v[12:13] 1835; VI-NEXT: s_add_u32 s2, s0, 16 1836; VI-NEXT: s_addc_u32 s3, s1, 0 1837; VI-NEXT: s_waitcnt vmcnt(3) 1838; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1839; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 1840; VI-NEXT: v_cvt_f16_f32_sdwa v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1841; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 1842; VI-NEXT: s_waitcnt vmcnt(2) 1843; VI-NEXT: v_cvt_f16_f32_sdwa v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1844; VI-NEXT: v_cvt_f16_f32_e32 v6, v6 1845; VI-NEXT: v_cvt_f16_f32_sdwa v17, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1846; VI-NEXT: v_cvt_f16_f32_e32 v18, v4 1847; VI-NEXT: s_waitcnt vmcnt(0) 1848; VI-NEXT: v_cvt_f16_f32_sdwa v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1849; VI-NEXT: v_cvt_f16_f32_e32 v14, v14 1850; VI-NEXT: v_cvt_f16_f32_sdwa v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1851; VI-NEXT: v_cvt_f16_f32_e32 v12, v12 1852; VI-NEXT: v_cvt_f16_f32_sdwa v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1853; VI-NEXT: v_cvt_f16_f32_e32 v10, v10 1854; VI-NEXT: v_cvt_f16_f32_sdwa v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD 1855; VI-NEXT: v_cvt_f16_f32_e32 v8, v8 1856; VI-NEXT: v_mov_b32_e32 v5, s3 1857; VI-NEXT: v_mov_b32_e32 v4, s2 1858; VI-NEXT: v_or_b32_e32 v1, v2, v3 1859; VI-NEXT: v_or_b32_e32 v0, v0, v16 1860; VI-NEXT: v_or_b32_e32 v3, v6, v7 1861; VI-NEXT: v_or_b32_e32 v2, v18, v17 1862; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1863; VI-NEXT: v_mov_b32_e32 v5, s1 1864; VI-NEXT: v_or_b32_e32 v1, v14, v15 1865; VI-NEXT: v_or_b32_e32 v0, v12, v13 1866; VI-NEXT: v_or_b32_e32 v3, v10, v11 1867; VI-NEXT: v_or_b32_e32 v2, v8, v9 1868; VI-NEXT: v_mov_b32_e32 v4, s0 1869; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 1870; VI-NEXT: s_endpgm 1871 %val = load <16 x float>, <16 x float> addrspace(1)* %in 1872 %cvt = fptrunc <16 x float> %val to <16 x half> 1873 store <16 x half> %cvt, <16 x half> addrspace(1)* %out 1874 ret void 1875} 1876 1877; FIXME: Unsafe math should fold conversions away 1878define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 { 1879; SI-LABEL: fadd_f16: 1880; SI: ; %bb.0: 1881; SI-NEXT: s_load_dword s0, s[4:5], 0x2 1882; SI-NEXT: s_waitcnt lgkmcnt(0) 1883; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 1884; SI-NEXT: s_lshr_b32 s0, s0, 16 1885; SI-NEXT: v_cvt_f32_f16_e32 v1, s0 1886; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1887; SI-NEXT: v_add_f32_e32 v0, v0, v1 1888; SI-NEXT: v_cvt_f16_f32_e32 v2, v0 1889; SI-NEXT: s_waitcnt lgkmcnt(0) 1890; SI-NEXT: v_mov_b32_e32 v0, s0 1891; SI-NEXT: v_mov_b32_e32 v1, s1 1892; SI-NEXT: flat_store_short v[0:1], v2 1893; SI-NEXT: s_endpgm 1894; 1895; VI-LABEL: fadd_f16: 1896; VI: ; %bb.0: 1897; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1898; VI-NEXT: s_load_dword s2, s[4:5], 0x8 1899; VI-NEXT: s_waitcnt lgkmcnt(0) 1900; VI-NEXT: s_lshr_b32 s3, s2, 16 1901; VI-NEXT: v_mov_b32_e32 v0, s3 1902; VI-NEXT: v_add_f16_e32 v2, s2, v0 1903; VI-NEXT: v_mov_b32_e32 v0, s0 1904; VI-NEXT: v_mov_b32_e32 v1, s1 1905; VI-NEXT: flat_store_short v[0:1], v2 1906; VI-NEXT: s_endpgm 1907 %add = fadd half %a, %b 1908 store half %add, half addrspace(1)* %out, align 4 1909 ret void 1910} 1911 1912define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 { 1913; SI-LABEL: fadd_v2f16: 1914; SI: ; %bb.0: 1915; SI-NEXT: s_load_dword s0, s[4:5], 0x2 1916; SI-NEXT: s_load_dword s1, s[4:5], 0x3 1917; SI-NEXT: s_waitcnt lgkmcnt(0) 1918; SI-NEXT: s_lshr_b32 s2, s0, 16 1919; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 1920; SI-NEXT: s_lshr_b32 s0, s1, 16 1921; SI-NEXT: v_cvt_f32_f16_e32 v1, s1 1922; SI-NEXT: v_cvt_f32_f16_e32 v2, s2 1923; SI-NEXT: v_cvt_f32_f16_e32 v3, s0 1924; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1925; SI-NEXT: v_add_f32_e32 v0, v0, v1 1926; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1927; SI-NEXT: v_add_f32_e32 v1, v2, v3 1928; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1929; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1930; SI-NEXT: v_or_b32_e32 v2, v0, v1 1931; SI-NEXT: s_waitcnt lgkmcnt(0) 1932; SI-NEXT: v_mov_b32_e32 v0, s0 1933; SI-NEXT: v_mov_b32_e32 v1, s1 1934; SI-NEXT: flat_store_dword v[0:1], v2 1935; SI-NEXT: s_endpgm 1936; 1937; VI-LABEL: fadd_v2f16: 1938; VI: ; %bb.0: 1939; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 1940; VI-NEXT: s_load_dword s2, s[4:5], 0x8 1941; VI-NEXT: s_load_dword s3, s[4:5], 0xc 1942; VI-NEXT: s_waitcnt lgkmcnt(0) 1943; VI-NEXT: s_lshr_b32 s5, s2, 16 1944; VI-NEXT: s_lshr_b32 s4, s3, 16 1945; VI-NEXT: v_mov_b32_e32 v0, s4 1946; VI-NEXT: v_mov_b32_e32 v1, s5 1947; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1948; VI-NEXT: v_mov_b32_e32 v1, s3 1949; VI-NEXT: v_add_f16_e32 v1, s2, v1 1950; VI-NEXT: v_or_b32_e32 v2, v1, v0 1951; VI-NEXT: v_mov_b32_e32 v0, s0 1952; VI-NEXT: v_mov_b32_e32 v1, s1 1953; VI-NEXT: flat_store_dword v[0:1], v2 1954; VI-NEXT: s_endpgm 1955 %add = fadd <2 x half> %a, %b 1956 store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8 1957 ret void 1958} 1959 1960define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 { 1961; SI-LABEL: fadd_v4f16: 1962; SI: ; %bb.0: 1963; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 1964; SI-NEXT: s_waitcnt lgkmcnt(0) 1965; SI-NEXT: s_add_u32 s4, s2, 8 1966; SI-NEXT: v_mov_b32_e32 v0, s2 1967; SI-NEXT: s_addc_u32 s5, s3, 0 1968; SI-NEXT: v_mov_b32_e32 v2, s4 1969; SI-NEXT: v_mov_b32_e32 v1, s3 1970; SI-NEXT: v_mov_b32_e32 v3, s5 1971; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 1972; SI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 1973; SI-NEXT: v_mov_b32_e32 v4, s0 1974; SI-NEXT: v_mov_b32_e32 v5, s1 1975; SI-NEXT: s_waitcnt vmcnt(1) 1976; SI-NEXT: v_cvt_f32_f16_e32 v6, v0 1977; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1978; SI-NEXT: v_cvt_f32_f16_e32 v7, v1 1979; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1980; SI-NEXT: s_waitcnt vmcnt(0) 1981; SI-NEXT: v_cvt_f32_f16_e32 v8, v2 1982; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 1983; SI-NEXT: v_cvt_f32_f16_e32 v9, v3 1984; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1985; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 1986; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 1987; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 1988; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 1989; SI-NEXT: v_add_f32_e32 v7, v7, v9 1990; SI-NEXT: v_add_f32_e32 v6, v6, v8 1991; SI-NEXT: v_add_f32_e32 v1, v1, v3 1992; SI-NEXT: v_add_f32_e32 v0, v0, v2 1993; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 1994; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 1995; SI-NEXT: v_cvt_f16_f32_e32 v2, v7 1996; SI-NEXT: v_cvt_f16_f32_e32 v3, v6 1997; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1998; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1999; SI-NEXT: v_or_b32_e32 v1, v2, v1 2000; SI-NEXT: v_or_b32_e32 v0, v3, v0 2001; SI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2002; SI-NEXT: s_endpgm 2003; 2004; VI-LABEL: fadd_v4f16: 2005; VI: ; %bb.0: 2006; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2007; VI-NEXT: s_waitcnt lgkmcnt(0) 2008; VI-NEXT: s_add_u32 s4, s2, 8 2009; VI-NEXT: v_mov_b32_e32 v0, s2 2010; VI-NEXT: s_addc_u32 s5, s3, 0 2011; VI-NEXT: v_mov_b32_e32 v2, s4 2012; VI-NEXT: v_mov_b32_e32 v1, s3 2013; VI-NEXT: v_mov_b32_e32 v3, s5 2014; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] 2015; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] 2016; VI-NEXT: v_mov_b32_e32 v4, s0 2017; VI-NEXT: v_mov_b32_e32 v5, s1 2018; VI-NEXT: s_waitcnt vmcnt(0) 2019; VI-NEXT: v_add_f16_sdwa v6, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2020; VI-NEXT: v_add_f16_e32 v1, v1, v3 2021; VI-NEXT: v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2022; VI-NEXT: v_add_f16_e32 v0, v0, v2 2023; VI-NEXT: v_or_b32_e32 v1, v1, v6 2024; VI-NEXT: v_or_b32_e32 v0, v0, v3 2025; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] 2026; VI-NEXT: s_endpgm 2027 %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1 2028 %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16 2029 %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16 2030 %result = fadd <4 x half> %a, %b 2031 store <4 x half> %result, <4 x half> addrspace(1)* %out, align 16 2032 ret void 2033} 2034 2035define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 { 2036; SI-LABEL: fadd_v8f16: 2037; SI: ; %bb.0: 2038; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2039; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 2040; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x8 2041; SI-NEXT: s_waitcnt lgkmcnt(0) 2042; SI-NEXT: s_lshr_b32 s10, s0, 16 2043; SI-NEXT: v_cvt_f32_f16_e32 v4, s0 2044; SI-NEXT: s_lshr_b32 s0, s4, 16 2045; SI-NEXT: v_cvt_f32_f16_e32 v8, s0 2046; SI-NEXT: s_lshr_b32 s0, s5, 16 2047; SI-NEXT: s_lshr_b32 s11, s1, 16 2048; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 2049; SI-NEXT: s_lshr_b32 s10, s2, 16 2050; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 2051; SI-NEXT: s_lshr_b32 s0, s6, 16 2052; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 2053; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 2054; SI-NEXT: s_lshr_b32 s10, s3, 16 2055; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 2056; SI-NEXT: s_lshr_b32 s0, s7, 16 2057; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 2058; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 2059; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 2060; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 2061; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 2062; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 2063; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 2064; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 2065; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 2066; SI-NEXT: v_add_f32_e32 v1, v1, v9 2067; SI-NEXT: v_add_f32_e32 v0, v0, v8 2068; SI-NEXT: v_add_f32_e32 v3, v3, v11 2069; SI-NEXT: v_add_f32_e32 v2, v2, v10 2070; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 2071; SI-NEXT: v_add_f32_e32 v5, v5, v13 2072; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 2073; SI-NEXT: v_add_f32_e32 v4, v4, v12 2074; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 2075; SI-NEXT: v_add_f32_e32 v7, v7, v14 2076; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 2077; SI-NEXT: v_add_f32_e32 v6, v6, v15 2078; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 2079; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 2080; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 2081; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 2082; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2083; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2084; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2085; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2086; SI-NEXT: v_or_b32_e32 v1, v5, v1 2087; SI-NEXT: v_or_b32_e32 v0, v4, v0 2088; SI-NEXT: v_mov_b32_e32 v4, s8 2089; SI-NEXT: v_or_b32_e32 v3, v7, v3 2090; SI-NEXT: v_or_b32_e32 v2, v6, v2 2091; SI-NEXT: v_mov_b32_e32 v5, s9 2092; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2093; SI-NEXT: s_endpgm 2094; 2095; VI-LABEL: fadd_v8f16: 2096; VI: ; %bb.0: 2097; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 2098; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 2099; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x20 2100; VI-NEXT: s_waitcnt lgkmcnt(0) 2101; VI-NEXT: s_lshr_b32 s11, s3, 16 2102; VI-NEXT: s_lshr_b32 s10, s7, 16 2103; VI-NEXT: v_mov_b32_e32 v0, s10 2104; VI-NEXT: v_mov_b32_e32 v1, s11 2105; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2106; VI-NEXT: v_mov_b32_e32 v1, s7 2107; VI-NEXT: v_add_f16_e32 v1, s3, v1 2108; VI-NEXT: s_lshr_b32 s3, s6, 16 2109; VI-NEXT: s_lshr_b32 s7, s2, 16 2110; VI-NEXT: v_or_b32_e32 v3, v1, v0 2111; VI-NEXT: v_mov_b32_e32 v0, s3 2112; VI-NEXT: v_mov_b32_e32 v1, s7 2113; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2114; VI-NEXT: v_mov_b32_e32 v1, s6 2115; VI-NEXT: v_add_f16_e32 v1, s2, v1 2116; VI-NEXT: s_lshr_b32 s2, s5, 16 2117; VI-NEXT: s_lshr_b32 s3, s1, 16 2118; VI-NEXT: v_or_b32_e32 v2, v1, v0 2119; VI-NEXT: v_mov_b32_e32 v0, s2 2120; VI-NEXT: v_mov_b32_e32 v1, s3 2121; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2122; VI-NEXT: v_mov_b32_e32 v1, s5 2123; VI-NEXT: v_add_f16_e32 v1, s1, v1 2124; VI-NEXT: s_lshr_b32 s1, s4, 16 2125; VI-NEXT: s_lshr_b32 s2, s0, 16 2126; VI-NEXT: v_or_b32_e32 v1, v1, v0 2127; VI-NEXT: v_mov_b32_e32 v0, s1 2128; VI-NEXT: v_mov_b32_e32 v4, s2 2129; VI-NEXT: v_add_f16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2130; VI-NEXT: v_mov_b32_e32 v4, s4 2131; VI-NEXT: v_add_f16_e32 v4, s0, v4 2132; VI-NEXT: v_or_b32_e32 v0, v4, v0 2133; VI-NEXT: v_mov_b32_e32 v4, s8 2134; VI-NEXT: v_mov_b32_e32 v5, s9 2135; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] 2136; VI-NEXT: s_endpgm 2137 %add = fadd <8 x half> %a, %b 2138 store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32 2139 ret void 2140} 2141 2142define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 { 2143; GCN-LABEL: test_bitcast_from_half: 2144; GCN: ; %bb.0: 2145; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2146; GCN-NEXT: s_waitcnt lgkmcnt(0) 2147; GCN-NEXT: v_mov_b32_e32 v0, s0 2148; GCN-NEXT: v_mov_b32_e32 v1, s1 2149; GCN-NEXT: flat_load_ushort v0, v[0:1] 2150; GCN-NEXT: v_mov_b32_e32 v2, s2 2151; GCN-NEXT: v_mov_b32_e32 v3, s3 2152; GCN-NEXT: s_waitcnt vmcnt(0) 2153; GCN-NEXT: flat_store_short v[2:3], v0 2154; GCN-NEXT: s_endpgm 2155 %val = load half, half addrspace(1)* %in 2156 %val_int = bitcast half %val to i16 2157 store i16 %val_int, i16 addrspace(1)* %out 2158 ret void 2159} 2160 2161define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 { 2162; GCN-LABEL: test_bitcast_to_half: 2163; GCN: ; %bb.0: 2164; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 2165; GCN-NEXT: s_waitcnt lgkmcnt(0) 2166; GCN-NEXT: v_mov_b32_e32 v2, s2 2167; GCN-NEXT: v_mov_b32_e32 v3, s3 2168; GCN-NEXT: flat_load_ushort v2, v[2:3] 2169; GCN-NEXT: v_mov_b32_e32 v0, s0 2170; GCN-NEXT: v_mov_b32_e32 v1, s1 2171; GCN-NEXT: s_waitcnt vmcnt(0) 2172; GCN-NEXT: flat_store_short v[0:1], v2 2173; GCN-NEXT: s_endpgm 2174 %val = load i16, i16 addrspace(1)* %in 2175 %val_fp = bitcast i16 %val to half 2176 store half %val_fp, half addrspace(1)* %out 2177 ret void 2178} 2179 2180attributes #0 = { nounwind } 2181