1; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s 2; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s 3; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s 5 6; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo: 7; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 8; GFX900-NEXT: ds_read_u16 v2, v0 9; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 10; GFX900-DAG: s_waitcnt lgkmcnt(0) 11; GFX900-DAG: v_mov_b32_e32 v1, v2 12; GFX900-DAG: ds_read_u16_d16_hi v1, v0 offset:16 13; GFX900: ds_write_b16 [[ZERO]], v2 14; GFX900-NEXT: s_waitcnt lgkmcnt(1) 15; GFX900-NEXT: v_mov_b32_e32 v0, v1 16; GFX900-NEXT: s_waitcnt lgkmcnt(0) 17; GFX900-NEXT: s_setpc_b64 s[30:31] 18define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 { 19entry: 20 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 21 %load.lo = load i16, i16 addrspace(3)* %in 22 %load.hi = load i16, i16 addrspace(3)* %gep 23 store i16 %load.lo, i16 addrspace(3)* null 24 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 25 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 26 ret <2 x i16> %build1 27} 28 29; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi: 30; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GFX900-DAG: ds_read_u16 [[HI:v[0-9]+]], v0 offset:16 32; GFX900-DAG: ds_read_u16 [[LO:v[0-9]+]], v0 33; GFX900-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 34; GFX900-DAG: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LO]] 35; GFX900-DAG: s_waitcnt lgkmcnt(1) 36; GFX900-DAG: ds_write_b16 [[ZERO]], [[HI]] 37; GFX900: v_lshl_or_b32 v{{[0-9]+}}, [[HI]], 16, [[AND]] 38; GFX900-NEXT: s_waitcnt lgkmcnt(0) 39; GFX900-NEXT: s_setpc_b64 s[30:31] 40define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 { 41entry: 42 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 43 %load.lo = load i16, i16 addrspace(3)* %in 44 %load.hi = load i16, i16 addrspace(3)* %gep 45 store i16 %load.hi, i16 addrspace(3)* null 46 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 47 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 48 ret <2 x i16> %build1 49} 50 51; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi: 52; GFX900: ds_read_u16 v3, v0 53; GFX900-NEXT: ds_read_u16 v0, v0 offset:16 54; GFX900-NEXT: s_waitcnt lgkmcnt(1) 55; GFX900-NEXT: ds_write_b16 v1, v3 56; GFX900-NEXT: s_waitcnt lgkmcnt(1) 57; GFX900-NEXT: ds_write_b16 v2, v0 58; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3 59; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1 60; GFX900-NEXT: s_waitcnt lgkmcnt(0) 61; GFX900-NEXT: s_setpc_b64 s[30:31] 62define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 { 63entry: 64 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 65 %load.lo = load i16, i16 addrspace(3)* %in 66 %load.hi = load i16, i16 addrspace(3)* %gep 67 store i16 %load.lo, i16 addrspace(3)* %out0 68 store i16 %load.hi, i16 addrspace(3)* %out1 69 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 70 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 71 ret <2 x i16> %build1 72} 73 74; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo: 75; GCN: s_waitcnt 76; GFX900-NEXT: ds_read_u16_d16_hi v0, v0 77; GFX900-NEXT: s_waitcnt 78; GFX900-NEXT: s_setpc_b64 79 80; NO-D16-HI: ds_read_u16 v 81define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 { 82entry: 83 %load = load i16, i16 addrspace(3)* %in 84 %build = insertelement <2 x i16> undef, i16 %load, i32 1 85 ret <2 x i16> %build 86} 87 88; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo: 89; GCN: s_waitcnt 90; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 91; GFX900-NEXT: s_waitcnt 92; GFX900-NEXT: v_mov_b32_e32 v0, v1 93; GFX900-NEXT: s_setpc_b64 94 95; NO-D16-HI: ds_read_u16 v 96define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 { 97entry: 98 %load = load i16, i16 addrspace(3)* %in 99 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 100 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 101 ret <2 x i16> %build1 102} 103 104; Show that we get reasonable regalloc without physreg constraints. 105; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg: 106; GCN: s_waitcnt 107; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 108; GFX900-NEXT: s_waitcnt 109; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 110; GFX900-NEXT: s_waitcnt 111; GFX900-NEXT: s_setpc_b64 112 113; NO-D16-HI: ds_read_u16 v 114define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 { 115entry: 116 %load = load i16, i16 addrspace(3)* %in 117 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 118 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 119 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 120 ret void 121} 122 123; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo: 124; GCN: s_waitcnt 125; GFX900-NEXT: v_mov_b32_e32 v1, 0 126; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 127; GFX900-NEXT: s_waitcnt 128; GFX900-NEXT: v_mov_b32_e32 v0, v1 129; GFX900-NEXT: s_setpc_b64 130 131; NO-D16-HI: ds_read_u16 v 132define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 { 133entry: 134 %load = load i16, i16 addrspace(3)* %in 135 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 136 ret <2 x i16> %build 137} 138 139; FIXME: Remove m0 initialization 140; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift: 141; GCN: s_waitcnt 142; GFX900-NEXT: ds_read_u16 v0, v0 143; GFX900-NEXT: s_waitcnt lgkmcnt(0) 144; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 145; GFX900-NEXT: s_setpc_b64 146 147; NO-D16-HI: ds_read_u16 v 148; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0 149define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 { 150entry: 151 %load = load i16, i16 addrspace(3)* %in 152 %zext = zext i16 %load to i32 153 %shift = shl i32 %zext, 16 154 ret i32 %shift 155} 156 157; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg: 158; GCN: s_waitcnt 159; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 160; GFX900-NEXT: s_waitcnt 161; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 162; GFX900-NEXT: s_waitcnt 163; GFX900-NEXT: s_setpc_b64 164 165; NO-D16-HI: ds_read_u16 v 166define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 { 167entry: 168 %load = load half, half addrspace(3)* %in 169 %build0 = insertelement <2 x half> undef, half %reg, i32 0 170 %build1 = insertelement <2 x half> %build0, half %load, i32 1 171 store <2 x half> %build1, <2 x half> addrspace(1)* undef 172 ret void 173} 174 175; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8: 176; GCN: s_waitcnt 177; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 178; GFX900-NEXT: s_waitcnt 179; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 180; GFX900-NEXT: s_waitcnt 181; GFX900-NEXT: s_setpc_b64 182 183; NO-D16-HI: ds_read_u8 v 184define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 185entry: 186 %load = load i8, i8 addrspace(3)* %in 187 %ext = zext i8 %load to i16 188 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 189 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 190 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 191 ret void 192} 193 194; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8: 195; GCN: s_waitcnt 196; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 197; GFX900-NEXT: s_waitcnt 198; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 199; GFX900-NEXT: s_waitcnt 200; GFX900-NEXT: s_setpc_b64 201 202; NO-D16-HI: ds_read_i8 v 203define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 { 204entry: 205 %load = load i8, i8 addrspace(3)* %in 206 %ext = sext i8 %load to i16 207 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 208 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 209 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 210 ret void 211} 212 213; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8: 214; GCN: s_waitcnt 215; GFX900-NEXT: ds_read_u8_d16_hi v1, v0 216; GFX900-NEXT: s_waitcnt 217; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 218; GFX900-NEXT: s_waitcnt 219; GFX900-NEXT: s_setpc_b64 220 221; NO-D16-HI: ds_read_u8 v 222define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 { 223entry: 224 %load = load i8, i8 addrspace(3)* %in 225 %ext = zext i8 %load to i16 226 %bitcast = bitcast i16 %ext to half 227 228 %build0 = insertelement <2 x half> undef, half %reg, i32 0 229 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 230 store <2 x half> %build1, <2 x half> addrspace(1)* undef 231 ret void 232} 233 234; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8: 235; GCN: s_waitcnt 236; GFX900-NEXT: ds_read_i8_d16_hi v1, v0 237; GFX900-NEXT: s_waitcnt 238; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}} 239; GFX900-NEXT: s_waitcnt 240; GFX900-NEXT: s_setpc_b64 241 242; NO-D16-HI: ds_read_i8 v 243define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 { 244entry: 245 %load = load i8, i8 addrspace(3)* %in 246 %ext = sext i8 %load to i16 247 %bitcast = bitcast i16 %ext to half 248 249 %build0 = insertelement <2 x half> undef, half %reg, i32 0 250 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 251 store <2 x half> %build1, <2 x half> addrspace(1)* undef 252 ret void 253} 254 255; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg: 256; GCN: s_waitcnt 257; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 258; GFX900-NEXT: s_waitcnt 259; GFX900-NEXT: global_store_dword 260; GFX900-NEXT: s_waitcnt 261; GFX900-NEXT: s_setpc_b64 262define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 { 263entry: 264 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047 265 %load = load i16, i16 addrspace(1)* %gep 266 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 267 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 268 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 269 ret void 270} 271 272; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg: 273; GCN: s_waitcnt 274; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 275; GFX900-NEXT: s_waitcnt 276; GFX900-NEXT: global_store_dword 277; GFX900-NEXT: s_waitcnt 278; GFX900-NEXT: s_setpc_b64 279define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 { 280entry: 281 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047 282 %load = load half, half addrspace(1)* %gep 283 %build0 = insertelement <2 x half> undef, half %reg, i32 0 284 %build1 = insertelement <2 x half> %build0, half %load, i32 1 285 store <2 x half> %build1, <2 x half> addrspace(1)* undef 286 ret void 287} 288 289; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8: 290; GCN: s_waitcnt 291; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 292; GFX900-NEXT: s_waitcnt 293; GFX900-NEXT: global_store_dword 294; GFX900-NEXT: s_waitcnt 295; GFX900-NEXT: s_setpc_b64 296define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 297entry: 298 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 299 %load = load i8, i8 addrspace(1)* %gep 300 %ext = zext i8 %load to i16 301 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 302 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 303 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 304 ret void 305} 306 307; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8: 308; GCN: s_waitcnt 309; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 310; GFX900-NEXT: s_waitcnt 311; GFX900-NEXT: global_store_dword 312; GFX900-NEXT: s_waitcnt 313; GFX900-NEXT: s_setpc_b64 314define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 { 315entry: 316 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 317 %load = load i8, i8 addrspace(1)* %gep 318 %ext = sext i8 %load to i16 319 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 320 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 321 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 322 ret void 323} 324 325; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8: 326; GCN: s_waitcnt 327; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 328; GFX900-NEXT: s_waitcnt 329; GFX900-NEXT: global_store_dword 330; GFX900-NEXT: s_waitcnt 331; GFX900-NEXT: s_setpc_b64 332define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 { 333entry: 334 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 335 %load = load i8, i8 addrspace(1)* %gep 336 %ext = sext i8 %load to i16 337 %bitcast = bitcast i16 %ext to half 338 %build0 = insertelement <2 x half> undef, half %reg, i32 0 339 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 340 store <2 x half> %build1, <2 x half> addrspace(1)* undef 341 ret void 342} 343 344; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8: 345; GCN: s_waitcnt 346; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 347; GFX900-NEXT: s_waitcnt 348; GFX900-NEXT: global_store_dword 349; GFX900-NEXT: s_waitcnt 350; GFX900-NEXT: s_setpc_b64 351define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 { 352entry: 353 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095 354 %load = load i8, i8 addrspace(1)* %gep 355 %ext = zext i8 %load to i16 356 %bitcast = bitcast i16 %ext to half 357 %build0 = insertelement <2 x half> undef, half %reg, i32 0 358 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 359 store <2 x half> %build1, <2 x half> addrspace(1)* undef 360 ret void 361} 362 363; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg: 364; GCN: s_waitcnt 365; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 366; GFX900-NEXT: s_waitcnt 367; GFX900-NEXT: global_store_dword v[0:1], v2 368; GFX900-NEXT: s_waitcnt 369; GFX900-NEXT: s_setpc_b64 370 371; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 372; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 373; GFX803: v_or_b32_sdwa 374; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 375define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 { 376entry: 377 %load = load i16, i16* %in 378 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 379 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 380 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 381 ret void 382} 383 384; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg: 385; GCN: s_waitcnt 386; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1] 387; GFX900-NEXT: s_waitcnt 388; GFX900-NEXT: global_store_dword v[0:1], v2 389; GFX900-NEXT: s_waitcnt 390; GFX900-NEXT: s_setpc_b64 391 392; NO-D16-HI: flat_load_ushort v{{[0-9]+}} 393; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 394; GFX803: v_or_b32_sdwa 395; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 396define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 { 397entry: 398 %load = load half, half* %in 399 %build0 = insertelement <2 x half> undef, half %reg, i32 0 400 %build1 = insertelement <2 x half> %build0, half %load, i32 1 401 store <2 x half> %build1, <2 x half> addrspace(1)* undef 402 ret void 403} 404 405; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8: 406; GCN: s_waitcnt 407; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 408; GFX900-NEXT: s_waitcnt 409; GFX900-NEXT: global_store_dword v[0:1], v2 410; GFX900-NEXT: s_waitcnt 411; GFX900-NEXT: s_setpc_b64 412 413; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 414; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 415; GFX803: v_or_b32_sdwa 416; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 417define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 { 418entry: 419 %load = load i8, i8* %in 420 %ext = zext i8 %load to i16 421 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 422 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 423 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 424 ret void 425} 426 427; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8: 428; GCN: s_waitcnt 429; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 430; GFX900-NEXT: s_waitcnt 431; GFX900-NEXT: global_store_dword v[0:1], v2 432; GFX900-NEXT: s_waitcnt 433; GFX900-NEXT: s_setpc_b64 434 435; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 436; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 437; GFX803: v_or_b32_sdwa 438; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 439define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 { 440entry: 441 %load = load i8, i8* %in 442 %ext = sext i8 %load to i16 443 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 444 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 445 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 446 ret void 447} 448 449; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8: 450; GCN: s_waitcnt 451; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1] 452; GFX900-NEXT: s_waitcnt 453; GFX900-NEXT: global_store_dword v[0:1], v2 454; GFX900-NEXT: s_waitcnt 455; GFX900-NEXT: s_setpc_b64 456 457; NO-D16-HI: flat_load_ubyte v{{[0-9]+}} 458; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 459; GFX803: v_or_b32_sdwa 460; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 461define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 { 462entry: 463 %load = load i8, i8* %in 464 %ext = zext i8 %load to i16 465 %bitcast = bitcast i16 %ext to half 466 %build0 = insertelement <2 x half> undef, half %reg, i32 0 467 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 468 store <2 x half> %build1, <2 x half> addrspace(1)* undef 469 ret void 470} 471 472; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8: 473; GCN: s_waitcnt 474; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1] 475; GFX900-NEXT: s_waitcnt 476; GFX900-NEXT: global_store_dword v[0:1], v2 477; GFX900-NEXT: s_waitcnt 478; GFX900-NEXT: s_setpc_b64 479 480; NO-D16-HI: flat_load_sbyte v{{[0-9]+}} 481; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, 482; GFX803: v_or_b32_sdwa 483; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 484define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 { 485entry: 486 %load = load i8, i8* %in 487 %ext = sext i8 %load to i16 488 %bitcast = bitcast i16 %ext to half 489 %build0 = insertelement <2 x half> undef, half %reg, i32 0 490 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 491 store <2 x half> %build1, <2 x half> addrspace(1)* undef 492 ret void 493} 494 495; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg: 496; GCN: s_waitcnt 497; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 498; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} 499; GFX900-NEXT: s_waitcnt 500; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 501; GFX900-NEXT: s_waitcnt 502; GFX900-NEXT: s_setpc_b64 503 504; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 505define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 { 506entry: 507 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047 508 %load = load i16, i16 addrspace(5)* %gep 509 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 510 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 511 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 512 ret void 513} 514 515; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg: 516; GCN: s_waitcnt 517; GFX900-MUBUF: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}} 518; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}} 519; GFX900-NEXT: s_waitcnt 520; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 521; GFX900-NEXT: s_waitcnt 522; GFX900-NEXT: s_setpc_b64 523 524; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}} 525define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval(half) %in, half %reg) #0 { 526entry: 527 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047 528 %load = load half, half addrspace(5)* %gep 529 %build0 = insertelement <2 x half> undef, half %reg, i32 0 530 %build1 = insertelement <2 x half> %build0, half %load, i32 1 531 store <2 x half> %build1, <2 x half> addrspace(1)* undef 532 ret void 533} 534 535; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: 536; GCN: s_waitcnt 537; GFX900-MUBUFF: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} 538; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe 539; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]] glc{{$}} 540; GFX900: s_waitcnt 541; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 542; GFX900-NEXT: s_waitcnt 543; GFX900-NEXT: s_setpc_b64 544 545; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}} 546define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval(i16) %in, i16 %reg) #0 { 547entry: 548 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) 549 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 550 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 551 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 552 ret void 553} 554 555; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: 556; GCN: s_waitcnt 557; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} 558; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 559; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]] glc{{$}} 560; GFX900-NEXT: s_waitcnt 561; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 562; GFX900-NEXT: s_waitcnt 563; GFX900-NEXT: s_setpc_b64 564 565; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094 glc{{$}} 566define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { 567entry: 568 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) 569 %build0 = insertelement <2 x half> undef, half %reg, i32 0 570 %build1 = insertelement <2 x half> %build0, half %load, i32 1 571 store <2 x half> %build1, <2 x half> addrspace(1)* undef 572 ret void 573} 574 575; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8: 576; GCN: s_waitcnt 577; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 578; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} 579; GFX900-NEXT: s_waitcnt 580; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 581; GFX900-NEXT: s_waitcnt 582; GFX900-NEXT: s_setpc_b64 583 584; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 585define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 { 586entry: 587 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 588 %load = load i8, i8 addrspace(5)* %gep 589 %ext = zext i8 %load to i16 590 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 591 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 592 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 593 ret void 594} 595 596; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8: 597; GCN: s_waitcnt 598; GFX900-MUBUF: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 599; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}} 600; GFX900-NEXT: s_waitcnt 601; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 602; GFX900-NEXT: s_waitcnt 603; GFX900-NEXT: s_setpc_b64 604 605; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 606define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 { 607entry: 608 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 609 %load = load i8, i8 addrspace(5)* %gep 610 %ext = zext i8 %load to i16 611 %bitcast = bitcast i16 %ext to half 612 %build0 = insertelement <2 x half> undef, half %reg, i32 0 613 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 614 store <2 x half> %build1, <2 x half> addrspace(1)* undef 615 ret void 616} 617 618; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8: 619; GCN: s_waitcnt 620; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 621; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} 622; GFX900-NEXT: s_waitcnt 623; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 624; GFX900-NEXT: s_waitcnt 625; GFX900-NEXT: s_setpc_b64 626 627; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 628define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, half %reg) #0 { 629entry: 630 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 631 %load = load i8, i8 addrspace(5)* %gep 632 %ext = sext i8 %load to i16 633 %bitcast = bitcast i16 %ext to half 634 %build0 = insertelement <2 x half> undef, half %reg, i32 0 635 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 636 store <2 x half> %build1, <2 x half> addrspace(1)* undef 637 ret void 638} 639 640; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8: 641; GCN: s_waitcnt 642; GFX900-MUBUF: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}} 643; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}} 644; GFX900-NEXT: s_waitcnt 645; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 646; GFX900-NEXT: s_waitcnt 647; GFX900-NEXT: s_setpc_b64 648 649; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}} 650define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval(i8) %in, i16 %reg) #0 { 651entry: 652 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095 653 %load = load i8, i8 addrspace(5)* %gep 654 %ext = sext i8 %load to i16 655 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 656 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 657 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 658 ret void 659} 660 661; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: 662; GCN: s_waitcnt 663; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} 664; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 665; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}} 666; GFX900-NEXT: s_waitcnt 667; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 668; GFX900-NEXT: s_waitcnt 669; GFX900-NEXT: s_setpc_b64 670 671; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}} 672define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 673entry: 674 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 675 %ext = zext i8 %load to i16 676 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 677 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 678 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 679 ret void 680} 681 682; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: 683; GCN: s_waitcnt 684; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} 685; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 686; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]] glc{{$}} 687; GFX900-NEXT: s_waitcnt 688; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 689; GFX900-NEXT: s_waitcnt 690; GFX900-NEXT: s_setpc_b64 691 692; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 glc{{$}} 693define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { 694entry: 695 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 696 %ext = sext i8 %load to i16 697 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 698 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 699 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 700 ret void 701} 702 703; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: 704; GCN: s_waitcnt 705; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094 glc{{$}} 706; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe 707; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]] glc{{$}} 708; GFX900-NEXT: s_waitcnt 709; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 710; GFX900-NEXT: s_waitcnt 711; GFX900-NEXT: s_setpc_b64 712 713; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 glc{{$}} 714define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { 715entry: 716 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) 717 %ext = zext i8 %load to i16 718 %bc.ext = bitcast i16 %ext to half 719 %build0 = insertelement <2 x half> undef, half %reg, i32 0 720 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1 721 store <2 x half> %build1, <2 x half> addrspace(1)* undef 722 ret void 723} 724 725; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg: 726; GCN: s_waitcnt 727; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 728; GFX900-NEXT: s_waitcnt 729; GFX900-NEXT: global_store_dword 730; GFX900-NEXT: s_waitcnt 731; GFX900-NEXT: s_setpc_b64 732 733; GFX803: flat_load_ushort 734; GFX906: global_load_ushort 735define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 { 736entry: 737 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047 738 %load = load i16, i16 addrspace(4)* %gep 739 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 740 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 741 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 742 ret void 743} 744 745; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg 746; GCN: s_waitcnt 747; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094 748; GFX900-NEXT: s_waitcnt 749; GFX900-NEXT: global_store_dword 750; GFX900-NEXT: s_waitcnt 751; GFX900-NEXT: s_setpc_b64 752 753; GFX803: flat_load_ushort 754; GFX906: global_load_ushort 755define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 { 756entry: 757 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047 758 %load = load half, half addrspace(4)* %gep 759 %build0 = insertelement <2 x half> undef, half %reg, i32 0 760 %build1 = insertelement <2 x half> %build0, half %load, i32 1 761 store <2 x half> %build1, <2 x half> addrspace(1)* undef 762 ret void 763} 764 765; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8: 766; GCN: s_waitcnt 767; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095 768; GFX900-NEXT: s_waitcnt 769; GFX900-NEXT: global_store_dword 770; GFX900-NEXT: s_waitcnt 771; GFX900-NEXT: s_setpc_b64 772define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 { 773entry: 774 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 775 %load = load i8, i8 addrspace(4)* %gep 776 %ext = sext i8 %load to i16 777 %bitcast = bitcast i16 %ext to half 778 %build0 = insertelement <2 x half> undef, half %reg, i32 0 779 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 780 store <2 x half> %build1, <2 x half> addrspace(1)* undef 781 ret void 782} 783 784; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8: 785; GCN: s_waitcnt 786; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095 787; GFX900-NEXT: s_waitcnt 788; GFX900-NEXT: global_store_dword 789; GFX900-NEXT: s_waitcnt 790; GFX900-NEXT: s_setpc_b64 791define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 { 792entry: 793 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095 794 %load = load i8, i8 addrspace(4)* %gep 795 %ext = zext i8 %load to i16 796 %bitcast = bitcast i16 %ext to half 797 %build0 = insertelement <2 x half> undef, half %reg, i32 0 798 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1 799 store <2 x half> %build1, <2 x half> addrspace(1)* undef 800 ret void 801} 802 803; Local object gives known offset, so requires converting from offen 804; to offset variant. 805 806; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset: 807; GFX900-MUBUF: buffer_store_dword 808; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 809; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4058 810; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 811; GFX900-FLATSCR: scratch_store_dword 812; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 813; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4058 814; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 815define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { 816entry: 817 %obj1 = alloca [4096 x i16], align 2, addrspace(5) 818 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 819 store volatile i32 123, i32 addrspace(5)* %bc 820 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027 821 %load = load i16, i16 addrspace(5)* %gep 822 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 823 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 824 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 825 ret void 826} 827 828; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset: 829; GFX900-MUBUF: buffer_store_dword 830; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 831; GFX900-MUBUF-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059 832; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 833; GFX900-FLATSCR: scratch_store_dword 834; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 835; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059 836; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 837define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { 838entry: 839 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 840 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 841 store volatile i32 123, i32 addrspace(5)* %bc 842 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 843 %load = load i8, i8 addrspace(5)* %gep 844 %ext = sext i8 %load to i16 845 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 846 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 847 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 848 ret void 849} 850 851; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset: 852; GFX900-MUBUF: buffer_store_dword 853; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 854; GFX900-MUBUF-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4059 855; GFX900-MUBUF-NEXT: s_waitcnt vmcnt(0) 856; GFX900-FLATSCR: scratch_store_dword 857; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 858; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4059 859; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) 860define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg, [10 x i32] addrspace(5)* %obj0) #0 { 861entry: 862 %obj1 = alloca [4096 x i8], align 2, addrspace(5) 863 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)* 864 store volatile i32 123, i32 addrspace(5)* %bc 865 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055 866 %load = load i8, i8 addrspace(5)* %gep 867 %ext = zext i8 %load to i16 868 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 869 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1 870 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef 871 ret void 872} 873 874; FIXME: Remove m0 init and waitcnt between reads 875; FIXME: Is there a cost to using the extload over not? 876; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain: 877; GCN: s_waitcnt 878; GFX900-NEXT: ds_read_u16 v1, v0 879; GFX900-NEXT: s_waitcnt 880; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2 881; GFX900-NEXT: s_waitcnt 882; GFX900-NEXT: v_mov_b32_e32 v0, v1 883; GFX900-NEXT: s_setpc_b64 884define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 { 885entry: 886 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 887 %load0 = load volatile i16, i16 addrspace(3)* %in 888 %load1 = load volatile i16, i16 addrspace(3)* %gep 889 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 890 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 891 ret <2 x i16> %build1 892} 893 894; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain: 895; GFX900: ds_read_u16 v1, v0 896; GFX900-NEXT: s_waitcnt lgkmcnt(0) 897; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16 898; GFX900-NEXT: s_waitcnt lgkmcnt(0) 899; GFX900-NEXT: v_mov_b32_e32 v0, v1 900; GFX900-NEXT: s_setpc_b64 901 902; NO-D16-HI: ds_read_u16 903; NO-D16-HI: ds_read_u16 904define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 { 905entry: 906 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 907 %load.lo = load i16, i16 addrspace(3)* %in 908 %load.hi = load i16, i16 addrspace(3)* %gep 909 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 910 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 911 ret <2 x i16> %build1 912} 913 914; FIXME: Remove and 915; GCN-LABEL: {{^}}load_local_v2i16_broadcast: 916; GCN: ds_read_u16 [[LOAD:v[0-9]+]] 917; GCN-NOT: ds_read 918; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]] 919; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]] 920define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 { 921entry: 922 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1 923 %load0 = load i16, i16 addrspace(3)* %in 924 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 925 %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1 926 ret <2 x i16> %build1 927} 928 929; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect: 930; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0 931; GFX900: ds_write_b16 932; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16 933 934; NO-D16-HI: ds_read_u16 935; NO-D16-HI: ds_write_b16 936; NO-D16-HI: ds_read_u16 937define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 { 938entry: 939 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8 940 %load.lo = load i16, i16 addrspace(3)* %in 941 store i16 123, i16 addrspace(3)* %may.alias 942 %load.hi = load i16, i16 addrspace(3)* %gep 943 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0 944 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1 945 ret <2 x i16> %build1 946} 947 948; FIXME: Remove waitcnt between reads 949; GCN-LABEL: {{^}}load_global_v2i16_split: 950; GCN: s_waitcnt 951; GFX900-NEXT: global_load_ushort v2 952; GFX900-NEXT: s_waitcnt 953; GFX900-NEXT: global_load_short_d16_hi v2 954; GFX900-NEXT: s_waitcnt 955; GFX900-NEXT: v_mov_b32_e32 v0, v2 956; GFX900-NEXT: s_setpc_b64 957define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 { 958entry: 959 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1 960 %load0 = load volatile i16, i16 addrspace(1)* %in 961 %load1 = load volatile i16, i16 addrspace(1)* %gep 962 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 963 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 964 ret <2 x i16> %build1 965} 966 967; FIXME: Remove waitcnt between reads 968; GCN-LABEL: {{^}}load_flat_v2i16_split: 969; GCN: s_waitcnt 970; GFX900-NEXT: flat_load_ushort v2 971; GFX900-NEXT: s_waitcnt 972; GFX900-NEXT: flat_load_short_d16_hi v2 973; GFX900-NEXT: s_waitcnt 974; GFX900-NEXT: v_mov_b32_e32 v0, v2 975; GFX900-NEXT: s_setpc_b64 976define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 { 977entry: 978 %gep = getelementptr inbounds i16, i16* %in, i64 1 979 %load0 = load volatile i16, i16* %in 980 %load1 = load volatile i16, i16* %gep 981 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 982 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 983 ret <2 x i16> %build1 984} 985 986; FIXME: Remove waitcnt between reads 987; GCN-LABEL: {{^}}load_constant_v2i16_split: 988; GCN: s_waitcnt 989; GFX900-NEXT: global_load_ushort v2 990; GFX900-NEXT: s_waitcnt 991; GFX900-NEXT: global_load_short_d16_hi v2 992; GFX900-NEXT: s_waitcnt 993; GFX900-NEXT: v_mov_b32_e32 v0, v2 994; GFX900-NEXT: s_setpc_b64 995define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 { 996entry: 997 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1 998 %load0 = load volatile i16, i16 addrspace(4)* %in 999 %load1 = load volatile i16, i16 addrspace(4)* %gep 1000 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 1001 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 1002 ret <2 x i16> %build1 1003} 1004 1005; FIXME: Remove m0 init and waitcnt between reads 1006; FIXME: Is there a cost to using the extload over not? 1007; GCN-LABEL: {{^}}load_private_v2i16_split: 1008; GCN: s_waitcnt 1009; GFX900-MUBUF: buffer_load_ushort v0, off, s[0:3], s32 glc{{$}} 1010; GFX900-FLATSCR: scratch_load_ushort v0, off, s32 glc{{$}} 1011; GFX900-NEXT: s_waitcnt 1012; GFX900-MUBUF-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2 1013; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2 1014; GFX900-NEXT: s_waitcnt 1015; GFX900-NEXT: s_setpc_b64 1016define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval(i16) %in) #0 { 1017entry: 1018 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1 1019 %load0 = load volatile i16, i16 addrspace(5)* %in 1020 %load1 = load volatile i16, i16 addrspace(5)* %gep 1021 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0 1022 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1 1023 ret <2 x i16> %build1 1024} 1025 1026; FIXME: This test should work without copying of v0. 1027; ds_read_u16_d16_hi preserves low 16 bits of the destination 1028; and ds_write_b16 only reads low 16 bits. 1029; GCN: s_waitcnt 1030; GFX900: v_mov_b32_e32 [[COPY:v[0-9]+]], v0 1031; GFX900-NEXT: ds_read_u16_d16_hi [[COPY]], v1 1032; GFX900-NEXT: ds_write_b16 v1, v0 1033; GFX900-NEXT: s_waitcnt 1034; GFX900-NEXT: v_mov_b32_e32 v0, [[COPY]] 1035; GFX900-NEXT: s_waitcnt 1036; GFX900-NEXT: s_setpc_b64 1037define <2 x i16> @load_local_hi_v2i16_store_local_lo(i16 %reg, i16 addrspace(3)* %in) #0 { 1038entry: 1039 %load = load i16, i16 addrspace(3)* %in 1040 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0 1041 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1 1042 store volatile i16 %reg, i16 addrspace(3)* %in 1043 ret <2 x i16> %build1 1044} 1045 1046attributes #0 = { nounwind } 1047