1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s 3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s 4; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s 5 6; Testing for ds_read/write_b128 7; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s 8; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s 9 10; FUNC-LABEL: {{^}}local_load_i8: 11; GCN-NOT: s_wqm_b64 12; SICIVI: s_mov_b32 m0 13; GFX9-NOT: m0 14; GCN: ds_read_u8 15 16; EG: LDS_UBYTE_READ_RET 17define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 18entry: 19 %ld = load i8, i8 addrspace(3)* %in 20 store i8 %ld, i8 addrspace(3)* %out 21 ret void 22} 23 24; FUNC-LABEL: {{^}}local_load_v2i8: 25; GCN-NOT: s_wqm_b64 26; SICIVI: s_mov_b32 m0 27; GFX9-NOT: m0 28; GCN: ds_read_u16 29 30; EG: LDS_USHORT_READ_RET 31define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 32entry: 33 %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in 34 store <2 x i8> %ld, <2 x i8> addrspace(3)* %out 35 ret void 36} 37 38; FUNC-LABEL: {{^}}local_load_v3i8: 39; GFX9-NOT: m0 40; GCN: ds_read_b32 41 42; EG: DS_READ_RET 43define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { 44entry: 45 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in 46 store <3 x i8> %ld, <3 x i8> addrspace(3)* %out 47 ret void 48} 49 50; FUNC-LABEL: {{^}}local_load_v4i8: 51; GFX9-NOT: m0 52; GCN: ds_read_b32 53 54; EG: LDS_READ_RET 55define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 56entry: 57 %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in 58 store <4 x i8> %ld, <4 x i8> addrspace(3)* %out 59 ret void 60} 61 62; FUNC-LABEL: {{^}}local_load_v8i8: 63; GFX9-NOT: m0 64; GCN: ds_read_b64 65 66; EG: LDS_READ_RET 67; EG: LDS_READ_RET 68define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 69entry: 70 %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in 71 store <8 x i8> %ld, <8 x i8> addrspace(3)* %out 72 ret void 73} 74 75; FUNC-LABEL: {{^}}local_load_v16i8: 76; GFX9-NOT: m0 77; GCN: ds_read2_b64 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]], v{{[0-9]+}} offset1:1{{$}} 78; GCN: ds_write2_b64 v{{[0-9]+}}, v[[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]] offset1:1{{$}} 79 80; EG: LDS_READ_RET 81; EG: LDS_READ_RET 82; EG: LDS_READ_RET 83; EG: LDS_READ_RET 84define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 85entry: 86 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in 87 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out 88 ret void 89} 90 91; FUNC-LABEL: {{^}}local_zextload_i8_to_i32: 92; GFX9-NOT: m0 93; GCN-NOT: s_wqm_b64 94; SICIVI: s_mov_b32 m0 95; GCN: ds_read_u8 96 97; EG: LDS_UBYTE_READ_RET 98define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 99 %a = load i8, i8 addrspace(3)* %in 100 %ext = zext i8 %a to i32 101 store i32 %ext, i32 addrspace(3)* %out 102 ret void 103} 104 105; FUNC-LABEL: {{^}}local_sextload_i8_to_i32: 106; GCN-NOT: s_wqm_b64 107; GFX9-NOT: m0 108; SICIVI: s_mov_b32 m0 109; GCN: ds_read_i8 110 111; EG: LDS_UBYTE_READ_RET 112; EG: BFE_INT 113define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 114 %ld = load i8, i8 addrspace(3)* %in 115 %ext = sext i8 %ld to i32 116 store i32 %ext, i32 addrspace(3)* %out 117 ret void 118} 119 120; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32: 121 122; EG: LDS_UBYTE_READ_RET 123define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 124 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 125 %ext = zext <1 x i8> %load to <1 x i32> 126 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out 127 ret void 128} 129 130; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32: 131; GFX9-NOT: m0 132 133; EG: LDS_UBYTE_READ_RET 134; EG: BFE_INT 135define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 136 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 137 %ext = sext <1 x i8> %load to <1 x i32> 138 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out 139 ret void 140} 141 142; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32: 143; GFX9-NOT: m0 144; GCN: ds_read_u16 145 146; EG: LDS_USHORT_READ_RET 147define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 148 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 149 %ext = zext <2 x i8> %load to <2 x i32> 150 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out 151 ret void 152} 153 154; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32: 155; GCN-NOT: s_wqm_b64 156; GFX9-NOT: m0 157; SICIVI: s_mov_b32 m0 158; GCN: ds_read_u16 159; FIXME: Need to optimize this sequence to avoid extra shift on VI. 160; t23: i16 = srl t39, Constant:i32<8> 161; t31: i32 = any_extend t23 162; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8 163 164; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 165; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 166 167; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}} 168; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8 169; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8 170 171; EG: LDS_USHORT_READ_RET 172; EG-DAG: BFE_INT 173; EG-DAG: BFE_INT 174define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 175 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 176 %ext = sext <2 x i8> %load to <2 x i32> 177 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out 178 ret void 179} 180 181; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32: 182; GFX9-NOT: m0 183; GCN: ds_read_b32 184 185; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8 186; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}} 187; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8 188; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff, 189 190; EG: LDS_READ_RET 191define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { 192entry: 193 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in 194 %ext = zext <3 x i8> %ld to <3 x i32> 195 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out 196 ret void 197} 198 199; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32: 200; GCN-NOT: s_wqm_b64 201; GFX9-NOT: m0 202; SICIVI: s_mov_b32 m0 203; GCN: ds_read_b32 204 205; GCN-DAG: v_bfe_i32 206; GCN-DAG: v_bfe_i32 207; GCN-DAG: v_bfe_i32 208; GCN-DAG: v_bfe_i32 209 210; SI-DAG: ds_write_b64 211; SI-DAG: ds_write_b32 212; CIVI-DAG: ds_write_b96 213; GFX9-DAG: ds_write_b96 214 215; EG: LDS_READ_RET 216; EG-DAG: BFE_INT 217; EG-DAG: BFE_INT 218; EG-DAG: BFE_INT 219define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 { 220entry: 221 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in 222 %ext = sext <3 x i8> %ld to <3 x i32> 223 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out 224 ret void 225} 226 227; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32: 228; GCN-NOT: s_wqm_b64 229; GFX9-NOT: m0 230; SICIVI: s_mov_b32 m0 231; GCN: ds_read_b32 232 233; EG: LDS_READ_RET 234; EG-DAG: BFE_UINT 235; EG-DAG: BFE_UINT 236; EG-DAG: BFE_UINT 237define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 238 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 239 %ext = zext <4 x i8> %load to <4 x i32> 240 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out 241 ret void 242} 243 244; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32: 245; GCN-NOT: s_wqm_b64 246; GFX9-NOT: m0 247; SICIVI: s_mov_b32 m0 248; GCN: ds_read_b32 249 250; EG-DAG: LDS_READ_RET 251; EG-DAG: BFE_INT 252; EG-DAG: BFE_INT 253; EG-DAG: BFE_INT 254; EG-DAG: BFE_INT 255define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 256 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 257 %ext = sext <4 x i8> %load to <4 x i32> 258 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out 259 ret void 260} 261 262; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32: 263; SICIVI: s_mov_b32 m0 264; GFX9-NOT: m0 265 266; EG-DAG: LDS_READ_RET 267; EG-DAG: LDS_READ_RET 268; EG-DAG: BFE_UINT 269; EG-DAG: BFE_UINT 270; EG-DAG: BFE_UINT 271; EG-DAG: BFE_UINT 272; EG-DAG: BFE_UINT 273; EG-DAG: BFE_UINT 274define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 275 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 276 %ext = zext <8 x i8> %load to <8 x i32> 277 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out 278 ret void 279} 280 281; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32: 282; SICIVI: s_mov_b32 m0 283; GFX9-NOT: m0 284 285; EG-DAG: LDS_READ_RET 286; EG-DAG: LDS_READ_RET 287; EG-DAG: BFE_INT 288; EG-DAG: BFE_INT 289; EG-DAG: BFE_INT 290; EG-DAG: BFE_INT 291; EG-DAG: BFE_INT 292; EG-DAG: BFE_INT 293; EG-DAG: BFE_INT 294; EG-DAG: BFE_INT 295define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 296 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 297 %ext = sext <8 x i8> %load to <8 x i32> 298 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out 299 ret void 300} 301 302; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32: 303; SICIVI: s_mov_b32 m0 304; GFX9-NOT: m0 305 306; EG-DAG: LDS_READ_RET 307; EG-DAG: LDS_READ_RET 308; EG-DAG: LDS_READ_RET 309; EG-DAG: LDS_READ_RET 310; EG-DAG: BFE_UINT 311; EG-DAG: BFE_UINT 312; EG-DAG: BFE_UINT 313; EG-DAG: BFE_UINT 314; EG-DAG: BFE_UINT 315; EG-DAG: BFE_UINT 316; EG-DAG: BFE_UINT 317; EG-DAG: BFE_UINT 318; EG-DAG: BFE_UINT 319; EG-DAG: BFE_UINT 320; EG-DAG: BFE_UINT 321; EG-DAG: BFE_UINT 322define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 323 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 324 %ext = zext <16 x i8> %load to <16 x i32> 325 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out 326 ret void 327} 328 329; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32: 330; SICIVI: s_mov_b32 m0 331; GFX9-NOT: m0 332 333; EG-DAG: LDS_READ_RET 334; EG-DAG: LDS_READ_RET 335; EG-DAG: LDS_READ_RET 336; EG-DAG: LDS_READ_RET 337; EG-DAG: BFE_INT 338; EG-DAG: BFE_INT 339; EG-DAG: BFE_INT 340; EG-DAG: BFE_INT 341; EG-DAG: BFE_INT 342; EG-DAG: BFE_INT 343; EG-DAG: BFE_INT 344; EG-DAG: BFE_INT 345; EG-DAG: BFE_INT 346; EG-DAG: BFE_INT 347; EG-DAG: BFE_INT 348; EG-DAG: BFE_INT 349; EG-DAG: BFE_INT 350; EG-DAG: BFE_INT 351; EG-DAG: BFE_INT 352; EG-DAG: BFE_INT 353define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 354 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 355 %ext = sext <16 x i8> %load to <16 x i32> 356 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out 357 ret void 358} 359 360; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32: 361; SICIVI: s_mov_b32 m0 362; GFX9-NOT: m0 363 364; EG-DAG: LDS_READ_RET 365; EG-DAG: LDS_READ_RET 366; EG-DAG: LDS_READ_RET 367; EG-DAG: LDS_READ_RET 368; EG-DAG: LDS_READ_RET 369; EG-DAG: LDS_READ_RET 370; EG-DAG: LDS_READ_RET 371; EG-DAG: LDS_READ_RET 372define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 373 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 374 %ext = zext <32 x i8> %load to <32 x i32> 375 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out 376 ret void 377} 378 379; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32: 380; SICIVI: s_mov_b32 m0 381; GFX9-NOT: m0 382 383; EG-DAG: LDS_READ_RET 384; EG-DAG: LDS_READ_RET 385; EG-DAG: LDS_READ_RET 386; EG-DAG: LDS_READ_RET 387; EG-DAG: LDS_READ_RET 388; EG-DAG: LDS_READ_RET 389; EG-DAG: LDS_READ_RET 390; EG-DAG: LDS_READ_RET 391define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 392 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 393 %ext = sext <32 x i8> %load to <32 x i32> 394 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out 395 ret void 396} 397 398; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32: 399; SICIVI: s_mov_b32 m0 400; GFX9-NOT: m0 401 402; EG-DAG: LDS_READ_RET 403; EG-DAG: LDS_READ_RET 404; EG-DAG: LDS_READ_RET 405; EG-DAG: LDS_READ_RET 406; EG-DAG: LDS_READ_RET 407; EG-DAG: LDS_READ_RET 408; EG-DAG: LDS_READ_RET 409; EG-DAG: LDS_READ_RET 410; EG-DAG: LDS_READ_RET 411; EG-DAG: LDS_READ_RET 412; EG-DAG: LDS_READ_RET 413; EG-DAG: LDS_READ_RET 414; EG-DAG: LDS_READ_RET 415; EG-DAG: LDS_READ_RET 416; EG-DAG: LDS_READ_RET 417; EG-DAG: LDS_READ_RET 418define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 419 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 420 %ext = zext <64 x i8> %load to <64 x i32> 421 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out 422 ret void 423} 424 425; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32: 426; SICIVI: s_mov_b32 m0 427; GFX9-NOT: m0 428 429; EG-DAG: LDS_READ_RET 430; EG-DAG: LDS_READ_RET 431; EG-DAG: LDS_READ_RET 432; EG-DAG: LDS_READ_RET 433; EG-DAG: LDS_READ_RET 434; EG-DAG: LDS_READ_RET 435; EG-DAG: LDS_READ_RET 436; EG-DAG: LDS_READ_RET 437; EG-DAG: LDS_READ_RET 438; EG-DAG: LDS_READ_RET 439; EG-DAG: LDS_READ_RET 440; EG-DAG: LDS_READ_RET 441; EG-DAG: LDS_READ_RET 442; EG-DAG: LDS_READ_RET 443; EG-DAG: LDS_READ_RET 444; EG-DAG: LDS_READ_RET 445define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 446 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 447 %ext = sext <64 x i8> %load to <64 x i32> 448 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out 449 ret void 450} 451 452; FUNC-LABEL: {{^}}local_zextload_i8_to_i64: 453; SICIVI: s_mov_b32 m0 454; GFX9-NOT: m0 455 456; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 457; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]], 458; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]] 459 460; EG: LDS_UBYTE_READ_RET 461; EG: MOV {{.*}}, literal 462; EG: 0.0 463define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 464 %a = load i8, i8 addrspace(3)* %in 465 %ext = zext i8 %a to i64 466 store i64 %ext, i64 addrspace(3)* %out 467 ret void 468} 469 470; FUNC-LABEL: {{^}}local_sextload_i8_to_i64: 471; SICIVI: s_mov_b32 m0 472; GFX9-NOT: m0 473 474; GCN: ds_read_i8 v[[LO:[0-9]+]], 475; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] 476 477; GCN: ds_write_b64 v{{[0-9]+}}, v[[[LO]]:[[HI]]] 478 479; EG: LDS_UBYTE_READ_RET 480; EG: ASHR 481; TODO: why not 7? 482; EG: 31 483define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 484 %a = load i8, i8 addrspace(3)* %in 485 %ext = sext i8 %a to i64 486 store i64 %ext, i64 addrspace(3)* %out 487 ret void 488} 489 490; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64: 491; SICIVI: s_mov_b32 m0 492; GFX9-NOT: m0 493 494; EG: LDS_UBYTE_READ_RET 495; EG: MOV {{.*}}, literal 496; TODO: merge? 497; EG: 0.0 498define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 499 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 500 %ext = zext <1 x i8> %load to <1 x i64> 501 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 502 ret void 503} 504 505; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64: 506; SICIVI: s_mov_b32 m0 507; GFX9-NOT: m0 508 509; EG: LDS_UBYTE_READ_RET 510; EG: ASHR 511; TODO: why not 7? 512; EG: 31 513define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 514 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 515 %ext = sext <1 x i8> %load to <1 x i64> 516 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out 517 ret void 518} 519 520; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64: 521; SICIVI: s_mov_b32 m0 522; GFX9-NOT: m0 523 524; EG: LDS_USHORT_READ_RET 525define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 526 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 527 %ext = zext <2 x i8> %load to <2 x i64> 528 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 529 ret void 530} 531 532; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64: 533; SICIVI: s_mov_b32 m0 534; GFX9-NOT: m0 535 536; EG: LDS_USHORT_READ_RET 537; EG: BFE_INT 538; EG: BFE_INT 539define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 540 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 541 %ext = sext <2 x i8> %load to <2 x i64> 542 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out 543 ret void 544} 545 546; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64: 547; SICIVI: s_mov_b32 m0 548; GFX9-NOT: m0 549 550; EG: LDS_READ_RET 551define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 552 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 553 %ext = zext <4 x i8> %load to <4 x i64> 554 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 555 ret void 556} 557 558; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64: 559; SICIVI: s_mov_b32 m0 560; GFX9-NOT: m0 561 562; EG: LDS_READ_RET 563define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 564 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 565 %ext = sext <4 x i8> %load to <4 x i64> 566 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out 567 ret void 568} 569 570; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64: 571; SICIVI: s_mov_b32 m0 572; GFX9-NOT: m0 573 574; EG: LDS_READ_RET 575; EG: LDS_READ_RET 576define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 577 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 578 %ext = zext <8 x i8> %load to <8 x i64> 579 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 580 ret void 581} 582 583; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64: 584; SICIVI: s_mov_b32 m0 585; GFX9-NOT: m0 586 587; EG: LDS_READ_RET 588; EG: LDS_READ_RET 589; EG-DAG: ASHR 590; EG-DAG: ASHR 591; EG-DAG: BFE_INT 592; EG-DAG: BFE_INT 593; EG-DAG: BFE_INT 594; EG-DAG: BFE_INT 595; EG-DAG: BFE_INT 596; EG-DAG: BFE_INT 597; EG-DAG: BFE_INT 598define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 599 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 600 %ext = sext <8 x i8> %load to <8 x i64> 601 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out 602 ret void 603} 604 605; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64: 606; SICIVI: s_mov_b32 m0 607; GFX9-NOT: m0 608 609; EG: LDS_READ_RET 610; EG: LDS_READ_RET 611; EG: LDS_READ_RET 612; EG: LDS_READ_RET 613define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 614 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 615 %ext = zext <16 x i8> %load to <16 x i64> 616 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 617 ret void 618} 619 620; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64: 621; SICIVI: s_mov_b32 m0 622; GFX9-NOT: m0 623 624; EG: LDS_READ_RET 625; EG: LDS_READ_RET 626; EG: LDS_READ_RET 627; EG: LDS_READ_RET 628define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 629 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 630 %ext = sext <16 x i8> %load to <16 x i64> 631 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out 632 ret void 633} 634 635; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64: 636; SICIVI: s_mov_b32 m0 637; GFX9-NOT: m0 638 639; EG: LDS_READ_RET 640; EG: LDS_READ_RET 641; EG: LDS_READ_RET 642; EG: LDS_READ_RET 643; EG: LDS_READ_RET 644; EG: LDS_READ_RET 645; EG: LDS_READ_RET 646; EG: LDS_READ_RET 647define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 648 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 649 %ext = zext <32 x i8> %load to <32 x i64> 650 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 651 ret void 652} 653 654; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64: 655; SICIVI: s_mov_b32 m0 656; GFX9-NOT: m0 657 658; EG: LDS_READ_RET 659; EG: LDS_READ_RET 660; EG: LDS_READ_RET 661; EG: LDS_READ_RET 662; EG: LDS_READ_RET 663; EG: LDS_READ_RET 664; EG: LDS_READ_RET 665; EG: LDS_READ_RET 666define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 667 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 668 %ext = sext <32 x i8> %load to <32 x i64> 669 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out 670 ret void 671} 672 673; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64: 674; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 675; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 676; %ext = zext <64 x i8> %load to <64 x i64> 677; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out 678; ret void 679; } 680 681; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64: 682; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 683; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 684; %ext = sext <64 x i8> %load to <64 x i64> 685; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out 686; ret void 687; } 688 689; FUNC-LABEL: {{^}}local_zextload_i8_to_i16: 690; SICIVI: s_mov_b32 m0 691; GFX9-NOT: m0 692; GCN: ds_read_u8 v[[VAL:[0-9]+]], 693; GCN: ds_write_b16 v[[VAL:[0-9]+]] 694 695; EG: LDS_UBYTE_READ_RET 696; EG: LDS_SHORT_WRITE 697define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 698 %a = load i8, i8 addrspace(3)* %in 699 %ext = zext i8 %a to i16 700 store i16 %ext, i16 addrspace(3)* %out 701 ret void 702} 703 704; FUNC-LABEL: {{^}}local_sextload_i8_to_i16: 705; SICIVI: s_mov_b32 m0 706; GFX9-NOT: m0 707; GCN: ds_read_i8 v[[VAL:[0-9]+]], 708; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]] 709 710; EG: LDS_UBYTE_READ_RET 711; EG: BFE_INT 712; EG: LDS_SHORT_WRITE 713define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 { 714 %a = load i8, i8 addrspace(3)* %in 715 %ext = sext i8 %a to i16 716 store i16 %ext, i16 addrspace(3)* %out 717 ret void 718} 719 720; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16: 721; SICIVI: s_mov_b32 m0 722; GFX9-NOT: m0 723 724; EG: LDS_UBYTE_READ_RET 725; EG: LDS_SHORT_WRITE 726define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 727 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 728 %ext = zext <1 x i8> %load to <1 x i16> 729 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out 730 ret void 731} 732 733; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16: 734; SICIVI: s_mov_b32 m0 735; GFX9-NOT: m0 736 737; EG: LDS_UBYTE_READ_RET 738; EG: BFE_INT 739; EG: LDS_SHORT_WRITE 740define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 { 741 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in 742 %ext = sext <1 x i8> %load to <1 x i16> 743 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out 744 ret void 745} 746 747; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16: 748; SICIVI: s_mov_b32 m0 749; GFX9-NOT: m0 750 751; EG: LDS_USHORT_READ_RET 752; EG: LDS_WRITE 753define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 754 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 755 %ext = zext <2 x i8> %load to <2 x i16> 756 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out 757 ret void 758} 759 760; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16: 761; SICIVI: s_mov_b32 m0 762; GFX9-NOT: m0 763 764; EG: LDS_USHORT_READ_RET 765; EG: BFE_INT 766; EG: BFE_INT 767; EG: LDS_WRITE 768define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 { 769 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in 770 %ext = sext <2 x i8> %load to <2 x i16> 771 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out 772 ret void 773} 774 775; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16: 776; SICIVI: s_mov_b32 m0 777; GFX9-NOT: m0 778 779; EG: LDS_READ_RET 780; EG: LDS_WRITE 781; EG: LDS_WRITE 782define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 783 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 784 %ext = zext <4 x i8> %load to <4 x i16> 785 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out 786 ret void 787} 788 789; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16: 790; SICIVI: s_mov_b32 m0 791; GFX9-NOT: m0 792 793; EG: LDS_READ_RET 794; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR 795; EG-DAG: BFE_INT 796; EG-DAG: BFE_INT 797; EG-DAG: BFE_INT 798; EG-DAG: BFE_INT 799; EG: LDS_WRITE 800; EG: LDS_WRITE 801define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 { 802 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in 803 %ext = sext <4 x i8> %load to <4 x i16> 804 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out 805 ret void 806} 807 808; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16: 809; SICIVI: s_mov_b32 m0 810; GFX9-NOT: m0 811 812; EG: LDS_READ_RET 813; EG: LDS_READ_RET 814; EG: LDS_WRITE 815; EG: LDS_WRITE 816; EG: LDS_WRITE 817; EG: LDS_WRITE 818define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 819 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 820 %ext = zext <8 x i8> %load to <8 x i16> 821 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out 822 ret void 823} 824 825; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16: 826; SICIVI: s_mov_b32 m0 827; GFX9-NOT: m0 828 829; EG: LDS_READ_RET 830; EG: LDS_READ_RET 831; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR 832; EG-DAG: BFE_INT 833; EG-DAG: BFE_INT 834; EG-DAG: BFE_INT 835; EG-DAG: BFE_INT 836; EG-DAG: BFE_INT 837; EG-DAG: BFE_INT 838; EG-DAG: BFE_INT 839; EG-DAG: BFE_INT 840; EG: LDS_WRITE 841; EG: LDS_WRITE 842; EG: LDS_WRITE 843; EG: LDS_WRITE 844define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 { 845 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in 846 %ext = sext <8 x i8> %load to <8 x i16> 847 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out 848 ret void 849} 850 851; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16: 852; SICIVI: s_mov_b32 m0 853; GFX9-NOT: m0 854 855; EG: LDS_READ_RET 856; EG: LDS_READ_RET 857; EG: LDS_READ_RET 858; EG: LDS_READ_RET 859; EG: LDS_WRITE 860; EG: LDS_WRITE 861; EG: LDS_WRITE 862; EG: LDS_WRITE 863; EG: LDS_WRITE 864; EG: LDS_WRITE 865; EG: LDS_WRITE 866; EG: LDS_WRITE 867define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 868 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 869 %ext = zext <16 x i8> %load to <16 x i16> 870 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out 871 ret void 872} 873 874; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16: 875; SICIVI: s_mov_b32 m0 876; GFX9-NOT: m0 877 878; EG: LDS_READ_RET 879; EG: LDS_READ_RET 880; EG: LDS_READ_RET 881; EG: LDS_READ_RET 882; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR 883; EG-DAG: BFE_INT 884; EG-DAG: BFE_INT 885; EG-DAG: BFE_INT 886; EG-DAG: BFE_INT 887; EG-DAG: BFE_INT 888; EG-DAG: BFE_INT 889; EG-DAG: BFE_INT 890; EG-DAG: BFE_INT 891; EG-DAG: BFE_INT 892; EG-DAG: BFE_INT 893; EG-DAG: BFE_INT 894; EG-DAG: BFE_INT 895; EG-DAG: BFE_INT 896; EG-DAG: BFE_INT 897; EG-DAG: BFE_INT 898; EG-DAG: BFE_INT 899; EG: LDS_WRITE 900; EG: LDS_WRITE 901; EG: LDS_WRITE 902; EG: LDS_WRITE 903; EG: LDS_WRITE 904; EG: LDS_WRITE 905; EG: LDS_WRITE 906; EG: LDS_WRITE 907define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 { 908 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in 909 %ext = sext <16 x i8> %load to <16 x i16> 910 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out 911 ret void 912} 913 914; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16: 915; SICIVI: s_mov_b32 m0 916; GFX9-NOT: m0 917 918; EG: LDS_READ_RET 919; EG: LDS_READ_RET 920; EG: LDS_READ_RET 921; EG: LDS_READ_RET 922; EG: LDS_READ_RET 923; EG: LDS_READ_RET 924; EG: LDS_READ_RET 925; EG: LDS_READ_RET 926; EG: LDS_WRITE 927; EG: LDS_WRITE 928; EG: LDS_WRITE 929; EG: LDS_WRITE 930; EG: LDS_WRITE 931; EG: LDS_WRITE 932; EG: LDS_WRITE 933; EG: LDS_WRITE 934; EG: LDS_WRITE 935; EG: LDS_WRITE 936; EG: LDS_WRITE 937; EG: LDS_WRITE 938; EG: LDS_WRITE 939; EG: LDS_WRITE 940; EG: LDS_WRITE 941; EG: LDS_WRITE 942define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 943 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 944 %ext = zext <32 x i8> %load to <32 x i16> 945 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out 946 ret void 947} 948 949; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16: 950; SICIVI: s_mov_b32 m0 951; GFX9-NOT: m0 952 953; EG: LDS_READ_RET 954; EG: LDS_READ_RET 955; EG: LDS_READ_RET 956; EG: LDS_READ_RET 957; EG: LDS_READ_RET 958; EG: LDS_READ_RET 959; EG: LDS_READ_RET 960; EG: LDS_READ_RET 961; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR 962; EG-DAG: BFE_INT 963; EG-DAG: BFE_INT 964; EG-DAG: BFE_INT 965; EG-DAG: BFE_INT 966; EG-DAG: BFE_INT 967; EG-DAG: BFE_INT 968; EG-DAG: BFE_INT 969; EG-DAG: BFE_INT 970; EG-DAG: BFE_INT 971; EG-DAG: BFE_INT 972; EG-DAG: BFE_INT 973; EG-DAG: BFE_INT 974; EG-DAG: BFE_INT 975; EG-DAG: BFE_INT 976; EG-DAG: BFE_INT 977; EG-DAG: BFE_INT 978; EG-DAG: BFE_INT 979; EG-DAG: BFE_INT 980; EG-DAG: BFE_INT 981; EG-DAG: BFE_INT 982; EG-DAG: BFE_INT 983; EG-DAG: BFE_INT 984; EG-DAG: BFE_INT 985; EG-DAG: BFE_INT 986; EG-DAG: BFE_INT 987; EG-DAG: BFE_INT 988; EG-DAG: BFE_INT 989; EG-DAG: BFE_INT 990; EG: LDS_WRITE 991; EG: LDS_WRITE 992; EG: LDS_WRITE 993; EG: LDS_WRITE 994; EG: LDS_WRITE 995; EG: LDS_WRITE 996; EG: LDS_WRITE 997; EG: LDS_WRITE 998; EG: LDS_WRITE 999; EG: LDS_WRITE 1000; EG: LDS_WRITE 1001; EG: LDS_WRITE 1002; EG: LDS_WRITE 1003; EG: LDS_WRITE 1004; EG: LDS_WRITE 1005; EG: LDS_WRITE 1006define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 { 1007 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in 1008 %ext = sext <32 x i8> %load to <32 x i16> 1009 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out 1010 ret void 1011} 1012 1013; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16: 1014; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 1015; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 1016; %ext = zext <64 x i8> %load to <64 x i16> 1017; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out 1018; ret void 1019; } 1020 1021; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16: 1022; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 { 1023; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in 1024; %ext = sext <64 x i8> %load to <64 x i16> 1025; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out 1026; ret void 1027; } 1028 1029; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load. 1030; FUNC-LABEL: {{^}}local_v16i8_to_128: 1031 1032; SI-NOT: ds_read_b128 1033; SI-NOT: ds_write_b128 1034 1035; CIVI: ds_read_b128 1036; CIVI: ds_write_b128 1037 1038; EG: LDS_READ_RET 1039; EG: LDS_READ_RET 1040; EG: LDS_READ_RET 1041; EG: LDS_READ_RET 1042define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) { 1043 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16 1044 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16 1045 ret void 1046} 1047 1048attributes #0 = { nounwind } 1049