1; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s 2; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s 3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s 4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s 5 6; SI-LABEL: {{^}}local_unaligned_load_store_i16: 7; SI: ds_read_u8 8; SI: ds_read_u8 9; SI: ds_write_b8 10; SI: ds_write_b8 11; SI: s_endpgm 12define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 { 13 %v = load i16, i16 addrspace(3)* %p, align 1 14 store i16 %v, i16 addrspace(3)* %r, align 1 15 ret void 16} 17 18; SI-LABEL: {{^}}global_unaligned_load_store_i16: 19; ALIGNED: buffer_load_ubyte 20; ALIGNED: buffer_load_ubyte 21; ALIGNED: buffer_store_byte 22; ALIGNED: buffer_store_byte 23 24; UNALIGNED: buffer_load_ushort 25; UNALIGNED: buffer_store_short 26; SI: s_endpgm 27define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { 28 %v = load i16, i16 addrspace(1)* %p, align 1 29 store i16 %v, i16 addrspace(1)* %r, align 1 30 ret void 31} 32 33; SI-LABEL: {{^}}local_unaligned_load_store_i32: 34 35; SI: ds_read_u8 36; SI: ds_read_u8 37; SI: ds_read_u8 38; SI: ds_read_u8 39; SI-NOT: v_or 40; SI-NOT: v_lshl 41; SI: ds_write_b8 42; SI: ds_write_b8 43; SI: ds_write_b8 44; SI: ds_write_b8 45; SI: s_endpgm 46define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { 47 %v = load i32, i32 addrspace(3)* %p, align 1 48 store i32 %v, i32 addrspace(3)* %r, align 1 49 ret void 50} 51 52; SI-LABEL: {{^}}global_unaligned_load_store_i32: 53; ALIGNED: buffer_load_ubyte 54; ALIGNED: buffer_load_ubyte 55; ALIGNED: buffer_load_ubyte 56; ALIGNED: buffer_load_ubyte 57; ALIGNED: buffer_store_byte 58; ALIGNED: buffer_store_byte 59; ALIGNED: buffer_store_byte 60; ALIGNED: buffer_store_byte 61 62; UNALIGNED: buffer_load_dword 63; UNALIGNED: buffer_store_dword 64define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { 65 %v = load i32, i32 addrspace(1)* %p, align 1 66 store i32 %v, i32 addrspace(1)* %r, align 1 67 ret void 68} 69 70; SI-LABEL: {{^}}global_align2_load_store_i32: 71; ALIGNED: buffer_load_ushort 72; ALIGNED: buffer_load_ushort 73; ALIGNED: buffer_store_short 74; ALIGNED: buffer_store_short 75 76; UNALIGNED: buffer_load_dword 77; UNALIGNED: buffer_store_dword 78define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 { 79 %v = load i32, i32 addrspace(1)* %p, align 2 80 store i32 %v, i32 addrspace(1)* %r, align 2 81 ret void 82} 83 84; GCN-LABEL: {{^}}local_align2_load_store_i32: 85; GCN: ds_read_u16 86; GCN: ds_read_u16 87; GCN: ds_write_b16 88; GCN: ds_write_b16 89define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 { 90 %v = load i32, i32 addrspace(3)* %p, align 2 91 store i32 %v, i32 addrspace(3)* %r, align 2 92 ret void 93} 94 95; SI-LABEL: {{^}}local_unaligned_load_store_i64: 96; SI: ds_read_u8 97; SI: ds_read_u8 98; SI: ds_read_u8 99; SI: ds_read_u8 100; SI: ds_read_u8 101; SI: ds_read_u8 102; SI: ds_read_u8 103; SI: ds_read_u8 104 105; SI-NOT: v_or_b32 106; SI-NOT: v_lshl 107; SI: ds_write_b8 108; SI-NOT: v_or_b32 109; SI-NOT: v_lshl 110 111; SI: ds_write_b8 112; SI-NOT: v_or_b32 113; SI-NOT: v_lshl 114 115; SI: ds_write_b8 116; SI-NOT: v_or_b32 117; SI-NOT: v_lshl 118 119; SI: ds_write_b8 120; SI-NOT: v_or_b32 121; SI-NOT: v_lshl 122 123; SI: ds_write_b8 124; SI-NOT: v_or_b32 125; SI-NOT: v_lshl 126 127; SI: ds_write_b8 128; SI-NOT: v_or_b32 129; SI-NOT: v_lshl 130 131; SI: ds_write_b8 132; SI-NOT: v_or_b32 133; SI-NOT: v_lshl 134; SI: ds_write_b8 135; SI: s_endpgm 136define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 { 137 %v = load i64, i64 addrspace(3)* %p, align 1 138 store i64 %v, i64 addrspace(3)* %r, align 1 139 ret void 140} 141 142; SI-LABEL: {{^}}local_unaligned_load_store_v2i32: 143; SI: ds_read_u8 144; SI: ds_read_u8 145; SI: ds_read_u8 146; SI: ds_read_u8 147; SI: ds_read_u8 148; SI: ds_read_u8 149; SI: ds_read_u8 150; SI: ds_read_u8 151 152; SI-NOT: v_or_b32 153; SI-NOT: v_lshl 154; SI: ds_write_b8 155; SI-NOT: v_or_b32 156; SI-NOT: v_lshl 157 158; SI: ds_write_b8 159; SI-NOT: v_or_b32 160; SI-NOT: v_lshl 161 162; SI: ds_write_b8 163; SI-NOT: v_or_b32 164; SI-NOT: v_lshl 165 166; SI: ds_write_b8 167; SI-NOT: v_or_b32 168; SI-NOT: v_lshl 169 170; SI: ds_write_b8 171; SI-NOT: v_or_b32 172; SI-NOT: v_lshl 173 174; SI: ds_write_b8 175; SI-NOT: v_or_b32 176; SI-NOT: v_lshl 177 178; SI: ds_write_b8 179; SI-NOT: v_or_b32 180; SI-NOT: v_lshl 181; SI: ds_write_b8 182; SI: s_endpgm 183define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 { 184 %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1 185 store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1 186 ret void 187} 188 189; SI-LABEL: {{^}}global_align2_load_store_i64: 190; ALIGNED: buffer_load_ushort 191; ALIGNED: buffer_load_ushort 192 193; ALIGNED-NOT: v_or_ 194; ALIGNED-NOT: v_lshl 195 196; ALIGNED: buffer_load_ushort 197 198; ALIGNED-NOT: v_or_ 199; ALIGNED-NOT: v_lshl 200 201; ALIGNED: buffer_load_ushort 202 203; ALIGNED-NOT: v_or_ 204; ALIGNED-NOT: v_lshl 205 206; ALIGNED: buffer_store_short 207; ALIGNED: buffer_store_short 208; ALIGNED: buffer_store_short 209; ALIGNED: buffer_store_short 210 211; UNALIGNED: buffer_load_dwordx2 212; UNALIGNED: buffer_store_dwordx2 213define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { 214 %v = load i64, i64 addrspace(1)* %p, align 2 215 store i64 %v, i64 addrspace(1)* %r, align 2 216 ret void 217} 218 219; SI-LABEL: {{^}}unaligned_load_store_i64_global: 220; ALIGNED: buffer_load_ubyte 221; ALIGNED: buffer_load_ubyte 222; ALIGNED: buffer_load_ubyte 223; ALIGNED: buffer_load_ubyte 224; ALIGNED: buffer_load_ubyte 225; ALIGNED: buffer_load_ubyte 226; ALIGNED: buffer_load_ubyte 227; ALIGNED: buffer_load_ubyte 228 229; ALIGNED-NOT: v_or_ 230; ALIGNED-NOT: v_lshl 231 232; ALIGNED: buffer_store_byte 233; ALIGNED: buffer_store_byte 234; ALIGNED: buffer_store_byte 235; ALIGNED: buffer_store_byte 236; ALIGNED: buffer_store_byte 237; ALIGNED: buffer_store_byte 238; ALIGNED: buffer_store_byte 239; ALIGNED: buffer_store_byte 240 241; UNALIGNED: buffer_load_dwordx2 242; UNALIGNED: buffer_store_dwordx2 243define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 { 244 %v = load i64, i64 addrspace(1)* %p, align 1 245 store i64 %v, i64 addrspace(1)* %r, align 1 246 ret void 247} 248 249; GCN-LABEL: {{^}}local_unaligned_load_store_v4i32: 250; GCN: ds_read_u8 251; GCN: ds_read_u8 252; GCN: ds_read_u8 253; GCN: ds_read_u8 254 255; GCN: ds_read_u8 256; GCN: ds_read_u8 257; GCN: ds_read_u8 258; GCN: ds_read_u8 259 260; GCN: ds_read_u8 261; GCN: ds_read_u8 262; GCN: ds_read_u8 263; GCN: ds_read_u8 264 265; GCN: ds_read_u8 266; GCN: ds_read_u8 267; GCN: ds_read_u8 268; GCN: ds_read_u8 269 270; GCN: ds_write_b8 271; GCN: ds_write_b8 272; GCN: ds_write_b8 273; GCN: ds_write_b8 274 275; GCN: ds_write_b8 276; GCN: ds_write_b8 277; GCN: ds_write_b8 278; GCN: ds_write_b8 279 280; GCN: ds_write_b8 281; GCN: ds_write_b8 282; GCN: ds_write_b8 283; GCN: ds_write_b8 284 285; GCN: ds_write_b8 286; GCN: ds_write_b8 287; GCN: ds_write_b8 288; GCN: ds_write_b8 289; GCN: s_endpgm 290define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 { 291 %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1 292 store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1 293 ret void 294} 295 296; SI-LABEL: {{^}}global_unaligned_load_store_v4i32 297; ALIGNED: buffer_load_ubyte 298; ALIGNED: buffer_load_ubyte 299; ALIGNED: buffer_load_ubyte 300; ALIGNED: buffer_load_ubyte 301; ALIGNED: buffer_load_ubyte 302; ALIGNED: buffer_load_ubyte 303; ALIGNED: buffer_load_ubyte 304; ALIGNED: buffer_load_ubyte 305; ALIGNED: buffer_load_ubyte 306; ALIGNED: buffer_load_ubyte 307; ALIGNED: buffer_load_ubyte 308; ALIGNED: buffer_load_ubyte 309; ALIGNED: buffer_load_ubyte 310; ALIGNED: buffer_load_ubyte 311; ALIGNED: buffer_load_ubyte 312; ALIGNED: buffer_load_ubyte 313 314; ALIGNED: buffer_store_byte 315; ALIGNED: buffer_store_byte 316; ALIGNED: buffer_store_byte 317; ALIGNED: buffer_store_byte 318; ALIGNED: buffer_store_byte 319; ALIGNED: buffer_store_byte 320; ALIGNED: buffer_store_byte 321; ALIGNED: buffer_store_byte 322; ALIGNED: buffer_store_byte 323; ALIGNED: buffer_store_byte 324; ALIGNED: buffer_store_byte 325; ALIGNED: buffer_store_byte 326; ALIGNED: buffer_store_byte 327; ALIGNED: buffer_store_byte 328; ALIGNED: buffer_store_byte 329; ALIGNED: buffer_store_byte 330 331; UNALIGNED: buffer_load_dwordx4 332; UNALIGNED: buffer_store_dwordx4 333define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 { 334 %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1 335 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1 336 ret void 337} 338 339; GCN-LABEL: {{^}}local_load_i64_align_4: 340; GCN: ds_read2_b32 341define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 342 %val = load i64, i64 addrspace(3)* %in, align 4 343 store i64 %val, i64 addrspace(1)* %out, align 8 344 ret void 345} 346 347; GCN-LABEL: {{^}}local_load_i64_align_4_with_offset 348; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9 349define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 350 %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4 351 %val = load i64, i64 addrspace(3)* %ptr, align 4 352 store i64 %val, i64 addrspace(1)* %out, align 8 353 ret void 354} 355 356; GCN-LABEL: {{^}}local_load_i64_align_4_with_split_offset: 357; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 358; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1 359; GCN: s_endpgm 360define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 361 %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)* 362 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 363 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 364 %val = load i64, i64 addrspace(3)* %ptri64, align 4 365 store i64 %val, i64 addrspace(1)* %out, align 8 366 ret void 367} 368 369; GCN-LABEL: {{^}}local_load_i64_align_1: 370; GCN: ds_read_u8 371; GCN: ds_read_u8 372; GCN: ds_read_u8 373; GCN: ds_read_u8 374; GCN: ds_read_u8 375; GCN: ds_read_u8 376; GCN: ds_read_u8 377; GCN: ds_read_u8 378; GCN: store_dwordx2 379define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 { 380 %val = load i64, i64 addrspace(3)* %in, align 1 381 store i64 %val, i64 addrspace(1)* %out, align 8 382 ret void 383} 384 385; GCN-LABEL: {{^}}local_store_i64_align_4: 386; GCN: ds_write2_b32 387define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 { 388 store i64 %val, i64 addrspace(3)* %out, align 4 389 ret void 390} 391 392; GCN-LABEL: {{^}}local_store_i64_align_4_with_offset 393; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9 394; GCN: s_endpgm 395define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 { 396 %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4 397 store i64 0, i64 addrspace(3)* %ptr, align 4 398 ret void 399} 400 401; GCN-LABEL: {{^}}local_store_i64_align_4_with_split_offset: 402; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits 403; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1 404; GCN: s_endpgm 405define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 { 406 %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)* 407 %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255 408 %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)* 409 store i64 0, i64 addrspace(3)* %out, align 4 410 ret void 411} 412 413; SI-LABEL: {{^}}constant_unaligned_load_i32: 414; ALIGNED: buffer_load_ubyte 415; ALIGNED: buffer_load_ubyte 416; ALIGNED: buffer_load_ubyte 417; ALIGNED: buffer_load_ubyte 418 419; UNALIGNED: s_load_dword 420 421; SI: buffer_store_dword 422define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { 423 %v = load i32, i32 addrspace(4)* %p, align 1 424 store i32 %v, i32 addrspace(1)* %r, align 4 425 ret void 426} 427 428; SI-LABEL: {{^}}constant_align2_load_i32: 429; ALIGNED: buffer_load_ushort 430; ALIGNED: buffer_load_ushort 431 432; UNALIGNED: s_load_dword 433; UNALIGNED: buffer_store_dword 434define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { 435 %v = load i32, i32 addrspace(4)* %p, align 2 436 store i32 %v, i32 addrspace(1)* %r, align 4 437 ret void 438} 439 440; SI-LABEL: {{^}}constant_align2_load_i64: 441; ALIGNED: buffer_load_ushort 442; ALIGNED: buffer_load_ushort 443; ALIGNED: buffer_load_ushort 444; ALIGNED: buffer_load_ushort 445 446; UNALIGNED: s_load_dwordx4 447; UNALIGNED: buffer_store_dwordx2 448define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 { 449 %v = load i64, i64 addrspace(4)* %p, align 2 450 store i64 %v, i64 addrspace(1)* %r, align 4 451 ret void 452} 453 454; SI-LABEL: {{^}}constant_align4_load_i64: 455; SI: s_load_dwordx2 456; SI: buffer_store_dwordx2 457define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 { 458 %v = load i64, i64 addrspace(4)* %p, align 4 459 store i64 %v, i64 addrspace(1)* %r, align 4 460 ret void 461} 462 463; SI-LABEL: {{^}}constant_align4_load_v4i32: 464; SI: s_load_dwordx4 465; SI: buffer_store_dwordx4 466define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 { 467 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4 468 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 469 ret void 470} 471 472; SI-LABEL: {{^}}constant_unaligned_load_v2i32: 473; ALIGNED: buffer_load_ubyte 474; ALIGNED: buffer_load_ubyte 475; ALIGNED: buffer_load_ubyte 476; ALIGNED: buffer_load_ubyte 477 478; ALIGNED: buffer_load_ubyte 479; ALIGNED: buffer_load_ubyte 480; ALIGNED: buffer_load_ubyte 481; ALIGNED: buffer_load_ubyte 482 483; UNALIGNED: buffer_load_dwordx2 484 485; SI: buffer_store_dwordx2 486define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 { 487 %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1 488 store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4 489 ret void 490} 491 492; SI-LABEL: {{^}}constant_unaligned_load_v4i32: 493; ALIGNED: buffer_load_ubyte 494; ALIGNED: buffer_load_ubyte 495; ALIGNED: buffer_load_ubyte 496; ALIGNED: buffer_load_ubyte 497 498; ALIGNED: buffer_load_ubyte 499; ALIGNED: buffer_load_ubyte 500; ALIGNED: buffer_load_ubyte 501; ALIGNED: buffer_load_ubyte 502 503; ALIGNED: buffer_load_ubyte 504; ALIGNED: buffer_load_ubyte 505; ALIGNED: buffer_load_ubyte 506; ALIGNED: buffer_load_ubyte 507 508; ALIGNED: buffer_load_ubyte 509; ALIGNED: buffer_load_ubyte 510; ALIGNED: buffer_load_ubyte 511; ALIGNED: buffer_load_ubyte 512 513; UNALIGNED: buffer_load_dwordx4 514 515; SI: buffer_store_dwordx4 516define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 { 517 %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1 518 store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4 519 ret void 520} 521 522; SI-LABEL: {{^}}constant_align4_load_i8: 523; SI: s_load_dword 524; SI: buffer_store_byte 525define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 { 526 %v = load i8, i8 addrspace(4)* %p, align 4 527 store i8 %v, i8 addrspace(1)* %r, align 4 528 ret void 529} 530 531; SI-LABEL: {{^}}constant_align2_load_i8: 532; SI: buffer_load_ubyte 533; SI: buffer_store_byte 534define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 { 535 %v = load i8, i8 addrspace(4)* %p, align 2 536 store i8 %v, i8 addrspace(1)* %r, align 2 537 ret void 538} 539 540; SI-LABEL: {{^}}constant_align4_merge_load_2_i32: 541; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}} 542; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]] 543; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]] 544; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]] 545define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 { 546 %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1 547 %v0 = load i32, i32 addrspace(4)* %p, align 4 548 %v1 = load i32, i32 addrspace(4)* %gep0, align 4 549 550 %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1 551 store i32 %v0, i32 addrspace(1)* %r, align 4 552 store i32 %v1, i32 addrspace(1)* %gep1, align 4 553 ret void 554} 555 556; SI-LABEL: {{^}}local_load_align1_v16i8: 557; SI: ds_read_u8 558; SI: ds_read_u8 559; SI: ds_read_u8 560; SI: ds_read_u8 561; SI: ds_read_u8 562; SI: ds_read_u8 563; SI: ds_read_u8 564; SI: ds_read_u8 565; SI: ds_read_u8 566; SI: ds_read_u8 567; SI: ds_read_u8 568; SI: ds_read_u8 569; SI: ds_read_u8 570; SI: ds_read_u8 571; SI: ds_read_u8 572; SI: ds_read_u8 573 574; SI: ScratchSize: 0{{$}} 575define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 { 576 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1 577 store <16 x i8> %ld, <16 x i8> addrspace(1)* %out 578 ret void 579} 580 581; SI-LABEL: {{^}}local_store_align1_v16i8: 582; SI: ds_write_b8 583; SI: ds_write_b8 584; SI: ds_write_b8 585; SI: ds_write_b8 586; SI: ds_write_b8 587; SI: ds_write_b8 588; SI: ds_write_b8 589; SI: ds_write_b8 590; SI: ds_write_b8 591; SI: ds_write_b8 592; SI: ds_write_b8 593; SI: ds_write_b8 594; SI: ds_write_b8 595; SI: ds_write_b8 596; SI: ds_write_b8 597; SI: ds_write_b8 598 599; SI: ScratchSize: 0{{$}} 600define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 { 601 store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1 602 ret void 603} 604 605; SI-LABEL: {{^}}private_load_align1_f64: 606; MUBUF: buffer_load_ubyte 607; MUBUF: buffer_load_ubyte 608; MUBUF: buffer_load_ubyte 609; MUBUF: buffer_load_ubyte 610; MUBUF: buffer_load_ubyte 611; MUBUF: buffer_load_ubyte 612; MUBUF: buffer_load_ubyte 613; MUBUF: buffer_load_ubyte 614; FLATSCR: scratch_load_dwordx2 615define double @private_load_align1_f64(double addrspace(5)* %in) { 616 %x = load double, double addrspace(5)* %in, align 1 617 ret double %x 618} 619 620; SI-LABEL: {{^}}private_store_align1_f64: 621; MUBUF: buffer_store_byte 622; MUBUF: buffer_store_byte 623; MUBUF: buffer_store_byte 624; MUBUF: buffer_store_byte 625; MUBUF: buffer_store_byte 626; MUBUF: buffer_store_byte 627; MUBUF: buffer_store_byte 628; MUBUF: buffer_store_byte 629; FLATSCR: scratch_store_dwordx2 630define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 { 631 store double %x, double addrspace(5)* %out, align 1 632 ret void 633} 634 635; SI-LABEL: {{^}}private_load_align4_f64: 636; MUBUF: buffer_load_dword 637; MUBUF: buffer_load_dword 638; FLATSCR: scratch_load_dwordx2 639define double @private_load_align4_f64(double addrspace(5)* %in) { 640 %x = load double, double addrspace(5)* %in, align 4 641 ret double %x 642} 643 644; SI-LABEL: {{^}}private_store_align4_f64: 645; MUBUF: buffer_store_dword 646; MUBUF: buffer_store_dword 647; FLATSCR: scratch_store_dwordx2 648define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 { 649 store double %x, double addrspace(5)* %out, align 4 650 ret void 651} 652 653; SI-LABEL: {{^}}private_load_align2_f64: 654; MUBUF: buffer_load_ushort 655; MUBUF: buffer_load_ushort 656; MUBUF: buffer_load_ushort 657; MUBUF: buffer_load_ushort 658; FLATSCR: scratch_load_dwordx2 659define double @private_load_align2_f64(double addrspace(5)* %in) { 660 %x = load double, double addrspace(5)* %in, align 2 661 ret double %x 662} 663 664; SI-LABEL: {{^}}private_store_align2_f64: 665; MUBUF: buffer_store_short 666; MUBUF: buffer_store_short 667; MUBUF: buffer_store_short 668; MUBUF: buffer_store_short 669; FLATSCR: scratch_store_dwordx2 670define void @private_store_align2_f64(double addrspace(5)* %out, double %x) #0 { 671 store double %x, double addrspace(5)* %out, align 2 672 ret void 673} 674 675; Should not merge this to a dword store 676define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 { 677 %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1 678 %v = load i16, i16 addrspace(1)* %p, align 2 679 store i16 1, i16 addrspace(1)* %r, align 2 680 store i16 2, i16 addrspace(1)* %gep.r, align 2 681 ret void 682} 683 684; Should not merge this to a word load 685define i32 @load_2xi16_align2(i16 addrspace(1)* %p) #0 { 686 %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1 687 %p.0 = load i16, i16 addrspace(1)* %p, align 2 688 %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2 689 %zext.0 = zext i16 %p.0 to i32 690 %zext.1 = zext i16 %p.1 to i32 691 %shl.1 = shl i32 %zext.1, 16 692 %or = or i32 %zext.0, %shl.1 693 ret i32 %or 694} 695 696attributes #0 = { nounwind } 697