1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-SDAG-LABEL: ds2align1: 25; ALIGNED-SDAG: ; %bb.0: 26; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 30; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1 34; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-SDAG-NEXT: s_endpgm 37; 38; ALIGNED-GISEL-LABEL: ds2align1: 39; ALIGNED-GISEL: ; %bb.0: 40; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 42; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 43; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 44; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 45; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 46; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 47; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 48; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 49; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 50; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 51; ALIGNED-GISEL-NEXT: s_endpgm 52; 53; UNALIGNED-LABEL: ds2align1: 54; UNALIGNED: ; %bb.0: 55; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 56; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 57; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 58; UNALIGNED-NEXT: ds_read_u16 v0, v0 59; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 60; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 61; UNALIGNED-NEXT: ds_write_b16 v1, v0 62; UNALIGNED-NEXT: s_endpgm 63 %val = load i16, i16 addrspace(3)* %in, align 1 64 store i16 %val, i16 addrspace(3)* %out, align 1 65 ret void 66} 67 68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 69; GCN-LABEL: ds2align2: 70; GCN: ; %bb.0: 71; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 72; GCN-NEXT: s_waitcnt lgkmcnt(0) 73; GCN-NEXT: v_mov_b32_e32 v0, s0 74; GCN-NEXT: ds_read_u16 v0, v0 75; GCN-NEXT: v_mov_b32_e32 v1, s1 76; GCN-NEXT: s_waitcnt lgkmcnt(0) 77; GCN-NEXT: ds_write_b16 v1, v0 78; GCN-NEXT: s_endpgm 79 %val = load i16, i16 addrspace(3)* %in, align 2 80 store i16 %val, i16 addrspace(3)* %out, align 2 81 ret void 82} 83 84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 85; ALIGNED-SDAG-LABEL: ds4align1: 86; ALIGNED-SDAG: ; %bb.0: 87; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 88; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 89; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 90; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 91; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 92; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 93; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3 94; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 95; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 96; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1 97; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 98; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1 99; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 100; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2 101; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 102; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3 103; ALIGNED-SDAG-NEXT: s_endpgm 104; 105; ALIGNED-GISEL-LABEL: ds4align1: 106; ALIGNED-GISEL: ; %bb.0: 107; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 108; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 109; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 110; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 111; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 112; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 113; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 114; ALIGNED-GISEL-NEXT: s_mov_b32 s0, 8 115; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 116; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 117; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 118; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 119; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 120; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 121; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 122; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 123; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 124; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 125; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1 126; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 127; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2 128; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:3 129; ALIGNED-GISEL-NEXT: s_endpgm 130; 131; UNALIGNED-LABEL: ds4align1: 132; UNALIGNED: ; %bb.0: 133; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 134; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 135; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 136; UNALIGNED-NEXT: ds_read_b32 v0, v0 137; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 138; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 139; UNALIGNED-NEXT: ds_write_b32 v1, v0 140; UNALIGNED-NEXT: s_endpgm 141 %val = load i32, i32 addrspace(3)* %in, align 1 142 store i32 %val, i32 addrspace(3)* %out, align 1 143 ret void 144} 145 146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 147; ALIGNED-SDAG-LABEL: ds4align2: 148; ALIGNED-SDAG: ; %bb.0: 149; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 151; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 152; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 153; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 154; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 156; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1 157; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 158; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2 159; ALIGNED-SDAG-NEXT: s_endpgm 160; 161; ALIGNED-GISEL-LABEL: ds4align2: 162; ALIGNED-GISEL: ; %bb.0: 163; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 164; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 165; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 166; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 167; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 168; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 169; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 171; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 172; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2 173; ALIGNED-GISEL-NEXT: s_endpgm 174; 175; UNALIGNED-LABEL: ds4align2: 176; UNALIGNED: ; %bb.0: 177; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 178; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 179; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 180; UNALIGNED-NEXT: ds_read_b32 v0, v0 181; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 182; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 183; UNALIGNED-NEXT: ds_write_b32 v1, v0 184; UNALIGNED-NEXT: s_endpgm 185 %val = load i32, i32 addrspace(3)* %in, align 2 186 store i32 %val, i32 addrspace(3)* %out, align 2 187 ret void 188} 189 190define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 191; GCN-LABEL: ds4align4: 192; GCN: ; %bb.0: 193; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 194; GCN-NEXT: s_waitcnt lgkmcnt(0) 195; GCN-NEXT: v_mov_b32_e32 v0, s0 196; GCN-NEXT: ds_read_b32 v0, v0 197; GCN-NEXT: v_mov_b32_e32 v1, s1 198; GCN-NEXT: s_waitcnt lgkmcnt(0) 199; GCN-NEXT: ds_write_b32 v1, v0 200; GCN-NEXT: s_endpgm 201 %val = load i32, i32 addrspace(3)* %in, align 4 202 store i32 %val, i32 addrspace(3)* %out, align 4 203 ret void 204} 205 206define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 207; ALIGNED-SDAG-LABEL: ds8align1: 208; ALIGNED-SDAG: ; %bb.0: 209; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 210; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 211; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 212; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 213; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 214; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 215; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 216; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 217; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 218; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 219; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 220; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 221; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 222; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 223; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 224; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 225; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 226; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 227; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 228; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 229; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 230; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 231; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 232; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 233; ALIGNED-SDAG-NEXT: s_endpgm 234; 235; ALIGNED-GISEL-LABEL: ds8align1: 236; ALIGNED-GISEL: ; %bb.0: 237; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 238; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 239; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 240; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 241; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 242; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 243; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 244; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 245; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 246; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 247; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 248; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 250; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 251; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 252; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 253; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 254; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 255; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 256; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 257; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 258; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 259; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 260; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 261; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 262; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 263; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 264; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 265; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 266; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 267; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3 268; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 269; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 270; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 271; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 272; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 273; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 274; ALIGNED-GISEL-NEXT: s_endpgm 275; 276; UNALIGNED-LABEL: ds8align1: 277; UNALIGNED: ; %bb.0: 278; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 279; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 280; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 281; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 282; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 283; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 284; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 285; UNALIGNED-NEXT: s_endpgm 286 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 287 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 288 ret void 289} 290 291define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 292; ALIGNED-SDAG-LABEL: ds8align2: 293; ALIGNED-SDAG: ; %bb.0: 294; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 295; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 296; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 297; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 298; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 299; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 300; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 301; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 302; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 303; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 304; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 305; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 306; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 307; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 308; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 309; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 310; ALIGNED-SDAG-NEXT: s_endpgm 311; 312; ALIGNED-GISEL-LABEL: ds8align2: 313; ALIGNED-GISEL: ; %bb.0: 314; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 315; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 316; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 317; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 318; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 319; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 320; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 321; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 322; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 323; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 324; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 325; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 326; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 327; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 328; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 329; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6 330; ALIGNED-GISEL-NEXT: s_endpgm 331; 332; UNALIGNED-LABEL: ds8align2: 333; UNALIGNED: ; %bb.0: 334; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 335; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 336; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 337; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 338; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 339; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 340; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 341; UNALIGNED-NEXT: s_endpgm 342 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 343 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 344 ret void 345} 346 347define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 348; GCN-LABEL: ds8align4: 349; GCN: ; %bb.0: 350; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 351; GCN-NEXT: s_waitcnt lgkmcnt(0) 352; GCN-NEXT: v_mov_b32_e32 v0, s0 353; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 354; GCN-NEXT: v_mov_b32_e32 v2, s1 355; GCN-NEXT: s_waitcnt lgkmcnt(0) 356; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 357; GCN-NEXT: s_endpgm 358 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 359 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 360 ret void 361} 362 363define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 364; GCN-LABEL: ds8align8: 365; GCN: ; %bb.0: 366; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 367; GCN-NEXT: s_waitcnt lgkmcnt(0) 368; GCN-NEXT: v_mov_b32_e32 v0, s0 369; GCN-NEXT: ds_read_b64 v[0:1], v0 370; GCN-NEXT: v_mov_b32_e32 v2, s1 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: ds_write_b64 v2, v[0:1] 373; GCN-NEXT: s_endpgm 374 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 375 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 376 ret void 377} 378 379define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 380; ALIGNED-SDAG-LABEL: ds12align1: 381; ALIGNED-SDAG: ; %bb.0: 382; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 383; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 384; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 385; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 386; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 387; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 388; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 389; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 390; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 391; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 392; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 393; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 394; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 395; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 396; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 397; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 398; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 399; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 400; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 401; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 402; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 403; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 404; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 405; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 406; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 407; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 408; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 409; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 410; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 411; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 412; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 413; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 414; ALIGNED-SDAG-NEXT: s_endpgm 415; 416; ALIGNED-GISEL-LABEL: ds12align1: 417; ALIGNED-GISEL: ; %bb.0: 418; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 419; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 420; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 421; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 422; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 423; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 424; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 425; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 426; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 427; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 428; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 429; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 430; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 431; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 432; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 433; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 434; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 435; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 436; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 437; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 438; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 439; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 440; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 441; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 442; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 443; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 444; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 445; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 446; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 447; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 448; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 449; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 450; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 451; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 452; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 453; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 454; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 455; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 456; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 457; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 458; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 459; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3 460; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 461; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 462; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 463; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 464; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 465; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 466; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 467; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 468; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 469; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 470; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 471; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 472; ALIGNED-GISEL-NEXT: s_endpgm 473; 474; UNALIGNED-LABEL: ds12align1: 475; UNALIGNED: ; %bb.0: 476; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 477; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 478; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 479; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 480; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 481; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 482; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 483; UNALIGNED-NEXT: s_endpgm 484 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 485 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 486 ret void 487} 488 489define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 490; ALIGNED-SDAG-LABEL: ds12align2: 491; ALIGNED-SDAG: ; %bb.0: 492; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 494; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 495; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 496; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 497; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 498; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10 499; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8 500; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 501; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 502; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 503; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 504; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 505; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10 506; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 507; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8 508; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 509; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 510; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 511; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 512; ALIGNED-SDAG-NEXT: s_endpgm 513; 514; ALIGNED-GISEL-LABEL: ds12align2: 515; ALIGNED-GISEL: ; %bb.0: 516; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 517; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 518; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 519; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 520; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 521; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 522; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 523; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 524; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 525; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 526; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 527; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 528; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 529; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 530; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 531; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 532; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 533; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2 534; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 535; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6 536; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 537; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10 538; ALIGNED-GISEL-NEXT: s_endpgm 539; 540; UNALIGNED-LABEL: ds12align2: 541; UNALIGNED: ; %bb.0: 542; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 543; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 544; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 545; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 546; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 547; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 548; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 549; UNALIGNED-NEXT: s_endpgm 550 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 551 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 552 ret void 553} 554 555define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 556; ALIGNED-LABEL: ds12align4: 557; ALIGNED: ; %bb.0: 558; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 559; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 560; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 561; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 562; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 563; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 564; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 565; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 566; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 567; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 568; ALIGNED-NEXT: s_endpgm 569; 570; UNALIGNED-LABEL: ds12align4: 571; UNALIGNED: ; %bb.0: 572; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 573; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 574; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 575; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 576; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 577; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 578; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 579; UNALIGNED-NEXT: s_endpgm 580 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 581 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 582 ret void 583} 584 585; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? 586define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 587; ALIGNED-SDAG-LABEL: ds12align8: 588; ALIGNED-SDAG: ; %bb.0: 589; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 590; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 591; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 592; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 593; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 594; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 595; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 596; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 597; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 598; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 599; ALIGNED-SDAG-NEXT: s_endpgm 600; 601; ALIGNED-GISEL-LABEL: ds12align8: 602; ALIGNED-GISEL: ; %bb.0: 603; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 604; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 605; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 606; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 607; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 608; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 609; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 610; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 611; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 612; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 613; ALIGNED-GISEL-NEXT: s_endpgm 614; 615; UNALIGNED-LABEL: ds12align8: 616; UNALIGNED: ; %bb.0: 617; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 618; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 619; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 620; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 621; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 622; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 623; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 624; UNALIGNED-NEXT: s_endpgm 625 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 626 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 627 ret void 628} 629 630define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 631; GCN-LABEL: ds12align16: 632; GCN: ; %bb.0: 633; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 634; GCN-NEXT: s_waitcnt lgkmcnt(0) 635; GCN-NEXT: v_mov_b32_e32 v0, s0 636; GCN-NEXT: ds_read_b96 v[0:2], v0 637; GCN-NEXT: v_mov_b32_e32 v3, s1 638; GCN-NEXT: s_waitcnt lgkmcnt(0) 639; GCN-NEXT: ds_write_b96 v3, v[0:2] 640; GCN-NEXT: s_endpgm 641 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 642 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 643 ret void 644} 645 646define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 647; ALIGNED-SDAG-LABEL: ds16align1: 648; ALIGNED-SDAG: ; %bb.0: 649; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 650; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 651; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 652; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 653; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 654; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 655; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 656; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 657; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 658; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 659; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 660; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 661; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 662; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 663; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 664; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 665; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 666; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 667; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 668; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 669; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 670; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 671; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 672; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 673; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 674; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 675; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 676; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 677; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 678; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 679; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 680; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 681; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 682; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 683; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 684; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 685; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 686; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 687; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 688; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 689; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 690; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 691; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 692; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 693; ALIGNED-SDAG-NEXT: s_endpgm 694; 695; ALIGNED-GISEL-LABEL: ds16align1: 696; ALIGNED-GISEL: ; %bb.0: 697; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 698; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 699; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 700; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 701; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 702; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 703; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 704; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 705; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 706; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 707; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 708; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 709; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 710; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 711; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 712; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 713; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 714; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 715; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 716; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 717; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 718; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 719; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 720; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 721; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 722; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 723; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 724; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 725; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 726; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 727; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 728; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 729; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 730; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 731; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 732; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 733; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 734; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 735; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 736; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 737; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 738; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 739; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 740; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 741; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 742; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 743; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 744; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 745; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 746; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 747; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3 748; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 749; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 750; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 751; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 752; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 753; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 754; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 755; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 756; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 757; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 758; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 759; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 760; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 761; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 762; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 763; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 8 764; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 765; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 766; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 767; ALIGNED-GISEL-NEXT: s_endpgm 768; 769; UNALIGNED-LABEL: ds16align1: 770; UNALIGNED: ; %bb.0: 771; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 772; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 773; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 774; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 775; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 776; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 777; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 778; UNALIGNED-NEXT: s_endpgm 779 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 780 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 781 ret void 782} 783 784define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 785; ALIGNED-SDAG-LABEL: ds16align2: 786; ALIGNED-SDAG: ; %bb.0: 787; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 788; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 789; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 790; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 791; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 792; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 793; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 794; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 795; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 796; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 797; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 798; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 799; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 800; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 801; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 802; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 803; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 804; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 805; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 806; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 807; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 808; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 809; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 810; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 811; ALIGNED-SDAG-NEXT: s_endpgm 812; 813; ALIGNED-GISEL-LABEL: ds16align2: 814; ALIGNED-GISEL: ; %bb.0: 815; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 816; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 817; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 818; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 819; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 820; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 821; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 822; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 823; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 824; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 825; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 826; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 827; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 828; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 829; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 830; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 831; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 832; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 833; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 834; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 835; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 836; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 837; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4 838; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6 839; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8 840; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10 841; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12 842; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14 843; ALIGNED-GISEL-NEXT: s_endpgm 844; 845; UNALIGNED-LABEL: ds16align2: 846; UNALIGNED: ; %bb.0: 847; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 848; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 849; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 850; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 851; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 852; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 853; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 854; UNALIGNED-NEXT: s_endpgm 855 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 856 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 857 ret void 858} 859 860define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 861; ALIGNED-LABEL: ds16align4: 862; ALIGNED: ; %bb.0: 863; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 864; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 865; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 866; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 867; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 868; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 869; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 870; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 871; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 872; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 873; ALIGNED-NEXT: s_endpgm 874; 875; UNALIGNED-LABEL: ds16align4: 876; UNALIGNED: ; %bb.0: 877; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 878; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 879; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 880; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 881; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 882; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 883; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 884; UNALIGNED-NEXT: s_endpgm 885 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 886 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 887 ret void 888} 889 890define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 891; GCN-LABEL: ds16align8: 892; GCN: ; %bb.0: 893; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 894; GCN-NEXT: s_waitcnt lgkmcnt(0) 895; GCN-NEXT: v_mov_b32_e32 v0, s0 896; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 897; GCN-NEXT: v_mov_b32_e32 v4, s1 898; GCN-NEXT: s_waitcnt lgkmcnt(0) 899; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 900; GCN-NEXT: s_endpgm 901 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 902 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 903 ret void 904} 905 906define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 907; GCN-LABEL: ds16align16: 908; GCN: ; %bb.0: 909; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 910; GCN-NEXT: s_waitcnt lgkmcnt(0) 911; GCN-NEXT: v_mov_b32_e32 v0, s0 912; GCN-NEXT: ds_read_b128 v[0:3], v0 913; GCN-NEXT: v_mov_b32_e32 v4, s1 914; GCN-NEXT: s_waitcnt lgkmcnt(0) 915; GCN-NEXT: ds_write_b128 v4, v[0:3] 916; GCN-NEXT: s_endpgm 917 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 918 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 919 ret void 920} 921