1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-SDAG-LABEL: ds2align1: 25; ALIGNED-SDAG: ; %bb.0: 26; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 30; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1 34; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-SDAG-NEXT: s_endpgm 37; 38; ALIGNED-GISEL-LABEL: ds2align1: 39; ALIGNED-GISEL: ; %bb.0: 40; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 42; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 43; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 44; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 45; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 46; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 47; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 48; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 49; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 50; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 51; ALIGNED-GISEL-NEXT: s_endpgm 52; 53; UNALIGNED-LABEL: ds2align1: 54; UNALIGNED: ; %bb.0: 55; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 56; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 57; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 58; UNALIGNED-NEXT: ds_read_u16 v0, v0 59; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 60; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 61; UNALIGNED-NEXT: ds_write_b16 v1, v0 62; UNALIGNED-NEXT: s_endpgm 63 %val = load i16, i16 addrspace(3)* %in, align 1 64 store i16 %val, i16 addrspace(3)* %out, align 1 65 ret void 66} 67 68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 69; GCN-LABEL: ds2align2: 70; GCN: ; %bb.0: 71; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 72; GCN-NEXT: s_waitcnt lgkmcnt(0) 73; GCN-NEXT: v_mov_b32_e32 v0, s0 74; GCN-NEXT: ds_read_u16 v0, v0 75; GCN-NEXT: v_mov_b32_e32 v1, s1 76; GCN-NEXT: s_waitcnt lgkmcnt(0) 77; GCN-NEXT: ds_write_b16 v1, v0 78; GCN-NEXT: s_endpgm 79 %val = load i16, i16 addrspace(3)* %in, align 2 80 store i16 %val, i16 addrspace(3)* %out, align 2 81 ret void 82} 83 84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 85; ALIGNED-SDAG-LABEL: ds4align1: 86; ALIGNED-SDAG: ; %bb.0: 87; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 88; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 89; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 90; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 91; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 92; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 93; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3 94; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 95; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 96; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1 97; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 98; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1 99; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 100; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2 101; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 102; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3 103; ALIGNED-SDAG-NEXT: s_endpgm 104; 105; ALIGNED-GISEL-LABEL: ds4align1: 106; ALIGNED-GISEL: ; %bb.0: 107; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 108; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 109; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 110; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 111; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 112; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 113; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 114; ALIGNED-GISEL-NEXT: s_mov_b32 s0, 8 115; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 116; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 117; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 118; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 119; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 120; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 121; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 122; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 123; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 124; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 125; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1 126; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 127; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2 128; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:3 129; ALIGNED-GISEL-NEXT: s_endpgm 130; 131; UNALIGNED-LABEL: ds4align1: 132; UNALIGNED: ; %bb.0: 133; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 134; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 135; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 136; UNALIGNED-NEXT: ds_read_b32 v0, v0 137; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 138; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 139; UNALIGNED-NEXT: ds_write_b32 v1, v0 140; UNALIGNED-NEXT: s_endpgm 141 %val = load i32, i32 addrspace(3)* %in, align 1 142 store i32 %val, i32 addrspace(3)* %out, align 1 143 ret void 144} 145 146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 147; ALIGNED-SDAG-LABEL: ds4align2: 148; ALIGNED-SDAG: ; %bb.0: 149; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 151; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 152; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 153; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 154; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 156; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1 157; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 158; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2 159; ALIGNED-SDAG-NEXT: s_endpgm 160; 161; ALIGNED-GISEL-LABEL: ds4align2: 162; ALIGNED-GISEL: ; %bb.0: 163; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 164; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 165; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 166; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 167; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 168; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 169; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 171; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 172; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2 173; ALIGNED-GISEL-NEXT: s_endpgm 174; 175; UNALIGNED-LABEL: ds4align2: 176; UNALIGNED: ; %bb.0: 177; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 178; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 179; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 180; UNALIGNED-NEXT: ds_read_b32 v0, v0 181; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 182; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 183; UNALIGNED-NEXT: ds_write_b32 v1, v0 184; UNALIGNED-NEXT: s_endpgm 185 %val = load i32, i32 addrspace(3)* %in, align 2 186 store i32 %val, i32 addrspace(3)* %out, align 2 187 ret void 188} 189 190define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 191; GCN-LABEL: ds4align4: 192; GCN: ; %bb.0: 193; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 194; GCN-NEXT: s_waitcnt lgkmcnt(0) 195; GCN-NEXT: v_mov_b32_e32 v0, s0 196; GCN-NEXT: ds_read_b32 v0, v0 197; GCN-NEXT: v_mov_b32_e32 v1, s1 198; GCN-NEXT: s_waitcnt lgkmcnt(0) 199; GCN-NEXT: ds_write_b32 v1, v0 200; GCN-NEXT: s_endpgm 201 %val = load i32, i32 addrspace(3)* %in, align 4 202 store i32 %val, i32 addrspace(3)* %out, align 4 203 ret void 204} 205 206define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 207; ALIGNED-SDAG-LABEL: ds8align1: 208; ALIGNED-SDAG: ; %bb.0: 209; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 210; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 211; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 212; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 213; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 214; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 215; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 216; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 217; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 218; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 219; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 220; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 221; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 222; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 223; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 224; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 225; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 226; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 227; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 228; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 229; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 230; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 231; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 232; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 233; ALIGNED-SDAG-NEXT: s_endpgm 234; 235; ALIGNED-GISEL-LABEL: ds8align1: 236; ALIGNED-GISEL: ; %bb.0: 237; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 238; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 239; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 240; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 241; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 242; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 243; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 244; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 245; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 246; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 247; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 248; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 250; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 251; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 252; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 253; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 254; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 255; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 256; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 257; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 258; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 259; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 260; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 261; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 262; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 263; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 264; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 265; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 266; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 267; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3 268; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 269; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 270; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 271; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 272; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 273; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 274; ALIGNED-GISEL-NEXT: s_endpgm 275; 276; UNALIGNED-LABEL: ds8align1: 277; UNALIGNED: ; %bb.0: 278; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 279; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 280; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 281; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 282; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 283; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 284; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] 285; UNALIGNED-NEXT: s_endpgm 286 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 287 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 288 ret void 289} 290 291define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 292; ALIGNED-SDAG-LABEL: ds8align2: 293; ALIGNED-SDAG: ; %bb.0: 294; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 295; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 296; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 297; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 298; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 299; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 300; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 301; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 302; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 303; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 304; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 305; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 306; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 307; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 308; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 309; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 310; ALIGNED-SDAG-NEXT: s_endpgm 311; 312; ALIGNED-GISEL-LABEL: ds8align2: 313; ALIGNED-GISEL: ; %bb.0: 314; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 315; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 316; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 317; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 318; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 319; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 320; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 321; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 322; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 323; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 324; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 325; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 326; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 327; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 328; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 329; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6 330; ALIGNED-GISEL-NEXT: s_endpgm 331; 332; UNALIGNED-LABEL: ds8align2: 333; UNALIGNED: ; %bb.0: 334; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 335; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 336; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 337; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0 338; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 339; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 340; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1] 341; UNALIGNED-NEXT: s_endpgm 342 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 343 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 344 ret void 345} 346 347define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 348; GCN-LABEL: ds8align4: 349; GCN: ; %bb.0: 350; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 351; GCN-NEXT: s_waitcnt lgkmcnt(0) 352; GCN-NEXT: v_mov_b32_e32 v0, s0 353; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 354; GCN-NEXT: v_mov_b32_e32 v2, s1 355; GCN-NEXT: s_waitcnt lgkmcnt(0) 356; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 357; GCN-NEXT: s_endpgm 358 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 359 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 360 ret void 361} 362 363define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 364; GCN-LABEL: ds8align8: 365; GCN: ; %bb.0: 366; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 367; GCN-NEXT: s_waitcnt lgkmcnt(0) 368; GCN-NEXT: v_mov_b32_e32 v0, s0 369; GCN-NEXT: ds_read_b64 v[0:1], v0 370; GCN-NEXT: v_mov_b32_e32 v2, s1 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: ds_write_b64 v2, v[0:1] 373; GCN-NEXT: s_endpgm 374 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 375 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 376 ret void 377} 378 379define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 380; ALIGNED-SDAG-LABEL: ds12align1: 381; ALIGNED-SDAG: ; %bb.0: 382; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 383; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 384; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 385; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 386; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 387; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 388; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 389; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 390; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 391; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 392; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 393; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 394; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 395; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 396; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 397; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 398; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 399; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 400; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 401; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 402; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 403; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 404; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 405; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 406; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 407; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 408; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 409; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 410; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 411; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 412; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 413; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 414; ALIGNED-SDAG-NEXT: s_endpgm 415; 416; ALIGNED-GISEL-LABEL: ds12align1: 417; ALIGNED-GISEL: ; %bb.0: 418; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 419; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 420; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 421; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 422; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 423; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 424; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 425; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 426; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 427; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 428; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 429; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 430; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 431; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 432; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 433; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 434; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 435; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 436; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 437; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 438; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 439; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 440; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 441; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 442; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 443; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 444; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 445; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 446; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 447; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 448; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 449; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 450; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 451; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 452; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 453; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 454; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 455; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 456; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 457; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 458; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 459; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3 460; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 461; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 462; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 463; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 464; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 465; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 466; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 467; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 468; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 469; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 470; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 471; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 472; ALIGNED-GISEL-NEXT: s_endpgm 473; 474; UNALIGNED-LABEL: ds12align1: 475; UNALIGNED: ; %bb.0: 476; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 477; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 478; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 479; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 480; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 481; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 482; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 483; UNALIGNED-NEXT: s_endpgm 484 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 485 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 486 ret void 487} 488 489define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 490; ALIGNED-SDAG-LABEL: ds12align2: 491; ALIGNED-SDAG: ; %bb.0: 492; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 494; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 495; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 496; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 497; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 498; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:10 499; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:8 500; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 501; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 502; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 503; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:4 504; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 505; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:10 506; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 507; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:8 508; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 509; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 offset:2 510; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 511; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:6 512; ALIGNED-SDAG-NEXT: s_endpgm 513; 514; ALIGNED-GISEL-LABEL: ds12align2: 515; ALIGNED-GISEL: ; %bb.0: 516; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 517; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 518; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 519; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 520; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 521; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 522; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 523; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 524; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 525; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 526; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 527; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 528; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 529; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 530; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 531; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 532; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 533; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2 534; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 535; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6 536; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 537; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10 538; ALIGNED-GISEL-NEXT: s_endpgm 539; 540; UNALIGNED-LABEL: ds12align2: 541; UNALIGNED: ; %bb.0: 542; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 543; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 544; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 545; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 546; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 547; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 548; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 549; UNALIGNED-NEXT: s_endpgm 550 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 551 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 552 ret void 553} 554 555define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 556; ALIGNED-LABEL: ds12align4: 557; ALIGNED: ; %bb.0: 558; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 559; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 560; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 561; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 562; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 563; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 564; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 565; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 566; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 567; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 568; ALIGNED-NEXT: s_endpgm 569; 570; UNALIGNED-SDAG-LABEL: ds12align4: 571; UNALIGNED-SDAG: ; %bb.0: 572; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 573; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 574; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 575; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 576; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 577; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 578; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 579; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 580; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 581; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 582; UNALIGNED-SDAG-NEXT: s_endpgm 583; 584; UNALIGNED-GISEL-LABEL: ds12align4: 585; UNALIGNED-GISEL: ; %bb.0: 586; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 587; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 588; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 589; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 590; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 591; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 592; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2] 593; UNALIGNED-GISEL-NEXT: s_endpgm 594 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 595 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 596 ret void 597} 598 599define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 600; ALIGNED-SDAG-LABEL: ds12align8: 601; ALIGNED-SDAG: ; %bb.0: 602; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 603; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 604; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 605; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2 606; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 607; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 608; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 609; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 610; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 611; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 612; ALIGNED-SDAG-NEXT: s_endpgm 613; 614; ALIGNED-GISEL-LABEL: ds12align8: 615; ALIGNED-GISEL: ; %bb.0: 616; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 617; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 618; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 619; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 620; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 621; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 622; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 623; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 624; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 625; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 626; ALIGNED-GISEL-NEXT: s_endpgm 627; 628; UNALIGNED-SDAG-LABEL: ds12align8: 629; UNALIGNED-SDAG: ; %bb.0: 630; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 631; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 632; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 633; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 634; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0 635; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 636; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 637; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 638; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 639; UNALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 640; UNALIGNED-SDAG-NEXT: s_endpgm 641; 642; UNALIGNED-GISEL-LABEL: ds12align8: 643; UNALIGNED-GISEL: ; %bb.0: 644; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 645; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 646; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 647; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0 648; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 649; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 650; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2] 651; UNALIGNED-GISEL-NEXT: s_endpgm 652 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 653 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 654 ret void 655} 656 657define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 658; GCN-LABEL: ds12align16: 659; GCN: ; %bb.0: 660; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 661; GCN-NEXT: s_waitcnt lgkmcnt(0) 662; GCN-NEXT: v_mov_b32_e32 v0, s0 663; GCN-NEXT: ds_read_b96 v[0:2], v0 664; GCN-NEXT: v_mov_b32_e32 v3, s1 665; GCN-NEXT: s_waitcnt lgkmcnt(0) 666; GCN-NEXT: ds_write_b96 v3, v[0:2] 667; GCN-NEXT: s_endpgm 668 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 669 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 670 ret void 671} 672 673define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 674; ALIGNED-SDAG-LABEL: ds16align1: 675; ALIGNED-SDAG: ; %bb.0: 676; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 677; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 678; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 679; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 680; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 681; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 682; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 683; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 684; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 685; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 686; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 687; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 688; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 689; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 690; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 691; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 692; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 693; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 694; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 695; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 696; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 697; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 698; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 699; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 700; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 701; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 702; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 703; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 704; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 705; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 706; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 707; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 708; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 709; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 710; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 711; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 712; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 713; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 714; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 715; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 716; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(13) 717; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 718; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 719; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 720; ALIGNED-SDAG-NEXT: s_endpgm 721; 722; ALIGNED-GISEL-LABEL: ds16align1: 723; ALIGNED-GISEL: ; %bb.0: 724; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 725; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 726; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 727; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 728; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 729; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 730; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 731; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 732; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 733; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 734; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 735; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 736; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 737; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 738; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 739; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 740; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 741; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 742; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 743; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 744; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 745; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 746; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 747; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 748; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 749; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 750; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 751; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 752; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 753; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 754; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 755; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 756; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 757; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 758; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 759; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 760; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 761; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 762; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 763; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 764; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 765; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 766; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 767; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 768; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 769; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 770; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 771; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 772; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 773; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 774; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3 775; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 776; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 777; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 778; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 779; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 780; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 781; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 782; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 783; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 784; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 785; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 786; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 787; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 788; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 789; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 790; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 8 791; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 792; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 793; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 794; ALIGNED-GISEL-NEXT: s_endpgm 795; 796; UNALIGNED-LABEL: ds16align1: 797; UNALIGNED: ; %bb.0: 798; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 799; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 800; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 801; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 802; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 803; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 804; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 805; UNALIGNED-NEXT: s_endpgm 806 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 807 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 808 ret void 809} 810 811define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 812; ALIGNED-SDAG-LABEL: ds16align2: 813; ALIGNED-SDAG: ; %bb.0: 814; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 815; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 816; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 817; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 818; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 819; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 820; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 821; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 822; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 823; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 824; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 825; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 826; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 827; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 828; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 829; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 830; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 831; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 832; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 833; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 834; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 835; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 836; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 837; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 838; ALIGNED-SDAG-NEXT: s_endpgm 839; 840; ALIGNED-GISEL-LABEL: ds16align2: 841; ALIGNED-GISEL: ; %bb.0: 842; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 843; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 844; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 845; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 846; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 847; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 848; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 849; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 850; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 851; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 852; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 853; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 854; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 855; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 856; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 857; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 858; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 859; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 860; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 861; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 862; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 863; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 864; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4 865; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6 866; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8 867; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10 868; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12 869; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14 870; ALIGNED-GISEL-NEXT: s_endpgm 871; 872; UNALIGNED-LABEL: ds16align2: 873; UNALIGNED: ; %bb.0: 874; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 875; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 876; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 877; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0 878; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 879; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 880; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3] 881; UNALIGNED-NEXT: s_endpgm 882 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 883 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 884 ret void 885} 886 887define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 888; ALIGNED-LABEL: ds16align4: 889; ALIGNED: ; %bb.0: 890; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 891; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 892; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 893; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 894; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 895; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 896; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 897; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 898; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 899; ALIGNED-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 900; ALIGNED-NEXT: s_endpgm 901; 902; UNALIGNED-SDAG-LABEL: ds16align4: 903; UNALIGNED-SDAG: ; %bb.0: 904; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 905; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 906; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 907; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 908; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 909; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 910; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 911; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 912; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 913; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 914; UNALIGNED-SDAG-NEXT: s_endpgm 915; 916; UNALIGNED-GISEL-LABEL: ds16align4: 917; UNALIGNED-GISEL: ; %bb.0: 918; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 919; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 920; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 921; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 922; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 923; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 924; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 925; UNALIGNED-GISEL-NEXT: s_endpgm 926 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 927 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 928 ret void 929} 930 931define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 932; GCN-LABEL: ds16align8: 933; GCN: ; %bb.0: 934; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 935; GCN-NEXT: s_waitcnt lgkmcnt(0) 936; GCN-NEXT: v_mov_b32_e32 v0, s0 937; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 938; GCN-NEXT: v_mov_b32_e32 v4, s1 939; GCN-NEXT: s_waitcnt lgkmcnt(0) 940; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 941; GCN-NEXT: s_endpgm 942 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 943 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 944 ret void 945} 946 947define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 948; GCN-LABEL: ds16align16: 949; GCN: ; %bb.0: 950; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 951; GCN-NEXT: s_waitcnt lgkmcnt(0) 952; GCN-NEXT: v_mov_b32_e32 v0, s0 953; GCN-NEXT: ds_read_b128 v[0:3], v0 954; GCN-NEXT: v_mov_b32_e32 v4, s1 955; GCN-NEXT: s_waitcnt lgkmcnt(0) 956; GCN-NEXT: ds_write_b128 v4, v[0:3] 957; GCN-NEXT: s_endpgm 958 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 959 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 960 ret void 961} 962