1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-LABEL: ds2align1: 25; ALIGNED: ; %bb.0: 26; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-NEXT: ds_read_u8 v1, v0 30; ALIGNED-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-NEXT: ds_write_b8 v2, v1 34; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-NEXT: s_endpgm 37; 38; UNALIGNED-LABEL: ds2align1: 39; UNALIGNED: ; %bb.0: 40; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 42; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 43; UNALIGNED-NEXT: ds_read_u16 v0, v0 44; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 45; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 46; UNALIGNED-NEXT: ds_write_b16 v1, v0 47; UNALIGNED-NEXT: s_endpgm 48 %val = load i16, i16 addrspace(3)* %in, align 1 49 store i16 %val, i16 addrspace(3)* %out, align 1 50 ret void 51} 52 53define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 54; GCN-LABEL: ds2align2: 55; GCN: ; %bb.0: 56; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 57; GCN-NEXT: s_waitcnt lgkmcnt(0) 58; GCN-NEXT: v_mov_b32_e32 v0, s0 59; GCN-NEXT: ds_read_u16 v0, v0 60; GCN-NEXT: v_mov_b32_e32 v1, s1 61; GCN-NEXT: s_waitcnt lgkmcnt(0) 62; GCN-NEXT: ds_write_b16 v1, v0 63; GCN-NEXT: s_endpgm 64 %val = load i16, i16 addrspace(3)* %in, align 2 65 store i16 %val, i16 addrspace(3)* %out, align 2 66 ret void 67} 68 69define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 70; ALIGNED-LABEL: ds4align1: 71; ALIGNED: ; %bb.0: 72; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 73; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 74; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 75; ALIGNED-NEXT: ds_read_u8 v1, v0 76; ALIGNED-NEXT: ds_read_u8 v2, v0 offset:1 77; ALIGNED-NEXT: ds_read_u8 v3, v0 offset:2 78; ALIGNED-NEXT: ds_read_u8 v0, v0 offset:3 79; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 80; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 81; ALIGNED-NEXT: ds_write_b8 v4, v1 82; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 83; ALIGNED-NEXT: ds_write_b8 v4, v2 offset:1 84; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 85; ALIGNED-NEXT: ds_write_b8 v4, v3 offset:2 86; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 87; ALIGNED-NEXT: ds_write_b8 v4, v0 offset:3 88; ALIGNED-NEXT: s_endpgm 89; 90; UNALIGNED-LABEL: ds4align1: 91; UNALIGNED: ; %bb.0: 92; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 93; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 94; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 95; UNALIGNED-NEXT: ds_read_b32 v0, v0 96; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 97; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 98; UNALIGNED-NEXT: ds_write_b32 v1, v0 99; UNALIGNED-NEXT: s_endpgm 100 %val = load i32, i32 addrspace(3)* %in, align 1 101 store i32 %val, i32 addrspace(3)* %out, align 1 102 ret void 103} 104 105define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 106; ALIGNED-LABEL: ds4align2: 107; ALIGNED: ; %bb.0: 108; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 110; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 111; ALIGNED-NEXT: ds_read_u16 v1, v0 112; ALIGNED-NEXT: ds_read_u16 v0, v0 offset:2 113; ALIGNED-NEXT: v_mov_b32_e32 v2, s1 114; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 115; ALIGNED-NEXT: ds_write_b16 v2, v1 116; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 117; ALIGNED-NEXT: ds_write_b16 v2, v0 offset:2 118; ALIGNED-NEXT: s_endpgm 119; 120; UNALIGNED-LABEL: ds4align2: 121; UNALIGNED: ; %bb.0: 122; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 123; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 124; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 125; UNALIGNED-NEXT: ds_read_b32 v0, v0 126; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 127; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 128; UNALIGNED-NEXT: ds_write_b32 v1, v0 129; UNALIGNED-NEXT: s_endpgm 130 %val = load i32, i32 addrspace(3)* %in, align 2 131 store i32 %val, i32 addrspace(3)* %out, align 2 132 ret void 133} 134 135define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 136; GCN-LABEL: ds4align4: 137; GCN: ; %bb.0: 138; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 139; GCN-NEXT: s_waitcnt lgkmcnt(0) 140; GCN-NEXT: v_mov_b32_e32 v0, s0 141; GCN-NEXT: ds_read_b32 v0, v0 142; GCN-NEXT: v_mov_b32_e32 v1, s1 143; GCN-NEXT: s_waitcnt lgkmcnt(0) 144; GCN-NEXT: ds_write_b32 v1, v0 145; GCN-NEXT: s_endpgm 146 %val = load i32, i32 addrspace(3)* %in, align 4 147 store i32 %val, i32 addrspace(3)* %out, align 4 148 ret void 149} 150 151define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 152; ALIGNED-SDAG-LABEL: ds8align1: 153; ALIGNED-SDAG: ; %bb.0: 154; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 156; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 157; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 158; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 159; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 160; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 161; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 162; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 163; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 164; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 165; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 166; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 167; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 168; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 169; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 170; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 171; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 172; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 173; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 174; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 175; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 176; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 177; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 178; ALIGNED-SDAG-NEXT: s_endpgm 179; 180; ALIGNED-GISEL-LABEL: ds8align1: 181; ALIGNED-GISEL: ; %bb.0: 182; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 183; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 184; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 185; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 186; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:1 187; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:2 188; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:3 189; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:4 190; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:5 191; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:6 192; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 193; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 194; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 195; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v2 196; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 197; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v3 offset:1 198; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 199; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v4 offset:2 200; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 201; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v5 offset:3 202; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 203; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v6 offset:4 204; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 205; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v7 offset:5 206; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 207; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v8 offset:6 208; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 209; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7 210; ALIGNED-GISEL-NEXT: s_endpgm 211; 212; UNALIGNED-LABEL: ds8align1: 213; UNALIGNED: ; %bb.0: 214; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 215; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 216; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 217; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 218; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 219; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 220; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 221; UNALIGNED-NEXT: s_endpgm 222 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 223 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 224 ret void 225} 226 227define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 228; ALIGNED-SDAG-LABEL: ds8align2: 229; ALIGNED-SDAG: ; %bb.0: 230; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 231; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 232; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 233; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 234; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 235; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 236; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 237; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 238; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) 239; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 offset:2 240; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 241; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) 242; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 243; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:4 244; ALIGNED-SDAG-NEXT: s_endpgm 245; 246; ALIGNED-GISEL-LABEL: ds8align2: 247; ALIGNED-GISEL: ; %bb.0: 248; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 250; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 251; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 252; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 253; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 254; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 255; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 256; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 257; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 258; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 259; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:2 260; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 261; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:4 262; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 263; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:6 264; ALIGNED-GISEL-NEXT: s_endpgm 265; 266; UNALIGNED-LABEL: ds8align2: 267; UNALIGNED: ; %bb.0: 268; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 269; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 270; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 271; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 272; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 273; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 274; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 275; UNALIGNED-NEXT: s_endpgm 276 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 277 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 278 ret void 279} 280 281define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 282; GCN-LABEL: ds8align4: 283; GCN: ; %bb.0: 284; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 285; GCN-NEXT: s_waitcnt lgkmcnt(0) 286; GCN-NEXT: v_mov_b32_e32 v0, s0 287; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 288; GCN-NEXT: v_mov_b32_e32 v2, s1 289; GCN-NEXT: s_waitcnt lgkmcnt(0) 290; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 291; GCN-NEXT: s_endpgm 292 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 293 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 294 ret void 295} 296 297define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 298; GCN-LABEL: ds8align8: 299; GCN: ; %bb.0: 300; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 301; GCN-NEXT: s_waitcnt lgkmcnt(0) 302; GCN-NEXT: v_mov_b32_e32 v0, s0 303; GCN-NEXT: ds_read_b64 v[0:1], v0 304; GCN-NEXT: v_mov_b32_e32 v2, s1 305; GCN-NEXT: s_waitcnt lgkmcnt(0) 306; GCN-NEXT: ds_write_b64 v2, v[0:1] 307; GCN-NEXT: s_endpgm 308 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 309 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 310 ret void 311} 312 313define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 314; ALIGNED-SDAG-LABEL: ds12align1: 315; ALIGNED-SDAG: ; %bb.0: 316; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 317; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 318; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 319; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 320; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 321; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 322; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 323; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 324; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 325; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 326; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 327; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 328; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 329; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 330; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 331; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 332; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 333; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 334; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 335; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 336; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 337; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 338; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 339; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 340; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 341; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 342; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 343; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 344; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 345; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 346; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 347; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 348; ALIGNED-SDAG-NEXT: s_endpgm 349; 350; ALIGNED-GISEL-LABEL: ds12align1: 351; ALIGNED-GISEL: ; %bb.0: 352; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 353; ALIGNED-GISEL-NEXT: s_mov_b32 s3, 8 354; ALIGNED-GISEL-NEXT: s_movk_i32 s2, 0xff 355; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 0xff 356; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 357; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 358; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v2 359; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:1 360; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:2 361; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:3 362; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v2 offset:4 363; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v2 offset:5 364; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v2 offset:6 365; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v2 offset:7 366; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 367; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 368; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v0, s2, v3 369; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) 370; ALIGNED-GISEL-NEXT: v_and_b32_e32 v3, s2, v4 371; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 372; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, s2, v5 373; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 374; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 375; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v4 376; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 377; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, v8, v1 378; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v3, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 379; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 380; ALIGNED-GISEL-NEXT: v_and_b32_e32 v1, v9, v1 381; ALIGNED-GISEL-NEXT: v_and_or_b32 v3, v6, s2, v3 382; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 383; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 384; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v3, v4, v1 385; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v2 offset:8 386; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v2 offset:9 387; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v2 offset:10 388; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v2 offset:11 389; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v6, 8, v0 390; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v0 391; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v9, s1 392; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v8, 24, v0 393; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v0 394; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v6 offset:1 395; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v7 offset:2 396; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v8 offset:3 397; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 8, v1 398; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 399; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 400; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v1 offset:4 401; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v0 offset:5 402; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v6 offset:6 403; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v7 offset:7 404; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(11) 405; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v3 offset:8 406; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(11) 407; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v4 offset:9 408; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(11) 409; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v5 offset:10 410; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(11) 411; ALIGNED-GISEL-NEXT: ds_write_b8 v9, v2 offset:11 412; ALIGNED-GISEL-NEXT: s_endpgm 413; 414; UNALIGNED-LABEL: ds12align1: 415; UNALIGNED: ; %bb.0: 416; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 417; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 418; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 419; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 420; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 421; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 422; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 423; UNALIGNED-NEXT: s_endpgm 424 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 425 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 426 ret void 427} 428 429define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 430; ALIGNED-SDAG-LABEL: ds12align2: 431; ALIGNED-SDAG: ; %bb.0: 432; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 433; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 434; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 435; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 436; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 437; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 438; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 439; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 440; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 441; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 442; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 443; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 444; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 445; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 446; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 447; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 448; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 449; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:10 450; ALIGNED-SDAG-NEXT: s_endpgm 451; 452; ALIGNED-GISEL-LABEL: ds12align2: 453; ALIGNED-GISEL: ; %bb.0: 454; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 455; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 0xffff 456; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 457; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 458; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 459; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 460; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 461; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 462; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 463; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 464; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 465; ALIGNED-GISEL-NEXT: v_and_b32_e32 v0, s2, v2 466; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 467; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 468; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, s2, v4 469; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 470; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v1, s2, v0 471; ALIGNED-GISEL-NEXT: v_and_or_b32 v1, v3, s2, v2 472; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 473; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 474; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0 475; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v2 offset:2 476; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 477; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v1 offset:4 478; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v0 offset:6 479; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) 480; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v5 offset:8 481; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) 482; ALIGNED-GISEL-NEXT: ds_write_b16 v3, v6 offset:10 483; ALIGNED-GISEL-NEXT: s_endpgm 484; 485; UNALIGNED-LABEL: ds12align2: 486; UNALIGNED: ; %bb.0: 487; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 488; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 489; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 490; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 491; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 492; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 493; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 494; UNALIGNED-NEXT: s_endpgm 495 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 496 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 497 ret void 498} 499 500define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 501; ALIGNED-LABEL: ds12align4: 502; ALIGNED: ; %bb.0: 503; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 504; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 505; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 506; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 507; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 508; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 509; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 510; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 511; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 512; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 513; ALIGNED-NEXT: s_endpgm 514; 515; UNALIGNED-LABEL: ds12align4: 516; UNALIGNED: ; %bb.0: 517; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 518; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 519; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 520; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 521; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 522; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 523; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 524; UNALIGNED-NEXT: s_endpgm 525 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 526 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 527 ret void 528} 529 530; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? 531define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 532; ALIGNED-SDAG-LABEL: ds12align8: 533; ALIGNED-SDAG: ; %bb.0: 534; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 535; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 536; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 537; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 538; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 539; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 540; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 541; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 542; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 543; ALIGNED-SDAG-NEXT: s_endpgm 544; 545; ALIGNED-GISEL-LABEL: ds12align8: 546; ALIGNED-GISEL: ; %bb.0: 547; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 548; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 549; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 550; ALIGNED-GISEL-NEXT: ds_read_b64 v[0:1], v2 551; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 552; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 553; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 554; ALIGNED-GISEL-NEXT: ds_write_b64 v3, v[0:1] 555; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 556; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 557; ALIGNED-GISEL-NEXT: s_endpgm 558; 559; UNALIGNED-LABEL: ds12align8: 560; UNALIGNED: ; %bb.0: 561; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 562; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 563; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 564; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 565; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 566; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 567; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 568; UNALIGNED-NEXT: s_endpgm 569 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 570 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 571 ret void 572} 573 574define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 575; GCN-LABEL: ds12align16: 576; GCN: ; %bb.0: 577; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 578; GCN-NEXT: s_waitcnt lgkmcnt(0) 579; GCN-NEXT: v_mov_b32_e32 v0, s0 580; GCN-NEXT: ds_read_b96 v[0:2], v0 581; GCN-NEXT: v_mov_b32_e32 v3, s1 582; GCN-NEXT: s_waitcnt lgkmcnt(0) 583; GCN-NEXT: ds_write_b96 v3, v[0:2] 584; GCN-NEXT: s_endpgm 585 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 586 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 587 ret void 588} 589 590define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 591; ALIGNED-SDAG-LABEL: ds16align1: 592; ALIGNED-SDAG: ; %bb.0: 593; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 594; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 595; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 596; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 597; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 598; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 599; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 600; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 601; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 602; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 603; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 604; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 605; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 606; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 607; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 608; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 609; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 610; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 611; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 612; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 613; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 614; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 615; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 616; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 617; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 618; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 619; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 620; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 621; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 622; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 623; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 624; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 625; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 626; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 627; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 628; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 629; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) 630; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 631; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 632; ALIGNED-SDAG-NEXT: s_endpgm 633; 634; ALIGNED-GISEL-LABEL: ds16align1: 635; ALIGNED-GISEL: ; %bb.0: 636; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 637; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 638; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 639; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 640; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 641; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 642; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 643; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 644; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 645; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 646; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 647; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:8 648; ALIGNED-GISEL-NEXT: ds_read_u8 v10, v0 offset:9 649; ALIGNED-GISEL-NEXT: ds_read_u8 v11, v0 offset:10 650; ALIGNED-GISEL-NEXT: ds_read_u8 v12, v0 offset:11 651; ALIGNED-GISEL-NEXT: ds_read_u8 v13, v0 offset:12 652; ALIGNED-GISEL-NEXT: ds_read_u8 v14, v0 offset:13 653; ALIGNED-GISEL-NEXT: ds_read_u8 v15, v0 offset:14 654; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 655; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v16, s1 656; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 657; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v1 658; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v2 offset:1 659; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 660; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v3 offset:2 661; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v4 offset:3 662; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 663; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v5 offset:4 664; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v6 offset:5 665; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 666; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v7 offset:6 667; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v8 offset:7 668; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 669; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v9 offset:8 670; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v10 offset:9 671; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 672; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v11 offset:10 673; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v12 offset:11 674; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 675; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v13 offset:12 676; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v14 offset:13 677; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 678; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v15 offset:14 679; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v0 offset:15 680; ALIGNED-GISEL-NEXT: s_endpgm 681; 682; UNALIGNED-LABEL: ds16align1: 683; UNALIGNED: ; %bb.0: 684; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 685; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 686; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 687; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 688; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 689; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 690; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 691; UNALIGNED-NEXT: s_endpgm 692 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 693 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 694 ret void 695} 696 697define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 698; ALIGNED-SDAG-LABEL: ds16align2: 699; ALIGNED-SDAG: ; %bb.0: 700; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 701; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 702; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 703; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 704; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 705; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 706; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 707; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 708; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 709; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 710; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 711; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 712; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 713; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 714; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 715; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 716; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 717; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 718; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 719; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 720; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 721; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 722; ALIGNED-SDAG-NEXT: s_endpgm 723; 724; ALIGNED-GISEL-LABEL: ds16align2: 725; ALIGNED-GISEL: ; %bb.0: 726; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 727; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 728; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 729; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 730; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:2 731; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:4 732; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:6 733; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:8 734; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:10 735; ALIGNED-GISEL-NEXT: ds_read_u16 v8, v0 offset:12 736; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 737; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 738; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 739; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v2 740; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 741; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v3 offset:2 742; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 743; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v4 offset:4 744; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 745; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v5 offset:6 746; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 747; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v6 offset:8 748; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 749; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v7 offset:10 750; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 751; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v8 offset:12 752; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 753; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:14 754; ALIGNED-GISEL-NEXT: s_endpgm 755; 756; UNALIGNED-LABEL: ds16align2: 757; UNALIGNED: ; %bb.0: 758; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 759; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 760; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 761; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 762; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 763; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 764; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 765; UNALIGNED-NEXT: s_endpgm 766 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 767 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 768 ret void 769} 770 771define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 772; ALIGNED-SDAG-LABEL: ds16align4: 773; ALIGNED-SDAG: ; %bb.0: 774; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 775; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 776; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 777; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 778; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 779; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 780; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 781; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 782; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 783; ALIGNED-SDAG-NEXT: s_endpgm 784; 785; ALIGNED-GISEL-LABEL: ds16align4: 786; ALIGNED-GISEL: ; %bb.0: 787; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 788; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 789; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 790; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 791; ALIGNED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 792; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 793; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 794; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 795; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 796; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 797; ALIGNED-GISEL-NEXT: s_endpgm 798; 799; UNALIGNED-LABEL: ds16align4: 800; UNALIGNED: ; %bb.0: 801; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 802; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 803; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 804; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 805; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 806; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 807; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 808; UNALIGNED-NEXT: s_endpgm 809 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 810 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 811 ret void 812} 813 814define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 815; GCN-LABEL: ds16align8: 816; GCN: ; %bb.0: 817; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 818; GCN-NEXT: s_waitcnt lgkmcnt(0) 819; GCN-NEXT: v_mov_b32_e32 v0, s0 820; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 821; GCN-NEXT: v_mov_b32_e32 v4, s1 822; GCN-NEXT: s_waitcnt lgkmcnt(0) 823; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 824; GCN-NEXT: s_endpgm 825 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 826 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 827 ret void 828} 829 830define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 831; GCN-LABEL: ds16align16: 832; GCN: ; %bb.0: 833; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 834; GCN-NEXT: s_waitcnt lgkmcnt(0) 835; GCN-NEXT: v_mov_b32_e32 v0, s0 836; GCN-NEXT: ds_read_b128 v[0:3], v0 837; GCN-NEXT: v_mov_b32_e32 v4, s1 838; GCN-NEXT: s_waitcnt lgkmcnt(0) 839; GCN-NEXT: ds_write_b128 v4, v[0:3] 840; GCN-NEXT: s_endpgm 841 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 842 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 843 ret void 844} 845