1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-LABEL: ds2align1: 25; ALIGNED: ; %bb.0: 26; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-NEXT: ds_read_u8 v1, v0 30; ALIGNED-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-NEXT: ds_write_b8 v2, v1 34; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-NEXT: s_endpgm 37; 38; UNALIGNED-LABEL: ds2align1: 39; UNALIGNED: ; %bb.0: 40; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 42; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 43; UNALIGNED-NEXT: ds_read_u16 v0, v0 44; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 45; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 46; UNALIGNED-NEXT: ds_write_b16 v1, v0 47; UNALIGNED-NEXT: s_endpgm 48 %val = load i16, i16 addrspace(3)* %in, align 1 49 store i16 %val, i16 addrspace(3)* %out, align 1 50 ret void 51} 52 53define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 54; GCN-LABEL: ds2align2: 55; GCN: ; %bb.0: 56; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 57; GCN-NEXT: s_waitcnt lgkmcnt(0) 58; GCN-NEXT: v_mov_b32_e32 v0, s0 59; GCN-NEXT: ds_read_u16 v0, v0 60; GCN-NEXT: v_mov_b32_e32 v1, s1 61; GCN-NEXT: s_waitcnt lgkmcnt(0) 62; GCN-NEXT: ds_write_b16 v1, v0 63; GCN-NEXT: s_endpgm 64 %val = load i16, i16 addrspace(3)* %in, align 2 65 store i16 %val, i16 addrspace(3)* %out, align 2 66 ret void 67} 68 69define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 70; ALIGNED-LABEL: ds4align1: 71; ALIGNED: ; %bb.0: 72; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 73; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 74; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 75; ALIGNED-NEXT: ds_read_u8 v1, v0 76; ALIGNED-NEXT: ds_read_u8 v2, v0 offset:1 77; ALIGNED-NEXT: ds_read_u8 v3, v0 offset:2 78; ALIGNED-NEXT: ds_read_u8 v0, v0 offset:3 79; ALIGNED-NEXT: v_mov_b32_e32 v4, s1 80; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 81; ALIGNED-NEXT: ds_write_b8 v4, v1 82; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 83; ALIGNED-NEXT: ds_write_b8 v4, v2 offset:1 84; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 85; ALIGNED-NEXT: ds_write_b8 v4, v3 offset:2 86; ALIGNED-NEXT: s_waitcnt lgkmcnt(3) 87; ALIGNED-NEXT: ds_write_b8 v4, v0 offset:3 88; ALIGNED-NEXT: s_endpgm 89; 90; UNALIGNED-LABEL: ds4align1: 91; UNALIGNED: ; %bb.0: 92; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 93; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 94; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 95; UNALIGNED-NEXT: ds_read_b32 v0, v0 96; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 97; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 98; UNALIGNED-NEXT: ds_write_b32 v1, v0 99; UNALIGNED-NEXT: s_endpgm 100 %val = load i32, i32 addrspace(3)* %in, align 1 101 store i32 %val, i32 addrspace(3)* %out, align 1 102 ret void 103} 104 105define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 106; ALIGNED-LABEL: ds4align2: 107; ALIGNED: ; %bb.0: 108; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 109; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 110; ALIGNED-NEXT: v_mov_b32_e32 v0, s0 111; ALIGNED-NEXT: ds_read_u16 v1, v0 112; ALIGNED-NEXT: ds_read_u16 v0, v0 offset:2 113; ALIGNED-NEXT: v_mov_b32_e32 v2, s1 114; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 115; ALIGNED-NEXT: ds_write_b16 v2, v1 116; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 117; ALIGNED-NEXT: ds_write_b16 v2, v0 offset:2 118; ALIGNED-NEXT: s_endpgm 119; 120; UNALIGNED-LABEL: ds4align2: 121; UNALIGNED: ; %bb.0: 122; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 123; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 124; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 125; UNALIGNED-NEXT: ds_read_b32 v0, v0 126; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 127; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 128; UNALIGNED-NEXT: ds_write_b32 v1, v0 129; UNALIGNED-NEXT: s_endpgm 130 %val = load i32, i32 addrspace(3)* %in, align 2 131 store i32 %val, i32 addrspace(3)* %out, align 2 132 ret void 133} 134 135define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 136; GCN-LABEL: ds4align4: 137; GCN: ; %bb.0: 138; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 139; GCN-NEXT: s_waitcnt lgkmcnt(0) 140; GCN-NEXT: v_mov_b32_e32 v0, s0 141; GCN-NEXT: ds_read_b32 v0, v0 142; GCN-NEXT: v_mov_b32_e32 v1, s1 143; GCN-NEXT: s_waitcnt lgkmcnt(0) 144; GCN-NEXT: ds_write_b32 v1, v0 145; GCN-NEXT: s_endpgm 146 %val = load i32, i32 addrspace(3)* %in, align 4 147 store i32 %val, i32 addrspace(3)* %out, align 4 148 ret void 149} 150 151define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 152; ALIGNED-SDAG-LABEL: ds8align1: 153; ALIGNED-SDAG: ; %bb.0: 154; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 156; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 157; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 158; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 159; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 160; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 161; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 162; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 163; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 164; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 165; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 166; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 167; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 168; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 169; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 170; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 171; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 172; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 173; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 174; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 175; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 176; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 177; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 178; ALIGNED-SDAG-NEXT: s_endpgm 179; 180; ALIGNED-GISEL-LABEL: ds8align1: 181; ALIGNED-GISEL: ; %bb.0: 182; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 183; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 184; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 185; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 186; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:1 187; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:2 188; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:3 189; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:4 190; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:5 191; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:6 192; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 193; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 194; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 195; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v2 196; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 197; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v3 offset:1 198; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 199; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v4 offset:2 200; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 201; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v5 offset:3 202; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 203; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v6 offset:4 204; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 205; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v7 offset:5 206; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 207; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v8 offset:6 208; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 209; ALIGNED-GISEL-NEXT: ds_write_b8 v1, v0 offset:7 210; ALIGNED-GISEL-NEXT: s_endpgm 211; 212; UNALIGNED-LABEL: ds8align1: 213; UNALIGNED: ; %bb.0: 214; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 215; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 216; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 217; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 218; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 219; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 220; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 221; UNALIGNED-NEXT: s_endpgm 222 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 223 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 224 ret void 225} 226 227define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 228; ALIGNED-SDAG-LABEL: ds8align2: 229; ALIGNED-SDAG: ; %bb.0: 230; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 231; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 232; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 233; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 234; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 offset:2 235; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:4 236; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6 237; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 238; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) 239; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 offset:2 240; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 241; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(2) 242; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:6 243; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:4 244; ALIGNED-SDAG-NEXT: s_endpgm 245; 246; ALIGNED-GISEL-LABEL: ds8align2: 247; ALIGNED-GISEL: ; %bb.0: 248; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 250; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 251; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 252; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 253; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 254; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 255; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 256; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 257; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 258; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 259; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:2 260; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 261; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:4 262; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(3) 263; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:6 264; ALIGNED-GISEL-NEXT: s_endpgm 265; 266; UNALIGNED-LABEL: ds8align2: 267; UNALIGNED: ; %bb.0: 268; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 269; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 270; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 271; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 272; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 273; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 274; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 275; UNALIGNED-NEXT: s_endpgm 276 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 277 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 278 ret void 279} 280 281define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 282; GCN-LABEL: ds8align4: 283; GCN: ; %bb.0: 284; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 285; GCN-NEXT: s_waitcnt lgkmcnt(0) 286; GCN-NEXT: v_mov_b32_e32 v0, s0 287; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 288; GCN-NEXT: v_mov_b32_e32 v2, s1 289; GCN-NEXT: s_waitcnt lgkmcnt(0) 290; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 291; GCN-NEXT: s_endpgm 292 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 293 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 294 ret void 295} 296 297define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 298; GCN-LABEL: ds8align8: 299; GCN: ; %bb.0: 300; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 301; GCN-NEXT: s_waitcnt lgkmcnt(0) 302; GCN-NEXT: v_mov_b32_e32 v0, s0 303; GCN-NEXT: ds_read_b64 v[0:1], v0 304; GCN-NEXT: v_mov_b32_e32 v2, s1 305; GCN-NEXT: s_waitcnt lgkmcnt(0) 306; GCN-NEXT: ds_write_b64 v2, v[0:1] 307; GCN-NEXT: s_endpgm 308 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 309 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 310 ret void 311} 312 313define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 314; ALIGNED-SDAG-LABEL: ds12align1: 315; ALIGNED-SDAG: ; %bb.0: 316; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 317; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 318; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 319; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 320; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 321; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 322; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 323; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 324; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 325; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 326; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 327; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 328; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 329; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 330; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 331; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 332; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 333; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 334; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 335; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 336; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 337; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 338; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 339; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 340; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 341; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 342; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 343; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 344; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 345; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 346; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 347; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 348; ALIGNED-SDAG-NEXT: s_endpgm 349; 350; ALIGNED-GISEL-LABEL: ds12align1: 351; ALIGNED-GISEL: ; %bb.0: 352; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 353; ALIGNED-GISEL-NEXT: s_mov_b32 s3, 8 354; ALIGNED-GISEL-NEXT: s_movk_i32 s2, 0xff 355; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 0xff 356; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 357; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s0 358; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v3 359; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v3 offset:1 360; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v3 offset:2 361; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v3 offset:3 362; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v3 offset:4 363; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v3 offset:5 364; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v3 offset:6 365; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v3 offset:7 366; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 367; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 368; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v0, s2, v1 369; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(5) 370; ALIGNED-GISEL-NEXT: v_and_b32_e32 v1, s2, v4 371; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 372; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, s2, v5 373; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 374; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4 375; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v1, v4 376; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 377; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v1, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 378; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 379; ALIGNED-GISEL-NEXT: v_and_b32_e32 v4, v8, v2 380; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 381; ALIGNED-GISEL-NEXT: v_and_b32_e32 v5, v9, v2 382; ALIGNED-GISEL-NEXT: v_and_or_b32 v1, v6, s2, v1 383; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 384; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 24, v5 385; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v1, v4, v5 386; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v3 offset:8 387; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v3 offset:9 388; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v3 offset:10 389; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v3 offset:11 390; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v7, 8 391; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 392; ALIGNED-GISEL-NEXT: v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 393; ALIGNED-GISEL-NEXT: v_and_or_b32 v4, v4, v2, v5 394; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 395; ALIGNED-GISEL-NEXT: v_and_b32_e32 v5, v6, v2 396; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 397; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, v3, v2 398; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 399; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2 400; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 8, v0 401; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 402; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v4, v5, v2 403; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 404; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v5, 24, v0 405; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v0 406; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v3 offset:1 407; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v4 offset:2 408; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v5 offset:3 409; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 8, v1 410; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 411; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 412; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:4 413; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v0 offset:5 414; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v3 offset:6 415; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v4 offset:7 416; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 8, v2 417; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 418; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v2 419; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:8 420; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v0 offset:9 421; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:10 422; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v3 offset:11 423; ALIGNED-GISEL-NEXT: s_endpgm 424; 425; UNALIGNED-LABEL: ds12align1: 426; UNALIGNED: ; %bb.0: 427; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 428; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 429; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 430; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 431; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 432; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 433; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 434; UNALIGNED-NEXT: s_endpgm 435 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 436 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 437 ret void 438} 439 440define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 441; ALIGNED-SDAG-LABEL: ds12align2: 442; ALIGNED-SDAG: ; %bb.0: 443; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 444; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 445; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 446; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 447; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 448; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 449; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 450; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 451; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 452; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 453; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 454; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 455; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 456; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 457; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 458; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 459; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 460; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:10 461; ALIGNED-SDAG-NEXT: s_endpgm 462; 463; ALIGNED-GISEL-LABEL: ds12align2: 464; ALIGNED-GISEL: ; %bb.0: 465; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 466; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 0xffff 467; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 468; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 469; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 470; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 471; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 472; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 473; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 474; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 475; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 476; ALIGNED-GISEL-NEXT: v_and_b32_e32 v0, s2, v2 477; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 478; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 479; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, s2, v4 480; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 481; ALIGNED-GISEL-NEXT: v_and_or_b32 v0, v1, s2, v0 482; ALIGNED-GISEL-NEXT: v_and_or_b32 v1, v3, s2, v2 483; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 484; ALIGNED-GISEL-NEXT: v_and_b32_e32 v2, s2, v6 485; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 486; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 487; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 488; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 489; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:2 490; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 491; ALIGNED-GISEL-NEXT: v_and_or_b32 v2, v5, s2, v2 492; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 offset:4 493; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:6 494; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v2 495; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:8 496; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:10 497; ALIGNED-GISEL-NEXT: s_endpgm 498; 499; UNALIGNED-LABEL: ds12align2: 500; UNALIGNED: ; %bb.0: 501; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 502; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 503; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 504; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 505; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 506; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 507; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 508; UNALIGNED-NEXT: s_endpgm 509 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 510 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 511 ret void 512} 513 514define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 515; ALIGNED-LABEL: ds12align4: 516; ALIGNED: ; %bb.0: 517; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 518; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 519; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 520; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 521; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 522; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 523; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 524; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 525; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 526; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 527; ALIGNED-NEXT: s_endpgm 528; 529; UNALIGNED-LABEL: ds12align4: 530; UNALIGNED: ; %bb.0: 531; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 532; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 533; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 534; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 535; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 536; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 537; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 538; UNALIGNED-NEXT: s_endpgm 539 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 540 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 541 ret void 542} 543 544; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? 545define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 546; ALIGNED-SDAG-LABEL: ds12align8: 547; ALIGNED-SDAG: ; %bb.0: 548; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 549; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 550; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 551; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 552; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8 553; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 554; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 555; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 556; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 557; ALIGNED-SDAG-NEXT: s_endpgm 558; 559; ALIGNED-GISEL-LABEL: ds12align8: 560; ALIGNED-GISEL: ; %bb.0: 561; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 562; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 563; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 564; ALIGNED-GISEL-NEXT: ds_read_b64 v[0:1], v2 565; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 566; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 567; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 568; ALIGNED-GISEL-NEXT: ds_write_b64 v3, v[0:1] 569; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 570; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 571; ALIGNED-GISEL-NEXT: s_endpgm 572; 573; UNALIGNED-LABEL: ds12align8: 574; UNALIGNED: ; %bb.0: 575; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 576; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 577; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 578; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 579; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 580; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 581; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 582; UNALIGNED-NEXT: s_endpgm 583 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 584 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 585 ret void 586} 587 588define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 589; GCN-LABEL: ds12align16: 590; GCN: ; %bb.0: 591; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 592; GCN-NEXT: s_waitcnt lgkmcnt(0) 593; GCN-NEXT: v_mov_b32_e32 v0, s0 594; GCN-NEXT: ds_read_b96 v[0:2], v0 595; GCN-NEXT: v_mov_b32_e32 v3, s1 596; GCN-NEXT: s_waitcnt lgkmcnt(0) 597; GCN-NEXT: ds_write_b96 v3, v[0:2] 598; GCN-NEXT: s_endpgm 599 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 600 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 601 ret void 602} 603 604define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 605; ALIGNED-SDAG-LABEL: ds16align1: 606; ALIGNED-SDAG: ; %bb.0: 607; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 608; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 609; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 610; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 611; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 612; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 613; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 614; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 615; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 616; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 617; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 618; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 619; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 620; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 621; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 622; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 623; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 624; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 625; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 626; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 627; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 628; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 629; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 630; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 631; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 632; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 633; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 634; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 635; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 636; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 637; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 638; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 639; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 640; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 641; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 642; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 643; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) 644; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 645; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 646; ALIGNED-SDAG-NEXT: s_endpgm 647; 648; ALIGNED-GISEL-LABEL: ds16align1: 649; ALIGNED-GISEL: ; %bb.0: 650; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 651; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 652; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 653; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 654; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 655; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 656; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 657; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 658; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 659; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 660; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 661; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:8 662; ALIGNED-GISEL-NEXT: ds_read_u8 v10, v0 offset:9 663; ALIGNED-GISEL-NEXT: ds_read_u8 v11, v0 offset:10 664; ALIGNED-GISEL-NEXT: ds_read_u8 v12, v0 offset:11 665; ALIGNED-GISEL-NEXT: ds_read_u8 v13, v0 offset:12 666; ALIGNED-GISEL-NEXT: ds_read_u8 v14, v0 offset:13 667; ALIGNED-GISEL-NEXT: ds_read_u8 v15, v0 offset:14 668; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 669; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v16, s1 670; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 671; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v1 672; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v2 offset:1 673; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 674; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v3 offset:2 675; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v4 offset:3 676; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 677; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v5 offset:4 678; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v6 offset:5 679; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 680; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v7 offset:6 681; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v8 offset:7 682; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 683; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v9 offset:8 684; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v10 offset:9 685; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 686; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v11 offset:10 687; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v12 offset:11 688; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 689; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v13 offset:12 690; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v14 offset:13 691; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(14) 692; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v15 offset:14 693; ALIGNED-GISEL-NEXT: ds_write_b8 v16, v0 offset:15 694; ALIGNED-GISEL-NEXT: s_endpgm 695; 696; UNALIGNED-LABEL: ds16align1: 697; UNALIGNED: ; %bb.0: 698; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 699; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 700; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 701; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 702; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 703; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 704; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 705; UNALIGNED-NEXT: s_endpgm 706 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 707 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 708 ret void 709} 710 711define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 712; ALIGNED-SDAG-LABEL: ds16align2: 713; ALIGNED-SDAG: ; %bb.0: 714; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 715; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 716; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 717; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 718; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 719; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 720; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 721; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 722; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 723; ALIGNED-SDAG-NEXT: ds_read_u16 v8, v0 offset:12 724; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 725; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 726; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 727; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v8 offset:12 728; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v3 offset:2 729; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v2 730; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v4 offset:4 731; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v6 offset:8 732; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v5 offset:6 733; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v7 offset:10 734; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 735; ALIGNED-SDAG-NEXT: ds_write_b16 v1, v0 offset:14 736; ALIGNED-SDAG-NEXT: s_endpgm 737; 738; ALIGNED-GISEL-LABEL: ds16align2: 739; ALIGNED-GISEL: ; %bb.0: 740; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 741; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 742; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 743; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 744; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:2 745; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:4 746; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:6 747; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:8 748; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:10 749; ALIGNED-GISEL-NEXT: ds_read_u16 v8, v0 offset:12 750; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 751; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, s1 752; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 753; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v2 754; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 755; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v3 offset:2 756; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 757; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v4 offset:4 758; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 759; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v5 offset:6 760; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 761; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v6 offset:8 762; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 763; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v7 offset:10 764; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 765; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v8 offset:12 766; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(7) 767; ALIGNED-GISEL-NEXT: ds_write_b16 v1, v0 offset:14 768; ALIGNED-GISEL-NEXT: s_endpgm 769; 770; UNALIGNED-LABEL: ds16align2: 771; UNALIGNED: ; %bb.0: 772; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 773; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 774; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 775; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 776; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 777; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 778; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 779; UNALIGNED-NEXT: s_endpgm 780 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 781 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 782 ret void 783} 784 785define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 786; ALIGNED-SDAG-LABEL: ds16align4: 787; ALIGNED-SDAG: ; %bb.0: 788; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 789; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 790; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 791; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 792; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 793; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 794; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 795; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 796; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 797; ALIGNED-SDAG-NEXT: s_endpgm 798; 799; ALIGNED-GISEL-LABEL: ds16align4: 800; ALIGNED-GISEL: ; %bb.0: 801; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 802; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 803; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 804; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 805; ALIGNED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 806; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 807; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 808; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 809; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 810; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 811; ALIGNED-GISEL-NEXT: s_endpgm 812; 813; UNALIGNED-LABEL: ds16align4: 814; UNALIGNED: ; %bb.0: 815; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 816; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 817; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 818; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 819; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 820; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 821; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 822; UNALIGNED-NEXT: s_endpgm 823 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 824 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 825 ret void 826} 827 828define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 829; GCN-LABEL: ds16align8: 830; GCN: ; %bb.0: 831; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 832; GCN-NEXT: s_waitcnt lgkmcnt(0) 833; GCN-NEXT: v_mov_b32_e32 v0, s0 834; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 835; GCN-NEXT: v_mov_b32_e32 v4, s1 836; GCN-NEXT: s_waitcnt lgkmcnt(0) 837; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 838; GCN-NEXT: s_endpgm 839 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 840 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 841 ret void 842} 843 844define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 845; GCN-LABEL: ds16align16: 846; GCN: ; %bb.0: 847; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 848; GCN-NEXT: s_waitcnt lgkmcnt(0) 849; GCN-NEXT: v_mov_b32_e32 v0, s0 850; GCN-NEXT: ds_read_b128 v[0:3], v0 851; GCN-NEXT: v_mov_b32_e32 v4, s1 852; GCN-NEXT: s_waitcnt lgkmcnt(0) 853; GCN-NEXT: ds_write_b128 v4, v[0:3] 854; GCN-NEXT: s_endpgm 855 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 856 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 857 ret void 858} 859