1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-SDAG-LABEL: ds2align1: 25; ALIGNED-SDAG: ; %bb.0: 26; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 30; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1 34; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-SDAG-NEXT: s_endpgm 37; 38; ALIGNED-GISEL-LABEL: ds2align1: 39; ALIGNED-GISEL: ; %bb.0: 40; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 42; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 43; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 44; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 45; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 46; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 47; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 48; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 49; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 50; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 51; ALIGNED-GISEL-NEXT: s_endpgm 52; 53; UNALIGNED-LABEL: ds2align1: 54; UNALIGNED: ; %bb.0: 55; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 56; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 57; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 58; UNALIGNED-NEXT: ds_read_u16 v0, v0 59; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 60; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 61; UNALIGNED-NEXT: ds_write_b16 v1, v0 62; UNALIGNED-NEXT: s_endpgm 63 %val = load i16, i16 addrspace(3)* %in, align 1 64 store i16 %val, i16 addrspace(3)* %out, align 1 65 ret void 66} 67 68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 69; GCN-LABEL: ds2align2: 70; GCN: ; %bb.0: 71; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 72; GCN-NEXT: s_waitcnt lgkmcnt(0) 73; GCN-NEXT: v_mov_b32_e32 v0, s0 74; GCN-NEXT: ds_read_u16 v0, v0 75; GCN-NEXT: v_mov_b32_e32 v1, s1 76; GCN-NEXT: s_waitcnt lgkmcnt(0) 77; GCN-NEXT: ds_write_b16 v1, v0 78; GCN-NEXT: s_endpgm 79 %val = load i16, i16 addrspace(3)* %in, align 2 80 store i16 %val, i16 addrspace(3)* %out, align 2 81 ret void 82} 83 84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 85; ALIGNED-SDAG-LABEL: ds4align1: 86; ALIGNED-SDAG: ; %bb.0: 87; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 88; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 89; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 90; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 91; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 92; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 93; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3 94; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 95; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 96; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1 97; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 98; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1 99; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 100; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2 101; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 102; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3 103; ALIGNED-SDAG-NEXT: s_endpgm 104; 105; ALIGNED-GISEL-LABEL: ds4align1: 106; ALIGNED-GISEL: ; %bb.0: 107; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 108; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 109; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 110; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 111; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 112; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 113; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 114; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 115; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 116; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 117; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 118; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 119; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 120; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 121; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 122; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 123; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v0 124; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 125; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:1 126; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v0, 8, v1 127; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:2 128; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:3 129; ALIGNED-GISEL-NEXT: s_endpgm 130; 131; UNALIGNED-LABEL: ds4align1: 132; UNALIGNED: ; %bb.0: 133; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 134; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 135; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 136; UNALIGNED-NEXT: ds_read_b32 v0, v0 137; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 138; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 139; UNALIGNED-NEXT: ds_write_b32 v1, v0 140; UNALIGNED-NEXT: s_endpgm 141 %val = load i32, i32 addrspace(3)* %in, align 1 142 store i32 %val, i32 addrspace(3)* %out, align 1 143 ret void 144} 145 146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 147; ALIGNED-SDAG-LABEL: ds4align2: 148; ALIGNED-SDAG: ; %bb.0: 149; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 151; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 152; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 153; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 154; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 156; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1 157; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 158; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2 159; ALIGNED-SDAG-NEXT: s_endpgm 160; 161; ALIGNED-GISEL-LABEL: ds4align2: 162; ALIGNED-GISEL: ; %bb.0: 163; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 164; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 165; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 166; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 167; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 168; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 169; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 171; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 172; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 173; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v1 offset:2 174; ALIGNED-GISEL-NEXT: s_endpgm 175; 176; UNALIGNED-LABEL: ds4align2: 177; UNALIGNED: ; %bb.0: 178; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 179; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 180; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 181; UNALIGNED-NEXT: ds_read_b32 v0, v0 182; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 183; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 184; UNALIGNED-NEXT: ds_write_b32 v1, v0 185; UNALIGNED-NEXT: s_endpgm 186 %val = load i32, i32 addrspace(3)* %in, align 2 187 store i32 %val, i32 addrspace(3)* %out, align 2 188 ret void 189} 190 191define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 192; GCN-LABEL: ds4align4: 193; GCN: ; %bb.0: 194; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 195; GCN-NEXT: s_waitcnt lgkmcnt(0) 196; GCN-NEXT: v_mov_b32_e32 v0, s0 197; GCN-NEXT: ds_read_b32 v0, v0 198; GCN-NEXT: v_mov_b32_e32 v1, s1 199; GCN-NEXT: s_waitcnt lgkmcnt(0) 200; GCN-NEXT: ds_write_b32 v1, v0 201; GCN-NEXT: s_endpgm 202 %val = load i32, i32 addrspace(3)* %in, align 4 203 store i32 %val, i32 addrspace(3)* %out, align 4 204 ret void 205} 206 207define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 208; ALIGNED-SDAG-LABEL: ds8align1: 209; ALIGNED-SDAG: ; %bb.0: 210; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 211; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 212; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 213; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 214; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 215; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 216; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 217; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 218; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 219; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 220; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 221; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 222; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 223; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 224; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 225; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 226; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 227; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 228; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 229; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 230; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 231; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 232; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 233; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 234; ALIGNED-SDAG-NEXT: s_endpgm 235; 236; ALIGNED-GISEL-LABEL: ds8align1: 237; ALIGNED-GISEL: ; %bb.0: 238; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 239; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 240; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 241; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 242; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 243; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 244; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 245; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 246; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 247; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 248; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 250; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 251; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 252; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 253; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 254; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 255; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 256; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 257; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 258; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 259; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 260; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 261; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v1 262; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 263; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 264; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 265; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 266; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 267; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:2 268; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:3 269; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 270; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v0 271; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:4 272; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:5 273; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v0, 8, v1 274; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:6 275; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:7 276; ALIGNED-GISEL-NEXT: s_endpgm 277; 278; UNALIGNED-LABEL: ds8align1: 279; UNALIGNED: ; %bb.0: 280; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 281; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 282; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 283; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 284; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 285; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 286; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 287; UNALIGNED-NEXT: s_endpgm 288 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 289 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 290 ret void 291} 292 293define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 294; ALIGNED-SDAG-LABEL: ds8align2: 295; ALIGNED-SDAG: ; %bb.0: 296; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 297; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 298; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 299; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 300; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 301; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 302; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 303; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 304; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 305; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 306; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 307; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 308; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 309; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 310; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 311; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 312; ALIGNED-SDAG-NEXT: s_endpgm 313; 314; ALIGNED-GISEL-LABEL: ds8align2: 315; ALIGNED-GISEL: ; %bb.0: 316; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 317; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 318; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 319; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 320; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 321; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 322; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 323; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 324; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 325; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 326; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v1 327; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 328; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 329; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 330; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 331; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 332; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:2 333; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 offset:6 334; ALIGNED-GISEL-NEXT: s_endpgm 335; 336; UNALIGNED-LABEL: ds8align2: 337; UNALIGNED: ; %bb.0: 338; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 339; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 340; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 341; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 342; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 343; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 344; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 345; UNALIGNED-NEXT: s_endpgm 346 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 347 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 348 ret void 349} 350 351define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 352; GCN-LABEL: ds8align4: 353; GCN: ; %bb.0: 354; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 355; GCN-NEXT: s_waitcnt lgkmcnt(0) 356; GCN-NEXT: v_mov_b32_e32 v0, s0 357; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 358; GCN-NEXT: v_mov_b32_e32 v2, s1 359; GCN-NEXT: s_waitcnt lgkmcnt(0) 360; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 361; GCN-NEXT: s_endpgm 362 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 363 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 364 ret void 365} 366 367define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 368; GCN-LABEL: ds8align8: 369; GCN: ; %bb.0: 370; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: v_mov_b32_e32 v0, s0 373; GCN-NEXT: ds_read_b64 v[0:1], v0 374; GCN-NEXT: v_mov_b32_e32 v2, s1 375; GCN-NEXT: s_waitcnt lgkmcnt(0) 376; GCN-NEXT: ds_write_b64 v2, v[0:1] 377; GCN-NEXT: s_endpgm 378 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 379 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 380 ret void 381} 382 383define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 384; ALIGNED-SDAG-LABEL: ds12align1: 385; ALIGNED-SDAG: ; %bb.0: 386; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 387; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 388; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 389; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 390; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 391; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 392; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 393; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 394; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 395; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 396; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 397; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 398; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 399; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 400; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 401; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 402; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 403; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 404; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 405; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 406; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 407; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 408; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 409; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 410; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 411; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 412; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 413; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 414; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 415; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 416; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 417; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 418; ALIGNED-SDAG-NEXT: s_endpgm 419; 420; ALIGNED-GISEL-LABEL: ds12align1: 421; ALIGNED-GISEL: ; %bb.0: 422; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 423; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 424; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 425; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 426; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 427; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 428; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 429; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 430; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 431; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 432; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 433; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 434; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 435; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 436; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 437; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 438; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 439; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 440; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 441; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 442; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 443; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 444; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 445; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 446; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 447; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 448; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 449; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 450; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 451; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 452; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 453; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 454; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 455; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 456; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 457; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 458; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 459; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 460; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 461; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 462; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:2 463; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:3 464; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 465; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v2 466; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 467; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:5 468; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 469; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:6 470; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:7 471; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 472; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v0 473; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:8 474; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:9 475; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v0, 8, v1 476; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:10 477; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:11 478; ALIGNED-GISEL-NEXT: s_endpgm 479; 480; UNALIGNED-LABEL: ds12align1: 481; UNALIGNED: ; %bb.0: 482; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 483; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 484; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 485; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 486; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 487; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 488; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 489; UNALIGNED-NEXT: s_endpgm 490 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 491 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 492 ret void 493} 494 495define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 496; ALIGNED-SDAG-LABEL: ds12align2: 497; ALIGNED-SDAG: ; %bb.0: 498; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 499; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 500; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 501; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 502; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 503; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 504; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 505; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 506; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 507; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 508; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 509; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 510; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) 511; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 512; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 513; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 514; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 515; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 516; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 517; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 518; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 519; ALIGNED-SDAG-NEXT: s_endpgm 520; 521; ALIGNED-GISEL-LABEL: ds12align2: 522; ALIGNED-GISEL: ; %bb.0: 523; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 524; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 525; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 526; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 527; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 528; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 529; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 530; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 531; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 532; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 533; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 534; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 535; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 536; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 537; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 538; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 539; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 540; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 541; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v3 offset:2 542; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 543; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 544; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 offset:6 545; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 546; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 547; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 offset:10 548; ALIGNED-GISEL-NEXT: s_endpgm 549; 550; UNALIGNED-LABEL: ds12align2: 551; UNALIGNED: ; %bb.0: 552; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 553; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 554; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 555; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 556; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 557; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 558; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 559; UNALIGNED-NEXT: s_endpgm 560 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 561 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 562 ret void 563} 564 565define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 566; ALIGNED-LABEL: ds12align4: 567; ALIGNED: ; %bb.0: 568; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 569; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 570; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 571; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 572; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 573; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 574; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 575; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 576; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 577; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 578; ALIGNED-NEXT: s_endpgm 579; 580; UNALIGNED-LABEL: ds12align4: 581; UNALIGNED: ; %bb.0: 582; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 583; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 584; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 585; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 586; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 587; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 588; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 589; UNALIGNED-NEXT: s_endpgm 590 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 591 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 592 ret void 593} 594 595; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? 596define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 597; ALIGNED-SDAG-LABEL: ds12align8: 598; ALIGNED-SDAG: ; %bb.0: 599; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 600; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 601; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 602; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 603; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 604; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 605; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 606; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 607; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 608; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 609; ALIGNED-SDAG-NEXT: s_endpgm 610; 611; ALIGNED-GISEL-LABEL: ds12align8: 612; ALIGNED-GISEL: ; %bb.0: 613; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 614; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 615; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 616; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 617; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 618; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 619; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 620; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 621; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 622; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 623; ALIGNED-GISEL-NEXT: s_endpgm 624; 625; UNALIGNED-LABEL: ds12align8: 626; UNALIGNED: ; %bb.0: 627; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 628; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 629; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 630; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 631; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 632; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 633; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 634; UNALIGNED-NEXT: s_endpgm 635 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 636 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 637 ret void 638} 639 640define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 641; GCN-LABEL: ds12align16: 642; GCN: ; %bb.0: 643; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 644; GCN-NEXT: s_waitcnt lgkmcnt(0) 645; GCN-NEXT: v_mov_b32_e32 v0, s0 646; GCN-NEXT: ds_read_b96 v[0:2], v0 647; GCN-NEXT: v_mov_b32_e32 v3, s1 648; GCN-NEXT: s_waitcnt lgkmcnt(0) 649; GCN-NEXT: ds_write_b96 v3, v[0:2] 650; GCN-NEXT: s_endpgm 651 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 652 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 653 ret void 654} 655 656define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 657; ALIGNED-SDAG-LABEL: ds16align1: 658; ALIGNED-SDAG: ; %bb.0: 659; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 660; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 661; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 662; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 663; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 664; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 665; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 666; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 667; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 668; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 669; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 670; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 671; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 672; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 673; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 674; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 675; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 676; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 677; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 678; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 679; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 680; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 681; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 682; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 683; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 684; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 685; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 686; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 687; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 688; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 689; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 690; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 691; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 692; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 693; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 694; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 695; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) 696; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 697; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 698; ALIGNED-SDAG-NEXT: s_endpgm 699; 700; ALIGNED-GISEL-LABEL: ds16align1: 701; ALIGNED-GISEL: ; %bb.0: 702; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 703; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 704; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 705; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 706; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 707; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 708; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 709; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 710; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 711; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 712; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 713; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 714; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 715; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 716; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 717; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 718; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 719; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 720; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 721; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 722; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 723; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 724; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 725; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 726; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 727; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 728; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 729; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 730; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 731; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 732; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 733; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 734; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 735; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 736; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 737; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 738; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 739; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 740; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 741; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 742; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 743; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 744; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 745; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 746; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 747; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v5, 8, v1 748; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 749; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v5 offset:1 750; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v4 751; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v4 offset:2 752; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:3 753; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 754; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v2 755; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:4 756; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v4 offset:5 757; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 758; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:6 759; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:7 760; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 761; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v3 762; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v3 offset:8 763; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:9 764; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 765; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:10 766; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:11 767; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 768; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v0 769; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v0 offset:12 770; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v2 offset:13 771; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v0, 8, v1 772; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v1 offset:14 773; ALIGNED-GISEL-NEXT: ds_write_b8 v6, v0 offset:15 774; ALIGNED-GISEL-NEXT: s_endpgm 775; 776; UNALIGNED-LABEL: ds16align1: 777; UNALIGNED: ; %bb.0: 778; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 779; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 780; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 781; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 782; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 783; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 784; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 785; UNALIGNED-NEXT: s_endpgm 786 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 787 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 788 ret void 789} 790 791define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 792; ALIGNED-SDAG-LABEL: ds16align2: 793; ALIGNED-SDAG: ; %bb.0: 794; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 795; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 796; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 797; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 798; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 799; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 800; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 801; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 802; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 803; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 804; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 805; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 806; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 807; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 808; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 809; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 810; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 811; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 812; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 813; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 814; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 815; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 816; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 817; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 818; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 819; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 820; ALIGNED-SDAG-NEXT: s_endpgm 821; 822; ALIGNED-GISEL-LABEL: ds16align2: 823; ALIGNED-GISEL: ; %bb.0: 824; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 825; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 826; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 827; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 828; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 829; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 830; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 831; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 832; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 833; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 834; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 835; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 836; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 837; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 838; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 839; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 840; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 841; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 842; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 843; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v1 844; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v4 offset:2 845; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 846; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 847; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 848; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v2 offset:4 849; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v1 offset:6 850; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v3 851; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v3 offset:8 852; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v1 offset:10 853; ALIGNED-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 854; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v0 offset:12 855; ALIGNED-GISEL-NEXT: ds_write_b16 v5, v1 offset:14 856; ALIGNED-GISEL-NEXT: s_endpgm 857; 858; UNALIGNED-LABEL: ds16align2: 859; UNALIGNED: ; %bb.0: 860; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 861; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 862; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 863; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 864; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 865; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 866; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 867; UNALIGNED-NEXT: s_endpgm 868 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 869 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 870 ret void 871} 872 873define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 874; ALIGNED-SDAG-LABEL: ds16align4: 875; ALIGNED-SDAG: ; %bb.0: 876; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 877; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 878; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 879; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 880; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 881; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 882; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 883; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 884; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 885; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 886; ALIGNED-SDAG-NEXT: s_endpgm 887; 888; ALIGNED-GISEL-LABEL: ds16align4: 889; ALIGNED-GISEL: ; %bb.0: 890; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 891; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 892; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 893; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 894; ALIGNED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 895; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 896; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 897; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 898; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 899; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 900; ALIGNED-GISEL-NEXT: s_endpgm 901; 902; UNALIGNED-LABEL: ds16align4: 903; UNALIGNED: ; %bb.0: 904; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 905; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 906; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 907; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 908; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 909; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 910; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 911; UNALIGNED-NEXT: s_endpgm 912 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 913 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 914 ret void 915} 916 917define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 918; GCN-LABEL: ds16align8: 919; GCN: ; %bb.0: 920; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 921; GCN-NEXT: s_waitcnt lgkmcnt(0) 922; GCN-NEXT: v_mov_b32_e32 v0, s0 923; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 924; GCN-NEXT: v_mov_b32_e32 v4, s1 925; GCN-NEXT: s_waitcnt lgkmcnt(0) 926; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 927; GCN-NEXT: s_endpgm 928 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 929 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 930 ret void 931} 932 933define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 934; GCN-LABEL: ds16align16: 935; GCN: ; %bb.0: 936; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 937; GCN-NEXT: s_waitcnt lgkmcnt(0) 938; GCN-NEXT: v_mov_b32_e32 v0, s0 939; GCN-NEXT: ds_read_b128 v[0:3], v0 940; GCN-NEXT: v_mov_b32_e32 v4, s1 941; GCN-NEXT: s_waitcnt lgkmcnt(0) 942; GCN-NEXT: ds_write_b128 v4, v[0:3] 943; GCN-NEXT: s_endpgm 944 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 945 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 946 ret void 947} 948