1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG 3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL 4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED 6 7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) { 8; GCN-LABEL: ds1align1: 9; GCN: ; %bb.0: 10; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11; GCN-NEXT: s_waitcnt lgkmcnt(0) 12; GCN-NEXT: v_mov_b32_e32 v0, s0 13; GCN-NEXT: ds_read_u8 v0, v0 14; GCN-NEXT: v_mov_b32_e32 v1, s1 15; GCN-NEXT: s_waitcnt lgkmcnt(0) 16; GCN-NEXT: ds_write_b8 v1, v0 17; GCN-NEXT: s_endpgm 18 %val = load i8, i8 addrspace(3)* %in, align 1 19 store i8 %val, i8 addrspace(3)* %out, align 1 20 ret void 21} 22 23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 24; ALIGNED-SDAG-LABEL: ds2align1: 25; ALIGNED-SDAG: ; %bb.0: 26; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 27; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 28; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 29; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 30; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1 31; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 32; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 33; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1 34; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 35; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v0 offset:1 36; ALIGNED-SDAG-NEXT: s_endpgm 37; 38; ALIGNED-GISEL-LABEL: ds2align1: 39; ALIGNED-GISEL: ; %bb.0: 40; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 41; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 42; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 43; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 44; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1 45; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 46; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 47; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1 48; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 49; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v0 50; ALIGNED-GISEL-NEXT: ds_write_b8 v2, v1 offset:1 51; ALIGNED-GISEL-NEXT: s_endpgm 52; 53; UNALIGNED-LABEL: ds2align1: 54; UNALIGNED: ; %bb.0: 55; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 56; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 57; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 58; UNALIGNED-NEXT: ds_read_u16 v0, v0 59; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 60; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 61; UNALIGNED-NEXT: ds_write_b16 v1, v0 62; UNALIGNED-NEXT: s_endpgm 63 %val = load i16, i16 addrspace(3)* %in, align 1 64 store i16 %val, i16 addrspace(3)* %out, align 1 65 ret void 66} 67 68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) { 69; GCN-LABEL: ds2align2: 70; GCN: ; %bb.0: 71; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 72; GCN-NEXT: s_waitcnt lgkmcnt(0) 73; GCN-NEXT: v_mov_b32_e32 v0, s0 74; GCN-NEXT: ds_read_u16 v0, v0 75; GCN-NEXT: v_mov_b32_e32 v1, s1 76; GCN-NEXT: s_waitcnt lgkmcnt(0) 77; GCN-NEXT: ds_write_b16 v1, v0 78; GCN-NEXT: s_endpgm 79 %val = load i16, i16 addrspace(3)* %in, align 2 80 store i16 %val, i16 addrspace(3)* %out, align 2 81 ret void 82} 83 84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 85; ALIGNED-SDAG-LABEL: ds4align1: 86; ALIGNED-SDAG: ; %bb.0: 87; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 88; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 89; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 90; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 91; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 92; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 93; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3 94; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 95; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 96; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1 97; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 98; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v2 offset:1 99; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 100; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v3 offset:2 101; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 102; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v0 offset:3 103; ALIGNED-SDAG-NEXT: s_endpgm 104; 105; ALIGNED-GISEL-LABEL: ds4align1: 106; ALIGNED-GISEL: ; %bb.0: 107; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 108; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 109; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 110; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 111; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 112; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3 113; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2 114; ALIGNED-GISEL-NEXT: s_mov_b32 s0, 8 115; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 116; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 117; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 118; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 119; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v3 120; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 121; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 122; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v2, v0, v1 123; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 124; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 125; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:1 126; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 127; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:2 128; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:3 129; ALIGNED-GISEL-NEXT: s_endpgm 130; 131; UNALIGNED-LABEL: ds4align1: 132; UNALIGNED: ; %bb.0: 133; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 134; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 135; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 136; UNALIGNED-NEXT: ds_read_b32 v0, v0 137; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 138; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 139; UNALIGNED-NEXT: ds_write_b32 v1, v0 140; UNALIGNED-NEXT: s_endpgm 141 %val = load i32, i32 addrspace(3)* %in, align 1 142 store i32 %val, i32 addrspace(3)* %out, align 1 143 ret void 144} 145 146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 147; ALIGNED-SDAG-LABEL: ds4align2: 148; ALIGNED-SDAG: ; %bb.0: 149; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 150; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 151; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 152; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 153; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2 154; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1 155; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 156; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1 157; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 158; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v0 offset:2 159; ALIGNED-SDAG-NEXT: s_endpgm 160; 161; ALIGNED-GISEL-LABEL: ds4align2: 162; ALIGNED-GISEL: ; %bb.0: 163; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 164; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 165; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 166; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 167; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2 168; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1 169; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 170; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 171; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0 172; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v2, v0 offset:2 173; ALIGNED-GISEL-NEXT: s_endpgm 174; 175; UNALIGNED-LABEL: ds4align2: 176; UNALIGNED: ; %bb.0: 177; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 178; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 179; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 180; UNALIGNED-NEXT: ds_read_b32 v0, v0 181; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 182; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 183; UNALIGNED-NEXT: ds_write_b32 v1, v0 184; UNALIGNED-NEXT: s_endpgm 185 %val = load i32, i32 addrspace(3)* %in, align 2 186 store i32 %val, i32 addrspace(3)* %out, align 2 187 ret void 188} 189 190define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { 191; GCN-LABEL: ds4align4: 192; GCN: ; %bb.0: 193; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 194; GCN-NEXT: s_waitcnt lgkmcnt(0) 195; GCN-NEXT: v_mov_b32_e32 v0, s0 196; GCN-NEXT: ds_read_b32 v0, v0 197; GCN-NEXT: v_mov_b32_e32 v1, s1 198; GCN-NEXT: s_waitcnt lgkmcnt(0) 199; GCN-NEXT: ds_write_b32 v1, v0 200; GCN-NEXT: s_endpgm 201 %val = load i32, i32 addrspace(3)* %in, align 4 202 store i32 %val, i32 addrspace(3)* %out, align 4 203 ret void 204} 205 206define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 207; ALIGNED-SDAG-LABEL: ds8align1: 208; ALIGNED-SDAG: ; %bb.0: 209; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 210; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 211; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 212; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 213; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:1 214; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:2 215; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:3 216; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:4 217; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:5 218; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6 219; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7 220; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v1, s1 221; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 222; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v4 offset:2 223; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 224; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v5 offset:3 225; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v2 226; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v3 offset:1 227; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 228; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v8 offset:6 229; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 230; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v0 offset:7 231; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v6 offset:4 232; ALIGNED-SDAG-NEXT: ds_write_b8 v1, v7 offset:5 233; ALIGNED-SDAG-NEXT: s_endpgm 234; 235; ALIGNED-GISEL-LABEL: ds8align1: 236; ALIGNED-GISEL: ; %bb.0: 237; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 238; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 239; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 240; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 241; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 242; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 243; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 244; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 245; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 246; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 247; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 248; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:7 249; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 250; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 251; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 252; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 253; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 254; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 255; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 256; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 257; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 258; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 259; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7 260; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2 261; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1 262; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 263; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 264; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1 265; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 266; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v1 offset:2 267; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:3 268; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 269; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v0 offset:4 270; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:5 271; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 272; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v3, v0 offset:6 273; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1 offset:7 274; ALIGNED-GISEL-NEXT: s_endpgm 275; 276; UNALIGNED-LABEL: ds8align1: 277; UNALIGNED: ; %bb.0: 278; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 279; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 280; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 281; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 282; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 283; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 284; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 285; UNALIGNED-NEXT: s_endpgm 286 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1 287 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1 288 ret void 289} 290 291define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 292; ALIGNED-SDAG-LABEL: ds8align2: 293; ALIGNED-SDAG: ; %bb.0: 294; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 295; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 296; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 297; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:2 298; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 299; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:6 300; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:4 301; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 302; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 303; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:2 304; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 305; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v2 306; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 307; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v3 offset:6 308; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 309; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v0 offset:4 310; ALIGNED-SDAG-NEXT: s_endpgm 311; 312; ALIGNED-GISEL-LABEL: ds8align2: 313; ALIGNED-GISEL: ; %bb.0: 314; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 315; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 316; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 317; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 318; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 319; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 320; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6 321; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 322; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 323; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 324; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 325; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 326; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 327; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 328; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:4 329; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:6 330; ALIGNED-GISEL-NEXT: s_endpgm 331; 332; UNALIGNED-LABEL: ds8align2: 333; UNALIGNED: ; %bb.0: 334; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 335; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 336; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 337; UNALIGNED-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 338; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1 339; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 340; UNALIGNED-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 341; UNALIGNED-NEXT: s_endpgm 342 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2 343 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2 344 ret void 345} 346 347define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 348; GCN-LABEL: ds8align4: 349; GCN: ; %bb.0: 350; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 351; GCN-NEXT: s_waitcnt lgkmcnt(0) 352; GCN-NEXT: v_mov_b32_e32 v0, s0 353; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 354; GCN-NEXT: v_mov_b32_e32 v2, s1 355; GCN-NEXT: s_waitcnt lgkmcnt(0) 356; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1 357; GCN-NEXT: s_endpgm 358 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4 359 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4 360 ret void 361} 362 363define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) { 364; GCN-LABEL: ds8align8: 365; GCN: ; %bb.0: 366; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 367; GCN-NEXT: s_waitcnt lgkmcnt(0) 368; GCN-NEXT: v_mov_b32_e32 v0, s0 369; GCN-NEXT: ds_read_b64 v[0:1], v0 370; GCN-NEXT: v_mov_b32_e32 v2, s1 371; GCN-NEXT: s_waitcnt lgkmcnt(0) 372; GCN-NEXT: ds_write_b64 v2, v[0:1] 373; GCN-NEXT: s_endpgm 374 %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8 375 store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8 376 ret void 377} 378 379define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 380; ALIGNED-SDAG-LABEL: ds12align1: 381; ALIGNED-SDAG: ; %bb.0: 382; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 383; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 384; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 385; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 386; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 387; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 388; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 389; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 390; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 391; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 392; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 393; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 394; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 395; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 396; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11 397; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1 398; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 399; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8 400; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 401; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v10 offset:9 402; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v3 offset:2 403; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v4 offset:3 404; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v1 405; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v2 offset:1 406; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v5 offset:4 407; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v6 offset:5 408; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v7 offset:6 409; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v8 offset:7 410; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 411; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v11 offset:10 412; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(11) 413; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v0 offset:11 414; ALIGNED-SDAG-NEXT: s_endpgm 415; 416; ALIGNED-GISEL-LABEL: ds12align1: 417; ALIGNED-GISEL: ; %bb.0: 418; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 419; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 420; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 421; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 422; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 423; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 424; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 425; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 426; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 427; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 428; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 429; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 430; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 431; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 432; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 433; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 434; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 435; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 436; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 437; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 438; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 439; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 440; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 441; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:11 442; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 443; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v6, 24, v8 444; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v7, 16, v7 445; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 446; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 447; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 448; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v5 449; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 450; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 451; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2 452; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3 453; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1 454; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 455; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 456; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1 457; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 458; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v1 offset:2 459; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:3 460; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 461; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v2 offset:4 462; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:5 463; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 464; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v2 offset:6 465; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:7 466; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 467; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v0 offset:8 468; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:9 469; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 470; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v4, v0 offset:10 471; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1 offset:11 472; ALIGNED-GISEL-NEXT: s_endpgm 473; 474; UNALIGNED-LABEL: ds12align1: 475; UNALIGNED: ; %bb.0: 476; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 477; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 478; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 479; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 480; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 481; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 482; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 483; UNALIGNED-NEXT: s_endpgm 484 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1 485 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1 486 ret void 487} 488 489define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 490; ALIGNED-SDAG-LABEL: ds12align2: 491; ALIGNED-SDAG: ; %bb.0: 492; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 494; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 495; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8 496; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 497; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 498; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 499; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 500; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1 501; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10 502; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 503; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8 504; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(4) 505; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v3 offset:2 506; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v2 507; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 508; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v4 offset:4 509; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 510; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v5 offset:6 511; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5) 512; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v0 offset:10 513; ALIGNED-SDAG-NEXT: s_endpgm 514; 515; ALIGNED-GISEL-LABEL: ds12align2: 516; ALIGNED-GISEL: ; %bb.0: 517; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 518; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 519; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 520; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 521; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 522; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 523; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 524; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 525; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10 526; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1 527; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 528; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 529; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 530; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 531; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 532; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v5 533; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v1 534; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v1 offset:2 535; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v2 offset:4 536; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v2 offset:6 537; ALIGNED-GISEL-NEXT: ds_write_b16 v6, v0 offset:8 538; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v6, v0 offset:10 539; ALIGNED-GISEL-NEXT: s_endpgm 540; 541; UNALIGNED-LABEL: ds12align2: 542; UNALIGNED: ; %bb.0: 543; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 544; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 545; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 546; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 547; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 548; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 549; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 550; UNALIGNED-NEXT: s_endpgm 551 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2 552 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2 553 ret void 554} 555 556define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 557; ALIGNED-LABEL: ds12align4: 558; ALIGNED: ; %bb.0: 559; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 560; ALIGNED-NEXT: s_waitcnt lgkmcnt(0) 561; ALIGNED-NEXT: v_mov_b32_e32 v2, s0 562; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 563; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8 564; ALIGNED-NEXT: v_mov_b32_e32 v3, s1 565; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 566; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 567; ALIGNED-NEXT: s_waitcnt lgkmcnt(1) 568; ALIGNED-NEXT: ds_write_b32 v3, v2 offset:8 569; ALIGNED-NEXT: s_endpgm 570; 571; UNALIGNED-LABEL: ds12align4: 572; UNALIGNED: ; %bb.0: 573; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 574; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 575; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 576; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 577; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 578; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 579; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 580; UNALIGNED-NEXT: s_endpgm 581 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4 582 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4 583 ret void 584} 585 586; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64? 587define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 588; ALIGNED-SDAG-LABEL: ds12align8: 589; ALIGNED-SDAG: ; %bb.0: 590; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 591; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 592; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 593; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8 594; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 595; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1 596; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 597; ALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8 598; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 599; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1] 600; ALIGNED-SDAG-NEXT: s_endpgm 601; 602; ALIGNED-GISEL-LABEL: ds12align8: 603; ALIGNED-GISEL: ; %bb.0: 604; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 605; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 606; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 607; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 608; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8 609; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1 610; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 611; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1 612; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 613; ALIGNED-GISEL-NEXT: ds_write_b32 v3, v2 offset:8 614; ALIGNED-GISEL-NEXT: s_endpgm 615; 616; UNALIGNED-LABEL: ds12align8: 617; UNALIGNED: ; %bb.0: 618; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 619; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 620; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 621; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0 622; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1 623; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 624; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2] 625; UNALIGNED-NEXT: s_endpgm 626 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8 627 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8 628 ret void 629} 630 631define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) { 632; GCN-LABEL: ds12align16: 633; GCN: ; %bb.0: 634; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 635; GCN-NEXT: s_waitcnt lgkmcnt(0) 636; GCN-NEXT: v_mov_b32_e32 v0, s0 637; GCN-NEXT: ds_read_b96 v[0:2], v0 638; GCN-NEXT: v_mov_b32_e32 v3, s1 639; GCN-NEXT: s_waitcnt lgkmcnt(0) 640; GCN-NEXT: ds_write_b96 v3, v[0:2] 641; GCN-NEXT: s_endpgm 642 %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16 643 store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16 644 ret void 645} 646 647define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 648; ALIGNED-SDAG-LABEL: ds16align1: 649; ALIGNED-SDAG: ; %bb.0: 650; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 651; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 652; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 653; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0 654; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1 655; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2 656; ALIGNED-SDAG-NEXT: ds_read_u8 v4, v0 offset:3 657; ALIGNED-SDAG-NEXT: ds_read_u8 v5, v0 offset:4 658; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5 659; ALIGNED-SDAG-NEXT: ds_read_u8 v7, v0 offset:6 660; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:7 661; ALIGNED-SDAG-NEXT: ds_read_u8 v9, v0 offset:8 662; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9 663; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10 664; ALIGNED-SDAG-NEXT: ds_read_u8 v12, v0 offset:11 665; ALIGNED-SDAG-NEXT: ds_read_u8 v13, v0 offset:12 666; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13 667; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14 668; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15 669; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1 670; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 671; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12 672; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3) 673; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v14 offset:13 674; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v3 offset:2 675; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v4 offset:3 676; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v1 677; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v2 offset:1 678; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v5 offset:4 679; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v6 offset:5 680; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v9 offset:8 681; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v10 offset:9 682; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v7 offset:6 683; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v8 offset:7 684; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v11 offset:10 685; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v12 offset:11 686; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(14) 687; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v15 offset:14 688; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v0 offset:15 689; ALIGNED-SDAG-NEXT: s_endpgm 690; 691; ALIGNED-GISEL-LABEL: ds16align1: 692; ALIGNED-GISEL: ; %bb.0: 693; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 694; ALIGNED-GISEL-NEXT: s_mov_b32 s2, 8 695; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 696; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 697; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0 698; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1 699; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2 700; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:3 701; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:4 702; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:5 703; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:6 704; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:7 705; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 706; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 707; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 708; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v4 709; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 710; ALIGNED-GISEL-NEXT: v_or3_b32 v1, v2, v3, v1 711; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 712; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v6, 8, v5 713; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 714; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v8 715; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v7 716; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2 717; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:8 718; ALIGNED-GISEL-NEXT: ds_read_u8 v4, v0 offset:9 719; ALIGNED-GISEL-NEXT: ds_read_u8 v5, v0 offset:10 720; ALIGNED-GISEL-NEXT: ds_read_u8 v6, v0 offset:11 721; ALIGNED-GISEL-NEXT: ds_read_u8 v7, v0 offset:12 722; ALIGNED-GISEL-NEXT: ds_read_u8 v8, v0 offset:13 723; ALIGNED-GISEL-NEXT: ds_read_u8 v9, v0 offset:14 724; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:15 725; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 726; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v4, 8, v3 727; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 728; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v6 729; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v5 730; ALIGNED-GISEL-NEXT: v_or3_b32 v3, v4, v5, v3 731; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 732; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v4, v8, 8, v7 733; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 734; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 735; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9 736; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4 737; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1 738; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1 739; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 740; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1 741; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 742; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v1 offset:2 743; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:3 744; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v2 745; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v2 offset:4 746; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:5 747; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 748; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v2 offset:6 749; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:7 750; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v3 751; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v3 offset:8 752; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:9 753; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 754; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v3 offset:10 755; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:11 756; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0 757; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v0 offset:12 758; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:13 759; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v1, 8 760; ALIGNED-GISEL-NEXT: v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 761; ALIGNED-GISEL-NEXT: ds_write_b8_d16_hi v5, v0 offset:14 762; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1 offset:15 763; ALIGNED-GISEL-NEXT: s_endpgm 764; 765; UNALIGNED-LABEL: ds16align1: 766; UNALIGNED: ; %bb.0: 767; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 768; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 769; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 770; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 771; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 772; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 773; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 774; UNALIGNED-NEXT: s_endpgm 775 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1 776 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1 777 ret void 778} 779 780define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 781; ALIGNED-SDAG-LABEL: ds16align2: 782; ALIGNED-SDAG: ; %bb.0: 783; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 784; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 785; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0 786; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12 787; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0 788; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2 789; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4 790; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6 791; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8 792; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10 793; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1 794; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14 795; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 796; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12 797; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 798; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v3 offset:2 799; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v2 800; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 801; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v4 offset:4 802; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(6) 803; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v6 offset:8 804; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v5 offset:6 805; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 806; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v7 offset:10 807; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7) 808; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v0 offset:14 809; ALIGNED-SDAG-NEXT: s_endpgm 810; 811; ALIGNED-GISEL-LABEL: ds16align2: 812; ALIGNED-GISEL: ; %bb.0: 813; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 814; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 815; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0 816; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0 817; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2 818; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4 819; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6 820; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8 821; ALIGNED-GISEL-NEXT: ds_read_u16 v6, v0 offset:10 822; ALIGNED-GISEL-NEXT: ds_read_u16 v7, v0 offset:12 823; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:14 824; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(6) 825; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1 826; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4) 827; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3 828; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 829; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2) 830; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5 831; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 832; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v7 833; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v1 834; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v1 offset:2 835; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v2 offset:4 836; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v2 offset:6 837; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v3 offset:8 838; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v3 offset:10 839; ALIGNED-GISEL-NEXT: ds_write_b16 v4, v0 offset:12 840; ALIGNED-GISEL-NEXT: ds_write_b16_d16_hi v4, v0 offset:14 841; ALIGNED-GISEL-NEXT: s_endpgm 842; 843; UNALIGNED-LABEL: ds16align2: 844; UNALIGNED: ; %bb.0: 845; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 846; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 847; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 848; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 849; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 850; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 851; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 852; UNALIGNED-NEXT: s_endpgm 853 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2 854 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2 855 ret void 856} 857 858define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 859; ALIGNED-SDAG-LABEL: ds16align4: 860; ALIGNED-SDAG: ; %bb.0: 861; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 862; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0) 863; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0 864; ALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3 865; ALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1 866; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1 867; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 868; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3 869; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1) 870; ALIGNED-SDAG-NEXT: ds_write2_b32 v4, v2, v3 offset1:1 871; ALIGNED-SDAG-NEXT: s_endpgm 872; 873; ALIGNED-GISEL-LABEL: ds16align4: 874; ALIGNED-GISEL: ; %bb.0: 875; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 876; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0) 877; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0 878; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1 879; ALIGNED-GISEL-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 880; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1 881; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 882; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v0, v1 offset1:1 883; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1) 884; ALIGNED-GISEL-NEXT: ds_write2_b32 v4, v2, v3 offset0:2 offset1:3 885; ALIGNED-GISEL-NEXT: s_endpgm 886; 887; UNALIGNED-LABEL: ds16align4: 888; UNALIGNED: ; %bb.0: 889; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 890; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 891; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 892; UNALIGNED-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 893; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1 894; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) 895; UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 896; UNALIGNED-NEXT: s_endpgm 897 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4 898 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4 899 ret void 900} 901 902define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 903; GCN-LABEL: ds16align8: 904; GCN: ; %bb.0: 905; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 906; GCN-NEXT: s_waitcnt lgkmcnt(0) 907; GCN-NEXT: v_mov_b32_e32 v0, s0 908; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 909; GCN-NEXT: v_mov_b32_e32 v4, s1 910; GCN-NEXT: s_waitcnt lgkmcnt(0) 911; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 912; GCN-NEXT: s_endpgm 913 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8 914 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8 915 ret void 916} 917 918define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) { 919; GCN-LABEL: ds16align16: 920; GCN: ; %bb.0: 921; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 922; GCN-NEXT: s_waitcnt lgkmcnt(0) 923; GCN-NEXT: v_mov_b32_e32 v0, s0 924; GCN-NEXT: ds_read_b128 v[0:3], v0 925; GCN-NEXT: v_mov_b32_e32 v4, s1 926; GCN-NEXT: s_waitcnt lgkmcnt(0) 927; GCN-NEXT: ds_write_b128 v4, v[0:3] 928; GCN-NEXT: s_endpgm 929 %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16 930 store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16 931 ret void 932} 933