1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5 6; Unaligned DS access in available from GFX9 onwards. 7; LDS alignment enforcement is controlled by a configuration register: 8; SH_MEM_CONFIG.alignment_mode 9 10define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 11; GFX9-LABEL: load_lds_v4i32_align1: 12; GFX9: ; %bb.0: 13; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 14; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 15; GFX9-NEXT: s_waitcnt lgkmcnt(0) 16; GFX9-NEXT: s_setpc_b64 s[30:31] 17; 18; GFX7-LABEL: load_lds_v4i32_align1: 19; GFX7: ; %bb.0: 20; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 21; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 22; GFX7-NEXT: ds_read_u8 v2, v0 23; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 24; GFX7-NEXT: s_mov_b32 m0, -1 25; GFX7-NEXT: s_waitcnt lgkmcnt(2) 26; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 27; GFX7-NEXT: s_waitcnt lgkmcnt(1) 28; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 29; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 30; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 31; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 32; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 33; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 34; GFX7-NEXT: ds_read_u8 v9, v0 offset:8 35; GFX7-NEXT: ds_read_u8 v10, v0 offset:9 36; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 37; GFX7-NEXT: s_waitcnt lgkmcnt(7) 38; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 39; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 40; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 41; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 42; GFX7-NEXT: s_waitcnt lgkmcnt(5) 43; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 44; GFX7-NEXT: s_waitcnt lgkmcnt(3) 45; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 46; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 47; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 48; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 49; GFX7-NEXT: ds_read_u8 v3, v0 offset:11 50; GFX7-NEXT: ds_read_u8 v5, v0 offset:12 51; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 52; GFX7-NEXT: ds_read_u8 v7, v0 offset:14 53; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 54; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 55; GFX7-NEXT: s_waitcnt lgkmcnt(6) 56; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10 57; GFX7-NEXT: s_waitcnt lgkmcnt(4) 58; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 59; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11 60; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 61; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 62; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 63; GFX7-NEXT: s_waitcnt lgkmcnt(2) 64; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6 65; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 66; GFX7-NEXT: s_waitcnt lgkmcnt(0) 67; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 68; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 69; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 70; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 71; GFX7-NEXT: v_mov_b32_e32 v0, v4 72; GFX7-NEXT: s_setpc_b64 s[30:31] 73; 74; GFX10-LABEL: load_lds_v4i32_align1: 75; GFX10: ; %bb.0: 76; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 77; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 78; GFX10-NEXT: ds_read_u8 v1, v0 79; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 80; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 81; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 82; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 83; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 84; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 85; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 86; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 87; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 88; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 89; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 90; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 91; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 92; GFX10-NEXT: ds_read_u8 v15, v0 offset:15 93; GFX10-NEXT: ds_read_u8 v0, v0 offset:14 94; GFX10-NEXT: s_waitcnt lgkmcnt(14) 95; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 96; GFX10-NEXT: s_waitcnt lgkmcnt(13) 97; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 98; GFX10-NEXT: s_waitcnt lgkmcnt(12) 99; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 100; GFX10-NEXT: s_waitcnt lgkmcnt(10) 101; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 102; GFX10-NEXT: s_waitcnt lgkmcnt(9) 103; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 104; GFX10-NEXT: s_waitcnt lgkmcnt(8) 105; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 106; GFX10-NEXT: s_waitcnt lgkmcnt(6) 107; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 108; GFX10-NEXT: s_waitcnt lgkmcnt(5) 109; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v11 110; GFX10-NEXT: s_waitcnt lgkmcnt(4) 111; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v12 112; GFX10-NEXT: s_waitcnt lgkmcnt(2) 113; GFX10-NEXT: v_lshl_or_b32 v10, v14, 8, v13 114; GFX10-NEXT: s_waitcnt lgkmcnt(1) 115; GFX10-NEXT: v_lshlrev_b32_e32 v11, 24, v15 116; GFX10-NEXT: s_waitcnt lgkmcnt(0) 117; GFX10-NEXT: v_lshlrev_b32_e32 v12, 16, v0 118; GFX10-NEXT: v_or3_b32 v0, v2, v3, v1 119; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4 120; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 121; GFX10-NEXT: v_or3_b32 v3, v11, v12, v10 122; GFX10-NEXT: s_setpc_b64 s[30:31] 123 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 124 ret <4 x i32> %load 125} 126 127define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { 128; GFX9-LABEL: load_lds_v3i32_align1: 129; GFX9: ; %bb.0: 130; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; GFX9-NEXT: ds_read_b96 v[0:2], v0 132; GFX9-NEXT: s_waitcnt lgkmcnt(0) 133; GFX9-NEXT: s_setpc_b64 s[30:31] 134; 135; GFX7-LABEL: load_lds_v3i32_align1: 136; GFX7: ; %bb.0: 137; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 138; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 139; GFX7-NEXT: ds_read_u8 v2, v0 140; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 141; GFX7-NEXT: s_mov_b32 m0, -1 142; GFX7-NEXT: s_waitcnt lgkmcnt(2) 143; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 144; GFX7-NEXT: s_waitcnt lgkmcnt(1) 145; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 146; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 147; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 148; GFX7-NEXT: ds_read_u8 v5, v0 offset:5 149; GFX7-NEXT: ds_read_u8 v6, v0 offset:6 150; GFX7-NEXT: ds_read_u8 v7, v0 offset:7 151; GFX7-NEXT: ds_read_u8 v8, v0 offset:8 152; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 153; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 154; GFX7-NEXT: s_waitcnt lgkmcnt(7) 155; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 156; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 157; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 158; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 159; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 160; GFX7-NEXT: s_waitcnt lgkmcnt(6) 161; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5 162; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 163; GFX7-NEXT: s_waitcnt lgkmcnt(4) 164; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 165; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 166; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 167; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 168; GFX7-NEXT: s_waitcnt lgkmcnt(2) 169; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 170; GFX7-NEXT: s_waitcnt lgkmcnt(0) 171; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 172; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10 173; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 174; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 175; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 176; GFX7-NEXT: v_mov_b32_e32 v0, v3 177; GFX7-NEXT: s_setpc_b64 s[30:31] 178; 179; GFX10-LABEL: load_lds_v3i32_align1: 180; GFX10: ; %bb.0: 181; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 182; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 183; GFX10-NEXT: ds_read_u8 v1, v0 184; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 185; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 186; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 187; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 188; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 189; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 190; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 191; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 192; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 193; GFX10-NEXT: ds_read_u8 v11, v0 offset:11 194; GFX10-NEXT: ds_read_u8 v0, v0 offset:10 195; GFX10-NEXT: s_waitcnt lgkmcnt(10) 196; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 197; GFX10-NEXT: s_waitcnt lgkmcnt(9) 198; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 199; GFX10-NEXT: s_waitcnt lgkmcnt(8) 200; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v4 201; GFX10-NEXT: s_waitcnt lgkmcnt(6) 202; GFX10-NEXT: v_lshl_or_b32 v4, v6, 8, v5 203; GFX10-NEXT: s_waitcnt lgkmcnt(5) 204; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v7 205; GFX10-NEXT: s_waitcnt lgkmcnt(4) 206; GFX10-NEXT: v_lshlrev_b32_e32 v5, 24, v8 207; GFX10-NEXT: s_waitcnt lgkmcnt(2) 208; GFX10-NEXT: v_lshl_or_b32 v7, v10, 8, v9 209; GFX10-NEXT: s_waitcnt lgkmcnt(1) 210; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v11 211; GFX10-NEXT: s_waitcnt lgkmcnt(0) 212; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v0 213; GFX10-NEXT: v_or3_b32 v0, v2, v3, v1 214; GFX10-NEXT: v_or3_b32 v1, v5, v6, v4 215; GFX10-NEXT: v_or3_b32 v2, v8, v9, v7 216; GFX10-NEXT: s_setpc_b64 s[30:31] 217 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 218 ret <3 x i32> %load 219} 220 221define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 222; GFX9-LABEL: store_lds_v4i32_align1: 223; GFX9: ; %bb.0: 224; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 225; GFX9-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 226; GFX9-NEXT: s_waitcnt lgkmcnt(0) 227; GFX9-NEXT: s_setpc_b64 s[30:31] 228; 229; GFX7-LABEL: store_lds_v4i32_align1: 230; GFX7: ; %bb.0: 231; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 232; GFX7-NEXT: s_mov_b32 m0, -1 233; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 234; GFX7-NEXT: v_bfe_u32 v6, v1, 8, 8 235; GFX7-NEXT: ds_write_b8 v0, v1 236; GFX7-NEXT: ds_write_b8 v0, v6 offset:1 237; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 238; GFX7-NEXT: ds_write_b8 v0, v5 offset:2 239; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 240; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 241; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 242; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 243; GFX7-NEXT: ds_write_b8 v0, v5 offset:5 244; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 245; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 246; GFX7-NEXT: ds_write_b8 v0, v2 offset:7 247; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8 248; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 249; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 250; GFX7-NEXT: ds_write_b8 v0, v2 offset:9 251; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 252; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 253; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 254; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 255; GFX7-NEXT: v_bfe_u32 v2, v4, 8, 8 256; GFX7-NEXT: ds_write_b8 v0, v4 offset:12 257; GFX7-NEXT: ds_write_b8 v0, v2 offset:13 258; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 259; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 260; GFX7-NEXT: ds_write_b8 v0, v2 offset:15 261; GFX7-NEXT: s_waitcnt lgkmcnt(0) 262; GFX7-NEXT: s_setpc_b64 s[30:31] 263; 264; GFX10-LABEL: store_lds_v4i32_align1: 265; GFX10: ; %bb.0: 266; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 267; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 268; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v1 269; GFX10-NEXT: v_lshrrev_b16 v6, 8, v1 270; GFX10-NEXT: ds_write_b8 v0, v1 271; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 272; GFX10-NEXT: v_lshrrev_b16 v8, 8, v2 273; GFX10-NEXT: v_lshrrev_b16 v7, 8, v5 274; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 275; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 276; GFX10-NEXT: ds_write_b8 v0, v5 offset:2 277; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 278; GFX10-NEXT: v_lshrrev_b16 v2, 8, v1 279; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 280; GFX10-NEXT: ds_write_b8 v0, v1 offset:6 281; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v3 282; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3 283; GFX10-NEXT: ds_write_b8 v0, v2 offset:7 284; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 285; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v4 286; GFX10-NEXT: v_lshrrev_b16 v3, 8, v1 287; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 288; GFX10-NEXT: v_lshrrev_b16 v5, 8, v4 289; GFX10-NEXT: ds_write_b8 v0, v1 offset:10 290; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2 291; GFX10-NEXT: ds_write_b8 v0, v3 offset:11 292; GFX10-NEXT: ds_write_b8 v0, v4 offset:12 293; GFX10-NEXT: ds_write_b8 v0, v5 offset:13 294; GFX10-NEXT: ds_write_b8 v0, v2 offset:14 295; GFX10-NEXT: ds_write_b8 v0, v1 offset:15 296; GFX10-NEXT: s_waitcnt lgkmcnt(0) 297; GFX10-NEXT: s_setpc_b64 s[30:31] 298 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 299 ret void 300} 301 302define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 303; GFX9-LABEL: store_lds_v3i32_align1: 304; GFX9: ; %bb.0: 305; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 306; GFX9-NEXT: ds_write_b96 v0, v[1:3] 307; GFX9-NEXT: s_waitcnt lgkmcnt(0) 308; GFX9-NEXT: s_setpc_b64 s[30:31] 309; 310; GFX7-LABEL: store_lds_v3i32_align1: 311; GFX7: ; %bb.0: 312; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 313; GFX7-NEXT: s_mov_b32 m0, -1 314; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 315; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 316; GFX7-NEXT: ds_write_b8 v0, v1 317; GFX7-NEXT: ds_write_b8 v0, v5 offset:1 318; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 319; GFX7-NEXT: ds_write_b8 v0, v4 offset:2 320; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 321; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 322; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 323; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 324; GFX7-NEXT: ds_write_b8 v0, v4 offset:5 325; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 326; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 327; GFX7-NEXT: ds_write_b8 v0, v2 offset:7 328; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 329; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8 330; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 331; GFX7-NEXT: ds_write_b8 v0, v2 offset:9 332; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 333; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 334; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 335; GFX7-NEXT: s_waitcnt lgkmcnt(0) 336; GFX7-NEXT: s_setpc_b64 s[30:31] 337; 338; GFX10-LABEL: store_lds_v3i32_align1: 339; GFX10: ; %bb.0: 340; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 341; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 342; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1 343; GFX10-NEXT: v_lshrrev_b16 v5, 8, v1 344; GFX10-NEXT: ds_write_b8 v0, v1 345; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v2 346; GFX10-NEXT: v_lshrrev_b16 v6, 8, v2 347; GFX10-NEXT: v_lshrrev_b16 v7, 8, v4 348; GFX10-NEXT: ds_write_b8 v0, v2 offset:4 349; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v3 350; GFX10-NEXT: ds_write_b8 v0, v5 offset:1 351; GFX10-NEXT: ds_write_b8 v0, v4 offset:2 352; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 353; GFX10-NEXT: v_lshrrev_b16 v4, 8, v1 354; GFX10-NEXT: v_lshrrev_b16 v5, 8, v3 355; GFX10-NEXT: ds_write_b8 v0, v1 offset:6 356; GFX10-NEXT: v_lshrrev_b16 v1, 8, v2 357; GFX10-NEXT: ds_write_b8 v0, v6 offset:5 358; GFX10-NEXT: ds_write_b8 v0, v4 offset:7 359; GFX10-NEXT: ds_write_b8 v0, v3 offset:8 360; GFX10-NEXT: ds_write_b8 v0, v5 offset:9 361; GFX10-NEXT: ds_write_b8 v0, v2 offset:10 362; GFX10-NEXT: ds_write_b8 v0, v1 offset:11 363; GFX10-NEXT: s_waitcnt lgkmcnt(0) 364; GFX10-NEXT: s_setpc_b64 s[30:31] 365 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 366 ret void 367} 368 369define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) { 370; GFX9-LABEL: test_s_load_constant_v8i32_align1: 371; GFX9: ; %bb.0: 372; GFX9-NEXT: v_mov_b32_e32 v8, 0 373; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] 374; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 375; GFX9-NEXT: s_waitcnt vmcnt(1) 376; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] 377; GFX9-NEXT: s_waitcnt vmcnt(1) 378; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 379; GFX9-NEXT: s_endpgm 380; 381; GFX7-LABEL: test_s_load_constant_v8i32_align1: 382; GFX7: ; %bb.0: 383; GFX7-NEXT: s_mov_b32 s4, s2 384; GFX7-NEXT: s_mov_b32 s5, s3 385; GFX7-NEXT: s_mov_b32 s2, -1 386; GFX7-NEXT: s_mov_b32 s3, 0xf000 387; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 388; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 389; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 390; GFX7-NEXT: s_waitcnt vmcnt(1) 391; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 392; GFX7-NEXT: s_waitcnt vmcnt(1) 393; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 394; GFX7-NEXT: s_endpgm 395; 396; GFX10-LABEL: test_s_load_constant_v8i32_align1: 397; GFX10: ; %bb.0: 398; GFX10-NEXT: v_mov_b32_e32 v8, 0 399; GFX10-NEXT: s_clause 0x1 400; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] 401; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 402; GFX10-NEXT: s_waitcnt vmcnt(1) 403; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] 404; GFX10-NEXT: s_waitcnt vmcnt(0) 405; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 406; GFX10-NEXT: s_endpgm 407 %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1 408 store <8 x i32> %load, <8 x i32> addrspace(1)* %out 409 ret void 410} 411