1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 6 7; Unaligned DS access in available from GFX9 onwards. 8; LDS alignment enforcement is controlled by a configuration register: 9; SH_MEM_CONFIG.alignment_mode 10 11define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 12; GFX9-LABEL: load_lds_v4i32_align1: 13; GFX9: ; %bb.0: 14; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 15; GFX9-NEXT: ds_read_b128 v[0:3], v0 16; GFX9-NEXT: s_waitcnt lgkmcnt(0) 17; GFX9-NEXT: s_setpc_b64 s[30:31] 18; 19; GFX7-LABEL: load_lds_v4i32_align1: 20; GFX7: ; %bb.0: 21; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 23; GFX7-NEXT: ds_read_u8 v2, v0 24; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 25; GFX7-NEXT: s_mov_b32 m0, -1 26; GFX7-NEXT: s_waitcnt lgkmcnt(2) 27; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 28; GFX7-NEXT: s_waitcnt lgkmcnt(1) 29; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 30; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 31; GFX7-NEXT: ds_read_u8 v5, v0 offset:4 32; GFX7-NEXT: ds_read_u8 v6, v0 offset:5 33; GFX7-NEXT: ds_read_u8 v7, v0 offset:6 34; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 35; GFX7-NEXT: ds_read_u8 v9, v0 offset:8 36; GFX7-NEXT: ds_read_u8 v10, v0 offset:9 37; GFX7-NEXT: ds_read_u8 v11, v0 offset:10 38; GFX7-NEXT: s_waitcnt lgkmcnt(7) 39; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 40; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 41; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 42; GFX7-NEXT: v_or_b32_e32 v4, v2, v1 43; GFX7-NEXT: s_waitcnt lgkmcnt(5) 44; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v6 45; GFX7-NEXT: s_waitcnt lgkmcnt(3) 46; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v8 47; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7 48; GFX7-NEXT: v_or_b32_e32 v1, v1, v5 49; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 50; GFX7-NEXT: ds_read_u8 v3, v0 offset:11 51; GFX7-NEXT: ds_read_u8 v5, v0 offset:12 52; GFX7-NEXT: ds_read_u8 v6, v0 offset:13 53; GFX7-NEXT: ds_read_u8 v7, v0 offset:14 54; GFX7-NEXT: ds_read_u8 v0, v0 offset:15 55; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 56; GFX7-NEXT: s_waitcnt lgkmcnt(6) 57; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v10 58; GFX7-NEXT: s_waitcnt lgkmcnt(4) 59; GFX7-NEXT: v_lshlrev_b32_e32 v3, 24, v3 60; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v11 61; GFX7-NEXT: v_or_b32_e32 v2, v2, v9 62; GFX7-NEXT: v_or_b32_e32 v3, v3, v8 63; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 64; GFX7-NEXT: s_waitcnt lgkmcnt(2) 65; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v6 66; GFX7-NEXT: v_or_b32_e32 v3, v3, v5 67; GFX7-NEXT: s_waitcnt lgkmcnt(0) 68; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 69; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7 70; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 71; GFX7-NEXT: v_or_b32_e32 v3, v0, v3 72; GFX7-NEXT: v_mov_b32_e32 v0, v4 73; GFX7-NEXT: s_setpc_b64 s[30:31] 74; 75; GFX10-LABEL: load_lds_v4i32_align1: 76; GFX10: ; %bb.0: 77; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 78; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 79; GFX10-NEXT: v_mov_b32_e32 v2, v0 80; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 81; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 82; GFX10-NEXT: s_waitcnt lgkmcnt(0) 83; GFX10-NEXT: s_setpc_b64 s[30:31] 84; 85; GFX11-LABEL: load_lds_v4i32_align1: 86; GFX11: ; %bb.0: 87; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 88; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 89; GFX11-NEXT: ds_load_b128 v[0:3], v0 90; GFX11-NEXT: s_waitcnt lgkmcnt(0) 91; GFX11-NEXT: s_setpc_b64 s[30:31] 92 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 93 ret <4 x i32> %load 94} 95 96define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) { 97; GFX9-LABEL: load_lds_v3i32_align1: 98; GFX9: ; %bb.0: 99; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 100; GFX9-NEXT: ds_read_b96 v[0:2], v0 101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 102; GFX9-NEXT: s_setpc_b64 s[30:31] 103; 104; GFX7-LABEL: load_lds_v3i32_align1: 105; GFX7: ; %bb.0: 106; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 107; GFX7-NEXT: ds_read_u8 v1, v0 offset:1 108; GFX7-NEXT: ds_read_u8 v2, v0 109; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 110; GFX7-NEXT: s_mov_b32 m0, -1 111; GFX7-NEXT: s_waitcnt lgkmcnt(2) 112; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 113; GFX7-NEXT: s_waitcnt lgkmcnt(1) 114; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 115; GFX7-NEXT: ds_read_u8 v2, v0 offset:3 116; GFX7-NEXT: ds_read_u8 v4, v0 offset:4 117; GFX7-NEXT: ds_read_u8 v5, v0 offset:5 118; GFX7-NEXT: ds_read_u8 v6, v0 offset:6 119; GFX7-NEXT: ds_read_u8 v7, v0 offset:7 120; GFX7-NEXT: ds_read_u8 v8, v0 offset:8 121; GFX7-NEXT: ds_read_u8 v9, v0 offset:9 122; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 123; GFX7-NEXT: s_waitcnt lgkmcnt(7) 124; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v2 125; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 126; GFX7-NEXT: ds_read_u8 v0, v0 offset:11 127; GFX7-NEXT: v_or_b32_e32 v2, v2, v3 128; GFX7-NEXT: v_or_b32_e32 v3, v2, v1 129; GFX7-NEXT: s_waitcnt lgkmcnt(6) 130; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v5 131; GFX7-NEXT: v_or_b32_e32 v1, v1, v4 132; GFX7-NEXT: s_waitcnt lgkmcnt(4) 133; GFX7-NEXT: v_lshlrev_b32_e32 v2, 24, v7 134; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 135; GFX7-NEXT: v_or_b32_e32 v2, v2, v4 136; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 137; GFX7-NEXT: s_waitcnt lgkmcnt(2) 138; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 139; GFX7-NEXT: s_waitcnt lgkmcnt(0) 140; GFX7-NEXT: v_lshlrev_b32_e32 v0, 24, v0 141; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v10 142; GFX7-NEXT: v_or_b32_e32 v2, v2, v8 143; GFX7-NEXT: v_or_b32_e32 v0, v0, v4 144; GFX7-NEXT: v_or_b32_e32 v2, v0, v2 145; GFX7-NEXT: v_mov_b32_e32 v0, v3 146; GFX7-NEXT: s_setpc_b64 s[30:31] 147; 148; GFX10-LABEL: load_lds_v3i32_align1: 149; GFX10: ; %bb.0: 150; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 151; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 152; GFX10-NEXT: v_mov_b32_e32 v2, v0 153; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 154; GFX10-NEXT: ds_read_b32 v2, v2 offset:8 155; GFX10-NEXT: s_waitcnt lgkmcnt(0) 156; GFX10-NEXT: s_setpc_b64 s[30:31] 157; 158; GFX11-LABEL: load_lds_v3i32_align1: 159; GFX11: ; %bb.0: 160; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 161; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 162; GFX11-NEXT: ds_load_b96 v[0:2], v0 163; GFX11-NEXT: s_waitcnt lgkmcnt(0) 164; GFX11-NEXT: s_setpc_b64 s[30:31] 165 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1 166 ret <3 x i32> %load 167} 168 169define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) { 170; GFX9-LABEL: store_lds_v4i32_align1: 171; GFX9: ; %bb.0: 172; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 173; GFX9-NEXT: ds_write_b128 v0, v[1:4] 174; GFX9-NEXT: s_waitcnt lgkmcnt(0) 175; GFX9-NEXT: s_setpc_b64 s[30:31] 176; 177; GFX7-LABEL: store_lds_v4i32_align1: 178; GFX7: ; %bb.0: 179; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 180; GFX7-NEXT: s_mov_b32 m0, -1 181; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v1 182; GFX7-NEXT: v_bfe_u32 v6, v1, 8, 8 183; GFX7-NEXT: ds_write_b8 v0, v1 184; GFX7-NEXT: ds_write_b8 v0, v6 offset:1 185; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 186; GFX7-NEXT: ds_write_b8 v0, v5 offset:2 187; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 188; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 189; GFX7-NEXT: v_bfe_u32 v5, v2, 8, 8 190; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 191; GFX7-NEXT: ds_write_b8 v0, v5 offset:5 192; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 193; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 194; GFX7-NEXT: ds_write_b8 v0, v2 offset:7 195; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8 196; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 197; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 198; GFX7-NEXT: ds_write_b8 v0, v2 offset:9 199; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 200; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 201; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 202; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v4 203; GFX7-NEXT: v_bfe_u32 v2, v4, 8, 8 204; GFX7-NEXT: ds_write_b8 v0, v4 offset:12 205; GFX7-NEXT: ds_write_b8 v0, v2 offset:13 206; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v4 207; GFX7-NEXT: ds_write_b8 v0, v1 offset:14 208; GFX7-NEXT: ds_write_b8 v0, v2 offset:15 209; GFX7-NEXT: s_waitcnt lgkmcnt(0) 210; GFX7-NEXT: s_setpc_b64 s[30:31] 211; 212; GFX10-LABEL: store_lds_v4i32_align1: 213; GFX10: ; %bb.0: 214; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 215; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 216; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 217; GFX10-NEXT: ds_write2_b32 v0, v3, v4 offset0:2 offset1:3 218; GFX10-NEXT: s_waitcnt lgkmcnt(0) 219; GFX10-NEXT: s_setpc_b64 s[30:31] 220; 221; GFX11-LABEL: store_lds_v4i32_align1: 222; GFX11: ; %bb.0: 223; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 224; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 225; GFX11-NEXT: ds_store_b128 v0, v[1:4] 226; GFX11-NEXT: s_waitcnt lgkmcnt(0) 227; GFX11-NEXT: s_setpc_b64 s[30:31] 228 store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1 229 ret void 230} 231 232define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 233; GFX9-LABEL: store_lds_v3i32_align1: 234; GFX9: ; %bb.0: 235; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 236; GFX9-NEXT: ds_write_b96 v0, v[1:3] 237; GFX9-NEXT: s_waitcnt lgkmcnt(0) 238; GFX9-NEXT: s_setpc_b64 s[30:31] 239; 240; GFX7-LABEL: store_lds_v3i32_align1: 241; GFX7: ; %bb.0: 242; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 243; GFX7-NEXT: s_mov_b32 m0, -1 244; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 245; GFX7-NEXT: v_bfe_u32 v5, v1, 8, 8 246; GFX7-NEXT: ds_write_b8 v0, v1 247; GFX7-NEXT: ds_write_b8 v0, v5 offset:1 248; GFX7-NEXT: v_lshrrev_b32_e32 v1, 24, v1 249; GFX7-NEXT: ds_write_b8 v0, v4 offset:2 250; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 251; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2 252; GFX7-NEXT: v_bfe_u32 v4, v2, 8, 8 253; GFX7-NEXT: ds_write_b8 v0, v2 offset:4 254; GFX7-NEXT: ds_write_b8 v0, v4 offset:5 255; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v2 256; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 257; GFX7-NEXT: ds_write_b8 v0, v2 offset:7 258; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3 259; GFX7-NEXT: v_bfe_u32 v2, v3, 8, 8 260; GFX7-NEXT: ds_write_b8 v0, v3 offset:8 261; GFX7-NEXT: ds_write_b8 v0, v2 offset:9 262; GFX7-NEXT: v_lshrrev_b32_e32 v2, 24, v3 263; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 264; GFX7-NEXT: ds_write_b8 v0, v2 offset:11 265; GFX7-NEXT: s_waitcnt lgkmcnt(0) 266; GFX7-NEXT: s_setpc_b64 s[30:31] 267; 268; GFX10-LABEL: store_lds_v3i32_align1: 269; GFX10: ; %bb.0: 270; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 271; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 272; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 273; GFX10-NEXT: ds_write_b32 v0, v3 offset:8 274; GFX10-NEXT: s_waitcnt lgkmcnt(0) 275; GFX10-NEXT: s_setpc_b64 s[30:31] 276; 277; GFX11-LABEL: store_lds_v3i32_align1: 278; GFX11: ; %bb.0: 279; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 280; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 281; GFX11-NEXT: ds_store_b96 v0, v[1:3] 282; GFX11-NEXT: s_waitcnt lgkmcnt(0) 283; GFX11-NEXT: s_setpc_b64 s[30:31] 284 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 285 ret void 286} 287 288define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) { 289; GFX9-LABEL: test_s_load_constant_v8i32_align1: 290; GFX9: ; %bb.0: 291; GFX9-NEXT: v_mov_b32_e32 v8, 0 292; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] 293; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 294; GFX9-NEXT: s_waitcnt vmcnt(1) 295; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] 296; GFX9-NEXT: s_waitcnt vmcnt(1) 297; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 298; GFX9-NEXT: s_endpgm 299; 300; GFX7-LABEL: test_s_load_constant_v8i32_align1: 301; GFX7: ; %bb.0: 302; GFX7-NEXT: s_mov_b32 s4, s2 303; GFX7-NEXT: s_mov_b32 s5, s3 304; GFX7-NEXT: s_mov_b32 s2, -1 305; GFX7-NEXT: s_mov_b32 s3, 0xf000 306; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 307; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 308; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3] 309; GFX7-NEXT: s_waitcnt vmcnt(1) 310; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 311; GFX7-NEXT: s_waitcnt vmcnt(1) 312; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 313; GFX7-NEXT: s_endpgm 314; 315; GFX10-LABEL: test_s_load_constant_v8i32_align1: 316; GFX10: ; %bb.0: 317; GFX10-NEXT: v_mov_b32_e32 v8, 0 318; GFX10-NEXT: s_clause 0x1 319; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] 320; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 321; GFX10-NEXT: s_waitcnt vmcnt(1) 322; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] 323; GFX10-NEXT: s_waitcnt vmcnt(0) 324; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16 325; GFX10-NEXT: s_endpgm 326; 327; GFX11-LABEL: test_s_load_constant_v8i32_align1: 328; GFX11: ; %bb.0: 329; GFX11-NEXT: v_mov_b32_e32 v8, 0 330; GFX11-NEXT: s_clause 0x1 331; GFX11-NEXT: global_load_b128 v[0:3], v8, s[0:1] 332; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:16 333; GFX11-NEXT: s_waitcnt vmcnt(1) 334; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] 335; GFX11-NEXT: s_waitcnt vmcnt(0) 336; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16 337; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 338; GFX11-NEXT: s_endpgm 339 %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1 340 store <8 x i32> %load, <8 x i32> addrspace(1)* %out 341 ret void 342} 343