1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck --check-prefix=GFX7-ALIGNED %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX7-UNALIGNED %s 4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-FLASTSCR %s 6; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX10 %s 7; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10-FLASTSCR %s 8; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access < %s | FileCheck --check-prefix=GFX11 %s 9; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+unaligned-scratch-access -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11-FLASTSCR %s 10 11; Should not merge this to a dword load 12define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { 13; GFX7-ALIGNED-LABEL: private_load_2xi16_align2: 14; GFX7-ALIGNED: ; %bb.0: 15; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 16; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 17; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen 18; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 19; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) 20; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 21; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 22; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 23; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 24; 25; GFX7-UNALIGNED-LABEL: private_load_2xi16_align2: 26; GFX7-UNALIGNED: ; %bb.0: 27; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 28; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 29; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen 30; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen 31; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(1) 32; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 33; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 34; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 35; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX9-LABEL: private_load_2xi16_align2: 38; GFX9: ; %bb.0: 39; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen 41; GFX9-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 42; GFX9-NEXT: s_waitcnt vmcnt(0) 43; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 44; GFX9-NEXT: s_setpc_b64 s[30:31] 45; 46; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2: 47; GFX9-FLASTSCR: ; %bb.0: 48; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GFX9-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off 50; GFX9-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 51; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 52; GFX9-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 53; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 54; 55; GFX10-LABEL: private_load_2xi16_align2: 56; GFX10: ; %bb.0: 57; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 59; GFX10-NEXT: s_clause 0x1 60; GFX10-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen 61; GFX10-NEXT: buffer_load_ushort v2, v0, s[0:3], 0 offen offset:2 62; GFX10-NEXT: s_waitcnt vmcnt(0) 63; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 64; GFX10-NEXT: s_setpc_b64 s[30:31] 65; 66; GFX10-FLASTSCR-LABEL: private_load_2xi16_align2: 67; GFX10-FLASTSCR: ; %bb.0: 68; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 69; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 70; GFX10-FLASTSCR-NEXT: s_clause 0x1 71; GFX10-FLASTSCR-NEXT: scratch_load_ushort v1, v0, off 72; GFX10-FLASTSCR-NEXT: scratch_load_ushort v2, v0, off offset:2 73; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 74; GFX10-FLASTSCR-NEXT: v_lshl_or_b32 v0, v2, 16, v1 75; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 76; 77; GFX11-LABEL: private_load_2xi16_align2: 78; GFX11: ; %bb.0: 79; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 80; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 81; GFX11-NEXT: s_clause 0x1 82; GFX11-NEXT: scratch_load_u16 v1, v0, off 83; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2 84; GFX11-NEXT: s_waitcnt vmcnt(0) 85; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 86; GFX11-NEXT: s_setpc_b64 s[30:31] 87; 88; GFX11-FLASTSCR-LABEL: private_load_2xi16_align2: 89; GFX11-FLASTSCR: ; %bb.0: 90; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 91; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 92; GFX11-FLASTSCR-NEXT: s_clause 0x1 93; GFX11-FLASTSCR-NEXT: scratch_load_u16 v1, v0, off 94; GFX11-FLASTSCR-NEXT: scratch_load_u16 v0, v0, off offset:2 95; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 96; GFX11-FLASTSCR-NEXT: v_lshl_or_b32 v0, v0, 16, v1 97; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 98 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 99 %p.0 = load i16, i16 addrspace(5)* %p, align 2 100 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 101 %zext.0 = zext i16 %p.0 to i32 102 %zext.1 = zext i16 %p.1 to i32 103 %shl.1 = shl i32 %zext.1, 16 104 %or = or i32 %zext.0, %shl.1 105 ret i32 %or 106} 107 108; Should not merge this to a dword store 109define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 110; GFX7-ALIGNED-LABEL: private_store_2xi16_align2: 111; GFX7-ALIGNED: ; %bb.0: 112; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 113; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 114; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 115; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 116; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen 117; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 118; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 119; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 120; 121; GFX7-UNALIGNED-LABEL: private_store_2xi16_align2: 122; GFX7-UNALIGNED: ; %bb.0: 123; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 124; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1 125; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2 126; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 127; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen 128; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen 129; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 130; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 131; 132; GFX9-LABEL: private_store_2xi16_align2: 133; GFX9: ; %bb.0: 134; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 135; GFX9-NEXT: v_mov_b32_e32 v0, 1 136; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 137; GFX9-NEXT: v_mov_b32_e32 v0, 2 138; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen offset:2 139; GFX9-NEXT: s_waitcnt vmcnt(0) 140; GFX9-NEXT: s_setpc_b64 s[30:31] 141; 142; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2: 143; GFX9-FLASTSCR: ; %bb.0: 144; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 145; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 146; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off 147; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 2 148; GFX9-FLASTSCR-NEXT: scratch_store_short v1, v0, off offset:2 149; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 150; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 151; 152; GFX10-LABEL: private_store_2xi16_align2: 153; GFX10: ; %bb.0: 154; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 155; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 156; GFX10-NEXT: v_mov_b32_e32 v0, 1 157; GFX10-NEXT: v_mov_b32_e32 v2, 2 158; GFX10-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen 159; GFX10-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 160; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 161; GFX10-NEXT: s_setpc_b64 s[30:31] 162; 163; GFX10-FLASTSCR-LABEL: private_store_2xi16_align2: 164; GFX10-FLASTSCR: ; %bb.0: 165; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 166; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 167; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 168; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2 169; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v0, off 170; GFX10-FLASTSCR-NEXT: scratch_store_short v1, v2, off offset:2 171; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 172; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 173; 174; GFX11-LABEL: private_store_2xi16_align2: 175; GFX11: ; %bb.0: 176; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 177; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 178; GFX11-NEXT: v_mov_b32_e32 v0, 1 179; GFX11-NEXT: v_mov_b32_e32 v2, 2 180; GFX11-NEXT: s_clause 0x1 181; GFX11-NEXT: scratch_store_b16 v1, v0, off 182; GFX11-NEXT: scratch_store_b16 v1, v2, off offset:2 183; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX11-NEXT: s_setpc_b64 s[30:31] 185; 186; GFX11-FLASTSCR-LABEL: private_store_2xi16_align2: 187; GFX11-FLASTSCR: ; %bb.0: 188; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 189; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 190; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 1 191; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v2, 2 192; GFX11-FLASTSCR-NEXT: s_clause 0x1 193; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v0, off 194; GFX11-FLASTSCR-NEXT: scratch_store_b16 v1, v2, off offset:2 195; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 196; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 197 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 198 store i16 1, i16 addrspace(5)* %r, align 2 199 store i16 2, i16 addrspace(5)* %gep.r, align 2 200 ret void 201} 202 203; Should produce align 1 dword when legal 204define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { 205; GFX7-ALIGNED-LABEL: private_load_2xi16_align1: 206; GFX7-ALIGNED: ; %bb.0: 207; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 208; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 209; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 210; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 211; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen 212; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], 0 offen 213; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen 214; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen 215; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) 216; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 217; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) 218; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 219; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) 220; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 221; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 222; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 223; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 224; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 225; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 226; 227; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1: 228; GFX7-UNALIGNED: ; %bb.0: 229; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 230; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 231; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 232; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 233; 234; GFX9-LABEL: private_load_2xi16_align1: 235; GFX9: ; %bb.0: 236; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 237; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 238; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 239; GFX9-NEXT: s_mov_b32 s4, 0xffff 240; GFX9-NEXT: s_waitcnt vmcnt(0) 241; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 242; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 243; GFX9-NEXT: s_setpc_b64 s[30:31] 244; 245; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1: 246; GFX9-FLASTSCR: ; %bb.0: 247; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 248; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 249; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff 250; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff 251; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 252; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 253; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 254; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 255; 256; GFX10-LABEL: private_load_2xi16_align1: 257; GFX10: ; %bb.0: 258; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 259; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 260; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 261; GFX10-NEXT: s_waitcnt vmcnt(0) 262; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 263; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 264; GFX10-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX10-FLASTSCR-LABEL: private_load_2xi16_align1: 267; GFX10-FLASTSCR: ; %bb.0: 268; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 270; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 271; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 272; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 273; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 274; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 275; 276; GFX11-LABEL: private_load_2xi16_align1: 277; GFX11: ; %bb.0: 278; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 280; GFX11-NEXT: scratch_load_b32 v0, v0, off 281; GFX11-NEXT: s_waitcnt vmcnt(0) 282; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 283; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 284; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 285; GFX11-NEXT: s_setpc_b64 s[30:31] 286; 287; GFX11-FLASTSCR-LABEL: private_load_2xi16_align1: 288; GFX11-FLASTSCR: ; %bb.0: 289; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 290; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 291; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off 292; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 293; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 294; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) 295; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 296; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 297 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 298 %p.0 = load i16, i16 addrspace(5)* %p, align 1 299 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 300 %zext.0 = zext i16 %p.0 to i32 301 %zext.1 = zext i16 %p.1 to i32 302 %shl.1 = shl i32 %zext.1, 16 303 %or = or i32 %zext.0, %shl.1 304 ret i32 %or 305} 306 307; Should produce align 1 dword when legal 308define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 309; GFX7-ALIGNED-LABEL: private_store_2xi16_align1: 310; GFX7-ALIGNED: ; %bb.0: 311; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 313; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 314; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen 315; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 316; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 317; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 318; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 319; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], 0 offen 320; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], 0 offen 321; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen 322; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 323; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 324; 325; GFX7-UNALIGNED-LABEL: private_store_2xi16_align1: 326; GFX7-UNALIGNED: ; %bb.0: 327; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 328; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 329; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 330; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 331; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 332; 333; GFX9-LABEL: private_store_2xi16_align1: 334; GFX9: ; %bb.0: 335; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 336; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 337; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 338; GFX9-NEXT: s_waitcnt vmcnt(0) 339; GFX9-NEXT: s_setpc_b64 s[30:31] 340; 341; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1: 342; GFX9-FLASTSCR: ; %bb.0: 343; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 344; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 345; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 346; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 347; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 348; 349; GFX10-LABEL: private_store_2xi16_align1: 350; GFX10: ; %bb.0: 351; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 352; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 353; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 354; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 355; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 356; GFX10-NEXT: s_setpc_b64 s[30:31] 357; 358; GFX10-FLASTSCR-LABEL: private_store_2xi16_align1: 359; GFX10-FLASTSCR: ; %bb.0: 360; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 361; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 362; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 363; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 364; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 365; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 366; 367; GFX11-LABEL: private_store_2xi16_align1: 368; GFX11: ; %bb.0: 369; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 370; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 371; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 372; GFX11-NEXT: scratch_store_b32 v1, v0, off 373; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 374; GFX11-NEXT: s_setpc_b64 s[30:31] 375; 376; GFX11-FLASTSCR-LABEL: private_store_2xi16_align1: 377; GFX11-FLASTSCR: ; %bb.0: 378; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 379; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 380; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 381; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off 382; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 383; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 384 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 385 store i16 1, i16 addrspace(5)* %r, align 1 386 store i16 2, i16 addrspace(5)* %gep.r, align 1 387 ret void 388} 389 390; Should merge this to a dword load 391define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { 392; GFX7-LABEL: load_2xi16_align4: 393; GFX7: ; %bb.0: 394; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 395; GFX7-NEXT: flat_load_dword v0, v[0:1] 396; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 397; GFX7-NEXT: s_setpc_b64 s[30:31] 398; 399; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: 400; GFX7-ALIGNED: ; %bb.0: 401; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 402; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 403; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 404; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 405; 406; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4: 407; GFX7-UNALIGNED: ; %bb.0: 408; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 409; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 410; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 411; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 412; 413; GFX9-LABEL: private_load_2xi16_align4: 414; GFX9: ; %bb.0: 415; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 416; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 417; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff 418; GFX9-NEXT: s_mov_b32 s4, 0xffff 419; GFX9-NEXT: s_waitcnt vmcnt(0) 420; GFX9-NEXT: v_bfi_b32 v1, v1, 0, v0 421; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v1 422; GFX9-NEXT: s_setpc_b64 s[30:31] 423; 424; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4: 425; GFX9-FLASTSCR: ; %bb.0: 426; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 427; GFX9-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 428; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v1, 0xffff 429; GFX9-FLASTSCR-NEXT: s_mov_b32 s0, 0xffff 430; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 431; GFX9-FLASTSCR-NEXT: v_bfi_b32 v1, v1, 0, v0 432; GFX9-FLASTSCR-NEXT: v_and_or_b32 v0, v0, s0, v1 433; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 434; 435; GFX10-LABEL: private_load_2xi16_align4: 436; GFX10: ; %bb.0: 437; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 438; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 439; GFX10-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen 440; GFX10-NEXT: s_waitcnt vmcnt(0) 441; GFX10-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 442; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 443; GFX10-NEXT: s_setpc_b64 s[30:31] 444; 445; GFX10-FLASTSCR-LABEL: private_load_2xi16_align4: 446; GFX10-FLASTSCR: ; %bb.0: 447; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 449; GFX10-FLASTSCR-NEXT: scratch_load_dword v0, v0, off 450; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 451; GFX10-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 452; GFX10-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 453; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 454; 455; GFX11-LABEL: private_load_2xi16_align4: 456; GFX11: ; %bb.0: 457; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 459; GFX11-NEXT: scratch_load_b32 v0, v0, off 460; GFX11-NEXT: s_waitcnt vmcnt(0) 461; GFX11-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 462; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) 463; GFX11-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 464; GFX11-NEXT: s_setpc_b64 s[30:31] 465; 466; GFX11-FLASTSCR-LABEL: private_load_2xi16_align4: 467; GFX11-FLASTSCR: ; %bb.0: 468; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 469; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 470; GFX11-FLASTSCR-NEXT: scratch_load_b32 v0, v0, off 471; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 472; GFX11-FLASTSCR-NEXT: v_bfi_b32 v1, 0xffff, 0, v0 473; GFX11-FLASTSCR-NEXT: s_delay_alu instid0(VALU_DEP_1) 474; GFX11-FLASTSCR-NEXT: v_and_or_b32 v0, 0xffff, v0, v1 475; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 476 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 477 %p.0 = load i16, i16 addrspace(5)* %p, align 4 478 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 479 %zext.0 = zext i16 %p.0 to i32 480 %zext.1 = zext i16 %p.1 to i32 481 %shl.1 = shl i32 %zext.1, 16 482 %or = or i32 %zext.0, %shl.1 483 ret i32 %or 484} 485 486; Should merge this to a dword store 487define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 488; GFX7-LABEL: private_store_2xi16_align4: 489; GFX7: ; %bb.0: 490; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 491; GFX7-NEXT: v_mov_b32_e32 v2, 0x20001 492; GFX7-NEXT: s_waitcnt lgkmcnt(0) 493; GFX7-NEXT: v_mov_b32_e32 v0, s0 494; GFX7-NEXT: v_mov_b32_e32 v1, s1 495; GFX7-NEXT: flat_store_dword v[0:1], v2 496; GFX7-NEXT: s_endpgm 497; 498; GFX7-ALIGNED-LABEL: private_store_2xi16_align4: 499; GFX7-ALIGNED: ; %bb.0: 500; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 501; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 502; GFX7-ALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 503; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) 504; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] 505; 506; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4: 507; GFX7-UNALIGNED: ; %bb.0: 508; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 509; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 510; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 511; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) 512; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] 513; 514; GFX9-LABEL: private_store_2xi16_align4: 515; GFX9: ; %bb.0: 516; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 517; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 518; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 519; GFX9-NEXT: s_waitcnt vmcnt(0) 520; GFX9-NEXT: s_setpc_b64 s[30:31] 521; 522; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4: 523; GFX9-FLASTSCR: ; %bb.0: 524; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 525; GFX9-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 526; GFX9-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 527; GFX9-FLASTSCR-NEXT: s_waitcnt vmcnt(0) 528; GFX9-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 529; 530; GFX10-LABEL: private_store_2xi16_align4: 531; GFX10: ; %bb.0: 532; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 533; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 534; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 535; GFX10-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen 536; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 537; GFX10-NEXT: s_setpc_b64 s[30:31] 538; 539; GFX10-FLASTSCR-LABEL: private_store_2xi16_align4: 540; GFX10-FLASTSCR: ; %bb.0: 541; GFX10-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 542; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 543; GFX10-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 544; GFX10-FLASTSCR-NEXT: scratch_store_dword v1, v0, off 545; GFX10-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 546; GFX10-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 547; 548; GFX11-LABEL: private_store_2xi16_align4: 549; GFX11: ; %bb.0: 550; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 551; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 552; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 553; GFX11-NEXT: scratch_store_b32 v1, v0, off 554; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 555; GFX11-NEXT: s_setpc_b64 s[30:31] 556; 557; GFX11-FLASTSCR-LABEL: private_store_2xi16_align4: 558; GFX11-FLASTSCR: ; %bb.0: 559; GFX11-FLASTSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 560; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 561; GFX11-FLASTSCR-NEXT: v_mov_b32_e32 v0, 0x20001 562; GFX11-FLASTSCR-NEXT: scratch_store_b32 v1, v0, off 563; GFX11-FLASTSCR-NEXT: s_waitcnt_vscnt null, 0x0 564; GFX11-FLASTSCR-NEXT: s_setpc_b64 s[30:31] 565 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 566 store i16 1, i16 addrspace(5)* %r, align 4 567 store i16 2, i16 addrspace(5)* %gep.r, align 2 568 ret void 569} 570