1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { 9; GFX9-LABEL: load_lds_v4i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX9-NEXT: ds_read_b128 v[0:3], v0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: s_setpc_b64 s[30:31] 15; 16; GFX7-LABEL: load_lds_v4i32: 17; GFX7: ; %bb.0: 18; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GFX7-NEXT: s_mov_b32 m0, -1 20; GFX7-NEXT: ds_read_b128 v[0:3], v0 21; GFX7-NEXT: s_waitcnt lgkmcnt(0) 22; GFX7-NEXT: s_setpc_b64 s[30:31] 23; 24; GFX6-LABEL: load_lds_v4i32: 25; GFX6: ; %bb.0: 26; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 27; GFX6-NEXT: v_mov_b32_e32 v2, v0 28; GFX6-NEXT: s_mov_b32 m0, -1 29; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 30; GFX6-NEXT: ds_read_b64 v[0:1], v0 31; GFX6-NEXT: ds_read_b64 v[2:3], v2 32; GFX6-NEXT: s_waitcnt lgkmcnt(0) 33; GFX6-NEXT: s_setpc_b64 s[30:31] 34; 35; GFX10-LABEL: load_lds_v4i32: 36; GFX10: ; %bb.0: 37; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 38; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 39; GFX10-NEXT: ds_read_b128 v[0:3], v0 40; GFX10-NEXT: s_waitcnt lgkmcnt(0) 41; GFX10-NEXT: s_setpc_b64 s[30:31] 42; 43; GFX11-LABEL: load_lds_v4i32: 44; GFX11: ; %bb.0: 45; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 46; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 47; GFX11-NEXT: ds_load_b128 v[0:3], v0 48; GFX11-NEXT: s_waitcnt lgkmcnt(0) 49; GFX11-NEXT: s_setpc_b64 s[30:31] 50 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr 51 ret <4 x i32> %load 52} 53 54define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 55; GFX9-LABEL: load_lds_v4i32_align1: 56; GFX9: ; %bb.0: 57; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 58; GFX9-NEXT: ds_read_u8 v1, v0 59; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 60; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 61; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 62; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 63; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 64; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 65; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 66; GFX9-NEXT: ds_read_u8 v9, v0 offset:8 67; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 68; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 69; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 70; GFX9-NEXT: ds_read_u8 v13, v0 offset:12 71; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 72; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 73; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 74; GFX9-NEXT: s_waitcnt lgkmcnt(14) 75; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 76; GFX9-NEXT: s_waitcnt lgkmcnt(12) 77; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 78; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 79; GFX9-NEXT: s_waitcnt lgkmcnt(10) 80; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 81; GFX9-NEXT: s_waitcnt lgkmcnt(8) 82; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 83; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 84; GFX9-NEXT: s_waitcnt lgkmcnt(6) 85; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 86; GFX9-NEXT: s_waitcnt lgkmcnt(4) 87; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 88; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 89; GFX9-NEXT: s_waitcnt lgkmcnt(2) 90; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 91; GFX9-NEXT: s_waitcnt lgkmcnt(0) 92; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 93; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 94; GFX9-NEXT: s_setpc_b64 s[30:31] 95; 96; GFX7-LABEL: load_lds_v4i32_align1: 97; GFX7: ; %bb.0: 98; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 99; GFX7-NEXT: s_mov_b32 m0, -1 100; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 101; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 102; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 103; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 104; GFX7-NEXT: ds_read_u8 v5, v0 105; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 106; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 107; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 108; GFX7-NEXT: s_waitcnt lgkmcnt(4) 109; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 110; GFX7-NEXT: s_waitcnt lgkmcnt(3) 111; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 112; GFX7-NEXT: s_waitcnt lgkmcnt(2) 113; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 114; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 115; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 116; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 117; GFX7-NEXT: s_waitcnt lgkmcnt(1) 118; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7 119; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 120; GFX7-NEXT: s_waitcnt lgkmcnt(0) 121; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 122; GFX7-NEXT: ds_read_u8 v5, v0 offset:15 123; GFX7-NEXT: ds_read_u8 v6, v0 offset:14 124; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 125; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 126; GFX7-NEXT: ds_read_u8 v9, v0 offset:11 127; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 128; GFX7-NEXT: ds_read_u8 v11, v0 offset:9 129; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 130; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 131; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 132; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 133; GFX7-NEXT: s_waitcnt lgkmcnt(1) 134; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11 135; GFX7-NEXT: s_waitcnt lgkmcnt(0) 136; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 137; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 138; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 139; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 140; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 141; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 142; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7 143; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 144; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 145; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 146; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 147; GFX7-NEXT: v_mov_b32_e32 v0, v4 148; GFX7-NEXT: s_setpc_b64 s[30:31] 149; 150; GFX6-LABEL: load_lds_v4i32_align1: 151; GFX6: ; %bb.0: 152; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 153; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0 154; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 155; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0 156; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 157; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0 158; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 159; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 160; GFX6-NEXT: s_mov_b32 m0, -1 161; GFX6-NEXT: ds_read_u8 v1, v1 162; GFX6-NEXT: ds_read_u8 v2, v2 163; GFX6-NEXT: ds_read_u8 v3, v3 164; GFX6-NEXT: ds_read_u8 v4, v4 165; GFX6-NEXT: ds_read_u8 v5, v5 166; GFX6-NEXT: ds_read_u8 v6, v6 167; GFX6-NEXT: ds_read_u8 v7, v7 168; GFX6-NEXT: ds_read_u8 v8, v0 169; GFX6-NEXT: s_waitcnt lgkmcnt(7) 170; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 171; GFX6-NEXT: s_waitcnt lgkmcnt(6) 172; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 173; GFX6-NEXT: s_waitcnt lgkmcnt(5) 174; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 175; GFX6-NEXT: s_waitcnt lgkmcnt(4) 176; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 177; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 178; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 179; GFX6-NEXT: s_waitcnt lgkmcnt(3) 180; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 181; GFX6-NEXT: s_waitcnt lgkmcnt(2) 182; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 183; GFX6-NEXT: s_waitcnt lgkmcnt(1) 184; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 185; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 186; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 187; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 188; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 189; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 190; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 191; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 192; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 193; GFX6-NEXT: ds_read_u8 v4, v4 194; GFX6-NEXT: ds_read_u8 v5, v5 195; GFX6-NEXT: ds_read_u8 v6, v6 196; GFX6-NEXT: ds_read_u8 v7, v7 197; GFX6-NEXT: ds_read_u8 v9, v9 198; GFX6-NEXT: ds_read_u8 v10, v10 199; GFX6-NEXT: ds_read_u8 v11, v11 200; GFX6-NEXT: ds_read_u8 v0, v0 201; GFX6-NEXT: s_waitcnt lgkmcnt(7) 202; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 203; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 204; GFX6-NEXT: s_waitcnt lgkmcnt(4) 205; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7 206; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 207; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 208; GFX6-NEXT: s_waitcnt lgkmcnt(3) 209; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 210; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 211; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 212; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 213; GFX6-NEXT: s_waitcnt lgkmcnt(2) 214; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v10 215; GFX6-NEXT: s_waitcnt lgkmcnt(1) 216; GFX6-NEXT: v_or_b32_e32 v4, v4, v11 217; GFX6-NEXT: s_waitcnt lgkmcnt(0) 218; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 219; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 220; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 221; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 222; GFX6-NEXT: s_setpc_b64 s[30:31] 223; 224; GFX10-LABEL: load_lds_v4i32_align1: 225; GFX10: ; %bb.0: 226; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 227; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 228; GFX10-NEXT: ds_read_u8 v1, v0 229; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 230; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 231; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 232; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 233; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 234; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 235; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 236; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 237; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 238; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 239; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 240; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 241; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 242; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 243; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 244; GFX10-NEXT: s_waitcnt lgkmcnt(14) 245; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 246; GFX10-NEXT: s_waitcnt lgkmcnt(12) 247; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 248; GFX10-NEXT: s_waitcnt lgkmcnt(10) 249; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 250; GFX10-NEXT: s_waitcnt lgkmcnt(8) 251; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 252; GFX10-NEXT: s_waitcnt lgkmcnt(6) 253; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 254; GFX10-NEXT: s_waitcnt lgkmcnt(4) 255; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 256; GFX10-NEXT: s_waitcnt lgkmcnt(2) 257; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 258; GFX10-NEXT: s_waitcnt lgkmcnt(0) 259; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 260; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 261; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 262; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 263; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 264; GFX10-NEXT: s_setpc_b64 s[30:31] 265; 266; GFX11-LABEL: load_lds_v4i32_align1: 267; GFX11: ; %bb.0: 268; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 269; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 270; GFX11-NEXT: ds_load_u8 v1, v0 271; GFX11-NEXT: ds_load_u8 v2, v0 offset:1 272; GFX11-NEXT: ds_load_u8 v3, v0 offset:2 273; GFX11-NEXT: ds_load_u8 v4, v0 offset:3 274; GFX11-NEXT: ds_load_u8 v5, v0 offset:4 275; GFX11-NEXT: ds_load_u8 v6, v0 offset:5 276; GFX11-NEXT: ds_load_u8 v7, v0 offset:6 277; GFX11-NEXT: ds_load_u8 v8, v0 offset:7 278; GFX11-NEXT: ds_load_u8 v9, v0 offset:8 279; GFX11-NEXT: ds_load_u8 v10, v0 offset:9 280; GFX11-NEXT: ds_load_u8 v11, v0 offset:10 281; GFX11-NEXT: ds_load_u8 v12, v0 offset:11 282; GFX11-NEXT: ds_load_u8 v13, v0 offset:12 283; GFX11-NEXT: ds_load_u8 v14, v0 offset:13 284; GFX11-NEXT: ds_load_u8 v15, v0 offset:14 285; GFX11-NEXT: ds_load_u8 v0, v0 offset:15 286; GFX11-NEXT: s_waitcnt lgkmcnt(14) 287; GFX11-NEXT: v_lshl_or_b32 v1, v2, 8, v1 288; GFX11-NEXT: s_waitcnt lgkmcnt(12) 289; GFX11-NEXT: v_lshl_or_b32 v2, v4, 8, v3 290; GFX11-NEXT: s_waitcnt lgkmcnt(10) 291; GFX11-NEXT: v_lshl_or_b32 v3, v6, 8, v5 292; GFX11-NEXT: s_waitcnt lgkmcnt(8) 293; GFX11-NEXT: v_lshl_or_b32 v4, v8, 8, v7 294; GFX11-NEXT: s_waitcnt lgkmcnt(6) 295; GFX11-NEXT: v_lshl_or_b32 v5, v10, 8, v9 296; GFX11-NEXT: s_waitcnt lgkmcnt(4) 297; GFX11-NEXT: v_lshl_or_b32 v6, v12, 8, v11 298; GFX11-NEXT: s_waitcnt lgkmcnt(2) 299; GFX11-NEXT: v_lshl_or_b32 v7, v14, 8, v13 300; GFX11-NEXT: s_waitcnt lgkmcnt(0) 301; GFX11-NEXT: v_lshl_or_b32 v8, v0, 8, v15 302; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 303; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 304; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 305; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) 306; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 307; GFX11-NEXT: s_setpc_b64 s[30:31] 308 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 309 ret <4 x i32> %load 310} 311 312define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) { 313; GFX9-LABEL: load_lds_v4i32_align2: 314; GFX9: ; %bb.0: 315; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 316; GFX9-NEXT: ds_read_u16 v1, v0 317; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 318; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 319; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 320; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 321; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 322; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 323; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 324; GFX9-NEXT: s_waitcnt lgkmcnt(6) 325; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 326; GFX9-NEXT: s_waitcnt lgkmcnt(4) 327; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 328; GFX9-NEXT: s_waitcnt lgkmcnt(2) 329; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 331; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 332; GFX9-NEXT: s_setpc_b64 s[30:31] 333; 334; GFX7-LABEL: load_lds_v4i32_align2: 335; GFX7: ; %bb.0: 336; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 337; GFX7-NEXT: s_mov_b32 m0, -1 338; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 339; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 340; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 341; GFX7-NEXT: ds_read_u16 v4, v0 offset:2 342; GFX7-NEXT: ds_read_u16 v5, v0 343; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 344; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 345; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 346; GFX7-NEXT: s_waitcnt lgkmcnt(4) 347; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 348; GFX7-NEXT: s_waitcnt lgkmcnt(3) 349; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 350; GFX7-NEXT: s_waitcnt lgkmcnt(2) 351; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 352; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 353; GFX7-NEXT: s_waitcnt lgkmcnt(1) 354; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 355; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 356; GFX7-NEXT: s_waitcnt lgkmcnt(0) 357; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 358; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 359; GFX7-NEXT: s_setpc_b64 s[30:31] 360; 361; GFX6-LABEL: load_lds_v4i32_align2: 362; GFX6: ; %bb.0: 363; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 364; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0 365; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 366; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0 367; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 368; GFX6-NEXT: v_add_i32_e32 v5, vcc, 14, v0 369; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 370; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0 371; GFX6-NEXT: s_mov_b32 m0, -1 372; GFX6-NEXT: ds_read_u16 v1, v1 373; GFX6-NEXT: ds_read_u16 v2, v2 374; GFX6-NEXT: ds_read_u16 v3, v3 375; GFX6-NEXT: ds_read_u16 v4, v4 376; GFX6-NEXT: ds_read_u16 v5, v5 377; GFX6-NEXT: ds_read_u16 v6, v6 378; GFX6-NEXT: ds_read_u16 v7, v7 379; GFX6-NEXT: ds_read_u16 v0, v0 380; GFX6-NEXT: s_waitcnt lgkmcnt(7) 381; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 382; GFX6-NEXT: s_waitcnt lgkmcnt(6) 383; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 384; GFX6-NEXT: s_waitcnt lgkmcnt(5) 385; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 386; GFX6-NEXT: s_waitcnt lgkmcnt(4) 387; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 388; GFX6-NEXT: s_waitcnt lgkmcnt(3) 389; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 390; GFX6-NEXT: s_waitcnt lgkmcnt(1) 391; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 392; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 393; GFX6-NEXT: s_waitcnt lgkmcnt(0) 394; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 395; GFX6-NEXT: s_setpc_b64 s[30:31] 396; 397; GFX10-LABEL: load_lds_v4i32_align2: 398; GFX10: ; %bb.0: 399; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 400; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 401; GFX10-NEXT: ds_read_u16 v1, v0 402; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 403; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 404; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 405; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 406; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 407; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 408; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 409; GFX10-NEXT: s_waitcnt lgkmcnt(6) 410; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 411; GFX10-NEXT: s_waitcnt lgkmcnt(4) 412; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 413; GFX10-NEXT: s_waitcnt lgkmcnt(2) 414; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 415; GFX10-NEXT: s_waitcnt lgkmcnt(0) 416; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 417; GFX10-NEXT: s_setpc_b64 s[30:31] 418; 419; GFX11-LABEL: load_lds_v4i32_align2: 420; GFX11: ; %bb.0: 421; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 423; GFX11-NEXT: ds_load_u16 v1, v0 424; GFX11-NEXT: ds_load_u16 v2, v0 offset:2 425; GFX11-NEXT: ds_load_u16 v3, v0 offset:4 426; GFX11-NEXT: ds_load_u16 v4, v0 offset:6 427; GFX11-NEXT: ds_load_u16 v5, v0 offset:8 428; GFX11-NEXT: ds_load_u16 v6, v0 offset:10 429; GFX11-NEXT: ds_load_u16 v7, v0 offset:12 430; GFX11-NEXT: ds_load_u16 v8, v0 offset:14 431; GFX11-NEXT: s_waitcnt lgkmcnt(6) 432; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v1 433; GFX11-NEXT: s_waitcnt lgkmcnt(4) 434; GFX11-NEXT: v_lshl_or_b32 v1, v4, 16, v3 435; GFX11-NEXT: s_waitcnt lgkmcnt(2) 436; GFX11-NEXT: v_lshl_or_b32 v2, v6, 16, v5 437; GFX11-NEXT: s_waitcnt lgkmcnt(0) 438; GFX11-NEXT: v_lshl_or_b32 v3, v8, 16, v7 439; GFX11-NEXT: s_setpc_b64 s[30:31] 440 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 441 ret <4 x i32> %load 442} 443 444define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) { 445; GFX9-LABEL: load_lds_v4i32_align4: 446; GFX9: ; %bb.0: 447; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 448; GFX9-NEXT: v_mov_b32_e32 v2, v0 449; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 450; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 451; GFX9-NEXT: s_waitcnt lgkmcnt(0) 452; GFX9-NEXT: s_setpc_b64 s[30:31] 453; 454; GFX7-LABEL: load_lds_v4i32_align4: 455; GFX7: ; %bb.0: 456; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 457; GFX7-NEXT: v_mov_b32_e32 v2, v0 458; GFX7-NEXT: s_mov_b32 m0, -1 459; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 460; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 461; GFX7-NEXT: s_waitcnt lgkmcnt(0) 462; GFX7-NEXT: s_setpc_b64 s[30:31] 463; 464; GFX6-LABEL: load_lds_v4i32_align4: 465; GFX6: ; %bb.0: 466; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 467; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 468; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 469; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0 470; GFX6-NEXT: s_mov_b32 m0, -1 471; GFX6-NEXT: ds_read_b32 v2, v2 472; GFX6-NEXT: ds_read_b32 v3, v3 473; GFX6-NEXT: ds_read_b32 v0, v0 474; GFX6-NEXT: ds_read_b32 v1, v1 475; GFX6-NEXT: s_waitcnt lgkmcnt(0) 476; GFX6-NEXT: s_setpc_b64 s[30:31] 477; 478; GFX10-LABEL: load_lds_v4i32_align4: 479; GFX10: ; %bb.0: 480; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 481; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 482; GFX10-NEXT: v_mov_b32_e32 v2, v0 483; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 484; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 485; GFX10-NEXT: s_waitcnt lgkmcnt(0) 486; GFX10-NEXT: s_setpc_b64 s[30:31] 487; 488; GFX11-LABEL: load_lds_v4i32_align4: 489; GFX11: ; %bb.0: 490; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 491; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 492; GFX11-NEXT: v_mov_b32_e32 v2, v0 493; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset1:1 494; GFX11-NEXT: ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3 495; GFX11-NEXT: s_waitcnt lgkmcnt(0) 496; GFX11-NEXT: s_setpc_b64 s[30:31] 497 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 498 ret <4 x i32> %load 499} 500 501define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { 502; GFX9-LABEL: load_lds_v4i32_align8: 503; GFX9: ; %bb.0: 504; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 505; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 506; GFX9-NEXT: s_waitcnt lgkmcnt(0) 507; GFX9-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX7-LABEL: load_lds_v4i32_align8: 510; GFX7: ; %bb.0: 511; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX7-NEXT: s_mov_b32 m0, -1 513; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 514; GFX7-NEXT: s_waitcnt lgkmcnt(0) 515; GFX7-NEXT: s_setpc_b64 s[30:31] 516; 517; GFX6-LABEL: load_lds_v4i32_align8: 518; GFX6: ; %bb.0: 519; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 520; GFX6-NEXT: v_mov_b32_e32 v2, v0 521; GFX6-NEXT: s_mov_b32 m0, -1 522; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 523; GFX6-NEXT: ds_read_b64 v[0:1], v0 524; GFX6-NEXT: ds_read_b64 v[2:3], v2 525; GFX6-NEXT: s_waitcnt lgkmcnt(0) 526; GFX6-NEXT: s_setpc_b64 s[30:31] 527; 528; GFX10-LABEL: load_lds_v4i32_align8: 529; GFX10: ; %bb.0: 530; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 531; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 532; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 533; GFX10-NEXT: s_waitcnt lgkmcnt(0) 534; GFX10-NEXT: s_setpc_b64 s[30:31] 535; 536; GFX11-LABEL: load_lds_v4i32_align8: 537; GFX11: ; %bb.0: 538; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 539; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 540; GFX11-NEXT: ds_load_2addr_b64 v[0:3], v0 offset1:1 541; GFX11-NEXT: s_waitcnt lgkmcnt(0) 542; GFX11-NEXT: s_setpc_b64 s[30:31] 543 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 544 ret <4 x i32> %load 545} 546 547define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) { 548; GFX9-LABEL: load_lds_v4i32_align16: 549; GFX9: ; %bb.0: 550; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 551; GFX9-NEXT: ds_read_b128 v[0:3], v0 552; GFX9-NEXT: s_waitcnt lgkmcnt(0) 553; GFX9-NEXT: s_setpc_b64 s[30:31] 554; 555; GFX7-LABEL: load_lds_v4i32_align16: 556; GFX7: ; %bb.0: 557; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 558; GFX7-NEXT: s_mov_b32 m0, -1 559; GFX7-NEXT: ds_read_b128 v[0:3], v0 560; GFX7-NEXT: s_waitcnt lgkmcnt(0) 561; GFX7-NEXT: s_setpc_b64 s[30:31] 562; 563; GFX6-LABEL: load_lds_v4i32_align16: 564; GFX6: ; %bb.0: 565; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 566; GFX6-NEXT: v_mov_b32_e32 v2, v0 567; GFX6-NEXT: s_mov_b32 m0, -1 568; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 569; GFX6-NEXT: ds_read_b64 v[0:1], v0 570; GFX6-NEXT: ds_read_b64 v[2:3], v2 571; GFX6-NEXT: s_waitcnt lgkmcnt(0) 572; GFX6-NEXT: s_setpc_b64 s[30:31] 573; 574; GFX10-LABEL: load_lds_v4i32_align16: 575; GFX10: ; %bb.0: 576; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 577; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 578; GFX10-NEXT: ds_read_b128 v[0:3], v0 579; GFX10-NEXT: s_waitcnt lgkmcnt(0) 580; GFX10-NEXT: s_setpc_b64 s[30:31] 581; 582; GFX11-LABEL: load_lds_v4i32_align16: 583; GFX11: ; %bb.0: 584; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 585; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 586; GFX11-NEXT: ds_load_b128 v[0:3], v0 587; GFX11-NEXT: s_waitcnt lgkmcnt(0) 588; GFX11-NEXT: s_setpc_b64 s[30:31] 589 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 590 ret <4 x i32> %load 591} 592