1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6 7define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) { 8; GFX9-LABEL: load_lds_v4i32: 9; GFX9: ; %bb.0: 10; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 11; GFX9-NEXT: ds_read_b128 v[0:3], v0 12; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13; GFX9-NEXT: s_setpc_b64 s[30:31] 14; 15; GFX7-LABEL: load_lds_v4i32: 16; GFX7: ; %bb.0: 17; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 18; GFX7-NEXT: s_mov_b32 m0, -1 19; GFX7-NEXT: ds_read_b128 v[0:3], v0 20; GFX7-NEXT: s_waitcnt lgkmcnt(0) 21; GFX7-NEXT: s_setpc_b64 s[30:31] 22; 23; GFX6-LABEL: load_lds_v4i32: 24; GFX6: ; %bb.0: 25; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GFX6-NEXT: v_mov_b32_e32 v2, v0 27; GFX6-NEXT: s_mov_b32 m0, -1 28; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 29; GFX6-NEXT: ds_read_b64 v[0:1], v0 30; GFX6-NEXT: ds_read_b64 v[2:3], v2 31; GFX6-NEXT: s_waitcnt lgkmcnt(0) 32; GFX6-NEXT: s_setpc_b64 s[30:31] 33; 34; GFX10-LABEL: load_lds_v4i32: 35; GFX10: ; %bb.0: 36; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 37; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 38; GFX10-NEXT: ds_read_b128 v[0:3], v0 39; GFX10-NEXT: s_waitcnt lgkmcnt(0) 40; GFX10-NEXT: s_setpc_b64 s[30:31] 41 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr 42 ret <4 x i32> %load 43} 44 45define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) { 46; GFX9-LABEL: load_lds_v4i32_align1: 47; GFX9: ; %bb.0: 48; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 49; GFX9-NEXT: ds_read_u8 v1, v0 50; GFX9-NEXT: ds_read_u8 v2, v0 offset:1 51; GFX9-NEXT: ds_read_u8 v3, v0 offset:2 52; GFX9-NEXT: ds_read_u8 v4, v0 offset:3 53; GFX9-NEXT: ds_read_u8 v5, v0 offset:4 54; GFX9-NEXT: ds_read_u8 v6, v0 offset:5 55; GFX9-NEXT: ds_read_u8 v7, v0 offset:6 56; GFX9-NEXT: ds_read_u8 v8, v0 offset:7 57; GFX9-NEXT: ds_read_u8 v9, v0 offset:8 58; GFX9-NEXT: ds_read_u8 v10, v0 offset:9 59; GFX9-NEXT: ds_read_u8 v11, v0 offset:10 60; GFX9-NEXT: ds_read_u8 v12, v0 offset:11 61; GFX9-NEXT: ds_read_u8 v13, v0 offset:12 62; GFX9-NEXT: ds_read_u8 v14, v0 offset:13 63; GFX9-NEXT: ds_read_u8 v15, v0 offset:14 64; GFX9-NEXT: ds_read_u8 v16, v0 offset:15 65; GFX9-NEXT: s_waitcnt lgkmcnt(14) 66; GFX9-NEXT: v_lshl_or_b32 v0, v2, 8, v1 67; GFX9-NEXT: s_waitcnt lgkmcnt(12) 68; GFX9-NEXT: v_lshl_or_b32 v1, v4, 8, v3 69; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 70; GFX9-NEXT: s_waitcnt lgkmcnt(10) 71; GFX9-NEXT: v_lshl_or_b32 v1, v6, 8, v5 72; GFX9-NEXT: s_waitcnt lgkmcnt(8) 73; GFX9-NEXT: v_lshl_or_b32 v2, v8, 8, v7 74; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 75; GFX9-NEXT: s_waitcnt lgkmcnt(6) 76; GFX9-NEXT: v_lshl_or_b32 v2, v10, 8, v9 77; GFX9-NEXT: s_waitcnt lgkmcnt(4) 78; GFX9-NEXT: v_lshl_or_b32 v3, v12, 8, v11 79; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 80; GFX9-NEXT: s_waitcnt lgkmcnt(2) 81; GFX9-NEXT: v_lshl_or_b32 v3, v14, 8, v13 82; GFX9-NEXT: s_waitcnt lgkmcnt(0) 83; GFX9-NEXT: v_lshl_or_b32 v4, v16, 8, v15 84; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 85; GFX9-NEXT: s_setpc_b64 s[30:31] 86; 87; GFX7-LABEL: load_lds_v4i32_align1: 88; GFX7: ; %bb.0: 89; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 90; GFX7-NEXT: s_mov_b32 m0, -1 91; GFX7-NEXT: ds_read_u8 v1, v0 offset:6 92; GFX7-NEXT: ds_read_u8 v2, v0 offset:4 93; GFX7-NEXT: ds_read_u8 v3, v0 offset:2 94; GFX7-NEXT: ds_read_u8 v4, v0 offset:1 95; GFX7-NEXT: ds_read_u8 v5, v0 96; GFX7-NEXT: ds_read_u8 v6, v0 offset:3 97; GFX7-NEXT: ds_read_u8 v7, v0 offset:5 98; GFX7-NEXT: ds_read_u8 v8, v0 offset:7 99; GFX7-NEXT: s_waitcnt lgkmcnt(4) 100; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 101; GFX7-NEXT: s_waitcnt lgkmcnt(3) 102; GFX7-NEXT: v_or_b32_e32 v4, v4, v5 103; GFX7-NEXT: s_waitcnt lgkmcnt(2) 104; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v6 105; GFX7-NEXT: v_or_b32_e32 v3, v5, v3 106; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 107; GFX7-NEXT: v_or_b32_e32 v4, v3, v4 108; GFX7-NEXT: s_waitcnt lgkmcnt(1) 109; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v7 110; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 111; GFX7-NEXT: s_waitcnt lgkmcnt(0) 112; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v8 113; GFX7-NEXT: ds_read_u8 v5, v0 offset:15 114; GFX7-NEXT: ds_read_u8 v6, v0 offset:14 115; GFX7-NEXT: ds_read_u8 v7, v0 offset:13 116; GFX7-NEXT: ds_read_u8 v8, v0 offset:12 117; GFX7-NEXT: ds_read_u8 v9, v0 offset:11 118; GFX7-NEXT: ds_read_u8 v10, v0 offset:10 119; GFX7-NEXT: ds_read_u8 v11, v0 offset:9 120; GFX7-NEXT: ds_read_u8 v0, v0 offset:8 121; GFX7-NEXT: v_or_b32_e32 v1, v3, v1 122; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 123; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 124; GFX7-NEXT: s_waitcnt lgkmcnt(1) 125; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v11 126; GFX7-NEXT: s_waitcnt lgkmcnt(0) 127; GFX7-NEXT: v_or_b32_e32 v0, v2, v0 128; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v9 129; GFX7-NEXT: v_or_b32_e32 v2, v2, v10 130; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 131; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v5 132; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 133; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v7 134; GFX7-NEXT: v_or_b32_e32 v3, v3, v6 135; GFX7-NEXT: v_or_b32_e32 v0, v0, v8 136; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 137; GFX7-NEXT: v_or_b32_e32 v3, v3, v0 138; GFX7-NEXT: v_mov_b32_e32 v0, v4 139; GFX7-NEXT: s_setpc_b64 s[30:31] 140; 141; GFX6-LABEL: load_lds_v4i32_align1: 142; GFX6: ; %bb.0: 143; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 144; GFX6-NEXT: v_add_i32_e32 v1, vcc, 5, v0 145; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 146; GFX6-NEXT: v_add_i32_e32 v3, vcc, 7, v0 147; GFX6-NEXT: v_add_i32_e32 v4, vcc, 6, v0 148; GFX6-NEXT: v_add_i32_e32 v5, vcc, 9, v0 149; GFX6-NEXT: v_add_i32_e32 v6, vcc, 8, v0 150; GFX6-NEXT: v_add_i32_e32 v7, vcc, 11, v0 151; GFX6-NEXT: s_mov_b32 m0, -1 152; GFX6-NEXT: ds_read_u8 v1, v1 153; GFX6-NEXT: ds_read_u8 v2, v2 154; GFX6-NEXT: ds_read_u8 v3, v3 155; GFX6-NEXT: ds_read_u8 v4, v4 156; GFX6-NEXT: ds_read_u8 v5, v5 157; GFX6-NEXT: ds_read_u8 v6, v6 158; GFX6-NEXT: ds_read_u8 v7, v7 159; GFX6-NEXT: ds_read_u8 v8, v0 160; GFX6-NEXT: s_waitcnt lgkmcnt(7) 161; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 162; GFX6-NEXT: s_waitcnt lgkmcnt(6) 163; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 164; GFX6-NEXT: s_waitcnt lgkmcnt(5) 165; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v3 166; GFX6-NEXT: s_waitcnt lgkmcnt(4) 167; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 168; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 169; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 170; GFX6-NEXT: s_waitcnt lgkmcnt(3) 171; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v5 172; GFX6-NEXT: s_waitcnt lgkmcnt(2) 173; GFX6-NEXT: v_or_b32_e32 v2, v2, v6 174; GFX6-NEXT: s_waitcnt lgkmcnt(1) 175; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v7 176; GFX6-NEXT: v_add_i32_e32 v4, vcc, 10, v0 177; GFX6-NEXT: v_add_i32_e32 v5, vcc, 13, v0 178; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 179; GFX6-NEXT: v_add_i32_e32 v7, vcc, 15, v0 180; GFX6-NEXT: v_add_i32_e32 v9, vcc, 14, v0 181; GFX6-NEXT: v_add_i32_e32 v10, vcc, 3, v0 182; GFX6-NEXT: v_add_i32_e32 v11, vcc, 2, v0 183; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0 184; GFX6-NEXT: ds_read_u8 v4, v4 185; GFX6-NEXT: ds_read_u8 v5, v5 186; GFX6-NEXT: ds_read_u8 v6, v6 187; GFX6-NEXT: ds_read_u8 v7, v7 188; GFX6-NEXT: ds_read_u8 v9, v9 189; GFX6-NEXT: ds_read_u8 v10, v10 190; GFX6-NEXT: ds_read_u8 v11, v11 191; GFX6-NEXT: ds_read_u8 v0, v0 192; GFX6-NEXT: s_waitcnt lgkmcnt(7) 193; GFX6-NEXT: v_or_b32_e32 v3, v3, v4 194; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 195; GFX6-NEXT: s_waitcnt lgkmcnt(4) 196; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v7 197; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 198; GFX6-NEXT: v_lshlrev_b32_e32 v3, 8, v5 199; GFX6-NEXT: s_waitcnt lgkmcnt(3) 200; GFX6-NEXT: v_or_b32_e32 v4, v4, v9 201; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 202; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 203; GFX6-NEXT: v_or_b32_e32 v3, v4, v3 204; GFX6-NEXT: s_waitcnt lgkmcnt(2) 205; GFX6-NEXT: v_lshlrev_b32_e32 v4, 8, v10 206; GFX6-NEXT: s_waitcnt lgkmcnt(1) 207; GFX6-NEXT: v_or_b32_e32 v4, v4, v11 208; GFX6-NEXT: s_waitcnt lgkmcnt(0) 209; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 210; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 211; GFX6-NEXT: v_or_b32_e32 v0, v0, v8 212; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 213; GFX6-NEXT: s_setpc_b64 s[30:31] 214; 215; GFX10-LABEL: load_lds_v4i32_align1: 216; GFX10: ; %bb.0: 217; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 218; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 219; GFX10-NEXT: ds_read_u8 v1, v0 220; GFX10-NEXT: ds_read_u8 v2, v0 offset:1 221; GFX10-NEXT: ds_read_u8 v3, v0 offset:2 222; GFX10-NEXT: ds_read_u8 v4, v0 offset:3 223; GFX10-NEXT: ds_read_u8 v5, v0 offset:4 224; GFX10-NEXT: ds_read_u8 v6, v0 offset:5 225; GFX10-NEXT: ds_read_u8 v7, v0 offset:6 226; GFX10-NEXT: ds_read_u8 v8, v0 offset:7 227; GFX10-NEXT: ds_read_u8 v9, v0 offset:8 228; GFX10-NEXT: ds_read_u8 v10, v0 offset:9 229; GFX10-NEXT: ds_read_u8 v11, v0 offset:10 230; GFX10-NEXT: ds_read_u8 v12, v0 offset:11 231; GFX10-NEXT: ds_read_u8 v13, v0 offset:12 232; GFX10-NEXT: ds_read_u8 v14, v0 offset:13 233; GFX10-NEXT: ds_read_u8 v15, v0 offset:14 234; GFX10-NEXT: ds_read_u8 v0, v0 offset:15 235; GFX10-NEXT: s_waitcnt lgkmcnt(14) 236; GFX10-NEXT: v_lshl_or_b32 v1, v2, 8, v1 237; GFX10-NEXT: s_waitcnt lgkmcnt(12) 238; GFX10-NEXT: v_lshl_or_b32 v2, v4, 8, v3 239; GFX10-NEXT: s_waitcnt lgkmcnt(10) 240; GFX10-NEXT: v_lshl_or_b32 v3, v6, 8, v5 241; GFX10-NEXT: s_waitcnt lgkmcnt(8) 242; GFX10-NEXT: v_lshl_or_b32 v4, v8, 8, v7 243; GFX10-NEXT: s_waitcnt lgkmcnt(6) 244; GFX10-NEXT: v_lshl_or_b32 v5, v10, 8, v9 245; GFX10-NEXT: s_waitcnt lgkmcnt(4) 246; GFX10-NEXT: v_lshl_or_b32 v6, v12, 8, v11 247; GFX10-NEXT: s_waitcnt lgkmcnt(2) 248; GFX10-NEXT: v_lshl_or_b32 v7, v14, 8, v13 249; GFX10-NEXT: s_waitcnt lgkmcnt(0) 250; GFX10-NEXT: v_lshl_or_b32 v8, v0, 8, v15 251; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 252; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 253; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 254; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 255; GFX10-NEXT: s_setpc_b64 s[30:31] 256 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1 257 ret <4 x i32> %load 258} 259 260define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) { 261; GFX9-LABEL: load_lds_v4i32_align2: 262; GFX9: ; %bb.0: 263; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 264; GFX9-NEXT: ds_read_u16 v1, v0 265; GFX9-NEXT: ds_read_u16 v2, v0 offset:2 266; GFX9-NEXT: ds_read_u16 v3, v0 offset:4 267; GFX9-NEXT: ds_read_u16 v4, v0 offset:6 268; GFX9-NEXT: ds_read_u16 v5, v0 offset:8 269; GFX9-NEXT: ds_read_u16 v6, v0 offset:10 270; GFX9-NEXT: ds_read_u16 v7, v0 offset:12 271; GFX9-NEXT: ds_read_u16 v8, v0 offset:14 272; GFX9-NEXT: s_waitcnt lgkmcnt(6) 273; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v1 274; GFX9-NEXT: s_waitcnt lgkmcnt(4) 275; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3 276; GFX9-NEXT: s_waitcnt lgkmcnt(2) 277; GFX9-NEXT: v_lshl_or_b32 v2, v6, 16, v5 278; GFX9-NEXT: s_waitcnt lgkmcnt(0) 279; GFX9-NEXT: v_lshl_or_b32 v3, v8, 16, v7 280; GFX9-NEXT: s_setpc_b64 s[30:31] 281; 282; GFX7-LABEL: load_lds_v4i32_align2: 283; GFX7: ; %bb.0: 284; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 285; GFX7-NEXT: s_mov_b32 m0, -1 286; GFX7-NEXT: ds_read_u16 v3, v0 offset:12 287; GFX7-NEXT: ds_read_u16 v2, v0 offset:8 288; GFX7-NEXT: ds_read_u16 v1, v0 offset:4 289; GFX7-NEXT: ds_read_u16 v4, v0 offset:2 290; GFX7-NEXT: ds_read_u16 v5, v0 291; GFX7-NEXT: ds_read_u16 v6, v0 offset:6 292; GFX7-NEXT: ds_read_u16 v7, v0 offset:10 293; GFX7-NEXT: ds_read_u16 v8, v0 offset:14 294; GFX7-NEXT: s_waitcnt lgkmcnt(4) 295; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4 296; GFX7-NEXT: s_waitcnt lgkmcnt(3) 297; GFX7-NEXT: v_or_b32_e32 v0, v0, v5 298; GFX7-NEXT: s_waitcnt lgkmcnt(2) 299; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v6 300; GFX7-NEXT: v_or_b32_e32 v1, v4, v1 301; GFX7-NEXT: s_waitcnt lgkmcnt(1) 302; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v7 303; GFX7-NEXT: v_or_b32_e32 v2, v4, v2 304; GFX7-NEXT: s_waitcnt lgkmcnt(0) 305; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8 306; GFX7-NEXT: v_or_b32_e32 v3, v4, v3 307; GFX7-NEXT: s_setpc_b64 s[30:31] 308; 309; GFX6-LABEL: load_lds_v4i32_align2: 310; GFX6: ; %bb.0: 311; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 312; GFX6-NEXT: v_add_i32_e32 v1, vcc, 6, v0 313; GFX6-NEXT: v_add_i32_e32 v2, vcc, 4, v0 314; GFX6-NEXT: v_add_i32_e32 v3, vcc, 10, v0 315; GFX6-NEXT: v_add_i32_e32 v4, vcc, 8, v0 316; GFX6-NEXT: v_add_i32_e32 v5, vcc, 14, v0 317; GFX6-NEXT: v_add_i32_e32 v6, vcc, 12, v0 318; GFX6-NEXT: v_add_i32_e32 v7, vcc, 2, v0 319; GFX6-NEXT: s_mov_b32 m0, -1 320; GFX6-NEXT: ds_read_u16 v1, v1 321; GFX6-NEXT: ds_read_u16 v2, v2 322; GFX6-NEXT: ds_read_u16 v3, v3 323; GFX6-NEXT: ds_read_u16 v4, v4 324; GFX6-NEXT: ds_read_u16 v5, v5 325; GFX6-NEXT: ds_read_u16 v6, v6 326; GFX6-NEXT: ds_read_u16 v7, v7 327; GFX6-NEXT: ds_read_u16 v0, v0 328; GFX6-NEXT: s_waitcnt lgkmcnt(7) 329; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 330; GFX6-NEXT: s_waitcnt lgkmcnt(6) 331; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 332; GFX6-NEXT: s_waitcnt lgkmcnt(5) 333; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 334; GFX6-NEXT: s_waitcnt lgkmcnt(4) 335; GFX6-NEXT: v_or_b32_e32 v2, v2, v4 336; GFX6-NEXT: s_waitcnt lgkmcnt(3) 337; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5 338; GFX6-NEXT: s_waitcnt lgkmcnt(1) 339; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 340; GFX6-NEXT: v_or_b32_e32 v3, v3, v6 341; GFX6-NEXT: s_waitcnt lgkmcnt(0) 342; GFX6-NEXT: v_or_b32_e32 v0, v4, v0 343; GFX6-NEXT: s_setpc_b64 s[30:31] 344; 345; GFX10-LABEL: load_lds_v4i32_align2: 346; GFX10: ; %bb.0: 347; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 348; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 349; GFX10-NEXT: ds_read_u16 v1, v0 350; GFX10-NEXT: ds_read_u16 v2, v0 offset:2 351; GFX10-NEXT: ds_read_u16 v3, v0 offset:4 352; GFX10-NEXT: ds_read_u16 v4, v0 offset:6 353; GFX10-NEXT: ds_read_u16 v5, v0 offset:8 354; GFX10-NEXT: ds_read_u16 v6, v0 offset:10 355; GFX10-NEXT: ds_read_u16 v7, v0 offset:12 356; GFX10-NEXT: ds_read_u16 v8, v0 offset:14 357; GFX10-NEXT: s_waitcnt lgkmcnt(6) 358; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v1 359; GFX10-NEXT: s_waitcnt lgkmcnt(4) 360; GFX10-NEXT: v_lshl_or_b32 v1, v4, 16, v3 361; GFX10-NEXT: s_waitcnt lgkmcnt(2) 362; GFX10-NEXT: v_lshl_or_b32 v2, v6, 16, v5 363; GFX10-NEXT: s_waitcnt lgkmcnt(0) 364; GFX10-NEXT: v_lshl_or_b32 v3, v8, 16, v7 365; GFX10-NEXT: s_setpc_b64 s[30:31] 366 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2 367 ret <4 x i32> %load 368} 369 370define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) { 371; GFX9-LABEL: load_lds_v4i32_align4: 372; GFX9: ; %bb.0: 373; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 374; GFX9-NEXT: v_mov_b32_e32 v2, v0 375; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 376; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 377; GFX9-NEXT: s_waitcnt lgkmcnt(0) 378; GFX9-NEXT: s_setpc_b64 s[30:31] 379; 380; GFX7-LABEL: load_lds_v4i32_align4: 381; GFX7: ; %bb.0: 382; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 383; GFX7-NEXT: v_mov_b32_e32 v2, v0 384; GFX7-NEXT: s_mov_b32 m0, -1 385; GFX7-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 386; GFX7-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 387; GFX7-NEXT: s_waitcnt lgkmcnt(0) 388; GFX7-NEXT: s_setpc_b64 s[30:31] 389; 390; GFX6-LABEL: load_lds_v4i32_align4: 391; GFX6: ; %bb.0: 392; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 393; GFX6-NEXT: v_add_i32_e32 v1, vcc, 4, v0 394; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v0 395; GFX6-NEXT: v_add_i32_e32 v3, vcc, 12, v0 396; GFX6-NEXT: s_mov_b32 m0, -1 397; GFX6-NEXT: ds_read_b32 v2, v2 398; GFX6-NEXT: ds_read_b32 v3, v3 399; GFX6-NEXT: ds_read_b32 v0, v0 400; GFX6-NEXT: ds_read_b32 v1, v1 401; GFX6-NEXT: s_waitcnt lgkmcnt(0) 402; GFX6-NEXT: s_setpc_b64 s[30:31] 403; 404; GFX10-LABEL: load_lds_v4i32_align4: 405; GFX10: ; %bb.0: 406; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 407; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 408; GFX10-NEXT: v_mov_b32_e32 v2, v0 409; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset1:1 410; GFX10-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3 411; GFX10-NEXT: s_waitcnt lgkmcnt(0) 412; GFX10-NEXT: s_setpc_b64 s[30:31] 413 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4 414 ret <4 x i32> %load 415} 416 417define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) { 418; GFX9-LABEL: load_lds_v4i32_align8: 419; GFX9: ; %bb.0: 420; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 421; GFX9-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 422; GFX9-NEXT: s_waitcnt lgkmcnt(0) 423; GFX9-NEXT: s_setpc_b64 s[30:31] 424; 425; GFX7-LABEL: load_lds_v4i32_align8: 426; GFX7: ; %bb.0: 427; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 428; GFX7-NEXT: s_mov_b32 m0, -1 429; GFX7-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 430; GFX7-NEXT: s_waitcnt lgkmcnt(0) 431; GFX7-NEXT: s_setpc_b64 s[30:31] 432; 433; GFX6-LABEL: load_lds_v4i32_align8: 434; GFX6: ; %bb.0: 435; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 436; GFX6-NEXT: v_mov_b32_e32 v2, v0 437; GFX6-NEXT: s_mov_b32 m0, -1 438; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 439; GFX6-NEXT: ds_read_b64 v[0:1], v0 440; GFX6-NEXT: ds_read_b64 v[2:3], v2 441; GFX6-NEXT: s_waitcnt lgkmcnt(0) 442; GFX6-NEXT: s_setpc_b64 s[30:31] 443; 444; GFX10-LABEL: load_lds_v4i32_align8: 445; GFX10: ; %bb.0: 446; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 447; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 448; GFX10-NEXT: ds_read2_b64 v[0:3], v0 offset1:1 449; GFX10-NEXT: s_waitcnt lgkmcnt(0) 450; GFX10-NEXT: s_setpc_b64 s[30:31] 451 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8 452 ret <4 x i32> %load 453} 454 455define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) { 456; GFX9-LABEL: load_lds_v4i32_align16: 457; GFX9: ; %bb.0: 458; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 459; GFX9-NEXT: ds_read_b128 v[0:3], v0 460; GFX9-NEXT: s_waitcnt lgkmcnt(0) 461; GFX9-NEXT: s_setpc_b64 s[30:31] 462; 463; GFX7-LABEL: load_lds_v4i32_align16: 464; GFX7: ; %bb.0: 465; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 466; GFX7-NEXT: s_mov_b32 m0, -1 467; GFX7-NEXT: ds_read_b128 v[0:3], v0 468; GFX7-NEXT: s_waitcnt lgkmcnt(0) 469; GFX7-NEXT: s_setpc_b64 s[30:31] 470; 471; GFX6-LABEL: load_lds_v4i32_align16: 472; GFX6: ; %bb.0: 473; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 474; GFX6-NEXT: v_mov_b32_e32 v2, v0 475; GFX6-NEXT: s_mov_b32 m0, -1 476; GFX6-NEXT: v_add_i32_e32 v2, vcc, 8, v2 477; GFX6-NEXT: ds_read_b64 v[0:1], v0 478; GFX6-NEXT: ds_read_b64 v[2:3], v2 479; GFX6-NEXT: s_waitcnt lgkmcnt(0) 480; GFX6-NEXT: s_setpc_b64 s[30:31] 481; 482; GFX10-LABEL: load_lds_v4i32_align16: 483; GFX10: ; %bb.0: 484; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 485; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 486; GFX10-NEXT: ds_read_b128 v[0:3], v0 487; GFX10-NEXT: s_waitcnt lgkmcnt(0) 488; GFX10-NEXT: s_setpc_b64 s[30:31] 489 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16 490 ret <4 x i32> %load 491} 492