1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX9 %s 3; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t 4; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s 5; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t 6; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s 7; RUN: FileCheck --enable-var-scope --check-prefixes=DBG,DBG11 %s < %t 8; REQUIRES: asserts 9 10; FIXME: Verifier error with xnack enabled. 11 12; DBG-LABEL: cluster_load_cluster_store: 13 14; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 15; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 16; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 17; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 18; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 19; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 20 21; DBG: Cluster ld/st SU(1) - SU(2) 22 23; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]]) 24; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) 25; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) 26 27; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) 28; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) 29; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) 30 31; DBG-NOT: Cluster ld/st 32 33define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) { 34; GFX9-LABEL: cluster_load_cluster_store: 35; GFX9: ; %bb.0: ; %bb 36; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 37; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 38; GFX9-NEXT: s_waitcnt lgkmcnt(0) 39; GFX9-NEXT: v_mov_b32_e32 v0, s2 40; GFX9-NEXT: v_mov_b32_e32 v1, s3 41; GFX9-NEXT: flat_load_dword v2, v[0:1] 42; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 43; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 44; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 45; GFX9-NEXT: v_mov_b32_e32 v0, s0 46; GFX9-NEXT: v_mov_b32_e32 v1, s1 47; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 48; GFX9-NEXT: flat_store_dword v[0:1], v2 49; GFX9-NEXT: flat_store_dword v[0:1], v3 offset:8 50; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16 51; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24 52; GFX9-NEXT: s_endpgm 53; 54; GFX10-LABEL: cluster_load_cluster_store: 55; GFX10: ; %bb.0: ; %bb 56; GFX10-NEXT: s_clause 0x1 57; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 58; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 59; GFX10-NEXT: s_waitcnt lgkmcnt(0) 60; GFX10-NEXT: s_add_u32 s0, s2, 8 61; GFX10-NEXT: s_addc_u32 s1, s3, 0 62; GFX10-NEXT: s_add_u32 s6, s2, 16 63; GFX10-NEXT: v_mov_b32_e32 v3, s1 64; GFX10-NEXT: s_addc_u32 s7, s3, 0 65; GFX10-NEXT: v_mov_b32_e32 v0, s2 66; GFX10-NEXT: v_mov_b32_e32 v2, s0 67; GFX10-NEXT: s_add_u32 s0, s2, 24 68; GFX10-NEXT: v_mov_b32_e32 v1, s3 69; GFX10-NEXT: s_addc_u32 s1, s3, 0 70; GFX10-NEXT: v_mov_b32_e32 v4, s6 71; GFX10-NEXT: v_mov_b32_e32 v5, s7 72; GFX10-NEXT: v_mov_b32_e32 v7, s1 73; GFX10-NEXT: v_mov_b32_e32 v6, s0 74; GFX10-NEXT: s_clause 0x3 75; GFX10-NEXT: flat_load_dword v8, v[0:1] 76; GFX10-NEXT: flat_load_dword v9, v[2:3] 77; GFX10-NEXT: flat_load_dword v10, v[4:5] 78; GFX10-NEXT: flat_load_dword v11, v[6:7] 79; GFX10-NEXT: s_add_u32 s0, s4, 8 80; GFX10-NEXT: s_addc_u32 s1, s5, 0 81; GFX10-NEXT: v_mov_b32_e32 v0, s4 82; GFX10-NEXT: v_mov_b32_e32 v3, s1 83; GFX10-NEXT: v_mov_b32_e32 v2, s0 84; GFX10-NEXT: s_add_u32 s0, s4, 16 85; GFX10-NEXT: s_addc_u32 s1, s5, 0 86; GFX10-NEXT: s_add_u32 s2, s4, 24 87; GFX10-NEXT: v_mov_b32_e32 v1, s5 88; GFX10-NEXT: s_addc_u32 s3, s5, 0 89; GFX10-NEXT: v_mov_b32_e32 v5, s1 90; GFX10-NEXT: v_mov_b32_e32 v4, s0 91; GFX10-NEXT: v_mov_b32_e32 v7, s3 92; GFX10-NEXT: v_mov_b32_e32 v6, s2 93; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 94; GFX10-NEXT: flat_store_dword v[0:1], v8 95; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) 96; GFX10-NEXT: flat_store_dword v[2:3], v9 97; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) 98; GFX10-NEXT: flat_store_dword v[4:5], v10 99; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) 100; GFX10-NEXT: flat_store_dword v[6:7], v11 101; GFX10-NEXT: s_endpgm 102; 103; GFX11-LABEL: cluster_load_cluster_store: 104; GFX11: ; %bb.0: ; %bb 105; GFX11-NEXT: s_clause 0x1 106; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 107; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c 108; GFX11-NEXT: s_waitcnt lgkmcnt(0) 109; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 110; GFX11-NEXT: s_clause 0x3 111; GFX11-NEXT: flat_load_b32 v2, v[0:1] 112; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 113; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 114; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 115; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 116; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 117; GFX11-NEXT: flat_store_b32 v[0:1], v2 118; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) 119; GFX11-NEXT: flat_store_b32 v[0:1], v3 offset:8 120; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) 121; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 122; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) 123; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 124; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 125; GFX11-NEXT: s_endpgm 126bb: 127 %la0 = getelementptr inbounds i32, i32* %lb, i32 0 128 %ld0 = load i32, i32* %la0 129 %la1 = getelementptr inbounds i32, i32* %lb, i32 2 130 %ld1 = load i32, i32* %la1 131 %la2 = getelementptr inbounds i32, i32* %lb, i32 4 132 %ld2 = load i32, i32* %la2 133 %la3 = getelementptr inbounds i32, i32* %lb, i32 6 134 %ld3 = load i32, i32* %la3 135 136 %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 137 store i32 %ld0, i32* %sa0 138 %sa1 = getelementptr inbounds i32, i32* %sb, i32 2 139 store i32 %ld1, i32* %sa1 140 %sa2 = getelementptr inbounds i32, i32* %sb, i32 4 141 store i32 %ld2, i32* %sa2 142 %sa3 = getelementptr inbounds i32, i32* %sb, i32 6 143 store i32 %ld3, i32* %sa3 144 145 ret void 146} 147 148; DBG-LABEL: cluster_load_valu_cluster_store: 149 150; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 151; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 8 152; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 153; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 154; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 155; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 4 156 157; DBG: Cluster ld/st SU(1) - SU(2) 158 159; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]]) 160; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]]) 161; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]]) 162 163; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]]) 164; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]]) 165; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]]) 166 167; DBG-NOT: Cluster ld/st 168 169define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) { 170; GFX9-LABEL: cluster_load_valu_cluster_store: 171; GFX9: ; %bb.0: ; %bb 172; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 173; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c 174; GFX9-NEXT: s_waitcnt lgkmcnt(0) 175; GFX9-NEXT: v_mov_b32_e32 v0, s2 176; GFX9-NEXT: v_mov_b32_e32 v1, s3 177; GFX9-NEXT: flat_load_dword v2, v[0:1] 178; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 179; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 180; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 181; GFX9-NEXT: v_mov_b32_e32 v0, s0 182; GFX9-NEXT: v_mov_b32_e32 v1, s1 183; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 184; GFX9-NEXT: flat_store_dword v[0:1], v2 185; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 186; GFX9-NEXT: flat_store_dword v[0:1], v4 offset:16 187; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:8 188; GFX9-NEXT: flat_store_dword v[0:1], v5 offset:24 189; GFX9-NEXT: s_endpgm 190; 191; GFX10-LABEL: cluster_load_valu_cluster_store: 192; GFX10: ; %bb.0: ; %bb 193; GFX10-NEXT: s_clause 0x1 194; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 195; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 196; GFX10-NEXT: s_waitcnt lgkmcnt(0) 197; GFX10-NEXT: s_add_u32 s0, s2, 8 198; GFX10-NEXT: s_addc_u32 s1, s3, 0 199; GFX10-NEXT: s_add_u32 s6, s2, 16 200; GFX10-NEXT: v_mov_b32_e32 v3, s1 201; GFX10-NEXT: v_mov_b32_e32 v2, s0 202; GFX10-NEXT: s_addc_u32 s7, s3, 0 203; GFX10-NEXT: s_add_u32 s0, s2, 24 204; GFX10-NEXT: v_mov_b32_e32 v0, s2 205; GFX10-NEXT: s_addc_u32 s1, s3, 0 206; GFX10-NEXT: v_mov_b32_e32 v1, s3 207; GFX10-NEXT: v_mov_b32_e32 v4, s6 208; GFX10-NEXT: v_mov_b32_e32 v5, s7 209; GFX10-NEXT: flat_load_dword v6, v[2:3] 210; GFX10-NEXT: v_mov_b32_e32 v3, s1 211; GFX10-NEXT: v_mov_b32_e32 v2, s0 212; GFX10-NEXT: s_clause 0x2 213; GFX10-NEXT: flat_load_dword v8, v[0:1] 214; GFX10-NEXT: flat_load_dword v9, v[4:5] 215; GFX10-NEXT: flat_load_dword v10, v[2:3] 216; GFX10-NEXT: s_add_u32 s0, s4, 8 217; GFX10-NEXT: s_addc_u32 s1, s5, 0 218; GFX10-NEXT: s_add_u32 s2, s4, 16 219; GFX10-NEXT: v_mov_b32_e32 v3, s1 220; GFX10-NEXT: s_addc_u32 s3, s5, 0 221; GFX10-NEXT: v_mov_b32_e32 v0, s4 222; GFX10-NEXT: v_mov_b32_e32 v2, s0 223; GFX10-NEXT: s_add_u32 s0, s4, 24 224; GFX10-NEXT: v_mov_b32_e32 v1, s5 225; GFX10-NEXT: v_mov_b32_e32 v5, s3 226; GFX10-NEXT: s_addc_u32 s1, s5, 0 227; GFX10-NEXT: v_mov_b32_e32 v4, s2 228; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 229; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 230; GFX10-NEXT: v_mov_b32_e32 v7, s1 231; GFX10-NEXT: v_mov_b32_e32 v6, s0 232; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) 233; GFX10-NEXT: flat_store_dword v[0:1], v8 234; GFX10-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) 235; GFX10-NEXT: flat_store_dword v[4:5], v9 236; GFX10-NEXT: flat_store_dword v[2:3], v11 237; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) 238; GFX10-NEXT: flat_store_dword v[6:7], v10 239; GFX10-NEXT: s_endpgm 240; 241; GFX11-LABEL: cluster_load_valu_cluster_store: 242; GFX11: ; %bb.0: ; %bb 243; GFX11-NEXT: s_clause 0x1 244; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 245; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c 246; GFX11-NEXT: s_waitcnt lgkmcnt(0) 247; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 248; GFX11-NEXT: s_clause 0x3 249; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 250; GFX11-NEXT: flat_load_b32 v3, v[0:1] 251; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 252; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 253; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 254; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) 255; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 256; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) 257; GFX11-NEXT: s_clause 0x1 258; GFX11-NEXT: flat_store_b32 v[0:1], v3 259; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:8 260; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) 261; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16 262; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) 263; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24 264; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 265; GFX11-NEXT: s_endpgm 266bb: 267 %la0 = getelementptr inbounds i32, i32* %lb, i32 0 268 %ld0 = load i32, i32* %la0 269 %la1 = getelementptr inbounds i32, i32* %lb, i32 2 270 %ld1 = load i32, i32* %la1 271 %la2 = getelementptr inbounds i32, i32* %lb, i32 4 272 %ld2 = load i32, i32* %la2 273 %la3 = getelementptr inbounds i32, i32* %lb, i32 6 274 %ld3 = load i32, i32* %la3 275 276 %sa0 = getelementptr inbounds i32, i32* %sb, i32 0 277 store i32 %ld0, i32* %sa0 278 %sa1 = getelementptr inbounds i32, i32* %sb, i32 2 279 %add = add i32 %ld1, 1 280 store i32 %add, i32* %sa1 281 %sa2 = getelementptr inbounds i32, i32* %sb, i32 4 282 store i32 %ld2, i32* %sa2 283 %sa3 = getelementptr inbounds i32, i32* %sb, i32 6 284 store i32 %ld3, i32* %sa3 285 286 ret void 287} 288 289; Cluster loads from the same texture with different coordinates 290; DBG-LABEL: cluster_image_load: 291; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 292; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 293; DBG: {{^}}Cluster ld/st [[SU1:SU\([0-9]+\)]] - [[SU2:SU\([0-9]+\)]] 294; DBG: {{^}}[[SU1]]: {{.*}} IMAGE_LOAD 295; DBG: {{^}}[[SU2]]: {{.*}} IMAGE_LOAD 296define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg %dst, i32 %x, i32 %y) { 297; GFX9-LABEL: cluster_image_load: 298; GFX9: ; %bb.0: ; %entry 299; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 300; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 301; GFX9-NEXT: v_add_u32_e32 v6, 2, v0 302; GFX9-NEXT: v_add_u32_e32 v7, 2, v1 303; GFX9-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf unorm 304; GFX9-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf unorm 305; GFX9-NEXT: s_waitcnt vmcnt(0) 306; GFX9-NEXT: v_add_f32_e32 v5, v5, v9 307; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 308; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 309; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 310; GFX9-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf unorm 311; GFX9-NEXT: s_endpgm 312; 313; GFX10-LABEL: cluster_image_load: 314; GFX10: ; %bb.0: ; %entry 315; GFX10-NEXT: v_add_nc_u32_e32 v10, 1, v0 316; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v1 317; GFX10-NEXT: v_add_nc_u32_e32 v12, 2, v0 318; GFX10-NEXT: v_add_nc_u32_e32 v13, 2, v1 319; GFX10-NEXT: s_clause 0x1 320; GFX10-NEXT: image_load v[2:5], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 321; GFX10-NEXT: image_load v[6:9], v[12:13], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 322; GFX10-NEXT: s_waitcnt vmcnt(0) 323; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 324; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 325; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 326; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 327; GFX10-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 328; GFX10-NEXT: s_endpgm 329; 330; GFX11-LABEL: cluster_image_load: 331; GFX11: ; %bb.0: ; %entry 332; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v0 333; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1 334; GFX11-NEXT: v_add_nc_u32_e32 v6, 2, v0 335; GFX11-NEXT: v_add_nc_u32_e32 v7, 2, v1 336; GFX11-NEXT: s_clause 0x1 337; GFX11-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 338; GFX11-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 339; GFX11-NEXT: s_waitcnt vmcnt(0) 340; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9 341; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7 342; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 343; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 344; GFX11-NEXT: s_endpgm 345entry: 346 %x1 = add i32 %x, 1 347 %y1 = add i32 %y, 1 348 %val1 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %x1, i32 %y1, <8 x i32> %src, i32 0, i32 0) 349 %x2 = add i32 %x, 2 350 %y2 = add i32 %y, 2 351 %val2 = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %x2, i32 %y2, <8 x i32> %src, i32 0, i32 0) 352 %val = fadd fast <4 x float> %val1, %val2 353 call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %val, i32 15, i32 %x, i32 %y, <8 x i32> %dst, i32 0, i32 0) 354 ret void 355} 356 357; Don't cluster loads from different textures 358; DBG-LABEL: no_cluster_image_load: 359; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 360; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 361; DBG-NOT: {{^}}Cluster ld/st 362define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> inreg %src2, <8 x i32> inreg %dst, i32 %x, i32 %y) { 363; GFX9-LABEL: no_cluster_image_load: 364; GFX9: ; %bb.0: ; %entry 365; GFX9-NEXT: v_mov_b32_e32 v2, 0 366; GFX9-NEXT: image_load_mip v[3:6], v[0:2], s[0:7] dmask:0xf unorm 367; GFX9-NEXT: image_load_mip v[7:10], v[0:2], s[8:15] dmask:0xf unorm 368; GFX9-NEXT: s_waitcnt vmcnt(0) 369; GFX9-NEXT: v_add_f32_e32 v6, v6, v10 370; GFX9-NEXT: v_add_f32_e32 v5, v5, v9 371; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 372; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 373; GFX9-NEXT: image_store v[3:6], v[0:1], s[16:23] dmask:0xf unorm 374; GFX9-NEXT: s_endpgm 375; 376; GFX10-LABEL: no_cluster_image_load: 377; GFX10: ; %bb.0: ; %entry 378; GFX10-NEXT: v_mov_b32_e32 v10, 0 379; GFX10-NEXT: image_load_mip v[2:5], [v0, v1, v10], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 380; GFX10-NEXT: image_load_mip v[6:9], [v0, v1, v10], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 381; GFX10-NEXT: s_waitcnt vmcnt(0) 382; GFX10-NEXT: v_add_f32_e32 v5, v5, v9 383; GFX10-NEXT: v_add_f32_e32 v4, v4, v8 384; GFX10-NEXT: v_add_f32_e32 v3, v3, v7 385; GFX10-NEXT: v_add_f32_e32 v2, v2, v6 386; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 387; GFX10-NEXT: s_endpgm 388; 389; GFX11-LABEL: no_cluster_image_load: 390; GFX11: ; %bb.0: ; %entry 391; GFX11-NEXT: v_mov_b32_e32 v6, 0 392; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 393; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 394; GFX11-NEXT: s_waitcnt vmcnt(0) 395; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 396; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 397; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 398; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 399; GFX11-NEXT: s_endpgm 400entry: 401 %val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0) 402 %val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0) 403 %val = fadd fast <4 x float> %val1, %val2 404 call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %val, i32 15, i32 %x, i32 %y, <8 x i32> %dst, i32 0, i32 0) 405 ret void 406} 407 408; Cluster loads from the same texture and sampler with different coordinates 409; DBG-LABEL: cluster_image_sample: 410; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 411; DBG: Num BaseOps: {{[1-9]+}}, Offset: {{[0-9]+}}, OffsetIsScalable: {{[01]}}, Width: 16 412; DBG: {{^}}Cluster ld/st [[SU1:SU\([0-9]+\)]] - [[SU2:SU\([0-9]+\)]] 413; DBG: {{^}}[[SU1]]: {{.*}} IMAGE_SAMPLE 414; DBG: {{^}}[[SU2]]: {{.*}} IMAGE_SAMPLE 415define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inreg %smp, <8 x i32> inreg %dst, i32 %x, i32 %y) { 416; GFX9-LABEL: cluster_image_sample: 417; GFX9: ; %bb.0: ; %entry 418; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v0 419; GFX9-NEXT: v_cvt_f32_i32_e32 v9, v1 420; GFX9-NEXT: v_mov_b32_e32 v4, 0 421; GFX9-NEXT: v_mov_b32_e32 v5, v4 422; GFX9-NEXT: v_add_f32_e32 v2, 1.0, v8 423; GFX9-NEXT: v_add_f32_e32 v3, 1.0, v9 424; GFX9-NEXT: v_mov_b32_e32 v6, v4 425; GFX9-NEXT: v_mov_b32_e32 v7, v4 426; GFX9-NEXT: v_add_f32_e32 v8, 2.0, v8 427; GFX9-NEXT: v_add_f32_e32 v9, 2.0, v9 428; GFX9-NEXT: v_mov_b32_e32 v10, 1.0 429; GFX9-NEXT: v_mov_b32_e32 v11, v10 430; GFX9-NEXT: v_mov_b32_e32 v12, v10 431; GFX9-NEXT: v_mov_b32_e32 v13, v10 432; GFX9-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf 433; GFX9-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf 434; GFX9-NEXT: s_waitcnt vmcnt(0) 435; GFX9-NEXT: v_add_f32_e32 v5, v5, v9 436; GFX9-NEXT: v_add_f32_e32 v4, v4, v8 437; GFX9-NEXT: v_add_f32_e32 v3, v3, v7 438; GFX9-NEXT: v_add_f32_e32 v2, v2, v6 439; GFX9-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf unorm 440; GFX9-NEXT: s_endpgm 441; 442; GFX10-LABEL: cluster_image_sample: 443; GFX10: ; %bb.0: ; %entry 444; GFX10-NEXT: v_cvt_f32_i32_e32 v8, v0 445; GFX10-NEXT: v_cvt_f32_i32_e32 v9, v1 446; GFX10-NEXT: v_mov_b32_e32 v4, 0 447; GFX10-NEXT: v_mov_b32_e32 v10, 1.0 448; GFX10-NEXT: v_add_f32_e32 v2, 1.0, v8 449; GFX10-NEXT: v_add_f32_e32 v3, 1.0, v9 450; GFX10-NEXT: v_mov_b32_e32 v5, v4 451; GFX10-NEXT: v_mov_b32_e32 v6, v4 452; GFX10-NEXT: v_mov_b32_e32 v7, v4 453; GFX10-NEXT: v_add_f32_e32 v8, 2.0, v8 454; GFX10-NEXT: v_add_f32_e32 v9, 2.0, v9 455; GFX10-NEXT: v_mov_b32_e32 v11, v10 456; GFX10-NEXT: v_mov_b32_e32 v12, v10 457; GFX10-NEXT: v_mov_b32_e32 v13, v10 458; GFX10-NEXT: s_clause 0x1 459; GFX10-NEXT: image_sample_d v[14:17], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D 460; GFX10-NEXT: image_sample_d v[18:21], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D 461; GFX10-NEXT: s_waitcnt vmcnt(0) 462; GFX10-NEXT: v_add_f32_e32 v5, v17, v21 463; GFX10-NEXT: v_add_f32_e32 v4, v16, v20 464; GFX10-NEXT: v_add_f32_e32 v3, v15, v19 465; GFX10-NEXT: v_add_f32_e32 v2, v14, v18 466; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 467; GFX10-NEXT: s_endpgm 468; 469; GFX11-LABEL: cluster_image_sample: 470; GFX11: ; %bb.0: ; %entry 471; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1 472; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0 473; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) 474; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_add_f32 v3, 1.0, v9 475; GFX11-NEXT: v_dual_mov_b32 v10, 1.0 :: v_dual_mov_b32 v7, v4 476; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) 477; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v8 :: v_dual_mov_b32 v5, v4 478; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_add_f32 v9, 2.0, v9 479; GFX11-NEXT: v_dual_add_f32 v8, 2.0, v8 :: v_dual_mov_b32 v11, v10 480; GFX11-NEXT: v_mov_b32_e32 v12, v10 481; GFX11-NEXT: v_mov_b32_e32 v13, v10 482; GFX11-NEXT: s_clause 0x1 483; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D 484; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D 485; GFX11-NEXT: s_waitcnt vmcnt(0) 486; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v5, v5, v9 487; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 488; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm 489; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) 490; GFX11-NEXT: s_endpgm 491entry: 492 %s = sitofp i32 %x to float 493 %t = sitofp i32 %y to float 494 %s1 = fadd float %s, 1.0 495 %t1 = fadd float %t, 1.0 496 %val1 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32(i32 15, float %s1, float %t1, float 0.0, float 0.0, float 0.0, float 0.0, <8 x i32> %src, <4 x i32> %smp, i1 false, i32 0, i32 0) 497 %s2 = fadd float %s, 2.0 498 %t2 = fadd float %t, 2.0 499 %val2 = call <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32(i32 15, float %s2, float %t2, float 1.0, float 1.0, float 1.0, float 1.0, <8 x i32> %src, <4 x i32> %smp, i1 false, i32 0, i32 0) 500 %val = fadd fast <4 x float> %val1, %val2 501 call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %val, i32 15, i32 %x, i32 %y, <8 x i32> %dst, i32 0, i32 0) 502 ret void 503} 504 505declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) 506declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 immarg, i32, i32, i32, <8 x i32>, i32 immarg, i32 immarg) 507declare <4 x float> @llvm.amdgcn.image.sample.d.2d.v4f32.f32(i32, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) 508declare void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float>, i32 immarg, i32, i32, <8 x i32>, i32 immarg, i32 immarg) 509