1; RUN: opt -S -load-store-vectorizer --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s 2; RUN: opt -S -load-store-vectorizer --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s 3; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s 4; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s 5 6target triple = "amdgcn--" 7target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 8 9; ALL-LABEL: @load_unknown_offset_align1_i8( 10; ALL: alloca [128 x i8], align 1 11; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 12define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { 13 %alloca = alloca [128 x i8], align 1, addrspace(5) 14 %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset 15 %val0 = load i8, i8 addrspace(5)* %ptr0, align 1 16 %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 17 %val1 = load i8, i8 addrspace(5)* %ptr1, align 1 18 %add = add i8 %val0, %val1 19 store i8 %add, i8 addrspace(1)* %out 20 ret void 21} 22 23; ALL-LABEL: @load_unknown_offset_align1_i16( 24; ALL: alloca [128 x i16], align 1, addrspace(5){{$}} 25; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 26 27; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}} 28; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}} 29define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { 30 %alloca = alloca [128 x i16], align 1, addrspace(5) 31 %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset 32 %val0 = load i16, i16 addrspace(5)* %ptr0, align 1 33 %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 34 %val1 = load i16, i16 addrspace(5)* %ptr1, align 1 35 %add = add i16 %val0, %val1 36 store i16 %add, i16 addrspace(1)* %out 37 ret void 38} 39 40; FIXME: Although the offset is unknown here, we know it is a multiple 41; of the element size, so should still be align 4 42 43; ALL-LABEL: @load_unknown_offset_align1_i32( 44; ALL: alloca [128 x i32], align 1 45; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 46 47; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1 48; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1 49define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { 50 %alloca = alloca [128 x i32], align 1, addrspace(5) 51 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset 52 %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 53 %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 54 %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 55 %add = add i32 %val0, %val1 56 store i32 %add, i32 addrspace(1)* %out 57 ret void 58} 59 60; Make sure alloca alignment isn't decreased 61; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32( 62; ALL: alloca [128 x i32], align 16 63 64; ALL: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}} 65define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { 66 %alloca = alloca [128 x i32], align 16, addrspace(5) 67 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset 68 %val0 = load i32, i32 addrspace(5)* %ptr0, align 1 69 %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 70 %val1 = load i32, i32 addrspace(5)* %ptr1, align 1 71 %add = add i32 %val0, %val1 72 store i32 %add, i32 addrspace(1)* %out 73 ret void 74} 75 76; ALL-LABEL: @store_unknown_offset_align1_i8( 77; ALL: alloca [128 x i8], align 1 78; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 79 80; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}} 81; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}} 82define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 { 83 %alloca = alloca [128 x i8], align 1, addrspace(5) 84 %ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset 85 store i8 9, i8 addrspace(5)* %ptr0, align 1 86 %ptr1 = getelementptr inbounds i8, i8 addrspace(5)* %ptr0, i32 1 87 store i8 10, i8 addrspace(5)* %ptr1, align 1 88 ret void 89} 90 91; ALL-LABEL: @store_unknown_offset_align1_i16( 92; ALL: alloca [128 x i16], align 1 93; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 94 95; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}} 96; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}} 97define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 { 98 %alloca = alloca [128 x i16], align 1, addrspace(5) 99 %ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset 100 store i16 9, i16 addrspace(5)* %ptr0, align 1 101 %ptr1 = getelementptr inbounds i16, i16 addrspace(5)* %ptr0, i32 1 102 store i16 10, i16 addrspace(5)* %ptr1, align 1 103 ret void 104} 105 106; FIXME: Although the offset is unknown here, we know it is a multiple 107; of the element size, so it still should be align 4. 108 109; ALL-LABEL: @store_unknown_offset_align1_i32( 110; ALL: alloca [128 x i32], align 1 111 112; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}} 113 114; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1 115; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1 116define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 { 117 %alloca = alloca [128 x i32], align 1, addrspace(5) 118 %ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset 119 store i32 9, i32 addrspace(5)* %ptr0, align 1 120 %ptr1 = getelementptr inbounds i32, i32 addrspace(5)* %ptr0, i32 1 121 store i32 10, i32 addrspace(5)* %ptr1, align 1 122 ret void 123} 124 125; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32( 126; ALL: %alloca = alloca [8 x i32], align 4, addrspace(5) 127; ALL: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4 128define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() { 129 %alloca = alloca [8 x i32], align 1, addrspace(5) 130 %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* 131 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 132 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 133 %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 134 135 store i32 9, i32 addrspace(5)* %out, align 1 136 store i32 1, i32 addrspace(5)* %out.gep.1, align 1 137 store i32 23, i32 addrspace(5)* %out.gep.2, align 1 138 store i32 19, i32 addrspace(5)* %out.gep.3, align 1 139 ret void 140} 141 142; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( 143; ALL: %alloca = alloca [8 x i8], align 4, addrspace(5) 144; ALL: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4 145define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() { 146 %alloca = alloca [8 x i8], align 1, addrspace(5) 147 %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* 148 %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 149 %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 150 %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 151 152 store i8 9, i8 addrspace(5)* %out, align 1 153 store i8 1, i8 addrspace(5)* %out.gep.1, align 1 154 store i8 23, i8 addrspace(5)* %out.gep.2, align 1 155 store i8 19, i8 addrspace(5)* %out.gep.3, align 1 156 ret void 157} 158 159; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32( 160; ALL: %alloca = alloca [8 x i32], align 4, addrspace(5) 161; ALL: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4 162define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() { 163 %alloca = alloca [8 x i32], align 1, addrspace(5) 164 %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)* 165 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 166 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 167 %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 168 169 %load0 = load i32, i32 addrspace(5)* %out, align 1 170 %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1 171 %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1 172 %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1 173 ret void 174} 175 176; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8( 177; ALL: %alloca = alloca [8 x i8], align 4, addrspace(5) 178; ALL: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4 179define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() { 180 %alloca = alloca [8 x i8], align 1, addrspace(5) 181 %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)* 182 %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 183 %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 184 %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3 185 186 %load0 = load i8, i8 addrspace(5)* %out, align 1 187 %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1 188 %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1 189 %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1 190 ret void 191} 192 193; Make sure we don't think the alignment will increase if the base address isn't an alloca 194; ALL-LABEL: @private_store_2xi16_align2_not_alloca( 195; ALL: store i16 196; ALL: store i16 197define void @private_store_2xi16_align2_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 198 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1 199 store i16 1, i16 addrspace(5)* %r, align 2 200 store i16 2, i16 addrspace(5)* %gep.r, align 2 201 ret void 202} 203 204; ALL-LABEL: @private_store_2xi16_align1_not_alloca( 205; ALIGNED: store i16 206; ALIGNED: store i16 207; UNALIGNED: store <2 x i16> 208define void @private_store_2xi16_align1_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 { 209 %gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1 210 store i16 1, i16 addrspace(5)* %r, align 1 211 store i16 2, i16 addrspace(5)* %gep.r, align 1 212 ret void 213} 214 215; ALL-LABEL: @private_load_2xi16_align2_not_alloca( 216; ALL: load i16 217; ALL: load i16 218define i32 @private_load_2xi16_align2_not_alloca(i16 addrspace(5)* %p) #0 { 219 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 220 %p.0 = load i16, i16 addrspace(5)* %p, align 2 221 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2 222 %zext.0 = zext i16 %p.0 to i32 223 %zext.1 = zext i16 %p.1 to i32 224 %shl.1 = shl i32 %zext.1, 16 225 %or = or i32 %zext.0, %shl.1 226 ret i32 %or 227} 228 229; ALL-LABEL: @private_load_2xi16_align1_not_alloca( 230; ALIGNED: load i16 231; ALIGNED: load i16 232; UNALIGNED: load <2 x i16> 233define i32 @private_load_2xi16_align1_not_alloca(i16 addrspace(5)* %p) #0 { 234 %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1 235 %p.0 = load i16, i16 addrspace(5)* %p, align 1 236 %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1 237 %zext.0 = zext i16 %p.0 to i32 238 %zext.1 = zext i16 %p.1 to i32 239 %shl.1 = shl i32 %zext.1, 16 240 %or = or i32 %zext.0, %shl.1 241 ret i32 %or 242} 243 244attributes #0 = { nounwind } 245