1; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s 2; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s 3; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s 4; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s 5; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s 6; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s 7 8target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" 9 10; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 11; ELT4-ALIGNED: store i32 12; ELT4-ALIGNED: store i32 13; ELT4-ALIGNED: store i32 14; ELT4-ALIGNED: store i32 15 16; ELT8: store <2 x i32> 17; ELT8: store <2 x i32> 18 19; ELT16-UNALIGNED: store <4 x i32> 20define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { 21 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 22 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 23 %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 24 25 store i32 9, i32 addrspace(5)* %out 26 store i32 1, i32 addrspace(5)* %out.gep.1 27 store i32 23, i32 addrspace(5)* %out.gep.2 28 store i32 19, i32 addrspace(5)* %out.gep.3 29 ret void 30} 31 32; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1( 33; ALIGNED: store i32 9, i32 addrspace(5)* %out, align 1 34; ALIGNED: store i32 1, i32 addrspace(5)* %out.gep.1, align 1 35; ALIGNED: store i32 23, i32 addrspace(5)* %out.gep.2, align 1 36; ALIGNED: store i32 19, i32 addrspace(5)* %out.gep.3, align 1 37 38; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1 39 40; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32> addrspace(5)* %1, align 1 41; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32> addrspace(5)* %2, align 1 42 43; ELT4-UNALIGNED: store i32 44; ELT4-UNALIGNED: store i32 45; ELT4-UNALIGNED: store i32 46; ELT4-UNALIGNED: store i32 47define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { 48 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 49 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 50 %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 51 52 store i32 9, i32 addrspace(5)* %out, align 1 53 store i32 1, i32 addrspace(5)* %out.gep.1, align 1 54 store i32 23, i32 addrspace(5)* %out.gep.2, align 1 55 store i32 19, i32 addrspace(5)* %out.gep.3, align 1 56 ret void 57} 58 59; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2( 60; ALL: store i32 61; ALL: store i32 62; ALL: store i32 63; ALL: store i32 64define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32 addrspace(5)* %out) #0 { 65 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 66 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 67 %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3 68 69 store i32 9, i32 addrspace(5)* %out, align 2 70 store i32 1, i32 addrspace(5)* %out.gep.1, align 2 71 store i32 23, i32 addrspace(5)* %out.gep.2, align 2 72 store i32 19, i32 addrspace(5)* %out.gep.3, align 2 73 ret void 74} 75 76; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( 77; ALL: store <4 x i8> 78define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8 addrspace(5)* %out) #0 { 79 %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 80 %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 81 %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 82 83 store i8 9, i8 addrspace(5)* %out, align 4 84 store i8 1, i8 addrspace(5)* %out.gep.1 85 store i8 23, i8 addrspace(5)* %out.gep.2 86 store i8 19, i8 addrspace(5)* %out.gep.3 87 ret void 88} 89 90; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1( 91; ALIGNED: store i8 92; ALIGNED: store i8 93; ALIGNED: store i8 94; ALIGNED: store i8 95 96; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1 97define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { 98 %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i32 1 99 %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i32 2 100 %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i32 3 101 102 store i8 9, i8 addrspace(5)* %out, align 1 103 store i8 1, i8 addrspace(5)* %out.gep.1, align 1 104 store i8 23, i8 addrspace(5)* %out.gep.2, align 1 105 store i8 19, i8 addrspace(5)* %out.gep.3, align 1 106 ret void 107} 108 109; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( 110; ALL: store <2 x i16> 111define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16 addrspace(5)* %out) #0 { 112 %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 113 114 store i16 9, i16 addrspace(5)* %out, align 4 115 store i16 12, i16 addrspace(5)* %out.gep.1 116 ret void 117} 118 119; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2( 120; ALL: store i16 121; ALL: store i16 122define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16 addrspace(5)* %out) #0 { 123 %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 124 125 store i16 9, i16 addrspace(5)* %out, align 2 126 store i16 12, i16 addrspace(5)* %out.gep.1, align 2 127 ret void 128} 129 130; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1( 131; ALIGNED: store i16 132; ALIGNED: store i16 133 134; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 1 135define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16 addrspace(5)* %out) #0 { 136 %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 137 138 store i16 9, i16 addrspace(5)* %out, align 1 139 store i16 12, i16 addrspace(5)* %out.gep.1, align 1 140 ret void 141} 142 143; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8( 144; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16> addrspace(5)* %1, align 8 145define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16 addrspace(5)* %out) #0 { 146 %out.gep.1 = getelementptr i16, i16 addrspace(5)* %out, i32 1 147 148 store i16 9, i16 addrspace(5)* %out, align 8 149 store i16 12, i16 addrspace(5)* %out.gep.1, align 2 150 ret void 151} 152 153; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32 154; ELT4: store i32 155; ELT4: store i32 156; ELT4: store i32 157 158; ELT8: store <2 x i32> 159; ELT8: store i32 160 161; ELT16: store <3 x i32> 162define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32 addrspace(5)* %out) #0 { 163 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 164 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 165 166 store i32 9, i32 addrspace(5)* %out 167 store i32 1, i32 addrspace(5)* %out.gep.1 168 store i32 23, i32 addrspace(5)* %out.gep.2 169 ret void 170} 171 172; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1( 173; ALIGNED: store i32 174; ALIGNED: store i32 175; ALIGNED: store i32 176 177; ELT4-UNALIGNED: store i32 178; ELT4-UNALIGNED: store i32 179; ELT4-UNALIGNED: store i32 180 181; ELT8-UNALIGNED: store <2 x i32> 182; ELT8-UNALIGNED: store i32 183 184; ELT16-UNALIGNED: store <3 x i32> 185define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32 addrspace(5)* %out) #0 { 186 %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1 187 %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2 188 189 store i32 9, i32 addrspace(5)* %out, align 1 190 store i32 1, i32 addrspace(5)* %out.gep.1, align 1 191 store i32 23, i32 addrspace(5)* %out.gep.2, align 1 192 ret void 193} 194 195; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1( 196; ALIGNED: store i8 197; ALIGNED: store i8 198; ALIGNED: store i8 199 200; UNALIGNED: store <3 x i8> 201define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8 addrspace(5)* %out) #0 { 202 %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1 203 %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2 204 205 store i8 9, i8 addrspace(5)* %out, align 1 206 store i8 1, i8 addrspace(5)* %out.gep.1, align 1 207 store i8 23, i8 addrspace(5)* %out.gep.2, align 1 208 ret void 209} 210 211attributes #0 = { nounwind } 212