1 //===-- Algorithms to compose sized memory operations ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Higher order primitives that build upon the SizedOpT facility.
10 // They constitute the basic blocks for composing memory functions.
11 // This file defines the following operations:
12 // - Skip
13 // - Tail
14 // - HeadTail
15 // - Loop
16 // - Align
17 //
18 // See each class for documentation.
19 //===----------------------------------------------------------------------===//
20 
21 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
22 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
23 
24 #include "src/string/memory_utils/address.h" // Address
25 #include "src/string/memory_utils/utils.h"   // offset_to_next_aligned
26 
27 #include <stddef.h> // ptrdiff_t
28 
29 namespace __llvm_libc {
30 
31 // We are not yet allowed to use asserts in low level memory operations as
32 // assert itself could depend on them.
33 // We define this empty macro so we can enable them as soon as possible and keep
34 // track of invariants.
35 #define LIBC_ASSERT(COND)
36 
37 // An operation that allows to skip the specified amount of bytes.
38 template <ptrdiff_t Bytes> struct Skip {
39   template <typename NextT> struct Then {
40     template <typename DstAddrT>
setSkip::Then41     static inline void set(DstAddrT dst, ubyte value) {
42       static_assert(NextT::IS_FIXED_SIZE);
43       NextT::set(offsetAddr<Bytes>(dst), value);
44     }
45 
46     template <typename SrcAddrT1, typename SrcAddrT2>
isDifferentSkip::Then47     static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2) {
48       static_assert(NextT::IS_FIXED_SIZE);
49       return NextT::isDifferent(offsetAddr<Bytes>(src1),
50                                 offsetAddr<Bytes>(src2));
51     }
52 
53     template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpSkip::Then54     static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2) {
55       static_assert(NextT::IS_FIXED_SIZE);
56       return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
57                                 offsetAddr<Bytes>(src2));
58     }
59 
60     template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpSkip::Then61     static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
62                                       size_t runtime_size) {
63       static_assert(NextT::IS_RUNTIME_SIZE);
64       return NextT::threeWayCmp(offsetAddr<Bytes>(src1),
65                                 offsetAddr<Bytes>(src2), runtime_size - Bytes);
66     }
67   };
68 };
69 
70 // Compute the address of a tail operation.
71 // Because of the runtime size, we loose the alignment information.
72 template <size_t Size, typename AddrT>
tailAddr(AddrT addr,size_t runtime_size)73 static auto tailAddr(AddrT addr, size_t runtime_size) {
74   static_assert(IsAddressType<AddrT>::Value);
75   return offsetAddrAssumeAligned<1>(addr, runtime_size - Size);
76 }
77 
78 // Perform the operation on the last 'Size' bytes of the buffer.
79 //
80 // e.g. with
81 // [1234567812345678123]
82 // [__XXXXXXXXXXXXXX___]
83 // [________XXXXXXXX___]
84 //
85 // Precondition: `runtime_size >= Size`.
86 template <typename SizedOpT> struct Tail {
87   static_assert(SizedOpT::IS_FIXED_SIZE);
88   static constexpr bool IS_RUNTIME_SIZE = true;
89   static constexpr size_t SIZE = SizedOpT::SIZE;
90 
91   template <typename DstAddrT, typename SrcAddrT>
copyTail92   static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
93     SizedOpT::copy(tailAddr<SIZE>(dst, runtime_size),
94                    tailAddr<SIZE>(src, runtime_size));
95   }
96 
97   template <typename DstAddrT, typename SrcAddrT>
moveTail98   static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
99     SizedOpT::move(tailAddr<SIZE>(dst, runtime_size),
100                    tailAddr<SIZE>(src, runtime_size));
101   }
102 
103   template <typename DstAddrT>
setTail104   static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
105     SizedOpT::set(tailAddr<SIZE>(dst, runtime_size), value);
106   }
107 
108   template <typename SrcAddrT1, typename SrcAddrT2>
isDifferentTail109   static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
110                                      size_t runtime_size) {
111     return SizedOpT::isDifferent(tailAddr<SIZE>(src1, runtime_size),
112                                  tailAddr<SIZE>(src2, runtime_size));
113   }
114 
115   template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpTail116   static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
117                                     size_t runtime_size) {
118     return SizedOpT::threeWayCmp(tailAddr<SIZE>(src1, runtime_size),
119                                  tailAddr<SIZE>(src2, runtime_size));
120   }
121 };
122 
123 // Perform the operation on the first and the last `SizedOpT::Size` bytes of the
124 // buffer. This is useful for overlapping operations.
125 //
126 // e.g. with
127 // [1234567812345678123]
128 // [__XXXXXXXXXXXXXX___]
129 // [__XXXXXXXX_________]
130 // [________XXXXXXXX___]
131 //
132 // Precondition: `runtime_size >= Size && runtime_size <= 2 x Size`.
133 template <typename SizedOpT> struct HeadTail {
134   static_assert(SizedOpT::IS_FIXED_SIZE);
135   static constexpr bool IS_RUNTIME_SIZE = true;
136 
137   template <typename DstAddrT, typename SrcAddrT>
copyHeadTail138   static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
139     LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
140     SizedOpT::copy(dst, src);
141     Tail<SizedOpT>::copy(dst, src, runtime_size);
142   }
143 
144   template <typename DstAddrT, typename SrcAddrT>
moveHeadTail145   static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
146     LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
147     static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
148     // The load and store operations can be performed in any order as long as
149     // they are not interleaved. More investigations are needed to determine the
150     // best order.
151     auto head = SizedOpT::load(src);
152     auto tail = SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
153     SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail);
154     SizedOpT::store(dst, head);
155   }
156 
157   template <typename DstAddrT>
setHeadTail158   static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
159     LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
160     SizedOpT::set(dst, value);
161     Tail<SizedOpT>::set(dst, value, runtime_size);
162   }
163 
164   template <typename SrcAddrT1, typename SrcAddrT2>
isDifferentHeadTail165   static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
166                                      size_t runtime_size) {
167     LIBC_ASSERT(runtime_size >= SizedOpT::SIZE);
168     // Two strategies can be applied here:
169     // 1. Compute head and tail and compose them with a bitwise or operation.
170     // 2. Stop early if head is different.
171     // We chose the later because HeadTail operations are typically performed
172     // with sizes ranging from 4 to 256 bytes. The cost of the loads is then
173     // significantly larger than the cost of the branch.
174     if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
175       return res;
176     return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
177   }
178 
179   template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpHeadTail180   static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
181                                     size_t runtime_size) {
182     LIBC_ASSERT(runtime_size >= SizedOpT::SIZE &&
183                 runtime_size <= 2 * SizedOpT::SIZE);
184     if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
185       return res;
186     return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
187   }
188 };
189 
190 // Simple loop ending with a Tail operation.
191 //
192 // e.g. with
193 // [12345678123456781234567812345678]
194 // [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
195 // [__XXXXXXXX_______________________]
196 // [__________XXXXXXXX_______________]
197 // [__________________XXXXXXXX_______]
198 // [______________________XXXXXXXX___]
199 //
200 // Precondition:
201 // - runtime_size >= Size
202 template <typename SizedOpT> struct Loop {
203   static_assert(SizedOpT::IS_FIXED_SIZE);
204   static constexpr bool IS_RUNTIME_SIZE = true;
205   static constexpr size_t BLOCK_SIZE = SizedOpT::SIZE;
206 
207   template <typename DstAddrT, typename SrcAddrT>
copyLoop208   static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
209     size_t offset = 0;
210     do {
211       SizedOpT::copy(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
212                      offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
213       offset += BLOCK_SIZE;
214     } while (offset < runtime_size - BLOCK_SIZE);
215     Tail<SizedOpT>::copy(dst, src, runtime_size);
216   }
217 
218   // Move forward suitable when dst < src. We load the tail bytes before
219   // handling the loop.
220   //
221   // e.g. Moving two bytes
222   // [   |       |       |       |       |]
223   // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
224   // [_________________________LLLLLLLL___]
225   // [___LLLLLLLL_________________________]
226   // [_SSSSSSSS___________________________]
227   // [___________LLLLLLLL_________________]
228   // [_________SSSSSSSS___________________]
229   // [___________________LLLLLLLL_________]
230   // [_________________SSSSSSSS___________]
231   // [_______________________SSSSSSSS_____]
232   template <typename DstAddrT, typename SrcAddrT>
moveLoop233   static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
234     const auto tail_value =
235         SizedOpT::load(tailAddr<BLOCK_SIZE>(src, runtime_size));
236     size_t offset = 0;
237     do {
238       SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
239                      offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
240       offset += BLOCK_SIZE;
241     } while (offset < runtime_size - BLOCK_SIZE);
242     SizedOpT::store(tailAddr<BLOCK_SIZE>(dst, runtime_size), tail_value);
243   }
244 
245   // Move backward suitable when dst > src. We load the head bytes before
246   // handling the loop.
247   //
248   // e.g. Moving two bytes
249   // [   |       |       |       |       |]
250   // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
251   // [___LLLLLLLL_________________________]
252   // [_________________________LLLLLLLL___]
253   // [___________________________SSSSSSSS_]
254   // [_________________LLLLLLLL___________]
255   // [___________________SSSSSSSS_________]
256   // [_________LLLLLLLL___________________]
257   // [___________SSSSSSSS_________________]
258   // [_____SSSSSSSS_______________________]
259   template <typename DstAddrT, typename SrcAddrT>
move_backwardLoop260   static inline void move_backward(DstAddrT dst, SrcAddrT src,
261                                    size_t runtime_size) {
262     const auto head_value = SizedOpT::load(src);
263     ptrdiff_t offset = runtime_size - BLOCK_SIZE;
264     do {
265       SizedOpT::move(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset),
266                      offsetAddrMultiplesOf<BLOCK_SIZE>(src, offset));
267       offset -= BLOCK_SIZE;
268     } while (offset >= 0);
269     SizedOpT::store(dst, head_value);
270   }
271 
272   template <typename DstAddrT>
setLoop273   static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
274     size_t offset = 0;
275     do {
276       SizedOpT::set(offsetAddrMultiplesOf<BLOCK_SIZE>(dst, offset), value);
277       offset += BLOCK_SIZE;
278     } while (offset < runtime_size - BLOCK_SIZE);
279     Tail<SizedOpT>::set(dst, value, runtime_size);
280   }
281 
282   template <typename SrcAddrT1, typename SrcAddrT2>
isDifferentLoop283   static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
284                                      size_t runtime_size) {
285     size_t offset = 0;
286     do {
287       if (uint64_t res = SizedOpT::isDifferent(
288               offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
289               offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
290         return res;
291       offset += BLOCK_SIZE;
292     } while (offset < runtime_size - BLOCK_SIZE);
293     return Tail<SizedOpT>::isDifferent(src1, src2, runtime_size);
294   }
295 
296   template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpLoop297   static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
298                                     size_t runtime_size) {
299     size_t offset = 0;
300     do {
301       if (int32_t res = SizedOpT::threeWayCmp(
302               offsetAddrMultiplesOf<BLOCK_SIZE>(src1, offset),
303               offsetAddrMultiplesOf<BLOCK_SIZE>(src2, offset)))
304         return res;
305       offset += BLOCK_SIZE;
306     } while (offset < runtime_size - BLOCK_SIZE);
307     return Tail<SizedOpT>::threeWayCmp(src1, src2, runtime_size);
308   }
309 };
310 
311 // Aligns using a statically-sized operation, then calls the subsequent NextT
312 // operation.
313 //
314 // e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
315 // Align<_16, Arg::Dst>::Then<Loop<_32>>::copy(dst, src, runtime_size);
316 enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
317 template <typename SizedOpT, Arg AlignOn = Arg::_1> struct Align {
318   static_assert(SizedOpT::IS_FIXED_SIZE);
319 
320   template <typename NextT> struct Then {
321     static_assert(NextT::IS_RUNTIME_SIZE);
322 
323     template <typename DstAddrT, typename SrcAddrT>
copyAlign::Then324     static inline void copy(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
325       SizedOpT::copy(dst, src);
326       auto aligned = align(dst, src, runtime_size);
327       NextT::copy(aligned.arg1, aligned.arg2, aligned.size);
328     }
329 
330     // Move forward suitable when dst < src. The alignment is performed with
331     // an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
332     //
333     // e.g. Moving two bytes and making sure src is then aligned.
334     // [  |       |       |       |      ]
335     // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
336     // [____LLLLLLLL_____________________]
337     // [___________LLLLLLLL______________]
338     // [_SSSSSSSS________________________]
339     // [________SSSSSSSS_________________]
340     //
341     // e.g. Moving two bytes and making sure dst is then aligned.
342     // [  |       |       |       |      ]
343     // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
344     // [____LLLLLLLL_____________________]
345     // [______LLLLLLLL___________________]
346     // [_SSSSSSSS________________________]
347     // [___SSSSSSSS______________________]
348     template <typename DstAddrT, typename SrcAddrT>
moveAlign::Then349     static inline void move(DstAddrT dst, SrcAddrT src, size_t runtime_size) {
350       auto aligned_after_begin = align(dst, src, runtime_size);
351       // We move pointers forward by Size so we can perform HeadTail.
352       auto aligned = aligned_after_begin.stepForward();
353       HeadTail<SizedOpT>::move(dst, src, runtime_size - aligned.size);
354       NextT::move(aligned.arg1, aligned.arg2, aligned.size);
355     }
356 
357     // Move backward suitable when dst > src. The alignment is performed with
358     // an HeadTail operation of size ∈ [Alignment, 2 x Alignment].
359     //
360     // e.g. Moving two bytes backward and making sure src is then aligned.
361     // [  |       |       |       |      ]
362     // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
363     // [ _________________LLLLLLLL_______]
364     // [ ___________________LLLLLLLL_____]
365     // [____________________SSSSSSSS_____]
366     // [______________________SSSSSSSS___]
367     //
368     // e.g. Moving two bytes and making sure dst is then aligned.
369     // [  |       |       |       |      ]
370     // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
371     // [ _______________LLLLLLLL_________]
372     // [ ___________________LLLLLLLL_____]
373     // [__________________SSSSSSSS_______]
374     // [______________________SSSSSSSS___]
375     template <typename DstAddrT, typename SrcAddrT>
move_backwardAlign::Then376     static inline void move_backward(DstAddrT dst, SrcAddrT src,
377                                      size_t runtime_size) {
378       const auto dst_end = offsetAddrAssumeAligned<1>(dst, runtime_size);
379       const auto src_end = offsetAddrAssumeAligned<1>(src, runtime_size);
380       auto aligned_after_end = align(dst_end, src_end, 0);
381       // We move pointers back by 2 x Size so we can perform HeadTail.
382       auto aligned = aligned_after_end.stepBack().stepBack();
383       HeadTail<SizedOpT>::move(aligned.arg1, aligned.arg2, aligned.size);
384       NextT::move_backward(dst, src, runtime_size - aligned.size);
385     }
386 
387     template <typename DstAddrT>
setAlign::Then388     static inline void set(DstAddrT dst, ubyte value, size_t runtime_size) {
389       SizedOpT::set(dst, value);
390       DstAddrT _(nullptr);
391       auto aligned = align(dst, _, runtime_size);
392       NextT::set(aligned.arg1, value, aligned.size);
393     }
394 
395     template <typename SrcAddrT1, typename SrcAddrT2>
isDifferentAlign::Then396     static inline uint64_t isDifferent(SrcAddrT1 src1, SrcAddrT2 src2,
397                                        size_t runtime_size) {
398       if (const uint64_t res = SizedOpT::isDifferent(src1, src2))
399         return res;
400       auto aligned = align(src1, src2, runtime_size);
401       return NextT::isDifferent(aligned.arg1, aligned.arg2, aligned.size);
402     }
403 
404     template <typename SrcAddrT1, typename SrcAddrT2>
threeWayCmpAlign::Then405     static inline int32_t threeWayCmp(SrcAddrT1 src1, SrcAddrT2 src2,
406                                       size_t runtime_size) {
407       if (const int32_t res = SizedOpT::threeWayCmp(src1, src2))
408         return res;
409       auto aligned = align(src1, src2, runtime_size);
410       return NextT::threeWayCmp(aligned.arg1, aligned.arg2, aligned.size);
411     }
412   };
413 
414 private:
415   static constexpr size_t ALIGN_OP_SIZE = SizedOpT::SIZE;
416   static_assert(ALIGN_OP_SIZE > 1);
417 
418   template <typename Arg1AddrT, typename Arg2AddrT> struct Aligned {
419     Arg1AddrT arg1;
420     Arg2AddrT arg2;
421     size_t size;
422 
stepForwardAlign::Aligned423     Aligned stepForward() const {
424       return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, ALIGN_OP_SIZE),
425                      offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, ALIGN_OP_SIZE),
426                      size - ALIGN_OP_SIZE};
427     }
428 
stepBackAlign::Aligned429     Aligned stepBack() const {
430       return Aligned{offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg1, -ALIGN_OP_SIZE),
431                      offsetAddrMultiplesOf<ALIGN_OP_SIZE>(arg2, -ALIGN_OP_SIZE),
432                      size + ALIGN_OP_SIZE};
433     }
434   };
435 
436   template <typename Arg1AddrT, typename Arg2AddrT>
makeAlignedAlign437   static auto makeAligned(Arg1AddrT arg1, Arg2AddrT arg2, size_t size) {
438     return Aligned<Arg1AddrT, Arg2AddrT>{arg1, arg2, size};
439   }
440 
441   template <typename Arg1AddrT, typename Arg2AddrT>
alignAlign442   static auto align(Arg1AddrT arg1, Arg2AddrT arg2, size_t runtime_size) {
443     static_assert(IsAddressType<Arg1AddrT>::Value);
444     static_assert(IsAddressType<Arg2AddrT>::Value);
445     if constexpr (AlignOn == Arg::_1) {
446       auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg1.ptr_);
447       return makeAligned(offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg1, offset),
448                          offsetAddrAssumeAligned<1>(arg2, offset),
449                          runtime_size - offset);
450     } else if constexpr (AlignOn == Arg::_2) {
451       auto offset = offset_to_next_aligned<ALIGN_OP_SIZE>(arg2.ptr_);
452       return makeAligned(offsetAddrAssumeAligned<1>(arg1, offset),
453                          offsetAddrAssumeAligned<ALIGN_OP_SIZE>(arg2, offset),
454                          runtime_size - offset);
455     } else {
456       DeferredStaticAssert("AlignOn must be either Arg::_1 or Arg::_2");
457     }
458   }
459 };
460 
461 } // namespace __llvm_libc
462 
463 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ALGORITHM_H
464