1 //===-- HexagonISelDAGToDAGHVX.cpp ----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "Hexagon.h"
10 #include "HexagonISelDAGToDAG.h"
11 #include "HexagonISelLowering.h"
12 #include "HexagonTargetMachine.h"
13 #include "llvm/ADT/SetVector.h"
14 #include "llvm/CodeGen/MachineInstrBuilder.h"
15 #include "llvm/CodeGen/SelectionDAGISel.h"
16 #include "llvm/IR/Intrinsics.h"
17 #include "llvm/IR/IntrinsicsHexagon.h"
18 #include "llvm/Support/CommandLine.h"
19 #include "llvm/Support/Debug.h"
20
21 #include <deque>
22 #include <map>
23 #include <set>
24 #include <utility>
25 #include <vector>
26
27 #define DEBUG_TYPE "hexagon-isel"
28
29 using namespace llvm;
30
31 namespace {
32
33 // --------------------------------------------------------------------
34 // Implementation of permutation networks.
35
36 // Implementation of the node routing through butterfly networks:
37 // - Forward delta.
38 // - Reverse delta.
39 // - Benes.
40 //
41 //
42 // Forward delta network consists of log(N) steps, where N is the number
43 // of inputs. In each step, an input can stay in place, or it can get
44 // routed to another position[1]. The step after that consists of two
45 // networks, each half in size in terms of the number of nodes. In those
46 // terms, in the given step, an input can go to either the upper or the
47 // lower network in the next step.
48 //
49 // [1] Hexagon's vdelta/vrdelta allow an element to be routed to both
50 // positions as long as there is no conflict.
51
52 // Here's a delta network for 8 inputs, only the switching routes are
53 // shown:
54 //
55 // Steps:
56 // |- 1 ---------------|- 2 -----|- 3 -|
57 //
58 // Inp[0] *** *** *** *** Out[0]
59 // \ / \ / \ /
60 // \ / \ / X
61 // \ / \ / / \
62 // Inp[1] *** \ / *** X *** *** Out[1]
63 // \ \ / / \ / \ /
64 // \ \ / / X X
65 // \ \ / / / \ / \
66 // Inp[2] *** \ \ / / *** X *** *** Out[2]
67 // \ \ X / / / \ \ /
68 // \ \ / \ / / / \ X
69 // \ X X / / \ / \
70 // Inp[3] *** \ / \ / \ / *** *** *** Out[3]
71 // \ X X X /
72 // \ / \ / \ / \ /
73 // X X X X
74 // / \ / \ / \ / \
75 // / X X X \
76 // Inp[4] *** / \ / \ / \ *** *** *** Out[4]
77 // / X X \ \ / \ /
78 // / / \ / \ \ \ / X
79 // / / X \ \ \ / / \
80 // Inp[5] *** / / \ \ *** X *** *** Out[5]
81 // / / \ \ \ / \ /
82 // / / \ \ X X
83 // / / \ \ / \ / \
84 // Inp[6] *** / \ *** X *** *** Out[6]
85 // / \ / \ \ /
86 // / \ / \ X
87 // / \ / \ / \
88 // Inp[7] *** *** *** *** Out[7]
89 //
90 //
91 // Reverse delta network is same as delta network, with the steps in
92 // the opposite order.
93 //
94 //
95 // Benes network is a forward delta network immediately followed by
96 // a reverse delta network.
97
98 enum class ColorKind { None, Red, Black };
99
100 // Graph coloring utility used to partition nodes into two groups:
101 // they will correspond to nodes routed to the upper and lower networks.
102 struct Coloring {
103 using Node = int;
104 using MapType = std::map<Node, ColorKind>;
105 static constexpr Node Ignore = Node(-1);
106
Coloring__anonbbf86c770111::Coloring107 Coloring(ArrayRef<Node> Ord) : Order(Ord) {
108 build();
109 if (!color())
110 Colors.clear();
111 }
112
colors__anonbbf86c770111::Coloring113 const MapType &colors() const {
114 return Colors;
115 }
116
other__anonbbf86c770111::Coloring117 ColorKind other(ColorKind Color) {
118 if (Color == ColorKind::None)
119 return ColorKind::Red;
120 return Color == ColorKind::Red ? ColorKind::Black : ColorKind::Red;
121 }
122
123 LLVM_DUMP_METHOD void dump() const;
124
125 private:
126 ArrayRef<Node> Order;
127 MapType Colors;
128 std::set<Node> Needed;
129
130 using NodeSet = std::set<Node>;
131 std::map<Node,NodeSet> Edges;
132
conj__anonbbf86c770111::Coloring133 Node conj(Node Pos) {
134 Node Num = Order.size();
135 return (Pos < Num/2) ? Pos + Num/2 : Pos - Num/2;
136 }
137
getColor__anonbbf86c770111::Coloring138 ColorKind getColor(Node N) {
139 auto F = Colors.find(N);
140 return F != Colors.end() ? F->second : ColorKind::None;
141 }
142
143 std::pair<bool, ColorKind> getUniqueColor(const NodeSet &Nodes);
144
145 void build();
146 bool color();
147 };
148 } // namespace
149
getUniqueColor(const NodeSet & Nodes)150 std::pair<bool, ColorKind> Coloring::getUniqueColor(const NodeSet &Nodes) {
151 auto Color = ColorKind::None;
152 for (Node N : Nodes) {
153 ColorKind ColorN = getColor(N);
154 if (ColorN == ColorKind::None)
155 continue;
156 if (Color == ColorKind::None)
157 Color = ColorN;
158 else if (Color != ColorKind::None && Color != ColorN)
159 return { false, ColorKind::None };
160 }
161 return { true, Color };
162 }
163
build()164 void Coloring::build() {
165 // Add Order[P] and Order[conj(P)] to Edges.
166 for (unsigned P = 0; P != Order.size(); ++P) {
167 Node I = Order[P];
168 if (I != Ignore) {
169 Needed.insert(I);
170 Node PC = Order[conj(P)];
171 if (PC != Ignore && PC != I)
172 Edges[I].insert(PC);
173 }
174 }
175 // Add I and conj(I) to Edges.
176 for (unsigned I = 0; I != Order.size(); ++I) {
177 if (!Needed.count(I))
178 continue;
179 Node C = conj(I);
180 // This will create an entry in the edge table, even if I is not
181 // connected to any other node. This is necessary, because it still
182 // needs to be colored.
183 NodeSet &Is = Edges[I];
184 if (Needed.count(C))
185 Is.insert(C);
186 }
187 }
188
color()189 bool Coloring::color() {
190 SetVector<Node> FirstQ;
191 auto Enqueue = [this,&FirstQ] (Node N) {
192 SetVector<Node> Q;
193 Q.insert(N);
194 for (unsigned I = 0; I != Q.size(); ++I) {
195 NodeSet &Ns = Edges[Q[I]];
196 Q.insert(Ns.begin(), Ns.end());
197 }
198 FirstQ.insert(Q.begin(), Q.end());
199 };
200 for (Node N : Needed)
201 Enqueue(N);
202
203 for (Node N : FirstQ) {
204 if (Colors.count(N))
205 continue;
206 NodeSet &Ns = Edges[N];
207 auto P = getUniqueColor(Ns);
208 if (!P.first)
209 return false;
210 Colors[N] = other(P.second);
211 }
212
213 // First, color nodes that don't have any dups.
214 for (auto E : Edges) {
215 Node N = E.first;
216 if (!Needed.count(conj(N)) || Colors.count(N))
217 continue;
218 auto P = getUniqueColor(E.second);
219 if (!P.first)
220 return false;
221 Colors[N] = other(P.second);
222 }
223
224 // Now, nodes that are still uncolored. Since the graph can be modified
225 // in this step, create a work queue.
226 std::vector<Node> WorkQ;
227 for (auto E : Edges) {
228 Node N = E.first;
229 if (!Colors.count(N))
230 WorkQ.push_back(N);
231 }
232
233 for (Node N : WorkQ) {
234 NodeSet &Ns = Edges[N];
235 auto P = getUniqueColor(Ns);
236 if (P.first) {
237 Colors[N] = other(P.second);
238 continue;
239 }
240
241 // Coloring failed. Split this node.
242 Node C = conj(N);
243 ColorKind ColorN = other(ColorKind::None);
244 ColorKind ColorC = other(ColorN);
245 NodeSet &Cs = Edges[C];
246 NodeSet CopyNs = Ns;
247 for (Node M : CopyNs) {
248 ColorKind ColorM = getColor(M);
249 if (ColorM == ColorC) {
250 // Connect M with C, disconnect M from N.
251 Cs.insert(M);
252 Edges[M].insert(C);
253 Ns.erase(M);
254 Edges[M].erase(N);
255 }
256 }
257 Colors[N] = ColorN;
258 Colors[C] = ColorC;
259 }
260
261 // Explicitly assign "None" to all uncolored nodes.
262 for (unsigned I = 0; I != Order.size(); ++I)
263 if (Colors.count(I) == 0)
264 Colors[I] = ColorKind::None;
265
266 return true;
267 }
268
269 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const270 void Coloring::dump() const {
271 dbgs() << "{ Order: {";
272 for (Node P : Order) {
273 if (P != Ignore)
274 dbgs() << ' ' << P;
275 else
276 dbgs() << " -";
277 }
278 dbgs() << " }\n";
279 dbgs() << " Needed: {";
280 for (Node N : Needed)
281 dbgs() << ' ' << N;
282 dbgs() << " }\n";
283
284 dbgs() << " Edges: {\n";
285 for (auto E : Edges) {
286 dbgs() << " " << E.first << " -> {";
287 for (auto N : E.second)
288 dbgs() << ' ' << N;
289 dbgs() << " }\n";
290 }
291 dbgs() << " }\n";
292
293 auto ColorKindToName = [](ColorKind C) {
294 switch (C) {
295 case ColorKind::None:
296 return "None";
297 case ColorKind::Red:
298 return "Red";
299 case ColorKind::Black:
300 return "Black";
301 }
302 llvm_unreachable("all ColorKinds should be handled by the switch above");
303 };
304
305 dbgs() << " Colors: {\n";
306 for (auto C : Colors)
307 dbgs() << " " << C.first << " -> " << ColorKindToName(C.second) << "\n";
308 dbgs() << " }\n}\n";
309 }
310 #endif
311
312 namespace {
313 // Base class of for reordering networks. They don't strictly need to be
314 // permutations, as outputs with repeated occurrences of an input element
315 // are allowed.
316 struct PermNetwork {
317 using Controls = std::vector<uint8_t>;
318 using ElemType = int;
319 static constexpr ElemType Ignore = ElemType(-1);
320
321 enum : uint8_t {
322 None,
323 Pass,
324 Switch
325 };
326 enum : uint8_t {
327 Forward,
328 Reverse
329 };
330
PermNetwork__anonbbf86c770411::PermNetwork331 PermNetwork(ArrayRef<ElemType> Ord, unsigned Mult = 1) {
332 Order.assign(Ord.data(), Ord.data()+Ord.size());
333 Log = 0;
334
335 unsigned S = Order.size();
336 while (S >>= 1)
337 ++Log;
338
339 Table.resize(Order.size());
340 for (RowType &Row : Table)
341 Row.resize(Mult*Log, None);
342 }
343
getControls__anonbbf86c770411::PermNetwork344 void getControls(Controls &V, unsigned StartAt, uint8_t Dir) const {
345 unsigned Size = Order.size();
346 V.resize(Size);
347 for (unsigned I = 0; I != Size; ++I) {
348 unsigned W = 0;
349 for (unsigned L = 0; L != Log; ++L) {
350 unsigned C = ctl(I, StartAt+L) == Switch;
351 if (Dir == Forward)
352 W |= C << (Log-1-L);
353 else
354 W |= C << L;
355 }
356 assert(isUInt<8>(W));
357 V[I] = uint8_t(W);
358 }
359 }
360
ctl__anonbbf86c770411::PermNetwork361 uint8_t ctl(ElemType Pos, unsigned Step) const {
362 return Table[Pos][Step];
363 }
size__anonbbf86c770411::PermNetwork364 unsigned size() const {
365 return Order.size();
366 }
steps__anonbbf86c770411::PermNetwork367 unsigned steps() const {
368 return Log;
369 }
370
371 protected:
372 unsigned Log;
373 std::vector<ElemType> Order;
374 using RowType = std::vector<uint8_t>;
375 std::vector<RowType> Table;
376 };
377
378 struct ForwardDeltaNetwork : public PermNetwork {
ForwardDeltaNetwork__anonbbf86c770411::ForwardDeltaNetwork379 ForwardDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}
380
run__anonbbf86c770411::ForwardDeltaNetwork381 bool run(Controls &V) {
382 if (!route(Order.data(), Table.data(), size(), 0))
383 return false;
384 getControls(V, 0, Forward);
385 return true;
386 }
387
388 private:
389 bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
390 };
391
392 struct ReverseDeltaNetwork : public PermNetwork {
ReverseDeltaNetwork__anonbbf86c770411::ReverseDeltaNetwork393 ReverseDeltaNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord) {}
394
run__anonbbf86c770411::ReverseDeltaNetwork395 bool run(Controls &V) {
396 if (!route(Order.data(), Table.data(), size(), 0))
397 return false;
398 getControls(V, 0, Reverse);
399 return true;
400 }
401
402 private:
403 bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
404 };
405
406 struct BenesNetwork : public PermNetwork {
BenesNetwork__anonbbf86c770411::BenesNetwork407 BenesNetwork(ArrayRef<ElemType> Ord) : PermNetwork(Ord, 2) {}
408
run__anonbbf86c770411::BenesNetwork409 bool run(Controls &F, Controls &R) {
410 if (!route(Order.data(), Table.data(), size(), 0))
411 return false;
412
413 getControls(F, 0, Forward);
414 getControls(R, Log, Reverse);
415 return true;
416 }
417
418 private:
419 bool route(ElemType *P, RowType *T, unsigned Size, unsigned Step);
420 };
421 } // namespace
422
route(ElemType * P,RowType * T,unsigned Size,unsigned Step)423 bool ForwardDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
424 unsigned Step) {
425 bool UseUp = false, UseDown = false;
426 ElemType Num = Size;
427
428 // Cannot use coloring here, because coloring is used to determine
429 // the "big" switch, i.e. the one that changes halves, and in a forward
430 // network, a color can be simultaneously routed to both halves in the
431 // step we're working on.
432 for (ElemType J = 0; J != Num; ++J) {
433 ElemType I = P[J];
434 // I is the position in the input,
435 // J is the position in the output.
436 if (I == Ignore)
437 continue;
438 uint8_t S;
439 if (I < Num/2)
440 S = (J < Num/2) ? Pass : Switch;
441 else
442 S = (J < Num/2) ? Switch : Pass;
443
444 // U is the element in the table that needs to be updated.
445 ElemType U = (S == Pass) ? I : (I < Num/2 ? I+Num/2 : I-Num/2);
446 if (U < Num/2)
447 UseUp = true;
448 else
449 UseDown = true;
450 if (T[U][Step] != S && T[U][Step] != None)
451 return false;
452 T[U][Step] = S;
453 }
454
455 for (ElemType J = 0; J != Num; ++J)
456 if (P[J] != Ignore && P[J] >= Num/2)
457 P[J] -= Num/2;
458
459 if (Step+1 < Log) {
460 if (UseUp && !route(P, T, Size/2, Step+1))
461 return false;
462 if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
463 return false;
464 }
465 return true;
466 }
467
route(ElemType * P,RowType * T,unsigned Size,unsigned Step)468 bool ReverseDeltaNetwork::route(ElemType *P, RowType *T, unsigned Size,
469 unsigned Step) {
470 unsigned Pets = Log-1 - Step;
471 bool UseUp = false, UseDown = false;
472 ElemType Num = Size;
473
474 // In this step half-switching occurs, so coloring can be used.
475 Coloring G({P,Size});
476 const Coloring::MapType &M = G.colors();
477 if (M.empty())
478 return false;
479
480 ColorKind ColorUp = ColorKind::None;
481 for (ElemType J = 0; J != Num; ++J) {
482 ElemType I = P[J];
483 // I is the position in the input,
484 // J is the position in the output.
485 if (I == Ignore)
486 continue;
487 ColorKind C = M.at(I);
488 if (C == ColorKind::None)
489 continue;
490 // During "Step", inputs cannot switch halves, so if the "up" color
491 // is still unknown, make sure that it is selected in such a way that
492 // "I" will stay in the same half.
493 bool InpUp = I < Num/2;
494 if (ColorUp == ColorKind::None)
495 ColorUp = InpUp ? C : G.other(C);
496 if ((C == ColorUp) != InpUp) {
497 // If I should go to a different half than where is it now, give up.
498 return false;
499 }
500
501 uint8_t S;
502 if (InpUp) {
503 S = (J < Num/2) ? Pass : Switch;
504 UseUp = true;
505 } else {
506 S = (J < Num/2) ? Switch : Pass;
507 UseDown = true;
508 }
509 T[J][Pets] = S;
510 }
511
512 // Reorder the working permutation according to the computed switch table
513 // for the last step (i.e. Pets).
514 for (ElemType J = 0, E = Size / 2; J != E; ++J) {
515 ElemType PJ = P[J]; // Current values of P[J]
516 ElemType PC = P[J+Size/2]; // and P[conj(J)]
517 ElemType QJ = PJ; // New values of P[J]
518 ElemType QC = PC; // and P[conj(J)]
519 if (T[J][Pets] == Switch)
520 QC = PJ;
521 if (T[J+Size/2][Pets] == Switch)
522 QJ = PC;
523 P[J] = QJ;
524 P[J+Size/2] = QC;
525 }
526
527 for (ElemType J = 0; J != Num; ++J)
528 if (P[J] != Ignore && P[J] >= Num/2)
529 P[J] -= Num/2;
530
531 if (Step+1 < Log) {
532 if (UseUp && !route(P, T, Size/2, Step+1))
533 return false;
534 if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
535 return false;
536 }
537 return true;
538 }
539
route(ElemType * P,RowType * T,unsigned Size,unsigned Step)540 bool BenesNetwork::route(ElemType *P, RowType *T, unsigned Size,
541 unsigned Step) {
542 Coloring G({P,Size});
543 const Coloring::MapType &M = G.colors();
544 if (M.empty())
545 return false;
546 ElemType Num = Size;
547
548 unsigned Pets = 2*Log-1 - Step;
549 bool UseUp = false, UseDown = false;
550
551 // Both assignments, i.e. Red->Up and Red->Down are valid, but they will
552 // result in different controls. Let's pick the one where the first
553 // control will be "Pass".
554 ColorKind ColorUp = ColorKind::None;
555 for (ElemType J = 0; J != Num; ++J) {
556 ElemType I = P[J];
557 if (I == Ignore)
558 continue;
559 ColorKind C = M.at(I);
560 if (C == ColorKind::None)
561 continue;
562 if (ColorUp == ColorKind::None) {
563 ColorUp = (I < Num / 2) ? ColorKind::Red : ColorKind::Black;
564 }
565 unsigned CI = (I < Num/2) ? I+Num/2 : I-Num/2;
566 if (C == ColorUp) {
567 if (I < Num/2)
568 T[I][Step] = Pass;
569 else
570 T[CI][Step] = Switch;
571 T[J][Pets] = (J < Num/2) ? Pass : Switch;
572 UseUp = true;
573 } else { // Down
574 if (I < Num/2)
575 T[CI][Step] = Switch;
576 else
577 T[I][Step] = Pass;
578 T[J][Pets] = (J < Num/2) ? Switch : Pass;
579 UseDown = true;
580 }
581 }
582
583 // Reorder the working permutation according to the computed switch table
584 // for the last step (i.e. Pets).
585 for (ElemType J = 0; J != Num/2; ++J) {
586 ElemType PJ = P[J]; // Current values of P[J]
587 ElemType PC = P[J+Num/2]; // and P[conj(J)]
588 ElemType QJ = PJ; // New values of P[J]
589 ElemType QC = PC; // and P[conj(J)]
590 if (T[J][Pets] == Switch)
591 QC = PJ;
592 if (T[J+Num/2][Pets] == Switch)
593 QJ = PC;
594 P[J] = QJ;
595 P[J+Num/2] = QC;
596 }
597
598 for (ElemType J = 0; J != Num; ++J)
599 if (P[J] != Ignore && P[J] >= Num/2)
600 P[J] -= Num/2;
601
602 if (Step+1 < Log) {
603 if (UseUp && !route(P, T, Size/2, Step+1))
604 return false;
605 if (UseDown && !route(P+Size/2, T+Size/2, Size/2, Step+1))
606 return false;
607 }
608 return true;
609 }
610
611 // --------------------------------------------------------------------
612 // Support for building selection results (output instructions that are
613 // parts of the final selection).
614
615 namespace {
616 struct OpRef {
OpRef__anonbbf86c770711::OpRef617 OpRef(SDValue V) : OpV(V) {}
isValue__anonbbf86c770711::OpRef618 bool isValue() const { return OpV.getNode() != nullptr; }
isValid__anonbbf86c770711::OpRef619 bool isValid() const { return isValue() || !(OpN & Invalid); }
res__anonbbf86c770711::OpRef620 static OpRef res(int N) { return OpRef(Whole | (N & Index)); }
fail__anonbbf86c770711::OpRef621 static OpRef fail() { return OpRef(Invalid); }
622
lo__anonbbf86c770711::OpRef623 static OpRef lo(const OpRef &R) {
624 assert(!R.isValue());
625 return OpRef(R.OpN & (Undef | Index | LoHalf));
626 }
hi__anonbbf86c770711::OpRef627 static OpRef hi(const OpRef &R) {
628 assert(!R.isValue());
629 return OpRef(R.OpN & (Undef | Index | HiHalf));
630 }
undef__anonbbf86c770711::OpRef631 static OpRef undef(MVT Ty) { return OpRef(Undef | Ty.SimpleTy); }
632
633 // Direct value.
634 SDValue OpV = SDValue();
635
636 // Reference to the operand of the input node:
637 // If the 31st bit is 1, it's undef, otherwise, bits 28..0 are the
638 // operand index:
639 // If bit 30 is set, it's the high half of the operand.
640 // If bit 29 is set, it's the low half of the operand.
641 unsigned OpN = 0;
642
643 enum : unsigned {
644 Invalid = 0x10000000,
645 LoHalf = 0x20000000,
646 HiHalf = 0x40000000,
647 Whole = LoHalf | HiHalf,
648 Undef = 0x80000000,
649 Index = 0x0FFFFFFF, // Mask of the index value.
650 IndexBits = 28,
651 };
652
653 LLVM_DUMP_METHOD
654 void print(raw_ostream &OS, const SelectionDAG &G) const;
655
656 private:
OpRef__anonbbf86c770711::OpRef657 OpRef(unsigned N) : OpN(N) {}
658 };
659
660 struct NodeTemplate {
661 NodeTemplate() = default;
662 unsigned Opc = 0;
663 MVT Ty = MVT::Other;
664 std::vector<OpRef> Ops;
665
666 LLVM_DUMP_METHOD void print(raw_ostream &OS, const SelectionDAG &G) const;
667 };
668
669 struct ResultStack {
ResultStack__anonbbf86c770711::ResultStack670 ResultStack(SDNode *Inp)
671 : InpNode(Inp), InpTy(Inp->getValueType(0).getSimpleVT()) {}
672 SDNode *InpNode;
673 MVT InpTy;
push__anonbbf86c770711::ResultStack674 unsigned push(const NodeTemplate &Res) {
675 List.push_back(Res);
676 return List.size()-1;
677 }
push__anonbbf86c770711::ResultStack678 unsigned push(unsigned Opc, MVT Ty, std::vector<OpRef> &&Ops) {
679 NodeTemplate Res;
680 Res.Opc = Opc;
681 Res.Ty = Ty;
682 Res.Ops = Ops;
683 return push(Res);
684 }
empty__anonbbf86c770711::ResultStack685 bool empty() const { return List.empty(); }
size__anonbbf86c770711::ResultStack686 unsigned size() const { return List.size(); }
top__anonbbf86c770711::ResultStack687 unsigned top() const { return size()-1; }
operator []__anonbbf86c770711::ResultStack688 const NodeTemplate &operator[](unsigned I) const { return List[I]; }
reset__anonbbf86c770711::ResultStack689 unsigned reset(unsigned NewTop) {
690 List.resize(NewTop+1);
691 return NewTop;
692 }
693
694 using BaseType = std::vector<NodeTemplate>;
begin__anonbbf86c770711::ResultStack695 BaseType::iterator begin() { return List.begin(); }
end__anonbbf86c770711::ResultStack696 BaseType::iterator end() { return List.end(); }
begin__anonbbf86c770711::ResultStack697 BaseType::const_iterator begin() const { return List.begin(); }
end__anonbbf86c770711::ResultStack698 BaseType::const_iterator end() const { return List.end(); }
699
700 BaseType List;
701
702 LLVM_DUMP_METHOD
703 void print(raw_ostream &OS, const SelectionDAG &G) const;
704 };
705 } // namespace
706
707 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & OS,const SelectionDAG & G) const708 void OpRef::print(raw_ostream &OS, const SelectionDAG &G) const {
709 if (isValue()) {
710 OpV.getNode()->print(OS, &G);
711 return;
712 }
713 if (OpN & Invalid) {
714 OS << "invalid";
715 return;
716 }
717 if (OpN & Undef) {
718 OS << "undef";
719 return;
720 }
721 if ((OpN & Whole) != Whole) {
722 assert((OpN & Whole) == LoHalf || (OpN & Whole) == HiHalf);
723 if (OpN & LoHalf)
724 OS << "lo ";
725 else
726 OS << "hi ";
727 }
728 OS << '#' << SignExtend32(OpN & Index, IndexBits);
729 }
730
print(raw_ostream & OS,const SelectionDAG & G) const731 void NodeTemplate::print(raw_ostream &OS, const SelectionDAG &G) const {
732 const TargetInstrInfo &TII = *G.getSubtarget().getInstrInfo();
733 OS << format("%8s", EVT(Ty).getEVTString().c_str()) << " "
734 << TII.getName(Opc);
735 bool Comma = false;
736 for (const auto &R : Ops) {
737 if (Comma)
738 OS << ',';
739 Comma = true;
740 OS << ' ';
741 R.print(OS, G);
742 }
743 }
744
print(raw_ostream & OS,const SelectionDAG & G) const745 void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
746 OS << "Input node:\n";
747 #ifndef NDEBUG
748 InpNode->dumpr(&G);
749 #endif
750 OS << "Result templates:\n";
751 for (unsigned I = 0, E = List.size(); I != E; ++I) {
752 OS << '[' << I << "] ";
753 List[I].print(OS, G);
754 OS << '\n';
755 }
756 }
757 #endif
758
759 namespace {
760 struct ShuffleMask {
ShuffleMask__anonbbf86c770911::ShuffleMask761 ShuffleMask(ArrayRef<int> M) : Mask(M) {
762 for (int M : Mask) {
763 if (M == -1)
764 continue;
765 MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M);
766 MaxSrc = (MaxSrc == -1) ? M : std::max(MaxSrc, M);
767 }
768 }
769
770 ArrayRef<int> Mask;
771 int MinSrc = -1, MaxSrc = -1;
772
lo__anonbbf86c770911::ShuffleMask773 ShuffleMask lo() const {
774 size_t H = Mask.size()/2;
775 return ShuffleMask(Mask.take_front(H));
776 }
hi__anonbbf86c770911::ShuffleMask777 ShuffleMask hi() const {
778 size_t H = Mask.size()/2;
779 return ShuffleMask(Mask.take_back(H));
780 }
781
print__anonbbf86c770911::ShuffleMask782 void print(raw_ostream &OS) const {
783 OS << "MinSrc:" << MinSrc << ", MaxSrc:" << MaxSrc << " {";
784 for (int M : Mask)
785 OS << ' ' << M;
786 OS << " }";
787 }
788 };
789
790 LLVM_ATTRIBUTE_UNUSED
operator <<(raw_ostream & OS,const ShuffleMask & SM)791 raw_ostream &operator<<(raw_ostream &OS, const ShuffleMask &SM) {
792 SM.print(OS);
793 return OS;
794 }
795 } // namespace
796
797 // --------------------------------------------------------------------
798 // The HvxSelector class.
799
getHexagonLowering(SelectionDAG & G)800 static const HexagonTargetLowering &getHexagonLowering(SelectionDAG &G) {
801 return static_cast<const HexagonTargetLowering&>(G.getTargetLoweringInfo());
802 }
getHexagonSubtarget(SelectionDAG & G)803 static const HexagonSubtarget &getHexagonSubtarget(SelectionDAG &G) {
804 return G.getSubtarget<HexagonSubtarget>();
805 }
806
807 namespace llvm {
808 struct HvxSelector {
809 const HexagonTargetLowering &Lower;
810 HexagonDAGToDAGISel &ISel;
811 SelectionDAG &DAG;
812 const HexagonSubtarget &HST;
813 const unsigned HwLen;
814
HvxSelectorllvm::HvxSelector815 HvxSelector(HexagonDAGToDAGISel &HS, SelectionDAG &G)
816 : Lower(getHexagonLowering(G)), ISel(HS), DAG(G),
817 HST(getHexagonSubtarget(G)), HwLen(HST.getVectorLength()) {}
818
getSingleVTllvm::HvxSelector819 MVT getSingleVT(MVT ElemTy) const {
820 assert(ElemTy != MVT::i1 && "Use getBoolVT for predicates");
821 unsigned NumElems = HwLen / (ElemTy.getSizeInBits()/8);
822 return MVT::getVectorVT(ElemTy, NumElems);
823 }
824
getPairVTllvm::HvxSelector825 MVT getPairVT(MVT ElemTy) const {
826 assert(ElemTy != MVT::i1); // Suspicious: there are no predicate pairs.
827 unsigned NumElems = (2*HwLen) / (ElemTy.getSizeInBits()/8);
828 return MVT::getVectorVT(ElemTy, NumElems);
829 }
830
getBoolVTllvm::HvxSelector831 MVT getBoolVT() const {
832 // Return HwLen x i1.
833 return MVT::getVectorVT(MVT::i1, HwLen);
834 }
835
836 void selectShuffle(SDNode *N);
837 void selectRor(SDNode *N);
838 void selectVAlign(SDNode *N);
839
840 private:
841 void select(SDNode *ISelN);
842 void materialize(const ResultStack &Results);
843
844 SDValue getConst32(int Val, const SDLoc &dl);
845 SDValue getVectorConstant(ArrayRef<uint8_t> Data, const SDLoc &dl);
846
847 enum : unsigned {
848 None,
849 PackMux,
850 };
851 OpRef concats(OpRef Va, OpRef Vb, ResultStack &Results);
852 OpRef packs(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
853 MutableArrayRef<int> NewMask, unsigned Options = None);
854 OpRef packp(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results,
855 MutableArrayRef<int> NewMask);
856 OpRef vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
857 ResultStack &Results);
858 OpRef vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
859 ResultStack &Results);
860
861 OpRef shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results);
862 OpRef shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
863 OpRef shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results);
864 OpRef shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
865
866 OpRef butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results);
867 OpRef contracting(ShuffleMask SM, OpRef Va, OpRef Vb, ResultStack &Results);
868 OpRef expanding(ShuffleMask SM, OpRef Va, ResultStack &Results);
869 OpRef perfect(ShuffleMask SM, OpRef Va, ResultStack &Results);
870
871 bool selectVectorConstants(SDNode *N);
872 bool scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl, MVT ResTy,
873 SDValue Va, SDValue Vb, SDNode *N);
874
875 };
876 }
877
splitMask(ArrayRef<int> Mask,MutableArrayRef<int> MaskL,MutableArrayRef<int> MaskR)878 static void splitMask(ArrayRef<int> Mask, MutableArrayRef<int> MaskL,
879 MutableArrayRef<int> MaskR) {
880 unsigned VecLen = Mask.size();
881 assert(MaskL.size() == VecLen && MaskR.size() == VecLen);
882 for (unsigned I = 0; I != VecLen; ++I) {
883 int M = Mask[I];
884 if (M < 0) {
885 MaskL[I] = MaskR[I] = -1;
886 } else if (unsigned(M) < VecLen) {
887 MaskL[I] = M;
888 MaskR[I] = -1;
889 } else {
890 MaskL[I] = -1;
891 MaskR[I] = M-VecLen;
892 }
893 }
894 }
895
findStrip(ArrayRef<int> A,int Inc,unsigned MaxLen)896 static std::pair<int,unsigned> findStrip(ArrayRef<int> A, int Inc,
897 unsigned MaxLen) {
898 assert(A.size() > 0 && A.size() >= MaxLen);
899 int F = A[0];
900 int E = F;
901 for (unsigned I = 1; I != MaxLen; ++I) {
902 if (A[I] - E != Inc)
903 return { F, I };
904 E = A[I];
905 }
906 return { F, MaxLen };
907 }
908
isUndef(ArrayRef<int> Mask)909 static bool isUndef(ArrayRef<int> Mask) {
910 for (int Idx : Mask)
911 if (Idx != -1)
912 return false;
913 return true;
914 }
915
isIdentity(ArrayRef<int> Mask)916 static bool isIdentity(ArrayRef<int> Mask) {
917 for (int I = 0, E = Mask.size(); I != E; ++I) {
918 int M = Mask[I];
919 if (M >= 0 && M != I)
920 return false;
921 }
922 return true;
923 }
924
getInputSegmentList(ShuffleMask SM,unsigned SegLen)925 static SmallVector<unsigned, 4> getInputSegmentList(ShuffleMask SM,
926 unsigned SegLen) {
927 assert(isPowerOf2_32(SegLen));
928 SmallVector<unsigned, 4> SegList;
929 if (SM.MaxSrc == -1)
930 return SegList;
931
932 unsigned Shift = Log2_32(SegLen);
933 BitVector Segs(alignTo(SM.MaxSrc + 1, SegLen) >> Shift);
934
935 for (int M : SM.Mask) {
936 if (M >= 0)
937 Segs.set(M >> Shift);
938 }
939
940 for (unsigned B : Segs.set_bits())
941 SegList.push_back(B);
942 return SegList;
943 }
944
getOutputSegmentMap(ShuffleMask SM,unsigned SegLen)945 static SmallVector<unsigned, 4> getOutputSegmentMap(ShuffleMask SM,
946 unsigned SegLen) {
947 // Calculate the layout of the output segments in terms of the input
948 // segments.
949 // For example [1,3,1,0] means that the output consists of 4 output
950 // segments, where the first output segment has only elements of the
951 // input segment at index 1. The next output segment only has elements
952 // of the input segment 3, etc.
953 // If an output segment only has undef elements, the value will be ~0u.
954 // If an output segment has elements from more than one input segment,
955 // the corresponding value will be ~1u.
956 unsigned MaskLen = SM.Mask.size();
957 assert(MaskLen % SegLen == 0);
958 SmallVector<unsigned, 4> Map(MaskLen / SegLen);
959
960 for (int S = 0, E = Map.size(); S != E; ++S) {
961 unsigned Idx = ~0u;
962 for (int I = 0; I != static_cast<int>(SegLen); ++I) {
963 int M = SM.Mask[S*SegLen + I];
964 if (M < 0)
965 continue;
966 unsigned G = M / SegLen; // Input segment of this element.
967 if (Idx == ~0u) {
968 Idx = G;
969 } else if (Idx != G) {
970 Idx = ~1u;
971 break;
972 }
973 }
974 Map[S] = Idx;
975 }
976
977 return Map;
978 }
979
packSegmentMask(ArrayRef<int> Mask,ArrayRef<unsigned> OutSegMap,unsigned SegLen,MutableArrayRef<int> PackedMask)980 static void packSegmentMask(ArrayRef<int> Mask, ArrayRef<unsigned> OutSegMap,
981 unsigned SegLen, MutableArrayRef<int> PackedMask) {
982 SmallVector<unsigned, 4> InvMap;
983 for (int I = OutSegMap.size() - 1; I >= 0; --I) {
984 unsigned S = OutSegMap[I];
985 assert(S != ~0u && "Unexpected undef");
986 assert(S != ~1u && "Unexpected multi");
987 if (InvMap.size() <= S)
988 InvMap.resize(S+1);
989 InvMap[S] = I;
990 }
991
992 unsigned Shift = Log2_32(SegLen);
993 for (int I = 0, E = Mask.size(); I != E; ++I) {
994 int M = Mask[I];
995 if (M >= 0) {
996 int OutIdx = InvMap[M >> Shift];
997 M = (M & (SegLen-1)) + SegLen*OutIdx;
998 }
999 PackedMask[I] = M;
1000 }
1001 }
1002
isPermutation(ArrayRef<int> Mask)1003 static bool isPermutation(ArrayRef<int> Mask) {
1004 // Check by adding all numbers only works if there is no overflow.
1005 assert(Mask.size() < 0x00007FFF && "Overflow failure");
1006 int Sum = 0;
1007 for (int Idx : Mask) {
1008 if (Idx == -1)
1009 return false;
1010 Sum += Idx;
1011 }
1012 int N = Mask.size();
1013 return 2*Sum == N*(N-1);
1014 }
1015
selectVectorConstants(SDNode * N)1016 bool HvxSelector::selectVectorConstants(SDNode *N) {
1017 // Constant vectors are generated as loads from constant pools or as
1018 // splats of a constant value. Since they are generated during the
1019 // selection process, the main selection algorithm is not aware of them.
1020 // Select them directly here.
1021 SmallVector<SDNode*,4> Nodes;
1022 SetVector<SDNode*> WorkQ;
1023
1024 // The DAG can change (due to CSE) during selection, so cache all the
1025 // unselected nodes first to avoid traversing a mutating DAG.
1026 WorkQ.insert(N);
1027 for (unsigned i = 0; i != WorkQ.size(); ++i) {
1028 SDNode *W = WorkQ[i];
1029 if (!W->isMachineOpcode() && W->getOpcode() == HexagonISD::ISEL)
1030 Nodes.push_back(W);
1031 for (unsigned j = 0, f = W->getNumOperands(); j != f; ++j)
1032 WorkQ.insert(W->getOperand(j).getNode());
1033 }
1034
1035 for (SDNode *L : Nodes)
1036 select(L);
1037
1038 return !Nodes.empty();
1039 }
1040
materialize(const ResultStack & Results)1041 void HvxSelector::materialize(const ResultStack &Results) {
1042 DEBUG_WITH_TYPE("isel", {
1043 dbgs() << "Materializing\n";
1044 Results.print(dbgs(), DAG);
1045 });
1046 if (Results.empty())
1047 return;
1048 const SDLoc &dl(Results.InpNode);
1049 std::vector<SDValue> Output;
1050
1051 for (unsigned I = 0, E = Results.size(); I != E; ++I) {
1052 const NodeTemplate &Node = Results[I];
1053 std::vector<SDValue> Ops;
1054 for (const OpRef &R : Node.Ops) {
1055 assert(R.isValid());
1056 if (R.isValue()) {
1057 Ops.push_back(R.OpV);
1058 continue;
1059 }
1060 if (R.OpN & OpRef::Undef) {
1061 MVT::SimpleValueType SVT = MVT::SimpleValueType(R.OpN & OpRef::Index);
1062 Ops.push_back(ISel.selectUndef(dl, MVT(SVT)));
1063 continue;
1064 }
1065 // R is an index of a result.
1066 unsigned Part = R.OpN & OpRef::Whole;
1067 int Idx = SignExtend32(R.OpN & OpRef::Index, OpRef::IndexBits);
1068 if (Idx < 0)
1069 Idx += I;
1070 assert(Idx >= 0 && unsigned(Idx) < Output.size());
1071 SDValue Op = Output[Idx];
1072 MVT OpTy = Op.getValueType().getSimpleVT();
1073 if (Part != OpRef::Whole) {
1074 assert(Part == OpRef::LoHalf || Part == OpRef::HiHalf);
1075 MVT HalfTy = MVT::getVectorVT(OpTy.getVectorElementType(),
1076 OpTy.getVectorNumElements()/2);
1077 unsigned Sub = (Part == OpRef::LoHalf) ? Hexagon::vsub_lo
1078 : Hexagon::vsub_hi;
1079 Op = DAG.getTargetExtractSubreg(Sub, dl, HalfTy, Op);
1080 }
1081 Ops.push_back(Op);
1082 } // for (Node : Results)
1083
1084 assert(Node.Ty != MVT::Other);
1085 SDNode *ResN = (Node.Opc == TargetOpcode::COPY)
1086 ? Ops.front().getNode()
1087 : DAG.getMachineNode(Node.Opc, dl, Node.Ty, Ops);
1088 Output.push_back(SDValue(ResN, 0));
1089 }
1090
1091 SDNode *OutN = Output.back().getNode();
1092 SDNode *InpN = Results.InpNode;
1093 DEBUG_WITH_TYPE("isel", {
1094 dbgs() << "Generated node:\n";
1095 OutN->dumpr(&DAG);
1096 });
1097
1098 ISel.ReplaceNode(InpN, OutN);
1099 selectVectorConstants(OutN);
1100 DAG.RemoveDeadNodes();
1101 }
1102
concats(OpRef Lo,OpRef Hi,ResultStack & Results)1103 OpRef HvxSelector::concats(OpRef Lo, OpRef Hi, ResultStack &Results) {
1104 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1105 const SDLoc &dl(Results.InpNode);
1106 Results.push(TargetOpcode::REG_SEQUENCE, getPairVT(MVT::i8), {
1107 getConst32(Hexagon::HvxWRRegClassID, dl),
1108 Lo, getConst32(Hexagon::vsub_lo, dl),
1109 Hi, getConst32(Hexagon::vsub_hi, dl),
1110 });
1111 return OpRef::res(Results.top());
1112 }
1113
1114 // Va, Vb are single vectors. If SM only uses two vector halves from Va/Vb,
1115 // pack these halves into a single vector, and remap SM into NewMask to use
1116 // the new vector instead.
packs(ShuffleMask SM,OpRef Va,OpRef Vb,ResultStack & Results,MutableArrayRef<int> NewMask,unsigned Options)1117 OpRef HvxSelector::packs(ShuffleMask SM, OpRef Va, OpRef Vb,
1118 ResultStack &Results, MutableArrayRef<int> NewMask,
1119 unsigned Options) {
1120 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1121 if (!Va.isValid() || !Vb.isValid())
1122 return OpRef::fail();
1123
1124 MVT Ty = getSingleVT(MVT::i8);
1125 MVT PairTy = getPairVT(MVT::i8);
1126 OpRef Inp[2] = {Va, Vb};
1127 unsigned VecLen = SM.Mask.size();
1128
1129 auto valign = [this](OpRef Lo, OpRef Hi, unsigned Amt, MVT Ty,
1130 ResultStack &Results) {
1131 if (Amt == 0)
1132 return Lo;
1133 const SDLoc &dl(Results.InpNode);
1134 if (isUInt<3>(Amt) || isUInt<3>(HwLen - Amt)) {
1135 bool IsRight = isUInt<3>(Amt); // Right align.
1136 SDValue S = getConst32(IsRight ? Amt : HwLen - Amt, dl);
1137 unsigned Opc = IsRight ? Hexagon::V6_valignbi : Hexagon::V6_vlalignbi;
1138 Results.push(Opc, Ty, {Hi, Lo, S});
1139 return OpRef::res(Results.top());
1140 }
1141 Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(Amt, dl)});
1142 OpRef A = OpRef::res(Results.top());
1143 Results.push(Hexagon::V6_valignb, Ty, {Hi, Lo, A});
1144 return OpRef::res(Results.top());
1145 };
1146
1147 // Segment is a vector half.
1148 unsigned SegLen = HwLen / 2;
1149
1150 // Check if we can shuffle vector halves around to get the used elements
1151 // into a single vector.
1152 SmallVector<int,128> MaskH(SM.Mask.begin(), SM.Mask.end());
1153 SmallVector<unsigned, 4> SegList = getInputSegmentList(SM.Mask, SegLen);
1154 unsigned SegCount = SegList.size();
1155 SmallVector<unsigned, 4> SegMap = getOutputSegmentMap(SM.Mask, SegLen);
1156
1157 if (SegList.empty())
1158 return OpRef::undef(Ty);
1159
1160 // NOTE:
1161 // In the following part of the function, where the segments are rearranged,
1162 // the shuffle mask SM can be of any length that is a multiple of a vector
1163 // (i.e. a multiple of 2*SegLen), and non-zero.
1164 // The output segment map is computed, and it may have any even number of
1165 // entries, but the rearrangement of input segments will be done based only
1166 // on the first two (non-undef) entries in the segment map.
1167 // For example, if the output map is 3, 1, 1, 3 (it can have at most two
1168 // distinct entries!), the segments 1 and 3 of Va/Vb will be packaged into
1169 // a single vector V = 3:1. The output mask will then be updated to use
1170 // seg(0,V), seg(1,V), seg(1,V), seg(0,V).
1171 //
1172 // Picking the segments based on the output map is an optimization. For
1173 // correctness it is only necessary that Seg0 and Seg1 are the two input
1174 // segments that are used in the output.
1175
1176 unsigned Seg0 = ~0u, Seg1 = ~0u;
1177 for (int I = 0, E = SegMap.size(); I != E; ++I) {
1178 unsigned X = SegMap[I];
1179 if (X == ~0u)
1180 continue;
1181 if (Seg0 == ~0u)
1182 Seg0 = X;
1183 else if (Seg1 != ~0u)
1184 break;
1185 if (X == ~1u || X != Seg0)
1186 Seg1 = X;
1187 }
1188
1189 if (SegCount == 1) {
1190 unsigned SrcOp = SegList[0] / 2;
1191 for (int I = 0; I != static_cast<int>(VecLen); ++I) {
1192 int M = SM.Mask[I];
1193 if (M >= 0) {
1194 M -= SrcOp * HwLen;
1195 assert(M >= 0);
1196 }
1197 NewMask[I] = M;
1198 }
1199 return Inp[SrcOp];
1200 }
1201
1202 if (SegCount == 2) {
1203 // Seg0 should not be undef here: this would imply a SegList
1204 // with <= 1 elements, which was checked earlier.
1205 assert(Seg0 != ~0u);
1206
1207 // If Seg0 or Seg1 are "multi-defined", pick them from the input
1208 // segment list instead.
1209 if (Seg0 == ~1u || Seg1 == ~1u) {
1210 if (Seg0 == Seg1) {
1211 Seg0 = SegList[0];
1212 Seg1 = SegList[1];
1213 } else if (Seg0 == ~1u) {
1214 Seg0 = SegList[0] != Seg1 ? SegList[0] : SegList[1];
1215 } else {
1216 assert(Seg1 == ~1u);
1217 Seg1 = SegList[0] != Seg0 ? SegList[0] : SegList[1];
1218 }
1219 }
1220 assert(Seg0 != ~1u && Seg1 != ~1u);
1221
1222 assert(Seg0 != Seg1 && "Expecting different segments");
1223 const SDLoc &dl(Results.InpNode);
1224 Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(SegLen, dl)});
1225 OpRef HL = OpRef::res(Results.top());
1226
1227 // Va = AB, Vb = CD
1228
1229 if (Seg0 / 2 == Seg1 / 2) {
1230 // Same input vector.
1231 Va = Inp[Seg0 / 2];
1232 if (Seg0 > Seg1) {
1233 // Swap halves.
1234 Results.push(Hexagon::V6_vror, Ty, {Inp[Seg0 / 2], HL});
1235 Va = OpRef::res(Results.top());
1236 }
1237 packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
1238 } else if (Seg0 % 2 == Seg1 % 2) {
1239 // Picking AC, BD, CA, or DB.
1240 // vshuff(CD,AB,HL) -> BD:AC
1241 // vshuff(AB,CD,HL) -> DB:CA
1242 auto Vs = (Seg0 == 0 || Seg0 == 1) ? std::make_pair(Vb, Va) // AC or BD
1243 : std::make_pair(Va, Vb); // CA or DB
1244 Results.push(Hexagon::V6_vshuffvdd, PairTy, {Vs.first, Vs.second, HL});
1245 OpRef P = OpRef::res(Results.top());
1246 Va = (Seg0 == 0 || Seg0 == 2) ? OpRef::lo(P) : OpRef::hi(P);
1247 packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
1248 } else {
1249 // Picking AD, BC, CB, or DA.
1250 if ((Seg0 == 0 && Seg1 == 3) || (Seg0 == 2 && Seg1 == 1)) {
1251 // AD or BC: this can be done using vmux.
1252 // Q = V6_pred_scalar2 SegLen
1253 // V = V6_vmux Q, (Va, Vb) or (Vb, Va)
1254 Results.push(Hexagon::V6_pred_scalar2, getBoolVT(), {HL});
1255 OpRef Qt = OpRef::res(Results.top());
1256 auto Vs = (Seg0 == 0) ? std::make_pair(Va, Vb) // AD
1257 : std::make_pair(Vb, Va); // CB
1258 Results.push(Hexagon::V6_vmux, Ty, {Qt, Vs.first, Vs.second});
1259 Va = OpRef::res(Results.top());
1260 packSegmentMask(SM.Mask, {Seg0, Seg1}, SegLen, MaskH);
1261 } else {
1262 // BC or DA: this could be done via valign by SegLen.
1263 // Do nothing here, because valign (if possible) will be generated
1264 // later on (make sure the Seg0 values are as expected).
1265 assert(Seg0 == 1 || Seg0 == 3);
1266 }
1267 }
1268 }
1269
1270 // Check if the arguments can be packed by valign(Va,Vb) or valign(Vb,Va).
1271
1272 ShuffleMask SMH(MaskH);
1273 assert(SMH.Mask.size() == VecLen);
1274 SmallVector<int,128> MaskA(SMH.Mask.begin(), SMH.Mask.end());
1275
1276 if (SMH.MaxSrc - SMH.MinSrc >= static_cast<int>(HwLen)) {
1277 // valign(Lo=Va,Hi=Vb) won't work. Try swapping Va/Vb.
1278 SmallVector<int,128> Swapped(SMH.Mask.begin(), SMH.Mask.end());
1279 ShuffleVectorSDNode::commuteMask(Swapped);
1280 ShuffleMask SW(Swapped);
1281 if (SW.MaxSrc - SW.MinSrc < static_cast<int>(HwLen)) {
1282 MaskA.assign(SW.Mask.begin(), SW.Mask.end());
1283 std::swap(Va, Vb);
1284 }
1285 }
1286 ShuffleMask SMA(MaskA);
1287 assert(SMA.Mask.size() == VecLen);
1288
1289 if (SMA.MaxSrc - SMA.MinSrc < static_cast<int>(HwLen)) {
1290 int ShiftR = SMA.MinSrc;
1291 if (ShiftR >= static_cast<int>(HwLen)) {
1292 Va = Vb;
1293 Vb = OpRef::undef(Ty);
1294 ShiftR -= HwLen;
1295 }
1296 OpRef RetVal = valign(Va, Vb, ShiftR, Ty, Results);
1297
1298 for (int I = 0; I != static_cast<int>(VecLen); ++I) {
1299 int M = SMA.Mask[I];
1300 if (M != -1)
1301 M -= SMA.MinSrc;
1302 NewMask[I] = M;
1303 }
1304 return RetVal;
1305 }
1306
1307 // By here, packing by segment (half-vector) shuffling, and vector alignment
1308 // failed. Try vmux.
1309 // Note: since this is using the original mask, Va and Vb must not have been
1310 // modified.
1311
1312 if (Options & PackMux) {
1313 // If elements picked from Va and Vb have all different (source) indexes
1314 // (relative to the start of the argument), do a mux, and update the mask.
1315 BitVector Picked(HwLen);
1316 SmallVector<uint8_t,128> MuxBytes(HwLen);
1317 bool CanMux = true;
1318 for (int I = 0; I != static_cast<int>(VecLen); ++I) {
1319 int M = SM.Mask[I];
1320 if (M == -1)
1321 continue;
1322 if (M >= static_cast<int>(HwLen))
1323 M -= HwLen;
1324 else
1325 MuxBytes[M] = 0xFF;
1326 if (Picked[M]) {
1327 CanMux = false;
1328 break;
1329 }
1330 NewMask[I] = M;
1331 }
1332 if (CanMux)
1333 return vmuxs(MuxBytes, Va, Vb, Results);
1334 }
1335 return OpRef::fail();
1336 }
1337
1338 // Va, Vb are vector pairs. If SM only uses two single vectors from Va/Vb,
1339 // pack these vectors into a pair, and remap SM into NewMask to use the
1340 // new pair instead.
packp(ShuffleMask SM,OpRef Va,OpRef Vb,ResultStack & Results,MutableArrayRef<int> NewMask)1341 OpRef HvxSelector::packp(ShuffleMask SM, OpRef Va, OpRef Vb,
1342 ResultStack &Results, MutableArrayRef<int> NewMask) {
1343 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1344 SmallVector<unsigned, 4> SegList = getInputSegmentList(SM.Mask, HwLen);
1345 if (SegList.empty())
1346 return OpRef::undef(getPairVT(MVT::i8));
1347
1348 // If more than two halves are used, bail.
1349 // TODO: be more aggressive here?
1350 unsigned SegCount = SegList.size();
1351 if (SegCount > 2)
1352 return OpRef::fail();
1353
1354 MVT HalfTy = getSingleVT(MVT::i8);
1355
1356 OpRef Inp[2] = { Va, Vb };
1357 OpRef Out[2] = { OpRef::undef(HalfTy), OpRef::undef(HalfTy) };
1358
1359 // Really make sure we have at most 2 vectors used in the mask.
1360 assert(SegCount <= 2);
1361
1362 for (int I = 0, E = SegList.size(); I != E; ++I) {
1363 unsigned S = SegList[I];
1364 OpRef Op = Inp[S / 2];
1365 Out[I] = (S & 1) ? OpRef::hi(Op) : OpRef::lo(Op);
1366 }
1367
1368 // NOTE: Using SegList as the packing map here (not SegMap). This works,
1369 // because we're not concerned here about the order of the segments (i.e.
1370 // single vectors) in the output pair. Changing the order of vectors is
1371 // free (as opposed to changing the order of vector halves as in packs),
1372 // and so there is no extra cost added in case the order needs to be
1373 // changed later.
1374 packSegmentMask(SM.Mask, SegList, HwLen, NewMask);
1375 return concats(Out[0], Out[1], Results);
1376 }
1377
vmuxs(ArrayRef<uint8_t> Bytes,OpRef Va,OpRef Vb,ResultStack & Results)1378 OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1379 ResultStack &Results) {
1380 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1381 MVT ByteTy = getSingleVT(MVT::i8);
1382 MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
1383 const SDLoc &dl(Results.InpNode);
1384 SDValue B = getVectorConstant(Bytes, dl);
1385 Results.push(Hexagon::V6_vd0, ByteTy, {});
1386 Results.push(Hexagon::V6_veqb, BoolTy, {OpRef(B), OpRef::res(-1)});
1387 Results.push(Hexagon::V6_vmux, ByteTy, {OpRef::res(-1), Vb, Va});
1388 return OpRef::res(Results.top());
1389 }
1390
vmuxp(ArrayRef<uint8_t> Bytes,OpRef Va,OpRef Vb,ResultStack & Results)1391 OpRef HvxSelector::vmuxp(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
1392 ResultStack &Results) {
1393 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1394 size_t S = Bytes.size() / 2;
1395 OpRef L = vmuxs(Bytes.take_front(S), OpRef::lo(Va), OpRef::lo(Vb), Results);
1396 OpRef H = vmuxs(Bytes.drop_front(S), OpRef::hi(Va), OpRef::hi(Vb), Results);
1397 return concats(L, H, Results);
1398 }
1399
shuffs1(ShuffleMask SM,OpRef Va,ResultStack & Results)1400 OpRef HvxSelector::shuffs1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1401 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1402 unsigned VecLen = SM.Mask.size();
1403 assert(HwLen == VecLen);
1404 (void)VecLen;
1405 assert(all_of(SM.Mask, [this](int M) { return M == -1 || M < int(HwLen); }));
1406
1407 if (isIdentity(SM.Mask))
1408 return Va;
1409 if (isUndef(SM.Mask))
1410 return OpRef::undef(getSingleVT(MVT::i8));
1411
1412 unsigned HalfLen = HwLen / 2;
1413 assert(isPowerOf2_32(HalfLen));
1414
1415 // Handle special case where the output is the same half of the input
1416 // repeated twice, i.e. if Va = AB, then handle the output of AA or BB.
1417 std::pair<int, unsigned> Strip1 = findStrip(SM.Mask, 1, HalfLen);
1418 if ((Strip1.first & ~HalfLen) == 0 && Strip1.second == HalfLen) {
1419 std::pair<int, unsigned> Strip2 =
1420 findStrip(SM.Mask.drop_front(HalfLen), 1, HalfLen);
1421 if (Strip1 == Strip2) {
1422 const SDLoc &dl(Results.InpNode);
1423 Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(HalfLen, dl)});
1424 Results.push(Hexagon::V6_vshuffvdd, getPairVT(MVT::i8),
1425 {Va, Va, OpRef::res(Results.top())});
1426 OpRef S = OpRef::res(Results.top());
1427 return (Strip1.first == 0) ? OpRef::lo(S) : OpRef::hi(S);
1428 }
1429 }
1430
1431 OpRef P = perfect(SM, Va, Results);
1432 if (P.isValid())
1433 return P;
1434 return butterfly(SM, Va, Results);
1435 }
1436
shuffs2(ShuffleMask SM,OpRef Va,OpRef Vb,ResultStack & Results)1437 OpRef HvxSelector::shuffs2(ShuffleMask SM, OpRef Va, OpRef Vb,
1438 ResultStack &Results) {
1439 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1440 if (isUndef(SM.Mask))
1441 return OpRef::undef(getSingleVT(MVT::i8));
1442
1443 OpRef C = contracting(SM, Va, Vb, Results);
1444 if (C.isValid())
1445 return C;
1446
1447 int VecLen = SM.Mask.size();
1448 SmallVector<int,128> PackedMask(VecLen);
1449 OpRef P = packs(SM, Va, Vb, Results, PackedMask);
1450 if (P.isValid())
1451 return shuffs1(ShuffleMask(PackedMask), P, Results);
1452
1453 // TODO: Before we split the mask, try perfect shuffle on concatenated
1454 // operands. This won't work now, because the perfect code does not
1455 // tolerate undefs in the mask.
1456
1457 SmallVector<int,128> MaskL(VecLen), MaskR(VecLen);
1458 splitMask(SM.Mask, MaskL, MaskR);
1459
1460 OpRef L = shuffs1(ShuffleMask(MaskL), Va, Results);
1461 OpRef R = shuffs1(ShuffleMask(MaskR), Vb, Results);
1462 if (!L.isValid() || !R.isValid())
1463 return OpRef::fail();
1464
1465 SmallVector<uint8_t,128> Bytes(VecLen);
1466 for (int I = 0; I != VecLen; ++I) {
1467 if (MaskL[I] != -1)
1468 Bytes[I] = 0xFF;
1469 }
1470 return vmuxs(Bytes, L, R, Results);
1471 }
1472
shuffp1(ShuffleMask SM,OpRef Va,ResultStack & Results)1473 OpRef HvxSelector::shuffp1(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1474 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1475 int VecLen = SM.Mask.size();
1476
1477 if (isIdentity(SM.Mask))
1478 return Va;
1479 if (isUndef(SM.Mask))
1480 return OpRef::undef(getPairVT(MVT::i8));
1481
1482 SmallVector<int,128> PackedMask(VecLen);
1483 OpRef P = packs(SM, OpRef::lo(Va), OpRef::hi(Va), Results, PackedMask);
1484 if (P.isValid()) {
1485 ShuffleMask PM(PackedMask);
1486 OpRef E = expanding(PM, P, Results);
1487 if (E.isValid())
1488 return E;
1489
1490 OpRef L = shuffs1(PM.lo(), P, Results);
1491 OpRef H = shuffs1(PM.hi(), P, Results);
1492 if (L.isValid() && H.isValid())
1493 return concats(L, H, Results);
1494 }
1495
1496 OpRef R = perfect(SM, Va, Results);
1497 if (R.isValid())
1498 return R;
1499 // TODO commute the mask and try the opposite order of the halves.
1500
1501 OpRef L = shuffs2(SM.lo(), OpRef::lo(Va), OpRef::hi(Va), Results);
1502 OpRef H = shuffs2(SM.hi(), OpRef::lo(Va), OpRef::hi(Va), Results);
1503 if (L.isValid() && H.isValid())
1504 return concats(L, H, Results);
1505
1506 return OpRef::fail();
1507 }
1508
shuffp2(ShuffleMask SM,OpRef Va,OpRef Vb,ResultStack & Results)1509 OpRef HvxSelector::shuffp2(ShuffleMask SM, OpRef Va, OpRef Vb,
1510 ResultStack &Results) {
1511 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1512 if (isUndef(SM.Mask))
1513 return OpRef::undef(getPairVT(MVT::i8));
1514
1515 int VecLen = SM.Mask.size();
1516 SmallVector<int,256> PackedMask(VecLen);
1517 OpRef P = packp(SM, Va, Vb, Results, PackedMask);
1518 if (P.isValid())
1519 return shuffp1(ShuffleMask(PackedMask), P, Results);
1520
1521 SmallVector<int,256> MaskL(VecLen), MaskR(VecLen);
1522 splitMask(SM.Mask, MaskL, MaskR);
1523
1524 OpRef L = shuffp1(ShuffleMask(MaskL), Va, Results);
1525 OpRef R = shuffp1(ShuffleMask(MaskR), Vb, Results);
1526 if (!L.isValid() || !R.isValid())
1527 return OpRef::fail();
1528
1529 // Mux the results.
1530 SmallVector<uint8_t,256> Bytes(VecLen);
1531 for (int I = 0; I != VecLen; ++I) {
1532 if (MaskL[I] != -1)
1533 Bytes[I] = 0xFF;
1534 }
1535 return vmuxp(Bytes, L, R, Results);
1536 }
1537
1538 namespace {
1539 struct Deleter : public SelectionDAG::DAGNodeDeletedListener {
1540 template <typename T>
Deleter__anonbbf86c770d11::Deleter1541 Deleter(SelectionDAG &D, T &C)
1542 : SelectionDAG::DAGNodeDeletedListener(D, [&C] (SDNode *N, SDNode *E) {
1543 C.erase(N);
1544 }) {}
1545 };
1546
1547 template <typename T>
1548 struct NullifyingVector : public T {
1549 DenseMap<SDNode*, SDNode**> Refs;
NullifyingVector__anonbbf86c770d11::NullifyingVector1550 NullifyingVector(T &&V) : T(V) {
1551 for (unsigned i = 0, e = T::size(); i != e; ++i) {
1552 SDNode *&N = T::operator[](i);
1553 Refs[N] = &N;
1554 }
1555 }
erase__anonbbf86c770d11::NullifyingVector1556 void erase(SDNode *N) {
1557 auto F = Refs.find(N);
1558 if (F != Refs.end())
1559 *F->second = nullptr;
1560 }
1561 };
1562 }
1563
select(SDNode * ISelN)1564 void HvxSelector::select(SDNode *ISelN) {
1565 // What's important here is to select the right set of nodes. The main
1566 // selection algorithm loops over nodes in a topological order, i.e. users
1567 // are visited before their operands.
1568 //
1569 // It is an error to have an unselected node with a selected operand, and
1570 // there is an assertion in the main selector code to enforce that.
1571 //
1572 // Such a situation could occur if we selected a node, which is both a
1573 // subnode of ISelN, and a subnode of an unrelated (and yet unselected)
1574 // node in the DAG.
1575 assert(ISelN->getOpcode() == HexagonISD::ISEL);
1576 SDNode *N0 = ISelN->getOperand(0).getNode();
1577 if (N0->isMachineOpcode()) {
1578 ISel.ReplaceNode(ISelN, N0);
1579 return;
1580 }
1581
1582 // There could have been nodes created (i.e. inserted into the DAG)
1583 // that are now dead. Remove them, in case they use any of the nodes
1584 // to select (and make them look shared).
1585 DAG.RemoveDeadNodes();
1586
1587 SetVector<SDNode*> SubNodes, TmpQ;
1588 std::map<SDNode*,unsigned> NumOps;
1589
1590 // Don't want to select N0 if it's shared with another node, except if
1591 // it's shared with other ISELs.
1592 auto IsISelN = [](SDNode *T) { return T->getOpcode() == HexagonISD::ISEL; };
1593 if (llvm::all_of(N0->uses(), IsISelN))
1594 SubNodes.insert(N0);
1595
1596 auto InSubNodes = [&SubNodes](SDNode *T) { return SubNodes.count(T); };
1597 for (unsigned I = 0; I != SubNodes.size(); ++I) {
1598 SDNode *S = SubNodes[I];
1599 unsigned OpN = 0;
1600 // Only add subnodes that are only reachable from N0.
1601 for (SDValue Op : S->ops()) {
1602 SDNode *O = Op.getNode();
1603 if (llvm::all_of(O->uses(), InSubNodes)) {
1604 SubNodes.insert(O);
1605 ++OpN;
1606 }
1607 }
1608 NumOps.insert({S, OpN});
1609 if (OpN == 0)
1610 TmpQ.insert(S);
1611 }
1612
1613 for (unsigned I = 0; I != TmpQ.size(); ++I) {
1614 SDNode *S = TmpQ[I];
1615 for (SDNode *U : S->uses()) {
1616 if (U == ISelN)
1617 continue;
1618 auto F = NumOps.find(U);
1619 assert(F != NumOps.end());
1620 if (F->second > 0 && !--F->second)
1621 TmpQ.insert(F->first);
1622 }
1623 }
1624
1625 // Remove the marker.
1626 ISel.ReplaceNode(ISelN, N0);
1627
1628 assert(SubNodes.size() == TmpQ.size());
1629 NullifyingVector<decltype(TmpQ)::vector_type> Queue(TmpQ.takeVector());
1630
1631 Deleter DUQ(DAG, Queue);
1632 for (SDNode *S : reverse(Queue)) {
1633 if (S == nullptr)
1634 continue;
1635 DEBUG_WITH_TYPE("isel", {dbgs() << "HVX selecting: "; S->dump(&DAG);});
1636 ISel.Select(S);
1637 }
1638 }
1639
scalarizeShuffle(ArrayRef<int> Mask,const SDLoc & dl,MVT ResTy,SDValue Va,SDValue Vb,SDNode * N)1640 bool HvxSelector::scalarizeShuffle(ArrayRef<int> Mask, const SDLoc &dl,
1641 MVT ResTy, SDValue Va, SDValue Vb,
1642 SDNode *N) {
1643 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1644 MVT ElemTy = ResTy.getVectorElementType();
1645 assert(ElemTy == MVT::i8);
1646 unsigned VecLen = Mask.size();
1647 bool HavePairs = (2*HwLen == VecLen);
1648 MVT SingleTy = getSingleVT(MVT::i8);
1649
1650 // The prior attempts to handle this shuffle may have left a bunch of
1651 // dead nodes in the DAG (such as constants). These nodes will be added
1652 // at the end of DAG's node list, which at that point had already been
1653 // sorted topologically. In the main selection loop, the node list is
1654 // traversed backwards from the root node, which means that any new
1655 // nodes (from the end of the list) will not be visited.
1656 // Scalarization will replace the shuffle node with the scalarized
1657 // expression, and if that expression reused any if the leftoever (dead)
1658 // nodes, these nodes would not be selected (since the "local" selection
1659 // only visits nodes that are not in AllNodes).
1660 // To avoid this issue, remove all dead nodes from the DAG now.
1661 // DAG.RemoveDeadNodes();
1662
1663 SmallVector<SDValue,128> Ops;
1664 LLVMContext &Ctx = *DAG.getContext();
1665 MVT LegalTy = Lower.getTypeToTransformTo(Ctx, ElemTy).getSimpleVT();
1666 for (int I : Mask) {
1667 if (I < 0) {
1668 Ops.push_back(ISel.selectUndef(dl, LegalTy));
1669 continue;
1670 }
1671 SDValue Vec;
1672 unsigned M = I;
1673 if (M < VecLen) {
1674 Vec = Va;
1675 } else {
1676 Vec = Vb;
1677 M -= VecLen;
1678 }
1679 if (HavePairs) {
1680 if (M < HwLen) {
1681 Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, Vec);
1682 } else {
1683 Vec = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, Vec);
1684 M -= HwLen;
1685 }
1686 }
1687 SDValue Idx = DAG.getConstant(M, dl, MVT::i32);
1688 SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalTy, {Vec, Idx});
1689 SDValue L = Lower.LowerOperation(Ex, DAG);
1690 assert(L.getNode());
1691 Ops.push_back(L);
1692 }
1693
1694 SDValue LV;
1695 if (2*HwLen == VecLen) {
1696 SDValue B0 = DAG.getBuildVector(SingleTy, dl, {Ops.data(), HwLen});
1697 SDValue L0 = Lower.LowerOperation(B0, DAG);
1698 SDValue B1 = DAG.getBuildVector(SingleTy, dl, {Ops.data()+HwLen, HwLen});
1699 SDValue L1 = Lower.LowerOperation(B1, DAG);
1700 // XXX CONCAT_VECTORS is legal for HVX vectors. Legalizing (lowering)
1701 // functions may expect to be called only for illegal operations, so
1702 // make sure that they are not called for legal ones. Develop a better
1703 // mechanism for dealing with this.
1704 LV = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResTy, {L0, L1});
1705 } else {
1706 SDValue BV = DAG.getBuildVector(ResTy, dl, Ops);
1707 LV = Lower.LowerOperation(BV, DAG);
1708 }
1709
1710 assert(!N->use_empty());
1711 SDValue IS = DAG.getNode(HexagonISD::ISEL, dl, ResTy, LV);
1712 ISel.ReplaceNode(N, IS.getNode());
1713 select(IS.getNode());
1714 DAG.RemoveDeadNodes();
1715 return true;
1716 }
1717
contracting(ShuffleMask SM,OpRef Va,OpRef Vb,ResultStack & Results)1718 OpRef HvxSelector::contracting(ShuffleMask SM, OpRef Va, OpRef Vb,
1719 ResultStack &Results) {
1720 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1721 if (!Va.isValid() || !Vb.isValid())
1722 return OpRef::fail();
1723
1724 // Contracting shuffles, i.e. instructions that always discard some bytes
1725 // from the operand vectors.
1726 //
1727 // V6_vshuff{e,o}b
1728 // V6_vdealb4w
1729 // V6_vpack{e,o}{b,h}
1730
1731 int VecLen = SM.Mask.size();
1732 std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);
1733 MVT ResTy = getSingleVT(MVT::i8);
1734
1735 // The following shuffles only work for bytes and halfwords. This requires
1736 // the strip length to be 1 or 2.
1737 if (Strip.second != 1 && Strip.second != 2)
1738 return OpRef::fail();
1739
1740 // The patterns for the shuffles, in terms of the starting offsets of the
1741 // consecutive strips (L = length of the strip, N = VecLen):
1742 //
1743 // vpacke: 0, 2L, 4L ... N+0, N+2L, N+4L ... L = 1 or 2
1744 // vpacko: L, 3L, 5L ... N+L, N+3L, N+5L ... L = 1 or 2
1745 //
1746 // vshuffe: 0, N+0, 2L, N+2L, 4L ... L = 1 or 2
1747 // vshuffo: L, N+L, 3L, N+3L, 5L ... L = 1 or 2
1748 //
1749 // vdealb4w: 0, 4, 8 ... 2, 6, 10 ... N+0, N+4, N+8 ... N+2, N+6, N+10 ...
1750
1751 // The value of the element in the mask following the strip will decide
1752 // what kind of a shuffle this can be.
1753 int NextInMask = SM.Mask[Strip.second];
1754
1755 // Check if NextInMask could be 2L, 3L or 4, i.e. if it could be a mask
1756 // for vpack or vdealb4w. VecLen > 4, so NextInMask for vdealb4w would
1757 // satisfy this.
1758 if (NextInMask < VecLen) {
1759 // vpack{e,o} or vdealb4w
1760 if (Strip.first == 0 && Strip.second == 1 && NextInMask == 4) {
1761 int N = VecLen;
1762 // Check if this is vdealb4w (L=1).
1763 for (int I = 0; I != N/4; ++I)
1764 if (SM.Mask[I] != 4*I)
1765 return OpRef::fail();
1766 for (int I = 0; I != N/4; ++I)
1767 if (SM.Mask[I+N/4] != 2 + 4*I)
1768 return OpRef::fail();
1769 for (int I = 0; I != N/4; ++I)
1770 if (SM.Mask[I+N/2] != N + 4*I)
1771 return OpRef::fail();
1772 for (int I = 0; I != N/4; ++I)
1773 if (SM.Mask[I+3*N/4] != N+2 + 4*I)
1774 return OpRef::fail();
1775 // Matched mask for vdealb4w.
1776 Results.push(Hexagon::V6_vdealb4w, ResTy, {Vb, Va});
1777 return OpRef::res(Results.top());
1778 }
1779
1780 // Check if this is vpack{e,o}.
1781 int N = VecLen;
1782 int L = Strip.second;
1783 // Check if the first strip starts at 0 or at L.
1784 if (Strip.first != 0 && Strip.first != L)
1785 return OpRef::fail();
1786 // Examine the rest of the mask.
1787 for (int I = L; I < N; I += L) {
1788 auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
1789 // Check whether the mask element at the beginning of each strip
1790 // increases by 2L each time.
1791 if (S.first - Strip.first != 2*I)
1792 return OpRef::fail();
1793 // Check whether each strip is of the same length.
1794 if (S.second != unsigned(L))
1795 return OpRef::fail();
1796 }
1797
1798 // Strip.first == 0 => vpacke
1799 // Strip.first == L => vpacko
1800 assert(Strip.first == 0 || Strip.first == L);
1801 using namespace Hexagon;
1802 NodeTemplate Res;
1803 Res.Opc = Strip.second == 1 // Number of bytes.
1804 ? (Strip.first == 0 ? V6_vpackeb : V6_vpackob)
1805 : (Strip.first == 0 ? V6_vpackeh : V6_vpackoh);
1806 Res.Ty = ResTy;
1807 Res.Ops = { Vb, Va };
1808 Results.push(Res);
1809 return OpRef::res(Results.top());
1810 }
1811
1812 // Check if this is vshuff{e,o}.
1813 int N = VecLen;
1814 int L = Strip.second;
1815 std::pair<int,unsigned> PrevS = Strip;
1816 bool Flip = false;
1817 for (int I = L; I < N; I += L) {
1818 auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
1819 if (S.second != PrevS.second)
1820 return OpRef::fail();
1821 int Diff = Flip ? PrevS.first - S.first + 2*L
1822 : S.first - PrevS.first;
1823 if (Diff != N)
1824 return OpRef::fail();
1825 Flip ^= true;
1826 PrevS = S;
1827 }
1828 // Strip.first == 0 => vshuffe
1829 // Strip.first == L => vshuffo
1830 assert(Strip.first == 0 || Strip.first == L);
1831 using namespace Hexagon;
1832 NodeTemplate Res;
1833 Res.Opc = Strip.second == 1 // Number of bytes.
1834 ? (Strip.first == 0 ? V6_vshuffeb : V6_vshuffob)
1835 : (Strip.first == 0 ? V6_vshufeh : V6_vshufoh);
1836 Res.Ty = ResTy;
1837 Res.Ops = { Vb, Va };
1838 Results.push(Res);
1839 return OpRef::res(Results.top());
1840 }
1841
expanding(ShuffleMask SM,OpRef Va,ResultStack & Results)1842 OpRef HvxSelector::expanding(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1843 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1844 // Expanding shuffles (using all elements and inserting into larger vector):
1845 //
1846 // V6_vunpacku{b,h} [*]
1847 //
1848 // [*] Only if the upper elements (filled with 0s) are "don't care" in Mask.
1849 //
1850 // Note: V6_vunpacko{b,h} are or-ing the high byte/half in the result, so
1851 // they are not shuffles.
1852 //
1853 // The argument is a single vector.
1854
1855 int VecLen = SM.Mask.size();
1856 assert(2*HwLen == unsigned(VecLen) && "Expecting vector-pair type");
1857
1858 std::pair<int,unsigned> Strip = findStrip(SM.Mask, 1, VecLen);
1859
1860 // The patterns for the unpacks, in terms of the starting offsets of the
1861 // consecutive strips (L = length of the strip, N = VecLen):
1862 //
1863 // vunpacku: 0, -1, L, -1, 2L, -1 ...
1864
1865 if (Strip.first != 0)
1866 return OpRef::fail();
1867
1868 // The vunpackus only handle byte and half-word.
1869 if (Strip.second != 1 && Strip.second != 2)
1870 return OpRef::fail();
1871
1872 int N = VecLen;
1873 int L = Strip.second;
1874
1875 // First, check the non-ignored strips.
1876 for (int I = 2*L; I < N; I += 2*L) {
1877 auto S = findStrip(SM.Mask.drop_front(I), 1, N-I);
1878 if (S.second != unsigned(L))
1879 return OpRef::fail();
1880 if (2*S.first != I)
1881 return OpRef::fail();
1882 }
1883 // Check the -1s.
1884 for (int I = L; I < N; I += 2*L) {
1885 auto S = findStrip(SM.Mask.drop_front(I), 0, N-I);
1886 if (S.first != -1 || S.second != unsigned(L))
1887 return OpRef::fail();
1888 }
1889
1890 unsigned Opc = Strip.second == 1 ? Hexagon::V6_vunpackub
1891 : Hexagon::V6_vunpackuh;
1892 Results.push(Opc, getPairVT(MVT::i8), {Va});
1893 return OpRef::res(Results.top());
1894 }
1895
perfect(ShuffleMask SM,OpRef Va,ResultStack & Results)1896 OpRef HvxSelector::perfect(ShuffleMask SM, OpRef Va, ResultStack &Results) {
1897 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
1898 // V6_vdeal{b,h}
1899 // V6_vshuff{b,h}
1900
1901 // V6_vshufoe{b,h} those are quivalent to vshuffvdd(..,{1,2})
1902 // V6_vshuffvdd (V6_vshuff)
1903 // V6_dealvdd (V6_vdeal)
1904
1905 int VecLen = SM.Mask.size();
1906 assert(isPowerOf2_32(VecLen) && Log2_32(VecLen) <= 8);
1907 unsigned LogLen = Log2_32(VecLen);
1908 unsigned HwLog = Log2_32(HwLen);
1909 // The result length must be the same as the length of a single vector,
1910 // or a vector pair.
1911 assert(LogLen == HwLog || LogLen == HwLog+1);
1912 bool HavePairs = LogLen == HwLog+1;
1913
1914 if (!isPermutation(SM.Mask))
1915 return OpRef::fail();
1916
1917 SmallVector<unsigned,8> Perm(LogLen);
1918
1919 // Check if this could be a perfect shuffle, or a combination of perfect
1920 // shuffles.
1921 //
1922 // Consider this permutation (using hex digits to make the ASCII diagrams
1923 // easier to read):
1924 // { 0, 8, 1, 9, 2, A, 3, B, 4, C, 5, D, 6, E, 7, F }.
1925 // This is a "deal" operation: divide the input into two halves, and
1926 // create the output by picking elements by alternating between these two
1927 // halves:
1928 // 0 1 2 3 4 5 6 7 --> 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F [*]
1929 // 8 9 A B C D E F
1930 //
1931 // Aside from a few special explicit cases (V6_vdealb, etc.), HVX provides
1932 // a somwehat different mechanism that could be used to perform shuffle/
1933 // deal operations: a 2x2 transpose.
1934 // Consider the halves of inputs again, they can be interpreted as a 2x8
1935 // matrix. A 2x8 matrix can be looked at four 2x2 matrices concatenated
1936 // together. Now, when considering 2 elements at a time, it will be a 2x4
1937 // matrix (with elements 01, 23, 45, etc.), or two 2x2 matrices:
1938 // 01 23 45 67
1939 // 89 AB CD EF
1940 // With groups of 4, this will become a single 2x2 matrix, and so on.
1941 //
1942 // The 2x2 transpose instruction works by transposing each of the 2x2
1943 // matrices (or "sub-matrices"), given a specific group size. For example,
1944 // if the group size is 1 (i.e. each element is its own group), there
1945 // will be four transposes of the four 2x2 matrices that form the 2x8.
1946 // For example, with the inputs as above, the result will be:
1947 // 0 8 2 A 4 C 6 E
1948 // 1 9 3 B 5 D 7 F
1949 // Now, this result can be tranposed again, but with the group size of 2:
1950 // 08 19 4C 5D
1951 // 2A 3B 6E 7F
1952 // If we then transpose that result, but with the group size of 4, we get:
1953 // 0819 2A3B
1954 // 4C5D 6E7F
1955 // If we concatenate these two rows, it will be
1956 // 0 8 1 9 2 A 3 B 4 C 5 D 6 E 7 F
1957 // which is the same as the "deal" [*] above.
1958 //
1959 // In general, a "deal" of individual elements is a series of 2x2 transposes,
1960 // with changing group size. HVX has two instructions:
1961 // Vdd = V6_vdealvdd Vu, Vv, Rt
1962 // Vdd = V6_shufvdd Vu, Vv, Rt
1963 // that perform exactly that. The register Rt controls which transposes are
1964 // going to happen: a bit at position n (counting from 0) indicates that a
1965 // transpose with a group size of 2^n will take place. If multiple bits are
1966 // set, multiple transposes will happen: vdealvdd will perform them starting
1967 // with the largest group size, vshuffvdd will do them in the reverse order.
1968 //
1969 // The main observation is that each 2x2 transpose corresponds to swapping
1970 // columns of bits in the binary representation of the values.
1971 //
1972 // The numbers {3,2,1,0} and the log2 of the number of contiguous 1 bits
1973 // in a given column. The * denote the columns that will be swapped.
1974 // The transpose with the group size 2^n corresponds to swapping columns
1975 // 3 (the highest log) and log2(n):
1976 //
1977 // 3 2 1 0 0 2 1 3 0 2 3 1
1978 // * * * * * *
1979 // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1980 // 1 0 0 0 1 8 1 0 0 0 8 1 0 0 0 8 1 0 0 0
1981 // 2 0 0 1 0 2 0 0 1 0 1 0 0 0 1 1 0 0 0 1
1982 // 3 0 0 1 1 A 1 0 1 0 9 1 0 0 1 9 1 0 0 1
1983 // 4 0 1 0 0 4 0 1 0 0 4 0 1 0 0 2 0 0 1 0
1984 // 5 0 1 0 1 C 1 1 0 0 C 1 1 0 0 A 1 0 1 0
1985 // 6 0 1 1 0 6 0 1 1 0 5 0 1 0 1 3 0 0 1 1
1986 // 7 0 1 1 1 E 1 1 1 0 D 1 1 0 1 B 1 0 1 1
1987 // 8 1 0 0 0 1 0 0 0 1 2 0 0 1 0 4 0 1 0 0
1988 // 9 1 0 0 1 9 1 0 0 1 A 1 0 1 0 C 1 1 0 0
1989 // A 1 0 1 0 3 0 0 1 1 3 0 0 1 1 5 0 1 0 1
1990 // B 1 0 1 1 B 1 0 1 1 B 1 0 1 1 D 1 1 0 1
1991 // C 1 1 0 0 5 0 1 0 1 6 0 1 1 0 6 0 1 1 0
1992 // D 1 1 0 1 D 1 1 0 1 E 1 1 1 0 E 1 1 1 0
1993 // E 1 1 1 0 7 0 1 1 1 7 0 1 1 1 7 0 1 1 1
1994 // F 1 1 1 1 F 1 1 1 1 F 1 1 1 1 F 1 1 1 1
1995
1996 // There is one special case that is not a perfect shuffle, but
1997 // can be turned into one easily: when the shuffle operates on
1998 // a vector pair, but the two vectors in the pair are swapped.
1999 // The code below that identifies perfect shuffles will reject
2000 // it, unless the order is reversed.
2001 SmallVector<int,128> MaskStorage(SM.Mask.begin(), SM.Mask.end());
2002 bool InvertedPair = false;
2003 if (HavePairs && SM.Mask[0] >= int(HwLen)) {
2004 for (int i = 0, e = SM.Mask.size(); i != e; ++i) {
2005 int M = SM.Mask[i];
2006 MaskStorage[i] = M >= int(HwLen) ? M-HwLen : M+HwLen;
2007 }
2008 InvertedPair = true;
2009 }
2010 ArrayRef<int> LocalMask(MaskStorage);
2011
2012 auto XorPow2 = [] (ArrayRef<int> Mask, unsigned Num) {
2013 unsigned X = Mask[0] ^ Mask[Num/2];
2014 // Check that the first half has the X's bits clear.
2015 if ((Mask[0] & X) != 0)
2016 return 0u;
2017 for (unsigned I = 1; I != Num/2; ++I) {
2018 if (unsigned(Mask[I] ^ Mask[I+Num/2]) != X)
2019 return 0u;
2020 if ((Mask[I] & X) != 0)
2021 return 0u;
2022 }
2023 return X;
2024 };
2025
2026 // Create a vector of log2's for each column: Perm[i] corresponds to
2027 // the i-th bit (lsb is 0).
2028 assert(VecLen > 2);
2029 for (unsigned I = VecLen; I >= 2; I >>= 1) {
2030 // Examine the initial segment of Mask of size I.
2031 unsigned X = XorPow2(LocalMask, I);
2032 if (!isPowerOf2_32(X))
2033 return OpRef::fail();
2034 // Check the other segments of Mask.
2035 for (int J = I; J < VecLen; J += I) {
2036 if (XorPow2(LocalMask.slice(J, I), I) != X)
2037 return OpRef::fail();
2038 }
2039 Perm[Log2_32(X)] = Log2_32(I)-1;
2040 }
2041
2042 // Once we have Perm, represent it as cycles. Denote the maximum log2
2043 // (equal to log2(VecLen)-1) as M. The cycle containing M can then be
2044 // written as (M a1 a2 a3 ... an). That cycle can be broken up into
2045 // simple swaps as (M a1)(M a2)(M a3)...(M an), with the composition
2046 // order being from left to right. Any (contiguous) segment where the
2047 // values ai, ai+1...aj are either all increasing or all decreasing,
2048 // can be implemented via a single vshuffvdd/vdealvdd respectively.
2049 //
2050 // If there is a cycle (a1 a2 ... an) that does not involve M, it can
2051 // be written as (M an)(a1 a2 ... an)(M a1). The first two cycles can
2052 // then be folded to get (M a1 a2 ... an)(M a1), and the above procedure
2053 // can be used to generate a sequence of vshuffvdd/vdealvdd.
2054 //
2055 // Example:
2056 // Assume M = 4 and consider a permutation (0 1)(2 3). It can be written
2057 // as (4 0 1)(4 0) composed with (4 2 3)(4 2), or simply
2058 // (4 0 1)(4 0)(4 2 3)(4 2).
2059 // It can then be expanded into swaps as
2060 // (4 0)(4 1)(4 0)(4 2)(4 3)(4 2),
2061 // and broken up into "increasing" segments as
2062 // [(4 0)(4 1)] [(4 0)(4 2)(4 3)] [(4 2)].
2063 // This is equivalent to
2064 // (4 0 1)(4 0 2 3)(4 2),
2065 // which can be implemented as 3 vshufvdd instructions.
2066
2067 using CycleType = SmallVector<unsigned,8>;
2068 std::set<CycleType> Cycles;
2069 std::set<unsigned> All;
2070
2071 for (unsigned I : Perm)
2072 All.insert(I);
2073
2074 // If the cycle contains LogLen-1, move it to the front of the cycle.
2075 // Otherwise, return the cycle unchanged.
2076 auto canonicalize = [LogLen](const CycleType &C) -> CycleType {
2077 unsigned LogPos, N = C.size();
2078 for (LogPos = 0; LogPos != N; ++LogPos)
2079 if (C[LogPos] == LogLen-1)
2080 break;
2081 if (LogPos == N)
2082 return C;
2083
2084 CycleType NewC(C.begin()+LogPos, C.end());
2085 NewC.append(C.begin(), C.begin()+LogPos);
2086 return NewC;
2087 };
2088
2089 auto pfs = [](const std::set<CycleType> &Cs, unsigned Len) {
2090 // Ordering: shuff: 5 0 1 2 3 4, deal: 5 4 3 2 1 0 (for Log=6),
2091 // for bytes zero is included, for halfwords is not.
2092 if (Cs.size() != 1)
2093 return 0u;
2094 const CycleType &C = *Cs.begin();
2095 if (C[0] != Len-1)
2096 return 0u;
2097 int D = Len - C.size();
2098 if (D != 0 && D != 1)
2099 return 0u;
2100
2101 bool IsDeal = true, IsShuff = true;
2102 for (unsigned I = 1; I != Len-D; ++I) {
2103 if (C[I] != Len-1-I)
2104 IsDeal = false;
2105 if (C[I] != I-(1-D)) // I-1, I
2106 IsShuff = false;
2107 }
2108 // At most one, IsDeal or IsShuff, can be non-zero.
2109 assert(!(IsDeal || IsShuff) || IsDeal != IsShuff);
2110 static unsigned Deals[] = { Hexagon::V6_vdealb, Hexagon::V6_vdealh };
2111 static unsigned Shufs[] = { Hexagon::V6_vshuffb, Hexagon::V6_vshuffh };
2112 return IsDeal ? Deals[D] : (IsShuff ? Shufs[D] : 0);
2113 };
2114
2115 while (!All.empty()) {
2116 unsigned A = *All.begin();
2117 All.erase(A);
2118 CycleType C;
2119 C.push_back(A);
2120 for (unsigned B = Perm[A]; B != A; B = Perm[B]) {
2121 C.push_back(B);
2122 All.erase(B);
2123 }
2124 if (C.size() <= 1)
2125 continue;
2126 Cycles.insert(canonicalize(C));
2127 }
2128
2129 MVT SingleTy = getSingleVT(MVT::i8);
2130 MVT PairTy = getPairVT(MVT::i8);
2131
2132 // Recognize patterns for V6_vdeal{b,h} and V6_vshuff{b,h}.
2133 if (unsigned(VecLen) == HwLen) {
2134 if (unsigned SingleOpc = pfs(Cycles, LogLen)) {
2135 Results.push(SingleOpc, SingleTy, {Va});
2136 return OpRef::res(Results.top());
2137 }
2138 }
2139
2140 // From the cycles, construct the sequence of values that will
2141 // then form the control values for vdealvdd/vshuffvdd, i.e.
2142 // (M a1 a2)(M a3 a4 a5)... -> a1 a2 a3 a4 a5
2143 // This essentially strips the M value from the cycles where
2144 // it's present, and performs the insertion of M (then stripping)
2145 // for cycles without M (as described in an earlier comment).
2146 SmallVector<unsigned,8> SwapElems;
2147 // When the input is extended (i.e. single vector becomes a pair),
2148 // this is done by using an "undef" vector as the second input.
2149 // However, then we get
2150 // input 1: GOODBITS
2151 // input 2: ........
2152 // but we need
2153 // input 1: ....BITS
2154 // input 2: ....GOOD
2155 // Then at the end, this needs to be undone. To accomplish this,
2156 // artificially add "LogLen-1" at both ends of the sequence.
2157 if (!HavePairs)
2158 SwapElems.push_back(LogLen-1);
2159 for (const CycleType &C : Cycles) {
2160 // Do the transformation: (a1..an) -> (M a1..an)(M a1).
2161 unsigned First = (C[0] == LogLen-1) ? 1 : 0;
2162 SwapElems.append(C.begin()+First, C.end());
2163 if (First == 0)
2164 SwapElems.push_back(C[0]);
2165 }
2166 if (!HavePairs)
2167 SwapElems.push_back(LogLen-1);
2168
2169 const SDLoc &dl(Results.InpNode);
2170 OpRef Arg = HavePairs ? Va
2171 : concats(Va, OpRef::undef(SingleTy), Results);
2172 if (InvertedPair)
2173 Arg = concats(OpRef::hi(Arg), OpRef::lo(Arg), Results);
2174
2175 for (unsigned I = 0, E = SwapElems.size(); I != E; ) {
2176 bool IsInc = I == E-1 || SwapElems[I] < SwapElems[I+1];
2177 unsigned S = (1u << SwapElems[I]);
2178 if (I < E-1) {
2179 while (++I < E-1 && IsInc == (SwapElems[I] < SwapElems[I+1]))
2180 S |= 1u << SwapElems[I];
2181 // The above loop will not add a bit for the final SwapElems[I+1],
2182 // so add it here.
2183 S |= 1u << SwapElems[I];
2184 }
2185 ++I;
2186
2187 NodeTemplate Res;
2188 Results.push(Hexagon::A2_tfrsi, MVT::i32, {getConst32(S, dl)});
2189 Res.Opc = IsInc ? Hexagon::V6_vshuffvdd : Hexagon::V6_vdealvdd;
2190 Res.Ty = PairTy;
2191 Res.Ops = { OpRef::hi(Arg), OpRef::lo(Arg), OpRef::res(-1) };
2192 Results.push(Res);
2193 Arg = OpRef::res(Results.top());
2194 }
2195
2196 return HavePairs ? Arg : OpRef::lo(Arg);
2197 }
2198
butterfly(ShuffleMask SM,OpRef Va,ResultStack & Results)2199 OpRef HvxSelector::butterfly(ShuffleMask SM, OpRef Va, ResultStack &Results) {
2200 DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
2201 // Butterfly shuffles.
2202 //
2203 // V6_vdelta
2204 // V6_vrdelta
2205 // V6_vror
2206
2207 // The assumption here is that all elements picked by Mask are in the
2208 // first operand to the vector_shuffle. This assumption is enforced
2209 // by the caller.
2210
2211 MVT ResTy = getSingleVT(MVT::i8);
2212 PermNetwork::Controls FC, RC;
2213 const SDLoc &dl(Results.InpNode);
2214 int VecLen = SM.Mask.size();
2215
2216 for (int M : SM.Mask) {
2217 if (M != -1 && M >= VecLen)
2218 return OpRef::fail();
2219 }
2220
2221 // Try the deltas/benes for both single vectors and vector pairs.
2222 ForwardDeltaNetwork FN(SM.Mask);
2223 if (FN.run(FC)) {
2224 SDValue Ctl = getVectorConstant(FC, dl);
2225 Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(Ctl)});
2226 return OpRef::res(Results.top());
2227 }
2228
2229 // Try reverse delta.
2230 ReverseDeltaNetwork RN(SM.Mask);
2231 if (RN.run(RC)) {
2232 SDValue Ctl = getVectorConstant(RC, dl);
2233 Results.push(Hexagon::V6_vrdelta, ResTy, {Va, OpRef(Ctl)});
2234 return OpRef::res(Results.top());
2235 }
2236
2237 // Do Benes.
2238 BenesNetwork BN(SM.Mask);
2239 if (BN.run(FC, RC)) {
2240 SDValue CtlF = getVectorConstant(FC, dl);
2241 SDValue CtlR = getVectorConstant(RC, dl);
2242 Results.push(Hexagon::V6_vdelta, ResTy, {Va, OpRef(CtlF)});
2243 Results.push(Hexagon::V6_vrdelta, ResTy,
2244 {OpRef::res(-1), OpRef(CtlR)});
2245 return OpRef::res(Results.top());
2246 }
2247
2248 return OpRef::fail();
2249 }
2250
getConst32(int Val,const SDLoc & dl)2251 SDValue HvxSelector::getConst32(int Val, const SDLoc &dl) {
2252 return DAG.getTargetConstant(Val, dl, MVT::i32);
2253 }
2254
getVectorConstant(ArrayRef<uint8_t> Data,const SDLoc & dl)2255 SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
2256 const SDLoc &dl) {
2257 SmallVector<SDValue, 128> Elems;
2258 for (uint8_t C : Data)
2259 Elems.push_back(DAG.getConstant(C, dl, MVT::i8));
2260 MVT VecTy = MVT::getVectorVT(MVT::i8, Data.size());
2261 SDValue BV = DAG.getBuildVector(VecTy, dl, Elems);
2262 SDValue LV = Lower.LowerOperation(BV, DAG);
2263 DAG.RemoveDeadNode(BV.getNode());
2264 return DAG.getNode(HexagonISD::ISEL, dl, VecTy, LV);
2265 }
2266
selectShuffle(SDNode * N)2267 void HvxSelector::selectShuffle(SDNode *N) {
2268 DEBUG_WITH_TYPE("isel", {
2269 dbgs() << "Starting " << __func__ << " on node:\n";
2270 N->dump(&DAG);
2271 });
2272 MVT ResTy = N->getValueType(0).getSimpleVT();
2273 // Assume that vector shuffles operate on vectors of bytes.
2274 assert(ResTy.isVector() && ResTy.getVectorElementType() == MVT::i8);
2275
2276 auto *SN = cast<ShuffleVectorSDNode>(N);
2277 std::vector<int> Mask(SN->getMask().begin(), SN->getMask().end());
2278 // This shouldn't really be necessary. Is it?
2279 for (int &Idx : Mask)
2280 if (Idx != -1 && Idx < 0)
2281 Idx = -1;
2282
2283 unsigned VecLen = Mask.size();
2284 bool HavePairs = (2*HwLen == VecLen);
2285 assert(ResTy.getSizeInBits() / 8 == VecLen);
2286
2287 // Vd = vector_shuffle Va, Vb, Mask
2288 //
2289
2290 bool UseLeft = false, UseRight = false;
2291 for (unsigned I = 0; I != VecLen; ++I) {
2292 if (Mask[I] == -1)
2293 continue;
2294 unsigned Idx = Mask[I];
2295 assert(Idx < 2*VecLen);
2296 if (Idx < VecLen)
2297 UseLeft = true;
2298 else
2299 UseRight = true;
2300 }
2301
2302 DEBUG_WITH_TYPE("isel", {
2303 dbgs() << "VecLen=" << VecLen << " HwLen=" << HwLen << " UseLeft="
2304 << UseLeft << " UseRight=" << UseRight << " HavePairs="
2305 << HavePairs << '\n';
2306 });
2307 // If the mask is all -1's, generate "undef".
2308 if (!UseLeft && !UseRight) {
2309 ISel.ReplaceNode(N, ISel.selectUndef(SDLoc(SN), ResTy).getNode());
2310 return;
2311 }
2312
2313 SDValue Vec0 = N->getOperand(0);
2314 SDValue Vec1 = N->getOperand(1);
2315 ResultStack Results(SN);
2316 Results.push(TargetOpcode::COPY, ResTy, {Vec0});
2317 Results.push(TargetOpcode::COPY, ResTy, {Vec1});
2318 OpRef Va = OpRef::res(Results.top()-1);
2319 OpRef Vb = OpRef::res(Results.top());
2320
2321 OpRef Res = !HavePairs ? shuffs2(ShuffleMask(Mask), Va, Vb, Results)
2322 : shuffp2(ShuffleMask(Mask), Va, Vb, Results);
2323
2324 bool Done = Res.isValid();
2325 if (Done) {
2326 // Make sure that Res is on the stack before materializing.
2327 Results.push(TargetOpcode::COPY, ResTy, {Res});
2328 materialize(Results);
2329 } else {
2330 Done = scalarizeShuffle(Mask, SDLoc(N), ResTy, Vec0, Vec1, N);
2331 }
2332
2333 if (!Done) {
2334 #ifndef NDEBUG
2335 dbgs() << "Unhandled shuffle:\n";
2336 SN->dumpr(&DAG);
2337 #endif
2338 llvm_unreachable("Failed to select vector shuffle");
2339 }
2340 }
2341
selectRor(SDNode * N)2342 void HvxSelector::selectRor(SDNode *N) {
2343 // If this is a rotation by less than 8, use V6_valignbi.
2344 MVT Ty = N->getValueType(0).getSimpleVT();
2345 const SDLoc &dl(N);
2346 SDValue VecV = N->getOperand(0);
2347 SDValue RotV = N->getOperand(1);
2348 SDNode *NewN = nullptr;
2349
2350 if (auto *CN = dyn_cast<ConstantSDNode>(RotV.getNode())) {
2351 unsigned S = CN->getZExtValue() % HST.getVectorLength();
2352 if (S == 0) {
2353 NewN = VecV.getNode();
2354 } else if (isUInt<3>(S)) {
2355 NewN = DAG.getMachineNode(Hexagon::V6_valignbi, dl, Ty,
2356 {VecV, VecV, getConst32(S, dl)});
2357 }
2358 }
2359
2360 if (!NewN)
2361 NewN = DAG.getMachineNode(Hexagon::V6_vror, dl, Ty, {VecV, RotV});
2362
2363 ISel.ReplaceNode(N, NewN);
2364 }
2365
selectVAlign(SDNode * N)2366 void HvxSelector::selectVAlign(SDNode *N) {
2367 SDValue Vv = N->getOperand(0);
2368 SDValue Vu = N->getOperand(1);
2369 SDValue Rt = N->getOperand(2);
2370 SDNode *NewN = DAG.getMachineNode(Hexagon::V6_valignb, SDLoc(N),
2371 N->getValueType(0), {Vv, Vu, Rt});
2372 ISel.ReplaceNode(N, NewN);
2373 DAG.RemoveDeadNode(N);
2374 }
2375
SelectHvxShuffle(SDNode * N)2376 void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
2377 HvxSelector(*this, *CurDAG).selectShuffle(N);
2378 }
2379
SelectHvxRor(SDNode * N)2380 void HexagonDAGToDAGISel::SelectHvxRor(SDNode *N) {
2381 HvxSelector(*this, *CurDAG).selectRor(N);
2382 }
2383
SelectHvxVAlign(SDNode * N)2384 void HexagonDAGToDAGISel::SelectHvxVAlign(SDNode *N) {
2385 HvxSelector(*this, *CurDAG).selectVAlign(N);
2386 }
2387
SelectV65GatherPred(SDNode * N)2388 void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
2389 const SDLoc &dl(N);
2390 SDValue Chain = N->getOperand(0);
2391 SDValue Address = N->getOperand(2);
2392 SDValue Predicate = N->getOperand(3);
2393 SDValue Base = N->getOperand(4);
2394 SDValue Modifier = N->getOperand(5);
2395 SDValue Offset = N->getOperand(6);
2396 SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
2397
2398 unsigned Opcode;
2399 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2400 switch (IntNo) {
2401 default:
2402 llvm_unreachable("Unexpected HVX gather intrinsic.");
2403 case Intrinsic::hexagon_V6_vgathermhq:
2404 case Intrinsic::hexagon_V6_vgathermhq_128B:
2405 Opcode = Hexagon::V6_vgathermhq_pseudo;
2406 break;
2407 case Intrinsic::hexagon_V6_vgathermwq:
2408 case Intrinsic::hexagon_V6_vgathermwq_128B:
2409 Opcode = Hexagon::V6_vgathermwq_pseudo;
2410 break;
2411 case Intrinsic::hexagon_V6_vgathermhwq:
2412 case Intrinsic::hexagon_V6_vgathermhwq_128B:
2413 Opcode = Hexagon::V6_vgathermhwq_pseudo;
2414 break;
2415 }
2416
2417 SDVTList VTs = CurDAG->getVTList(MVT::Other);
2418 SDValue Ops[] = { Address, ImmOperand,
2419 Predicate, Base, Modifier, Offset, Chain };
2420 SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2421
2422 MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
2423 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
2424
2425 ReplaceNode(N, Result);
2426 }
2427
SelectV65Gather(SDNode * N)2428 void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
2429 const SDLoc &dl(N);
2430 SDValue Chain = N->getOperand(0);
2431 SDValue Address = N->getOperand(2);
2432 SDValue Base = N->getOperand(3);
2433 SDValue Modifier = N->getOperand(4);
2434 SDValue Offset = N->getOperand(5);
2435 SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
2436
2437 unsigned Opcode;
2438 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2439 switch (IntNo) {
2440 default:
2441 llvm_unreachable("Unexpected HVX gather intrinsic.");
2442 case Intrinsic::hexagon_V6_vgathermh:
2443 case Intrinsic::hexagon_V6_vgathermh_128B:
2444 Opcode = Hexagon::V6_vgathermh_pseudo;
2445 break;
2446 case Intrinsic::hexagon_V6_vgathermw:
2447 case Intrinsic::hexagon_V6_vgathermw_128B:
2448 Opcode = Hexagon::V6_vgathermw_pseudo;
2449 break;
2450 case Intrinsic::hexagon_V6_vgathermhw:
2451 case Intrinsic::hexagon_V6_vgathermhw_128B:
2452 Opcode = Hexagon::V6_vgathermhw_pseudo;
2453 break;
2454 }
2455
2456 SDVTList VTs = CurDAG->getVTList(MVT::Other);
2457 SDValue Ops[] = { Address, ImmOperand, Base, Modifier, Offset, Chain };
2458 SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
2459
2460 MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
2461 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
2462
2463 ReplaceNode(N, Result);
2464 }
2465
SelectHVXDualOutput(SDNode * N)2466 void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
2467 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2468 SDNode *Result;
2469 switch (IID) {
2470 case Intrinsic::hexagon_V6_vaddcarry: {
2471 std::array<SDValue, 3> Ops = {
2472 {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
2473 SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v64i1);
2474 Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
2475 break;
2476 }
2477 case Intrinsic::hexagon_V6_vaddcarry_128B: {
2478 std::array<SDValue, 3> Ops = {
2479 {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
2480 SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v128i1);
2481 Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
2482 break;
2483 }
2484 case Intrinsic::hexagon_V6_vsubcarry: {
2485 std::array<SDValue, 3> Ops = {
2486 {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
2487 SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v64i1);
2488 Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
2489 break;
2490 }
2491 case Intrinsic::hexagon_V6_vsubcarry_128B: {
2492 std::array<SDValue, 3> Ops = {
2493 {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
2494 SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v128i1);
2495 Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
2496 break;
2497 }
2498 default:
2499 llvm_unreachable("Unexpected HVX dual output intrinsic.");
2500 }
2501 ReplaceUses(N, Result);
2502 ReplaceUses(SDValue(N, 0), SDValue(Result, 0));
2503 ReplaceUses(SDValue(N, 1), SDValue(Result, 1));
2504 CurDAG->RemoveDeadNode(N);
2505 }
2506