1 //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a glob pattern matcher.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/GlobPattern.h"
14 #include "llvm/ADT/ArrayRef.h"
15 #include "llvm/ADT/BitVector.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Support/Errc.h"
19 
20 using namespace llvm;
21 
22 static bool hasWildcard(StringRef S) {
23   return S.find_first_of("?*[\\") != StringRef::npos;
24 }
25 
26 // Expands character ranges and returns a bitmap.
27 // For example, "a-cf-hz" is expanded to "abcfghz".
28 static Expected<BitVector> expand(StringRef S, StringRef Original) {
29   BitVector BV(256, false);
30 
31   // Expand X-Y.
32   for (;;) {
33     if (S.size() < 3)
34       break;
35 
36     uint8_t Start = S[0];
37     uint8_t End = S[2];
38 
39     // If it doesn't start with something like X-Y,
40     // consume the first character and proceed.
41     if (S[1] != '-') {
42       BV[Start] = true;
43       S = S.substr(1);
44       continue;
45     }
46 
47     // It must be in the form of X-Y.
48     // Validate it and then interpret the range.
49     if (Start > End)
50       return make_error<StringError>("invalid glob pattern: " + Original,
51                                      errc::invalid_argument);
52 
53     for (int C = Start; C <= End; ++C)
54       BV[(uint8_t)C] = true;
55     S = S.substr(3);
56   }
57 
58   for (char C : S)
59     BV[(uint8_t)C] = true;
60   return BV;
61 }
62 
63 // This is a scanner for the glob pattern.
64 // A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
65 // (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
66 // equivalent to "[^<chars>]"), or a non-meta character.
67 // This function returns the first token in S.
68 static Expected<BitVector> scan(StringRef &S, StringRef Original) {
69   switch (S[0]) {
70   case '*':
71     S = S.substr(1);
72     // '*' is represented by an empty bitvector.
73     // All other bitvectors are 256-bit long.
74     return BitVector();
75   case '?':
76     S = S.substr(1);
77     return BitVector(256, true);
78   case '[': {
79     // ']' is allowed as the first character of a character class. '[]' is
80     // invalid. So, just skip the first character.
81     size_t End = S.find(']', 2);
82     if (End == StringRef::npos)
83       return make_error<StringError>("invalid glob pattern: " + Original,
84                                      errc::invalid_argument);
85 
86     StringRef Chars = S.substr(1, End - 1);
87     S = S.substr(End + 1);
88     if (Chars.startswith("^") || Chars.startswith("!")) {
89       Expected<BitVector> BV = expand(Chars.substr(1), Original);
90       if (!BV)
91         return BV.takeError();
92       return BV->flip();
93     }
94     return expand(Chars, Original);
95   }
96   case '\\':
97     // Eat this character and fall through below to treat it like a non-meta
98     // character.
99     S = S.substr(1);
100     LLVM_FALLTHROUGH;
101   default:
102     BitVector BV(256, false);
103     BV[(uint8_t)S[0]] = true;
104     S = S.substr(1);
105     return BV;
106   }
107 }
108 
109 Expected<GlobPattern> GlobPattern::create(StringRef S) {
110   GlobPattern Pat;
111 
112   // S doesn't contain any metacharacter,
113   // so the regular string comparison should work.
114   if (!hasWildcard(S)) {
115     Pat.Exact = S;
116     return Pat;
117   }
118 
119   // S is something like "foo*", and the "* is not escaped. We can use
120   // startswith().
121   if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
122     Pat.Prefix = S.drop_back();
123     return Pat;
124   }
125 
126   // S is something like "*foo". We can use endswith().
127   if (S.startswith("*") && !hasWildcard(S.drop_front())) {
128     Pat.Suffix = S.drop_front();
129     return Pat;
130   }
131 
132   // Otherwise, we need to do real glob pattern matching.
133   // Parse the pattern now.
134   StringRef Original = S;
135   while (!S.empty()) {
136     Expected<BitVector> BV = scan(S, Original);
137     if (!BV)
138       return BV.takeError();
139     Pat.Tokens.push_back(*BV);
140   }
141   return Pat;
142 }
143 
144 bool GlobPattern::match(StringRef S) const {
145   if (Exact)
146     return S == *Exact;
147   if (Prefix)
148     return S.startswith(*Prefix);
149   if (Suffix)
150     return S.endswith(*Suffix);
151   return matchOne(Tokens, S);
152 }
153 
154 // Runs glob pattern Pats against string S.
155 bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
156   for (;;) {
157     if (Pats.empty())
158       return S.empty();
159 
160     // If Pats[0] is '*', try to match Pats[1..] against all possible
161     // tail strings of S to see at least one pattern succeeds.
162     if (Pats[0].size() == 0) {
163       Pats = Pats.slice(1);
164       if (Pats.empty())
165         // Fast path. If a pattern is '*', it matches anything.
166         return true;
167       for (size_t I = 0, E = S.size(); I < E; ++I)
168         if (matchOne(Pats, S.substr(I)))
169           return true;
170       return false;
171     }
172 
173     // If Pats[0] is not '*', it must consume one character.
174     if (S.empty() || !Pats[0][(uint8_t)S[0]])
175       return false;
176     Pats = Pats.slice(1);
177     S = S.substr(1);
178   }
179 }
180