1f7a7ab59SRui Ueyama //===-- GlobPattern.cpp - Glob pattern matcher implementation -------------===//
2f7a7ab59SRui Ueyama //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6f7a7ab59SRui Ueyama //
7f7a7ab59SRui Ueyama //===----------------------------------------------------------------------===//
8f7a7ab59SRui Ueyama //
9f7a7ab59SRui Ueyama // This file implements a glob pattern matcher.
10f7a7ab59SRui Ueyama //
11f7a7ab59SRui Ueyama //===----------------------------------------------------------------------===//
12f7a7ab59SRui Ueyama 
13f7a7ab59SRui Ueyama #include "llvm/Support/GlobPattern.h"
14f7a7ab59SRui Ueyama #include "llvm/ADT/ArrayRef.h"
15f7a7ab59SRui Ueyama #include "llvm/ADT/Optional.h"
16f7a7ab59SRui Ueyama #include "llvm/ADT/StringRef.h"
17f7a7ab59SRui Ueyama #include "llvm/Support/Errc.h"
18f7a7ab59SRui Ueyama 
19f7a7ab59SRui Ueyama using namespace llvm;
20f7a7ab59SRui Ueyama 
hasWildcard(StringRef S)21f7a7ab59SRui Ueyama static bool hasWildcard(StringRef S) {
22*48993d5aSJordan Rupprecht   return S.find_first_of("?*[\\") != StringRef::npos;
23f7a7ab59SRui Ueyama }
24f7a7ab59SRui Ueyama 
25f7a7ab59SRui Ueyama // Expands character ranges and returns a bitmap.
26f7a7ab59SRui Ueyama // For example, "a-cf-hz" is expanded to "abcfghz".
expand(StringRef S,StringRef Original)27f7a7ab59SRui Ueyama static Expected<BitVector> expand(StringRef S, StringRef Original) {
28f7a7ab59SRui Ueyama   BitVector BV(256, false);
29f7a7ab59SRui Ueyama 
30f7a7ab59SRui Ueyama   // Expand X-Y.
31f7a7ab59SRui Ueyama   for (;;) {
32f7a7ab59SRui Ueyama     if (S.size() < 3)
33f7a7ab59SRui Ueyama       break;
34f7a7ab59SRui Ueyama 
35e36d7a6dSGeorge Rimar     uint8_t Start = S[0];
36e36d7a6dSGeorge Rimar     uint8_t End = S[2];
37e36d7a6dSGeorge Rimar 
38f7a7ab59SRui Ueyama     // If it doesn't start with something like X-Y,
39f7a7ab59SRui Ueyama     // consume the first character and proceed.
40f7a7ab59SRui Ueyama     if (S[1] != '-') {
41e36d7a6dSGeorge Rimar       BV[Start] = true;
42f7a7ab59SRui Ueyama       S = S.substr(1);
43f7a7ab59SRui Ueyama       continue;
44f7a7ab59SRui Ueyama     }
45f7a7ab59SRui Ueyama 
46f7a7ab59SRui Ueyama     // It must be in the form of X-Y.
47f7a7ab59SRui Ueyama     // Validate it and then interpret the range.
48e36d7a6dSGeorge Rimar     if (Start > End)
49f7a7ab59SRui Ueyama       return make_error<StringError>("invalid glob pattern: " + Original,
50f7a7ab59SRui Ueyama                                      errc::invalid_argument);
51f7a7ab59SRui Ueyama 
52e36d7a6dSGeorge Rimar     for (int C = Start; C <= End; ++C)
53e36d7a6dSGeorge Rimar       BV[(uint8_t)C] = true;
54f7a7ab59SRui Ueyama     S = S.substr(3);
55f7a7ab59SRui Ueyama   }
56f7a7ab59SRui Ueyama 
57f7a7ab59SRui Ueyama   for (char C : S)
58e36d7a6dSGeorge Rimar     BV[(uint8_t)C] = true;
59f7a7ab59SRui Ueyama   return BV;
60f7a7ab59SRui Ueyama }
61f7a7ab59SRui Ueyama 
62f7a7ab59SRui Ueyama // This is a scanner for the glob pattern.
63*48993d5aSJordan Rupprecht // A glob pattern token is one of "*", "?", "\", "[<chars>]", "[^<chars>]"
64*48993d5aSJordan Rupprecht // (which is a negative form of "[<chars>]"), "[!<chars>]" (which is
65*48993d5aSJordan Rupprecht // equivalent to "[^<chars>]"), or a non-meta character.
66f7a7ab59SRui Ueyama // This function returns the first token in S.
scan(StringRef & S,StringRef Original)67f7a7ab59SRui Ueyama static Expected<BitVector> scan(StringRef &S, StringRef Original) {
68f7a7ab59SRui Ueyama   switch (S[0]) {
69f7a7ab59SRui Ueyama   case '*':
70f7a7ab59SRui Ueyama     S = S.substr(1);
71f7a7ab59SRui Ueyama     // '*' is represented by an empty bitvector.
72f7a7ab59SRui Ueyama     // All other bitvectors are 256-bit long.
73f7a7ab59SRui Ueyama     return BitVector();
74f7a7ab59SRui Ueyama   case '?':
75f7a7ab59SRui Ueyama     S = S.substr(1);
76f7a7ab59SRui Ueyama     return BitVector(256, true);
77f7a7ab59SRui Ueyama   case '[': {
78*48993d5aSJordan Rupprecht     // ']' is allowed as the first character of a character class. '[]' is
79*48993d5aSJordan Rupprecht     // invalid. So, just skip the first character.
80*48993d5aSJordan Rupprecht     size_t End = S.find(']', 2);
81f7a7ab59SRui Ueyama     if (End == StringRef::npos)
82f7a7ab59SRui Ueyama       return make_error<StringError>("invalid glob pattern: " + Original,
83f7a7ab59SRui Ueyama                                      errc::invalid_argument);
84f7a7ab59SRui Ueyama 
85f7a7ab59SRui Ueyama     StringRef Chars = S.substr(1, End - 1);
86f7a7ab59SRui Ueyama     S = S.substr(End + 1);
87*48993d5aSJordan Rupprecht     if (Chars.startswith("^") || Chars.startswith("!")) {
88f7a7ab59SRui Ueyama       Expected<BitVector> BV = expand(Chars.substr(1), Original);
89f7a7ab59SRui Ueyama       if (!BV)
90f7a7ab59SRui Ueyama         return BV.takeError();
91f7a7ab59SRui Ueyama       return BV->flip();
92f7a7ab59SRui Ueyama     }
93f7a7ab59SRui Ueyama     return expand(Chars, Original);
94f7a7ab59SRui Ueyama   }
95*48993d5aSJordan Rupprecht   case '\\':
96*48993d5aSJordan Rupprecht     // Eat this character and fall through below to treat it like a non-meta
97*48993d5aSJordan Rupprecht     // character.
98*48993d5aSJordan Rupprecht     S = S.substr(1);
99*48993d5aSJordan Rupprecht     LLVM_FALLTHROUGH;
100f7a7ab59SRui Ueyama   default:
101f7a7ab59SRui Ueyama     BitVector BV(256, false);
102e36d7a6dSGeorge Rimar     BV[(uint8_t)S[0]] = true;
103f7a7ab59SRui Ueyama     S = S.substr(1);
104f7a7ab59SRui Ueyama     return BV;
105f7a7ab59SRui Ueyama   }
106f7a7ab59SRui Ueyama }
107f7a7ab59SRui Ueyama 
create(StringRef S)108f7a7ab59SRui Ueyama Expected<GlobPattern> GlobPattern::create(StringRef S) {
109f7a7ab59SRui Ueyama   GlobPattern Pat;
110f7a7ab59SRui Ueyama 
111f7a7ab59SRui Ueyama   // S doesn't contain any metacharacter,
112f7a7ab59SRui Ueyama   // so the regular string comparison should work.
113f7a7ab59SRui Ueyama   if (!hasWildcard(S)) {
114f7a7ab59SRui Ueyama     Pat.Exact = S;
115f7a7ab59SRui Ueyama     return Pat;
116f7a7ab59SRui Ueyama   }
117f7a7ab59SRui Ueyama 
118*48993d5aSJordan Rupprecht   // S is something like "foo*", and the "* is not escaped. We can use
119*48993d5aSJordan Rupprecht   // startswith().
120*48993d5aSJordan Rupprecht   if (S.endswith("*") && !S.endswith("\\*") && !hasWildcard(S.drop_back())) {
121f7a7ab59SRui Ueyama     Pat.Prefix = S.drop_back();
122f7a7ab59SRui Ueyama     return Pat;
123f7a7ab59SRui Ueyama   }
124f7a7ab59SRui Ueyama 
125f7a7ab59SRui Ueyama   // S is something like "*foo". We can use endswith().
126f7a7ab59SRui Ueyama   if (S.startswith("*") && !hasWildcard(S.drop_front())) {
127f7a7ab59SRui Ueyama     Pat.Suffix = S.drop_front();
128f7a7ab59SRui Ueyama     return Pat;
129f7a7ab59SRui Ueyama   }
130f7a7ab59SRui Ueyama 
131f7a7ab59SRui Ueyama   // Otherwise, we need to do real glob pattern matching.
132f7a7ab59SRui Ueyama   // Parse the pattern now.
133f7a7ab59SRui Ueyama   StringRef Original = S;
134f7a7ab59SRui Ueyama   while (!S.empty()) {
135f7a7ab59SRui Ueyama     Expected<BitVector> BV = scan(S, Original);
136f7a7ab59SRui Ueyama     if (!BV)
137f7a7ab59SRui Ueyama       return BV.takeError();
138f7a7ab59SRui Ueyama     Pat.Tokens.push_back(*BV);
139f7a7ab59SRui Ueyama   }
140f7a7ab59SRui Ueyama   return Pat;
141f7a7ab59SRui Ueyama }
142f7a7ab59SRui Ueyama 
match(StringRef S) const143f7a7ab59SRui Ueyama bool GlobPattern::match(StringRef S) const {
144f7a7ab59SRui Ueyama   if (Exact)
145f7a7ab59SRui Ueyama     return S == *Exact;
146f7a7ab59SRui Ueyama   if (Prefix)
147f7a7ab59SRui Ueyama     return S.startswith(*Prefix);
148f7a7ab59SRui Ueyama   if (Suffix)
149f7a7ab59SRui Ueyama     return S.endswith(*Suffix);
150f7a7ab59SRui Ueyama   return matchOne(Tokens, S);
151f7a7ab59SRui Ueyama }
152f7a7ab59SRui Ueyama 
153f7a7ab59SRui Ueyama // Runs glob pattern Pats against string S.
matchOne(ArrayRef<BitVector> Pats,StringRef S) const154f7a7ab59SRui Ueyama bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
155f7a7ab59SRui Ueyama   for (;;) {
156f7a7ab59SRui Ueyama     if (Pats.empty())
157f7a7ab59SRui Ueyama       return S.empty();
158f7a7ab59SRui Ueyama 
159f7a7ab59SRui Ueyama     // If Pats[0] is '*', try to match Pats[1..] against all possible
160f7a7ab59SRui Ueyama     // tail strings of S to see at least one pattern succeeds.
161f7a7ab59SRui Ueyama     if (Pats[0].size() == 0) {
162f7a7ab59SRui Ueyama       Pats = Pats.slice(1);
163f7a7ab59SRui Ueyama       if (Pats.empty())
164f7a7ab59SRui Ueyama         // Fast path. If a pattern is '*', it matches anything.
165f7a7ab59SRui Ueyama         return true;
166f7a7ab59SRui Ueyama       for (size_t I = 0, E = S.size(); I < E; ++I)
167f7a7ab59SRui Ueyama         if (matchOne(Pats, S.substr(I)))
168f7a7ab59SRui Ueyama           return true;
169f7a7ab59SRui Ueyama       return false;
170f7a7ab59SRui Ueyama     }
171f7a7ab59SRui Ueyama 
172f7a7ab59SRui Ueyama     // If Pats[0] is not '*', it must consume one character.
173e36d7a6dSGeorge Rimar     if (S.empty() || !Pats[0][(uint8_t)S[0]])
174f7a7ab59SRui Ueyama       return false;
175f7a7ab59SRui Ueyama     Pats = Pats.slice(1);
176f7a7ab59SRui Ueyama     S = S.substr(1);
177f7a7ab59SRui Ueyama   }
178f7a7ab59SRui Ueyama }
179