Tokenizer.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 24 SBuf */
10 
11 #include "squid.h"
12 #include "debug/Stream.h"
13 #include "parser/forward.h"
14 #include "parser/Tokenizer.h"
15 #include "sbuf/Stream.h"
16 
17 #include <cctype>
18 #include <cerrno>
19 
21 SBuf
23 {
24  // careful: n may be npos!
25  debugs(24, 5, "consuming " << n << " bytes");
26  const SBuf result = buf_.consume(n);
27  parsed_ += result.length();
28  return result;
29 }
30 
34 {
35  return consume(n).length();
36 }
37 
39 SBuf
41 {
42  debugs(24, 5, "consuming " << n << " bytes");
43 
44  // If n is npos, we consume everything from buf_ (and nothing from result).
45  const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
46 
47  SBuf result = buf_;
48  buf_ = result.consume(buf_.length() - parsed);
49  parsed_ += parsed;
50  return result;
51 }
52 
56 {
57  return consumeTrailing(n).length();
58 }
59 
60 bool
61 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
62 {
63  const Tokenizer saved(*this);
64  skipAll(delimiters);
65  const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
66  if (tokenLen == SBuf::npos) {
67  debugs(24, 8, "no token found for delimiters " << delimiters.name);
68  *this = saved;
69  return false;
70  }
71  returnedToken = consume(tokenLen); // cannot be empty
72  skipAll(delimiters);
73  debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
74  returnedToken << '\'');
75  return true;
76 }
77 
78 bool
79 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
80 {
81  SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
82  if (prefixLen == 0) {
83  debugs(24, 8, "no prefix for set " << tokenChars.name);
84  return false;
85  }
86  if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
87  debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
88  return false;
89  }
90  if (prefixLen == SBuf::npos && limit > 0) {
91  debugs(24, 8, "whole haystack matched");
92  prefixLen = limit;
93  }
94  debugs(24, 8, "found with length " << prefixLen);
95  returnedToken = consume(prefixLen); // cannot be empty after the npos check
96  return true;
97 }
98 
99 SBuf
100 Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
101 {
102  if (atEnd())
103  throw InsufficientInput();
104 
105  SBuf result;
106 
107  if (!prefix(result, tokenChars, limit))
108  throw TexcHere(ToSBuf("cannot parse ", description));
109 
110  if (atEnd())
111  throw InsufficientInput();
112 
113  return result;
114 }
115 
116 bool
117 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
118 {
119  SBuf span = buf_;
120 
121  if (limit < buf_.length())
122  span.consume(buf_.length() - limit); // ignore the N prefix characters
123 
124  auto i = span.rbegin();
125  SBuf::size_type found = 0;
126  while (i != span.rend() && tokenChars[*i]) {
127  ++i;
128  ++found;
129  }
130  if (!found)
131  return false;
132  returnedToken = consumeTrailing(found);
133  return true;
134 }
135 
138 {
139  const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
140  if (prefixLen == 0) {
141  debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
142  return 0;
143  }
144  debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
145  return success(prefixLen);
146 }
147 
148 void
149 Parser::Tokenizer::skipRequired(const char *description, const SBuf &tokenToSkip)
150 {
151  if (skip(tokenToSkip) || tokenToSkip.isEmpty())
152  return;
153 
154  if (tokenToSkip.startsWith(buf_))
155  throw InsufficientInput();
156 
157  throw TextException(ToSBuf("cannot skip ", description), Here());
158 }
159 
160 bool
162 {
163  if (!buf_.isEmpty() && chars[buf_[0]]) {
164  debugs(24, 8, "skipping one-of " << chars.name);
165  return success(1);
166  }
167  debugs(24, 8, "no match while skipping one-of " << chars.name);
168  return false;
169 }
170 
171 bool
173 {
174  if (buf_.length() < tokenToSkip.length())
175  return false;
176 
177  SBuf::size_type offset = 0;
178  if (tokenToSkip.length() < buf_.length())
179  offset = buf_.length() - tokenToSkip.length();
180 
181  if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
182  debugs(24, 8, "skipping " << tokenToSkip.length());
183  return successTrailing(tokenToSkip.length());
184  }
185  return false;
186 }
187 
188 bool
189 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
190 {
191  if (buf_.startsWith(tokenToSkip)) {
192  debugs(24, 8, "skipping " << tokenToSkip.length());
193  return success(tokenToSkip.length());
194  }
195  debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
196  return false;
197 }
198 
199 bool
200 Parser::Tokenizer::skip(const char tokenChar)
201 {
202  if (!buf_.isEmpty() && buf_[0] == tokenChar) {
203  debugs(24, 8, "skipping char '" << tokenChar << '\'');
204  return success(1);
205  }
206  debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
207  return false;
208 }
209 
210 bool
212 {
213  if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
214  debugs(24, 8, "skipping one-of " << skippable.name);
215  return successTrailing(1);
216  }
217  debugs(24, 8, "no match while skipping one-of " << skippable.name);
218  return false;
219 }
220 
223 {
224  const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
225  const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
226  0 : (prefixEnd + 1);
227  const SBuf::size_type suffixLen = buf_.length() - prefixLen;
228  if (suffixLen == 0) {
229  debugs(24, 8, "no match when trying to skip " << skippable.name);
230  return 0;
231  }
232  debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
233  return successTrailing(suffixLen);
234 }
235 
236 /* reworked from compat/strtoll.c */
237 bool
238 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
239 {
240  if (atEnd() || limit == 0)
241  return false;
242 
243  const SBuf range(buf_.substr(0,limit));
244 
245  // XXX: account for buf_.size()
246  bool neg = false;
247  const char *s = range.rawContent();
248  const char *end = range.rawContent() + range.length();
249 
250  if (allowSign) {
251  if (*s == '-') {
252  neg = true;
253  ++s;
254  } else if (*s == '+') {
255  ++s;
256  }
257  if (s >= end) return false;
258  }
259  if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
260  tolower(*(s+1)) == 'x') {
261  s += 2;
262  base = 16;
263  }
264  if (base == 0) {
265  if ( *s == '0') {
266  base = 8;
267  } else {
268  base = 10;
269  }
270  }
271  if (s >= end) return false;
272 
273  uint64_t cutoff;
274 
275  cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
276  const int cutlim = cutoff % static_cast<int64_t>(base);
277  cutoff /= static_cast<uint64_t>(base);
278 
279  int any = 0, c;
280  int64_t acc = 0;
281  do {
282  c = *s;
283  if (xisdigit(c)) {
284  c -= '0';
285  } else if (xisalpha(c)) {
286  c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
287  } else {
288  break;
289  }
290  if (c >= base)
291  break;
292  if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
293  any = -1;
294  else {
295  any = 1;
296  acc *= base;
297  acc += c;
298  }
299  } while (++s < end);
300 
301  if (any == 0) // nothing was parsed
302  return false;
303  if (any < 0) {
304  acc = neg ? INT64_MIN : INT64_MAX;
305  errno = ERANGE;
306  return false;
307  } else if (neg)
308  acc = -acc;
309 
310  result = acc;
311  return success(s - range.rawContent());
312 }
313 
314 int64_t
315 Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
316 {
317  if (atEnd())
318  throw InsufficientInput();
319 
320  int64_t result = 0;
321 
322  // Since we only support unsigned decimals, a parsing failure with a
323  // non-empty input always implies invalid/malformed input (or a buggy
324  // limit=0 caller). TODO: Support signed and non-decimal integers by
325  // refactoring int64() to detect insufficient input.
326  if (!int64(result, 10, false, limit))
327  throw TexcHere(ToSBuf("cannot parse ", description));
328 
329  if (atEnd())
330  throw InsufficientInput(); // more digits may be coming
331 
332  return result;
333 }
334 
SBuf buf_
yet unparsed input
Definition: Tokenizer.h:176
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:79
#define Here()
source code location of the caller
Definition: Here.h:15
SBuf::size_type skipAll(const CharacterSet &discardables)
Definition: Tokenizer.cc:137
const char * name
optional set label for debugging (default: "anonymous")
Definition: CharacterSet.h:72
const_reverse_iterator rbegin() const
Definition: SBuf.h:595
bool skipOneTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:211
SBuf::size_type success(const SBuf::size_type n)
convenience method: consume()s up to n bytes and returns their count
Definition: Tokenizer.cc:33
bool isEmpty() const
Definition: SBuf.h:435
bool token(SBuf &returnedToken, const CharacterSet &delimiters)
Definition: Tokenizer.cc:61
#define INT64_MIN
Definition: types.h:79
Definition: SBuf.h:93
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:189
bool startsWith(const SBuf &S, const SBufCaseSensitive isCaseSensitive=caseSensitive) const
Definition: SBuf.cc:442
SBuf::size_type skipAllTrailing(const CharacterSet &discardables)
Definition: Tokenizer.cc:222
#define TexcHere(msg)
legacy convenience macro; it is not difficult to type Here() now
Definition: TextException.h:63
#define DBG_DATA
Definition: Stream.h:40
#define xisupper(x)
Definition: xis.h:26
const_reverse_iterator rend() const
Definition: SBuf.h:599
#define xisalpha(x)
Definition: xis.h:21
const char * rawContent() const
Definition: SBuf.cc:509
SBuf consume(size_type n=npos)
Definition: SBuf.cc:481
SBuf consumeTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns them
Definition: Tokenizer.cc:40
MemBlob::size_type size_type
Definition: SBuf.h:96
SBuf::size_type parsed_
bytes successfully parsed, including skipped
Definition: Tokenizer.h:177
#define xisdigit(x)
Definition: xis.h:18
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:419
static const size_type npos
Definition: SBuf.h:100
SBuf consume(const SBuf::size_type n)
convenience method: consumes up to n bytes, counts, and returns them
Definition: Tokenizer.cc:22
#define INT64_MAX
Definition: types.h:89
bool int64(int64_t &result, int base=0, bool allowSign=true, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:238
an std::runtime_error with thrower location info
Definition: TextException.h:20
void skipRequired(const char *description, const SBuf &tokenToSkip)
Definition: Tokenizer.cc:149
SBuf::size_type successTrailing(const SBuf::size_type n)
convenience method: consumes up to n last bytes and returns their count
Definition: Tokenizer.cc:55
SBuf ToSBuf(Args &&... args)
slowly stream-prints all arguments into a freshly allocated SBuf
Definition: Stream.h:63
bool skipOne(const CharacterSet &discardables)
Definition: Tokenizer.cc:161
bool skipSuffix(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:172
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:17
int64_t udec64(const char *description, SBuf::size_type limit=SBuf::npos)
int64() wrapper but limited to unsigned decimal integers (for now)
Definition: Tokenizer.cc:315
bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:117
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:192
thrown by modern "incremental" parsers when they need more data
Definition: forward.h:18

 

Introduction

Documentation

Support

Miscellaneous