Uri.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 /* DEBUG: section 23 URL Parsing */
10 
11 #include "squid.h"
12 #include "anyp/Host.h"
13 #include "anyp/Uri.h"
14 #include "base/Raw.h"
15 #include "globals.h"
16 #include "HttpRequest.h"
17 #include "parser/Tokenizer.h"
18 #include "rfc1738.h"
19 #include "SquidConfig.h"
20 #include "SquidMath.h"
21 
22 static const char valid_hostname_chars_u[] =
23  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
24  "abcdefghijklmnopqrstuvwxyz"
25  "0123456789-._"
26  "[:]"
27  ;
28 static const char valid_hostname_chars[] =
29  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
30  "abcdefghijklmnopqrstuvwxyz"
31  "0123456789-."
32  "[:]"
33  ;
34 
36 static const CharacterSet &
38 {
39  /*
40  * RFC 3986 section 3.2.1
41  *
42  * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
43  * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
44  * pct-encoded = "%" HEXDIG HEXDIG
45  * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
46  */
47  static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
50  return userInfoValid;
51 }
52 
54 static const CharacterSet &
56 {
57  /*
58  * RFC 3986 section 3.3
59  *
60  * path = path-abempty ; begins with "/" or is empty
61  * ...
62  * path-abempty = *( "/" segment )
63  * segment = *pchar
64  * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
65  */
66  static const auto pathValid = CharacterSet("path", "/:@-._~%!$&'()*+,;=") +
69  return pathValid;
70 }
71 
75 SBuf
76 AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
77 {
78  if (buf.isEmpty())
79  return buf;
80 
81  Parser::Tokenizer tk(buf);
82  SBuf goodSection;
83  // optimization for the arguably common "no encoding necessary" case
84  if (tk.prefix(goodSection, ignore) && tk.atEnd())
85  return buf;
86 
87  SBuf output;
88  output.reserveSpace(buf.length() * 3); // worst case: encode all chars
89  output.append(goodSection); // may be empty
90 
91  while (!tk.atEnd()) {
92  // TODO: Add Tokenizer::parseOne(void).
93  const auto ch = tk.remaining()[0];
94  output.appendf("%%%02X", static_cast<unsigned int>(static_cast<unsigned char>(ch))); // TODO: Optimize using a table
95  (void)tk.skip(ch);
96 
97  if (tk.prefix(goodSection, ignore))
98  output.append(goodSection);
99  }
100 
101  return output;
102 }
103 
104 SBuf
106 {
107  SBuf output;
108  Parser::Tokenizer tok(buf);
109  while (!tok.atEnd()) {
110  SBuf token;
111  static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
112  if (tok.prefix(token, unencodedChars))
113  output.append(token);
114 
115  // we are either at '%' or at end of input
116  if (tok.skip('%')) {
117  int64_t hex1 = 0, hex2 = 0;
118  if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
119  output.append(static_cast<char>((hex1 << 4) | hex2));
120  else
121  throw TextException("invalid pct-encoded triplet", Here());
122  }
123  }
124  return output;
125 }
126 
127 const SBuf &
129 {
130  static SBuf star("*");
131  return star;
132 }
133 
134 const SBuf &
136 {
137  static SBuf slash("/");
138  return slash;
139 }
140 
141 void
142 AnyP::Uri::host(const char *src)
143 {
144  hostAddr_.fromHost(src);
145  if (hostAddr_.isAnyAddr()) {
146  xstrncpy(host_, src, sizeof(host_));
147  hostIsNumeric_ = false;
148  } else {
149  hostAddr_.toHostStr(host_, sizeof(host_));
150  debugs(23, 3, "given IP: " << hostAddr_);
151  hostIsNumeric_ = 1;
152  }
153  touch();
154 }
155 
156 // TODO: Replace with ToSBuf(parsedHost()) or similar.
157 SBuf
159 {
160  if (hostIsNumeric()) {
161  static char ip[MAX_IPSTRLEN];
162  const auto hostStrLen = hostIP().toHostStr(ip, sizeof(ip));
163  return SBuf(ip, hostStrLen);
164  } else
165  return SBuf(host());
166 }
167 
168 std::optional<AnyP::Host>
170 {
171  if (hostIsNumeric())
172  return Host::ParseIp(hostIP());
173 
174  // XXX: Interpret host subcomponent as reg-name representing a DNS name. It
175  // may actually be, for example, a URN namespace ID (NID; see RFC 8141), but
176  // current Squid APIs do not support adequate representation of those cases.
177  const SBuf regName(host());
178 
179  if (regName.find('%') != SBuf::npos) {
180  debugs(23, 3, "rejecting percent-encoded reg-name: " << regName);
181  return std::nullopt; // TODO: Decode() instead
182  }
183 
184  return Host::ParseSimpleDomainName(regName);
185 }
186 
187 const SBuf &
189 {
190  // RFC 3986 section 3.3 says path can be empty (path-abempty).
191  // RFC 7230 sections 2.7.3, 5.3.1, 5.7.2 - says path cannot be empty, default to "/"
192  // at least when sending and using. We must still accept path-abempty as input.
193  if (path_.isEmpty() && (scheme_ == AnyP::PROTO_HTTP || scheme_ == AnyP::PROTO_HTTPS))
194  return SlashPath();
195 
196  return path_;
197 }
198 
199 void
201 {
202  debugs(23, 5, "urlInitialize: Initializing...");
203  /* this ensures that the number of protocol strings is the same as
204  * the enum slots allocated because the last enum is always 'MAX'.
205  */
206  assert(strcmp(AnyP::ProtocolType_str[AnyP::PROTO_MAX], "MAX") == 0);
207  /*
208  * These test that our matchDomainName() function works the
209  * way we expect it to.
210  */
211  assert(0 == matchDomainName("foo.com", "foo.com"));
212  assert(0 == matchDomainName(".foo.com", "foo.com"));
213  assert(0 == matchDomainName("foo.com", ".foo.com"));
214  assert(0 == matchDomainName(".foo.com", ".foo.com"));
215  assert(0 == matchDomainName("x.foo.com", ".foo.com"));
216  assert(0 == matchDomainName("y.x.foo.com", ".foo.com"));
217  assert(0 != matchDomainName("x.foo.com", "foo.com"));
218  assert(0 != matchDomainName("foo.com", "x.foo.com"));
219  assert(0 != matchDomainName("bar.com", "foo.com"));
220  assert(0 != matchDomainName(".bar.com", "foo.com"));
221  assert(0 != matchDomainName(".bar.com", ".foo.com"));
222  assert(0 != matchDomainName("bar.com", ".foo.com"));
223  assert(0 < matchDomainName("zzz.com", "foo.com"));
224  assert(0 > matchDomainName("aaa.com", "foo.com"));
225  assert(0 == matchDomainName("FOO.com", "foo.COM"));
226  assert(0 < matchDomainName("bfoo.com", "afoo.com"));
227  assert(0 > matchDomainName("afoo.com", "bfoo.com"));
228  assert(0 < matchDomainName("x-foo.com", ".foo.com"));
229 
230  assert(0 == matchDomainName(".foo.com", ".foo.com", mdnRejectSubsubDomains));
231  assert(0 == matchDomainName("x.foo.com", ".foo.com", mdnRejectSubsubDomains));
232  assert(0 != matchDomainName("y.x.foo.com", ".foo.com", mdnRejectSubsubDomains));
233  assert(0 != matchDomainName(".x.foo.com", ".foo.com", mdnRejectSubsubDomains));
234 
235  assert(0 == matchDomainName("*.foo.com", "x.foo.com", mdnHonorWildcards));
236  assert(0 == matchDomainName("*.foo.com", ".x.foo.com", mdnHonorWildcards));
237  assert(0 == matchDomainName("*.foo.com", ".foo.com", mdnHonorWildcards));
238  assert(0 != matchDomainName("*.foo.com", "foo.com", mdnHonorWildcards));
239 
240  assert(0 != matchDomainName("foo.com", ""));
241  assert(0 != matchDomainName("foo.com", "", mdnHonorWildcards));
242  assert(0 != matchDomainName("foo.com", "", mdnRejectSubsubDomains));
243 
244  /* more cases? */
245 }
246 
254 static AnyP::UriScheme
256 {
257  /*
258  * RFC 3986 section 3.1 paragraph 2:
259  *
260  * Scheme names consist of a sequence of characters beginning with a
261  * letter and followed by any combination of letters, digits, plus
262  * ("+"), period ("."), or hyphen ("-").
263  */
264  static const auto schemeChars = CharacterSet("scheme", "+.-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
265 
266  SBuf str;
267  if (tok.prefix(str, schemeChars, 16) && tok.skip(':') && CharacterSet::ALPHA[str.at(0)]) {
268  const auto protocol = AnyP::UriScheme::FindProtocolType(str);
269  if (protocol == AnyP::PROTO_UNKNOWN)
270  return AnyP::UriScheme(protocol, str.c_str());
271  return AnyP::UriScheme(protocol, nullptr);
272  }
273 
274  throw TextException("invalid URI scheme", Here());
275 }
276 
284 bool
285 urlAppendDomain(char *host)
286 {
287  /* For IPv4 addresses check for a dot */
288  /* For IPv6 addresses also check for a colon */
289  if (Config.appendDomain && !strchr(host, '.') && !strchr(host, ':')) {
290  const uint64_t dlen = strlen(host);
291  const uint64_t want = dlen + Config.appendDomainLen;
292  if (want > SQUIDHOSTNAMELEN - 1) {
293  debugs(23, 2, "URL domain too large (" << dlen << " bytes)");
294  return false;
295  }
296  strncat(host, Config.appendDomain, SQUIDHOSTNAMELEN - dlen - 1);
297  }
298  return true;
299 }
300 
301 /*
302  * Parse a URI/URL.
303  *
304  * It is assumed that the URL is complete -
305  * ie, the end of the string is the end of the URL. Don't pass a partial
306  * URL here as this routine doesn't have any way of knowing whether
307  * it is partial or not (ie, it handles the case of no trailing slash as
308  * being "end of host with implied path of /".
309  *
310  * method is used to switch parsers. If method is Http::METHOD_CONNECT,
311  * then rather than a URL a hostname:port is looked for.
312  */
313 bool
314 AnyP::Uri::parse(const HttpRequestMethod& method, const SBuf &rawUrl)
315 {
316  try {
317 
318  LOCAL_ARRAY(char, login, MAX_URL);
319  LOCAL_ARRAY(char, foundHost, MAX_URL);
320  LOCAL_ARRAY(char, urlpath, MAX_URL);
321  char *t = nullptr;
322  char *q = nullptr;
323  int foundPort;
324  int l;
325  int i;
326  const char *src;
327  char *dst;
328  foundHost[0] = urlpath[0] = login[0] = '\0';
329 
330  if ((l = rawUrl.length()) + Config.appendDomainLen > (MAX_URL - 1)) {
331  debugs(23, DBG_IMPORTANT, MYNAME << "URL too large (" << l << " bytes)");
332  return false;
333  }
334 
335  if ((method == Http::METHOD_OPTIONS || method == Http::METHOD_TRACE) &&
336  Asterisk().cmp(rawUrl) == 0) {
337  // XXX: these methods might also occur in HTTPS traffic. Handle this better.
338  setScheme(AnyP::PROTO_HTTP, nullptr);
339  port(getScheme().defaultPort());
340  path(Asterisk());
341  return true;
342  }
343 
344  Parser::Tokenizer tok(rawUrl);
345  AnyP::UriScheme scheme;
346 
347  if (method == Http::METHOD_CONNECT) {
348  // For CONNECTs, RFC 9110 Section 9.3.6 requires "only the host and
349  // port number of the tunnel destination, separated by a colon".
350 
351  const auto rawHost = parseHost(tok);
352  Assure(rawHost.length() < sizeof(foundHost));
353  SBufToCstring(foundHost, rawHost);
354 
355  if (!tok.skip(':'))
356  throw TextException("missing required :port in CONNECT target", Here());
357  foundPort = parsePort(tok);
358 
359  if (!tok.remaining().isEmpty())
360  throw TextException("garbage after host:port in CONNECT target", Here());
361  } else {
362 
363  scheme = uriParseScheme(tok);
364 
365  if (scheme == AnyP::PROTO_NONE)
366  return false; // invalid scheme
367 
368  if (scheme == AnyP::PROTO_URN) {
369  parseUrn(tok); // throws on any error
370  return true;
371  }
372 
373  // URLs then have "//"
374  static const SBuf doubleSlash("//");
375  if (!tok.skip(doubleSlash))
376  return false;
377 
378  auto B = tok.remaining();
379  const char *url = B.c_str();
380 
381  /* Parse the URL: */
382  src = url;
383  i = 0;
384 
385  /* Then everything until first /; that's host (and port; which we'll look for here later) */
386  // bug 1881: If we don't get a "/" then we imply it was there
387  // bug 3074: We could just be given a "?" or "#". These also imply "/"
388  // bug 3233: whitespace is also a hostname delimiter.
389  for (dst = foundHost; i < l && *src != '/' && *src != '?' && *src != '#' && *src != '\0' && !xisspace(*src); ++i, ++src, ++dst) {
390  *dst = *src;
391  }
392 
393  /*
394  * We can't check for "i >= l" here because we could be at the end of the line
395  * and have a perfectly valid URL w/ no trailing '/'. In this case we assume we've
396  * been -given- a valid URL and the path is just '/'.
397  */
398  if (i > l)
399  return false;
400  *dst = '\0';
401 
402  // We are looking at path-abempty.
403  if (*src != '/') {
404  // path-empty, including the end of the `src` c-string cases
405  urlpath[0] = '/';
406  dst = &urlpath[1];
407  } else {
408  dst = urlpath;
409  }
410  /* Then everything from / (inclusive) until \r\n or \0 - that's urlpath */
411  for (; i < l && *src != '\r' && *src != '\n' && *src != '\0'; ++i, ++src, ++dst) {
412  *dst = *src;
413  }
414 
415  /* We -could- be at the end of the buffer here */
416  if (i > l)
417  return false;
418  *dst = '\0';
419 
420  // If the parsed scheme has no (known) default port, and there is no
421  // explicit port, then we will reject the zero port during foundPort
422  // validation, often resulting in a misleading 400/ERR_INVALID_URL.
423  // TODO: Remove this hack when switching to Tokenizer-based parsing.
424  foundPort = scheme.defaultPort().value_or(0); // may be reset later
425 
426  /* Is there any login information? (we should eventually parse it above) */
427  t = strrchr(foundHost, '@');
428  if (t != nullptr) {
429  strncpy((char *) login, (char *) foundHost, sizeof(login)-1);
430  login[sizeof(login)-1] = '\0';
431  t = strrchr(login, '@');
432  *t = 0;
433  strncpy((char *) foundHost, t + 1, sizeof(foundHost)-1);
434  foundHost[sizeof(foundHost)-1] = '\0';
435  // Bug 4498: URL-unescape the login info after extraction
436  rfc1738_unescape(login);
437  }
438 
439  /* Is there any host information? (we should eventually parse it above) */
440  if (*foundHost == '[') {
441  /* strip any IPA brackets. valid under IPv6. */
442  dst = foundHost;
443  /* only for IPv6 sadly, pre-IPv6/URL code can't handle the clean result properly anyway. */
444  src = foundHost;
445  ++src;
446  l = strlen(foundHost);
447  i = 1;
448  for (; i < l && *src != ']' && *src != '\0'; ++i, ++src, ++dst) {
449  *dst = *src;
450  }
451 
452  /* we moved in-place, so truncate the actual hostname found */
453  *dst = '\0';
454  ++dst;
455 
456  /* skip ahead to either start of port, or original EOS */
457  while (*dst != '\0' && *dst != ':')
458  ++dst;
459  t = dst;
460  } else {
461  t = strrchr(foundHost, ':');
462 
463  if (t != strchr(foundHost,':') ) {
464  /* RFC 2732 states IPv6 "SHOULD" be bracketed. allowing for times when its not. */
465  /* RFC 3986 'update' simply modifies this to an "is" with no emphasis at all! */
466  /* therefore we MUST accept the case where they are not bracketed at all. */
467  t = nullptr;
468  }
469  }
470 
471  // Bug 3183 sanity check: If scheme is present, host must be too.
472  if (scheme != AnyP::PROTO_NONE && foundHost[0] == '\0') {
473  debugs(23, DBG_IMPORTANT, "SECURITY ALERT: Missing hostname in URL '" << url << "'. see access.log for details.");
474  return false;
475  }
476 
477  if (t && *t == ':') {
478  *t = '\0';
479  ++t;
480  foundPort = atoi(t);
481  }
482  }
483 
484  for (t = foundHost; *t; ++t)
485  *t = xtolower(*t);
486 
487  if (stringHasWhitespace(foundHost)) {
489  t = q = foundHost;
490  while (*t) {
491  if (!xisspace(*t)) {
492  *q = *t;
493  ++q;
494  }
495  ++t;
496  }
497  *q = '\0';
498  }
499  }
500 
501  debugs(23, 3, "Split URL '" << rawUrl << "' into proto='" << scheme.image() << "', host='" << foundHost << "', port='" << foundPort << "', path='" << urlpath << "'");
502 
504  strspn(foundHost, Config.onoff.allow_underscore ? valid_hostname_chars_u : valid_hostname_chars) != strlen(foundHost)) {
505  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal character in hostname '" << foundHost << "'");
506  return false;
507  }
508 
509  if (!urlAppendDomain(foundHost))
510  return false;
511 
512  /* remove trailing dots from hostnames */
513  while ((l = strlen(foundHost)) > 0 && foundHost[--l] == '.')
514  foundHost[l] = '\0';
515 
516  /* reject duplicate or leading dots */
517  if (strstr(foundHost, "..") || *foundHost == '.') {
518  debugs(23, DBG_IMPORTANT, MYNAME << "Illegal hostname '" << foundHost << "'");
519  return false;
520  }
521 
522  if (foundPort < 1 || foundPort > 65535) {
523  debugs(23, 3, "Invalid port '" << foundPort << "'");
524  return false;
525  }
526 
527  if (stringHasWhitespace(urlpath)) {
528  debugs(23, 2, "URI has whitespace: {" << rawUrl << "}");
529 
530  switch (Config.uri_whitespace) {
531 
532  case URI_WHITESPACE_DENY:
533  return false;
534 
536  break;
537 
539  t = rfc1738_escape_unescaped(urlpath);
540  xstrncpy(urlpath, t, MAX_URL);
541  break;
542 
543  case URI_WHITESPACE_CHOP:
544  *(urlpath + strcspn(urlpath, w_space)) = '\0';
545  break;
546 
548  default:
549  t = q = urlpath;
550  while (*t) {
551  if (!xisspace(*t)) {
552  *q = *t;
553  ++q;
554  }
555  ++t;
556  }
557  *q = '\0';
558  }
559  }
560 
561  setScheme(scheme);
562  path(urlpath);
563  host(foundHost);
564  userInfo(SBuf(login));
565  port(foundPort);
566  return true;
567 
568  } catch (...) {
569  debugs(23, 2, "error: " << CurrentException << " " << Raw("rawUrl", rawUrl.rawContent(), rawUrl.length()));
570  return false;
571  }
572 }
573 
588 void
590 {
591  static const auto nidChars = CharacterSet("NID","-") + CharacterSet::ALPHA + CharacterSet::DIGIT;
592  static const auto alphanum = (CharacterSet::ALPHA + CharacterSet::DIGIT).rename("alphanum");
593  SBuf nid;
594  if (!tok.prefix(nid, nidChars, 32))
595  throw TextException("NID not found", Here());
596 
597  if (!tok.skip(':'))
598  throw TextException("NID too long or missing ':' delimiter", Here());
599 
600  if (nid.length() < 2)
601  throw TextException("NID too short", Here());
602 
603  if (!alphanum[*nid.begin()])
604  throw TextException("NID prefix is not alphanumeric", Here());
605 
606  if (!alphanum[*nid.rbegin()])
607  throw TextException("NID suffix is not alphanumeric", Here());
608 
609  setScheme(AnyP::PROTO_URN, nullptr);
610  host(nid.c_str());
611  // TODO validate path characters
612  path(tok.remaining());
613  debugs(23, 3, "Split URI into proto=urn, nid=" << nid << ", " << Raw("path",path().rawContent(),path().length()));
614 }
615 
619 SBuf
621 {
622  // host = IP-literal / IPv4address / reg-name
623 
624  // XXX: CharacterSets below reject uri-host values containing whitespace
625  // (e.g., "10.0.0. 1"). That is not a bug, but the uri_whitespace directive
626  // can be interpreted as if it applies to uri-host and this code. TODO: Fix
627  // uri_whitespace and the code using it to exclude uri-host (and URI scheme,
628  // port, etc.) from that directive scope.
629 
630  // IP-literal = "[" ( IPv6address / IPvFuture ) "]"
631  if (tok.skip('[')) {
632  // Add "." because IPv6address in RFC 3986 includes ls32, which includes
633  // IPv4address: ls32 = ( h16 ":" h16 ) / IPv4address
634  // This set rejects IPvFuture that needs a "v" character.
635  static const CharacterSet IPv6chars = (
636  CharacterSet::HEXDIG + CharacterSet("colon", ":") + CharacterSet("period", ".")).rename("IPv6");
637  SBuf ipv6ish;
638  if (!tok.prefix(ipv6ish, IPv6chars))
639  throw TextException("malformed or unsupported bracketed IP address in uri-host", Here());
640 
641  if (!tok.skip(']'))
642  throw TextException("IPv6 address is missing a closing bracket in uri-host", Here());
643 
644  // This rejects bracketed IPv4address and domain names because they lack ":".
645  if (ipv6ish.find(':') == SBuf::npos)
646  throw TextException("bracketed IPv6 address is missing a colon in uri-host", Here());
647 
648  // This rejects bracketed non-IP addresses that our caller would have
649  // otherwise mistaken for a domain name (e.g., '[127.0.0:1]').
650  Ip::Address ipv6check;
651  if (!ipv6check.fromHost(ipv6ish.c_str()))
652  throw TextException("malformed bracketed IPv6 address in uri-host", Here());
653 
654  return ipv6ish;
655  }
656 
657  // no brackets implies we are looking at IPv4address or reg-name
658 
659  static const CharacterSet IPv4chars = CharacterSet("period", ".") + CharacterSet::DIGIT;
660  SBuf ipv4ish; // IPv4address-ish
661  if (tok.prefix(ipv4ish, IPv4chars)) {
662  // This rejects non-IP addresses that our caller would have
663  // otherwise mistaken for a domain name (e.g., '127.0.0' or '1234.5').
664  Ip::Address ipCheck;
665  if (!ipCheck.fromHost(ipv4ish.c_str()))
666  throw TextException("malformed IP address in uri-host", Here());
667 
668  return ipv4ish;
669  }
670 
671  // XXX: This code does not detect/reject some bad host values (e.g. "!#$%&").
672  // TODO: Add more checks here, after migrating the
673  // non-CONNECT uri-host parsing code to use us.
674 
675  SBuf otherHost; // IPv4address-ish or reg-name-ish;
676  // ":" is not in TCHAR so we will stop before any port specification
677  if (tok.prefix(otherHost, CharacterSet::TCHAR))
678  return otherHost;
679 
680  throw TextException("malformed IPv4 address or host name in uri-host", Here());
681 }
682 
689 int
691 {
692  if (tok.skip('0'))
693  throw TextException("zero or zero-prefixed port", Here());
694 
695  int64_t rawPort = 0;
696  if (!tok.int64(rawPort, 10, false)) // port = *DIGIT
697  throw TextException("malformed or missing port", Here());
698 
699  Assure(rawPort > 0);
700  constexpr KnownPort portMax = 65535; // TODO: Make this a class-scope constant and REuse it.
701  constexpr auto portStorageMax = std::numeric_limits<Port::value_type>::max();
702  static_assert(!Less(portStorageMax, portMax), "Port type can represent the maximum valid port number");
703  if (Less(portMax, rawPort))
704  throw TextException("huge port", Here());
705 
706  // TODO: Return KnownPort after migrating the non-CONNECT uri-host parsing
707  // code to use us (so that foundPort "int" disappears or starts using Port).
708  return NaturalCast<int>(rawPort);
709 }
710 
711 void
713 {
714  absolute_.clear();
715  authorityHttp_.clear();
716  authorityWithPort_.clear();
717  absolutePath_.clear();
718 }
719 
720 SBuf &
721 AnyP::Uri::authority(bool requirePort) const
722 {
723  if (authorityHttp_.isEmpty()) {
724 
725  // both formats contain Host/IP
726  authorityWithPort_.append(host());
727  authorityHttp_ = authorityWithPort_;
728 
729  if (port().has_value()) {
730  authorityWithPort_.appendf(":%hu", *port());
731  // authorityHttp_ only has :port for known non-default ports
732  if (port() != getScheme().defaultPort())
733  authorityHttp_ = authorityWithPort_;
734  }
735  // else XXX: We made authorityWithPort_ that does not have a port.
736  // TODO: Audit callers and refuse to give out broken authorityWithPort_.
737  }
738 
739  return requirePort ? authorityWithPort_ : authorityHttp_;
740 }
741 
742 SBuf &
744 {
745  if (absolute_.isEmpty()) {
746  // TODO: most URL will be much shorter, avoid allocating this much
747  absolute_.reserveCapacity(MAX_URL);
748 
749  absolute_.append(getScheme().image());
750  absolute_.append(":",1);
751  if (getScheme() != AnyP::PROTO_URN) {
752  absolute_.append("//", 2);
753  const bool allowUserInfo = getScheme() == AnyP::PROTO_FTP ||
754  getScheme() == AnyP::PROTO_UNKNOWN;
755 
756  if (allowUserInfo && !userInfo().isEmpty()) {
757  static const CharacterSet uiChars = CharacterSet(UserInfoChars())
758  .remove('%')
759  .rename("userinfo-reserved");
760  absolute_.append(Encode(userInfo(), uiChars));
761  absolute_.append("@", 1);
762  }
763  absolute_.append(authority());
764  } else {
765  absolute_.append(host());
766  absolute_.append(":", 1);
767  }
768  absolute_.append(absolutePath());
769  }
770 
771  return absolute_;
772 }
773 
774 SBuf &
776 {
777  if (absolutePath_.isEmpty()) {
778  // TODO: Encode each URI subcomponent in path_ as needed.
779  absolutePath_ = Encode(path(), PathChars());
780  }
781 
782  return absolutePath_;
783 }
784 
785 /* XXX: Performance: This is an *almost* duplicate of HttpRequest::effectiveRequestUri(). But elides the query-string.
786  * After copying it on in the first place! Would be less code to merge the two with a flag parameter.
787  * and never copy the query-string part in the first place
788  */
789 char *
791 {
792  LOCAL_ARRAY(char, buf, MAX_URL);
793 
794  snprintf(buf, sizeof(buf), SQUIDSBUFPH, SQUIDSBUFPRINT(url));
795  buf[sizeof(buf)-1] = '\0';
796 
797  // URN, CONNECT method, and non-stripped URIs can go straight out
798  if (Config.onoff.strip_query_terms && !(method == Http::METHOD_CONNECT || scheme == AnyP::PROTO_URN)) {
799  // strip anything AFTER a question-mark
800  // leaving the '?' in place
801  if (auto t = strchr(buf, '?')) {
802  *(++t) = '\0';
803  }
804  }
805 
806  if (stringHasCntl(buf))
808 
809  return buf;
810 }
811 
818 const char *
820 {
821  LOCAL_ARRAY(char, buf, MAX_URL);
822 
823  // method CONNECT and port HTTPS
824  if (request->method == Http::METHOD_CONNECT && request->url.port() == 443) {
825  snprintf(buf, MAX_URL, "https://%s/*", request->url.host());
826  return buf;
827  }
828 
829  // else do the normal complete canonical thing.
830  return request->canonicalCleanUrl();
831 }
832 
845 bool
846 urlIsRelative(const char *url)
847 {
848  if (!url)
849  return false; // no URL
850 
851  /*
852  * RFC 3986 section 5.2.3
853  *
854  * path = path-abempty ; begins with "/" or is empty
855  * / path-absolute ; begins with "/" but not "//"
856  * / path-noscheme ; begins with a non-colon segment
857  * / path-rootless ; begins with a segment
858  * / path-empty ; zero characters
859  */
860 
861  if (*url == '\0')
862  return true; // path-empty
863 
864  if (*url == '/') {
865  // network-path reference (a.k.a. 'scheme-relative URI') or
866  // path-absolute (a.k.a. 'absolute-path reference')
867  return true;
868  }
869 
870  for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
871  if (*p == ':')
872  return false; // colon is forbidden in first segment
873  }
874 
875  return true; // path-noscheme, path-abempty, path-rootless
876 }
877 
878 void
879 AnyP::Uri::addRelativePath(const char *relUrl)
880 {
881  // URN cannot be merged
882  if (getScheme() == AnyP::PROTO_URN)
883  return;
884 
885  // TODO: Handle . and .. segment normalization
886 
887  const auto lastSlashPos = path_.rfind('/');
888  // TODO: To optimize and simplify, add and use SBuf::replace().
889  const auto relUrlLength = strlen(relUrl);
890  if (lastSlashPos == SBuf::npos) {
891  // start replacing the whole path
892  path_.reserveCapacity(1 + relUrlLength);
893  path_.assign("/", 1);
894  } else {
895  // start replacing just the last segment
896  path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
897  path_.chop(0, lastSlashPos+1);
898  }
899  path_.append(relUrl, relUrlLength);
900 }
901 
902 int
903 matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
904 {
905  int dl;
906  int hl;
907 
908  const bool hostIncludesSubdomains = (*h == '.');
909  while ('.' == *h)
910  ++h;
911 
912  hl = strlen(h);
913 
914  if (hl == 0)
915  return -1;
916 
917  dl = strlen(d);
918  if (dl == 0)
919  return 1;
920 
921  /*
922  * Start at the ends of the two strings and work towards the
923  * beginning.
924  */
925  while (xtolower(h[--hl]) == xtolower(d[--dl])) {
926  if (hl == 0 && dl == 0) {
927  /*
928  * We made it all the way to the beginning of both
929  * strings without finding any difference.
930  */
931  return 0;
932  }
933 
934  if (0 == hl) {
935  /*
936  * The host string is shorter than the domain string.
937  * There is only one case when this can be a match.
938  * If the domain is just one character longer, and if
939  * that character is a leading '.' then we call it a
940  * match.
941  */
942 
943  if (1 == dl && '.' == d[0])
944  return 0;
945  else
946  return -1;
947  }
948 
949  if (0 == dl) {
950  /*
951  * The domain string is shorter than the host string.
952  * This is a match only if the first domain character
953  * is a leading '.'.
954  */
955 
956  if ('.' == d[0]) {
957  if (flags & mdnRejectSubsubDomains) {
958  // Check for sub-sub domain and reject
959  while(--hl >= 0 && h[hl] != '.');
960  if (hl < 0) {
961  // No sub-sub domain found, but reject if there is a
962  // leading dot in given host string (which is removed
963  // before the check is started).
964  return hostIncludesSubdomains ? 1 : 0;
965  } else
966  return 1; // sub-sub domain, reject
967  } else
968  return 0;
969  } else
970  return 1;
971  }
972  }
973 
974  /*
975  * We found different characters in the same position (from the end).
976  */
977 
978  // If the h has a form of "*.foo.com" and d has a form of "x.foo.com"
979  // then the h[hl] points to '*', h[hl+1] to '.' and d[dl] to 'x'
980  // The following checks are safe, the "h[hl + 1]" in the worst case is '\0'.
981  if ((flags & mdnHonorWildcards) && h[hl] == '*' && h[hl + 1] == '.')
982  return 0;
983 
984  /*
985  * If one of those character is '.' then its special. In order
986  * for splay tree sorting to work properly, "x-foo.com" must
987  * be greater than ".foo.com" even though '-' is less than '.'.
988  */
989  if ('.' == d[dl])
990  return 1;
991 
992  if ('.' == h[hl])
993  return -1;
994 
995  return (xtolower(h[hl]) - xtolower(d[dl]));
996 }
997 
998 /*
999  * return true if we can serve requests for this method.
1000  */
1001 bool
1003 {
1004  /* protocol "independent" methods
1005  *
1006  * actually these methods are specific to HTTP:
1007  * they are methods we receive on our HTTP port,
1008  * and if we had a FTP listener would not be relevant
1009  * there.
1010  *
1011  * So, we should delegate them to HTTP. The problem is that we
1012  * do not have a default protocol from the client side of HTTP.
1013  */
1014 
1015  if (r->method == Http::METHOD_CONNECT)
1016  return true;
1017 
1018  // we support OPTIONS and TRACE directed at us (with a 501 reply, for now)
1019  // we also support forwarding OPTIONS and TRACE, except for the *-URI ones
1021  return (r->header.getInt64(Http::HdrType::MAX_FORWARDS) == 0 || r->url.path() != AnyP::Uri::Asterisk());
1022 
1023  if (r->method == Http::METHOD_PURGE)
1024  return true;
1025 
1026  /* does method match the protocol? */
1027  switch (r->url.getScheme()) {
1028 
1029  case AnyP::PROTO_URN:
1030  case AnyP::PROTO_HTTP:
1031  return true;
1032 
1033  case AnyP::PROTO_FTP:
1034  if (r->method == Http::METHOD_PUT ||
1035  r->method == Http::METHOD_GET ||
1036  r->method == Http::METHOD_HEAD )
1037  return true;
1038  return false;
1039 
1040  case AnyP::PROTO_WAIS:
1041  case AnyP::PROTO_WHOIS:
1042  if (r->method == Http::METHOD_GET ||
1043  r->method == Http::METHOD_HEAD)
1044  return true;
1045  return false;
1046 
1047  case AnyP::PROTO_HTTPS:
1048 #if USE_OPENSSL || HAVE_LIBGNUTLS
1049  return true;
1050 #else
1051  /*
1052  * Squid can't originate an SSL connection, so it should
1053  * never receive an "https:" URL. It should always be
1054  * CONNECT instead.
1055  */
1056  return false;
1057 #endif
1058 
1059  default:
1060  return false;
1061  }
1062 
1063  /* notreached */
1064  return false;
1065 }
1066 
1068  scheme_(aScheme),
1069  hostIsNumeric_(false)
1070 {
1071  *host_=0;
1072 }
1073 
1074 // TODO: fix code duplication with AnyP::Uri::parse()
1075 char *
1076 AnyP::Uri::cleanup(const char *uri)
1077 {
1078  char *cleanedUri = nullptr;
1079  switch (Config.uri_whitespace) {
1080  case URI_WHITESPACE_ALLOW: {
1081  const auto flags = RFC1738_ESCAPE_NOSPACE | RFC1738_ESCAPE_UNESCAPED;
1082  cleanedUri = xstrndup(rfc1738_do_escape(uri, flags), MAX_URL);
1083  break;
1084  }
1085 
1086  case URI_WHITESPACE_ENCODE:
1088  break;
1089 
1090  case URI_WHITESPACE_CHOP: {
1091  const auto pos = strcspn(uri, w_space);
1092  char *choppedUri = nullptr;
1093  if (pos < strlen(uri))
1094  choppedUri = xstrndup(uri, pos + 1);
1095  cleanedUri = xstrndup(rfc1738_do_escape(choppedUri ? choppedUri : uri,
1097  cleanedUri[pos] = '\0';
1098  xfree(choppedUri);
1099  break;
1100  }
1101 
1102  case URI_WHITESPACE_DENY:
1103  case URI_WHITESPACE_STRIP:
1104  default: {
1105  // TODO: avoid duplication with urlParse()
1106  const char *t;
1107  char *tmp_uri = static_cast<char*>(xmalloc(strlen(uri) + 1));
1108  char *q = tmp_uri;
1109  t = uri;
1110  while (*t) {
1111  if (!xisspace(*t)) {
1112  *q = *t;
1113  ++q;
1114  }
1115  ++t;
1116  }
1117  *q = '\0';
1118  cleanedUri = xstrndup(rfc1738_escape_unescaped(tmp_uri), MAX_URL);
1119  xfree(tmp_uri);
1120  break;
1121  }
1122  }
1123 
1124  assert(cleanedUri);
1125  return cleanedUri;
1126 }
1127 
static char * cleanup(const char *uri)
Definition: Uri.cc:1076
#define URI_WHITESPACE_ENCODE
Definition: defines.h:126
bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit=SBuf::npos)
Definition: Tokenizer.cc:79
size_type find(char c, size_type startPos=0) const
Definition: SBuf.cc:584
#define Here()
source code location of the caller
Definition: Here.h:15
static const char valid_hostname_chars_u[]
Definition: Uri.cc:22
@ METHOD_HEAD
Definition: MethodType.h:28
AnyP::Uri url
the request URI
Definition: HttpRequest.h:115
#define xmalloc
char * canonicalCleanUrl() const
Definition: HttpRequest.cc:810
int stringHasCntl(const char *)
Definition: String.cc:301
#define URI_WHITESPACE_STRIP
Definition: defines.h:124
const_reverse_iterator rbegin() const
Definition: SBuf.h:595
#define URI_WHITESPACE_CHOP
Definition: defines.h:127
static AnyP::ProtocolType FindProtocolType(const SBuf &)
Definition: UriScheme.cc:52
char host_[SQUIDHOSTNAMELEN]
string representation of the URI authority name or IP
Definition: Uri.h:180
#define LOCAL_ARRAY(type, name, size)
Definition: squid.h:62
HttpHeader header
Definition: Message.h:74
int check_hostnames
Definition: SquidConfig.h:316
@ mdnHonorWildcards
Definition: Uri.h:232
bool urlCheckRequest(const HttpRequest *r)
Definition: Uri.cc:1002
bool isEmpty() const
Definition: SBuf.h:435
void reserveSpace(size_type minSpace)
Definition: SBuf.h:444
static std::optional< Host > ParseSimpleDomainName(const SBuf &)
Definition: Host.cc:49
@ PROTO_NONE
Definition: ProtocolType.h:24
SBuf hostOrIp() const
Definition: Uri.cc:158
bool parse(const HttpRequestMethod &, const SBuf &url)
Definition: Uri.cc:314
const char * ProtocolType_str[]
Definition: SBuf.h:93
bool atEnd() const
whether the end of the buffer has been reached
Definition: Tokenizer.h:41
void SBufToCstring(char *d, const SBuf &s)
Definition: SBuf.h:756
CharacterSet complement(const char *complementLabel=nullptr) const
Definition: CharacterSet.cc:74
bool fromHost(const char *hostWithoutPort)
Definition: Address.cc:910
const A & max(A const &lhs, A const &rhs)
static const CharacterSet & PathChars()
Characters which are valid within a URI path section.
Definition: Uri.cc:55
bool skip(const SBuf &tokenToSkip)
Definition: Tokenizer.cc:189
char * xstrncpy(char *dst, const char *src, size_t n)
Definition: xstring.cc:37
int matchDomainName(const char *h, const char *d, MatchDomainNameFlags flags)
Definition: Uri.cc:903
#define xtolower(x)
Definition: xis.h:17
#define w_space
static int port
Definition: ldap_backend.cc:70
@ PROTO_UNKNOWN
Definition: ProtocolType.h:41
static const CharacterSet ALPHA
Definition: CharacterSet.h:76
@ METHOD_OPTIONS
Definition: MethodType.h:31
#define MAX_IPSTRLEN
Length of buffer that needs to be allocated to old a null-terminated IP-string.
Definition: forward.h:25
@ PROTO_URN
Definition: ProtocolType.h:35
Definition: Raw.h:20
#define MAX_URL
Definition: defines.h:76
const SBuf & remaining() const
the remaining unprocessed section of buffer
Definition: Tokenizer.h:44
static const CharacterSet & UserInfoChars()
Characters which are valid within a URI userinfo section.
Definition: Uri.cc:37
#define URI_WHITESPACE_DENY
Definition: defines.h:128
int strip_query_terms
Definition: SquidConfig.h:300
const char * rawContent() const
Definition: SBuf.cc:509
@ PROTO_MAX
Definition: ProtocolType.h:42
void rfc1738_unescape(char *url)
Definition: rfc1738.c:146
#define SQUIDSBUFPRINT(s)
Definition: SBuf.h:32
int parsePort(Parser::Tokenizer &) const
Definition: Uri.cc:690
uint16_t KnownPort
validated/supported port number; these values are never zero
Definition: UriScheme.h:23
char at(size_type pos) const
Definition: SBuf.h:253
static const CharacterSet TCHAR
Definition: CharacterSet.h:105
static const CharacterSet HEXDIG
Definition: CharacterSet.h:88
MatchDomainNameFlags
Definition: Uri.h:230
@ METHOD_CONNECT
Definition: MethodType.h:29
const_iterator begin() const
Definition: SBuf.h:587
int64_t getInt64(Http::HdrType id) const
Definition: HttpHeader.cc:1266
std::optional< Host > parsedHost() const
Definition: Uri.cc:169
static std::optional< Host > ParseIp(const Ip::Address &)
converts an already parsed IP address to a Host object
Definition: Host.cc:15
void parseUrn(Parser::Tokenizer &)
Definition: Uri.cc:589
#define assert(EX)
Definition: assert.h:17
SBuf image() const
Definition: UriScheme.h:57
@ METHOD_PUT
Definition: MethodType.h:27
std::ostream & CurrentException(std::ostream &os)
prints active (i.e., thrown but not yet handled) exception
const AnyP::UriScheme & getScheme() const
Definition: Uri.h:58
@ METHOD_TRACE
Definition: MethodType.h:30
Port defaultPort() const
Definition: UriScheme.cc:71
#define Assure(condition)
Definition: Assure.h:35
static const CharacterSet DIGIT
Definition: CharacterSet.h:84
void port(const Port p)
reset authority port subcomponent
Definition: Uri.h:90
const char * c_str()
Definition: SBuf.cc:516
SBuf & authority(bool requirePort=false) const
Definition: Uri.cc:721
size_type length() const
Returns the number of bytes stored in SBuf.
Definition: SBuf.h:419
char * xstrndup(const char *s, size_t n)
Definition: xstring.cc:56
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
#define xfree
void addRelativePath(const char *relUrl)
Definition: Uri.cc:879
CharacterSet & rename(const char *label)
change name; handy in const declarations that use operators
Definition: CharacterSet.h:61
int allow_underscore
Definition: SquidConfig.h:317
static const size_type npos
Definition: SBuf.h:100
static const char valid_hostname_chars[]
Definition: Uri.cc:28
@ METHOD_PURGE
Definition: MethodType.h:92
const char * urlCanonicalFakeHttps(const HttpRequest *request)
Definition: Uri.cc:819
CharacterSet & remove(const unsigned char c)
remove a given character from the character set
Definition: CharacterSet.cc:54
bool urlAppendDomain(char *host)
apply append_domain config to the given hostname
Definition: Uri.cc:285
#define rfc1738_escape_unescaped(x)
Definition: rfc1738.h:59
static const SBuf & SlashPath()
the static '/' default URL-path
Definition: Uri.cc:135
void urlInitialize(void)
Definition: Uri.cc:200
SBuf & absolute() const
Definition: Uri.cc:743
@ PROTO_WHOIS
Definition: ProtocolType.h:36
@ PROTO_HTTPS
Definition: ProtocolType.h:27
HttpRequestMethod method
Definition: HttpRequest.h:114
void path(const char *p)
Definition: Uri.h:96
@ PROTO_FTP
Definition: ProtocolType.h:26
@ PROTO_HTTP
Definition: ProtocolType.h:25
void reserveCapacity(size_type minCapacity)
Definition: SBuf.cc:105
Definition: parse.c:160
static SBuf Decode(const SBuf &)
%-decode the given buffer
Definition: Uri.cc:105
@ mdnRejectSubsubDomains
Definition: Uri.h:233
an std::runtime_error with thrower location info
Definition: TextException.h:20
@ PROTO_WAIS
Definition: ProtocolType.h:30
Uri()
Definition: Uri.h:36
#define DBG_IMPORTANT
Definition: Stream.h:38
#define RFC1738_ESCAPE_UNESCAPED
Definition: rfc1738.h:25
#define MYNAME
Definition: Stream.h:219
bool urlIsRelative(const char *url)
Definition: Uri.cc:846
char * rfc1738_do_escape(const char *url, int flags)
Definition: rfc1738.c:56
char * urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &method, const AnyP::UriScheme &scheme)
Definition: Uri.cc:790
SBuf & absolutePath() const
RFC 3986 section 4.2 relative reference called 'absolute-path'.
Definition: Uri.cc:775
optimized set of C chars, with quick membership test and merge support
Definition: CharacterSet.h:17
#define xisspace(x)
Definition: xis.h:15
#define RFC1738_ESCAPE_NOSPACE
Definition: rfc1738.h:22
char * appendDomain
Definition: SquidConfig.h:222
size_t appendDomainLen
Definition: SquidConfig.h:223
const SBuf & path() const
Definition: Uri.cc:188
SBuf & appendf(const char *fmt,...) PRINTF_FORMAT_ARG2
Definition: SBuf.cc:229
@ METHOD_GET
Definition: MethodType.h:25
struct SquidConfig::@90 onoff
constexpr bool Less(const A a, const B b)
whether integer a is less than integer b, with correct overflow handling
Definition: SquidMath.h:48
int stringHasWhitespace(const char *)
Definition: String.cc:294
const char * host(void) const
Definition: Uri.h:76
static const SBuf & Asterisk()
the static '*' pseudo-URI
Definition: Uri.cc:128
void host(const char *src)
Definition: Uri.cc:142
int uri_whitespace
Definition: SquidConfig.h:457
#define debugs(SECTION, LEVEL, CONTENT)
Definition: Stream.h:192
static SBuf Encode(const SBuf &, const CharacterSet &expected)
Definition: Uri.cc:76
SBuf parseHost(Parser::Tokenizer &) const
Definition: Uri.cc:620
#define SQUIDHOSTNAMELEN
Definition: rfc2181.h:30
void touch()
clear the cached URI display forms
Definition: Uri.cc:712
#define SQUIDSBUFPH
Definition: SBuf.h:31
class SquidConfig Config
Definition: SquidConfig.cc:12
static AnyP::UriScheme uriParseScheme(Parser::Tokenizer &tok)
Definition: Uri.cc:255
#define URI_WHITESPACE_ALLOW
Definition: defines.h:125

 

Introduction

Documentation

Support

Miscellaneous