toUtf.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3  *
4  * Squid software is distributed under GPLv2+ license and includes
5  * contributions from numerous individuals and organizations.
6  * Please see the COPYING and CONTRIBUTORS files for details.
7  */
8 
9 #include "squid.h"
10 #include "auth/toUtf.h"
11 #include "sbuf/SBuf.h"
12 
13 #include <limits>
14 
15 SBuf
16 Latin1ToUtf8(const char *in)
17 {
18  SBuf result;
19 
20  if (!in)
21  return result;
22 
23  for (; *in; in++) {
24  const auto ch = static_cast<unsigned char>(*in);
25 
26  if (ch < 0x80) {
27  result.append(ch);
28  } else {
29  result.append(static_cast<char>((ch >> 6) | 0xc0));
30  result.append(static_cast<char>((ch & 0x3f) | 0x80));
31  }
32  }
33  return result;
34 }
35 
36 SBuf
37 Cp1251ToUtf8(const char *in)
38 {
39  static const unsigned char firstByteMark[] = { 0x00, 0x00, 0xC0, 0xE0 };
40  static const unsigned unicodevalues[] = {
41  0x0402, 0x0403, 0x201A, 0x0453, 0x201E, 0x2026, 0x2020, 0x2021,
42  0x20AC, 0x2030, 0x0409, 0x2039, 0x040A, 0x040C, 0x040B, 0x040F,
43  0x0452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
44  0xFFFD, 0x2122, 0x0459, 0x203A, 0x045A, 0x045C, 0x045B, 0x045F,
45  0x00A0, 0x040E, 0x045E, 0x0408, 0x00A4, 0x0490, 0x00A6, 0x00A7,
46  0x0401, 0x00A9, 0x0404, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x0407,
47  0x00B0, 0x00B1, 0x0406, 0x0456, 0x0491, 0x00B5, 0x00B6, 0x00B7,
48  0x0451, 0x2116, 0x0454, 0x00BB, 0x0458, 0x0405, 0x0455, 0x0457
49  };
50  SBuf result;
51 
52  if (!in)
53  return result;
54 
55  for (; *in; in++) {
56  const auto ch = static_cast<unsigned char>(*in);
57  unsigned u = 0;
58  size_t bytesToWrite = 0;
59  char sequence[4] = {0, 0, 0, 0};
60 
61  static_assert(std::numeric_limits<unsigned char>::max() == 0xFFu,
62  "we require char to be exactly 8 bits");
63  if (ch < 0x80)
64  u = ch;
65  else if (ch >= 0xC0) // 0x0410..0x044F
66  u = 0x0350 + ch;
67  else
68  u = unicodevalues[ch - 0x80];
69 
70  if (u < 0x80)
71  bytesToWrite = 1;
72  else if (u < 0x800)
73  bytesToWrite = 2;
74  else
75  bytesToWrite = 3;
76 
77  switch (bytesToWrite) {
78  case 3:
79  sequence[2] = static_cast<char>(u & 0x3f) | 0x80;
80  u >>= 6;
81  [[fallthrough]];
82  case 2:
83  sequence[1] = static_cast<char>(u & 0x3f) | 0x80;
84  u >>= 6;
85  [[fallthrough]];
86  case 1:
87  sequence[0] = static_cast<char>(u) | firstByteMark[bytesToWrite];
88  }
89  result.append(sequence, bytesToWrite);
90  }
91  return result;
92 }
93 
100 static inline size_t
101 utf8CodePointLength(const char b0)
102 {
103  if ((b0 & 0x80) == 0)
104  return 1;
105  if ((b0 & 0xC0) != 0xC0)
106  return 0; // invalid code point
107  if ((b0 & 0xE0) == 0xC0)
108  return 2;
109  if ((b0 & 0xF0) == 0xE0)
110  return 3;
111  if ((b0 & 0xF8) == 0xF0)
112  return 4;
113  return 0; // invalid code point
114 }
115 
122 static bool
123 isValidUtf8CodePoint(const unsigned char* source, const size_t length)
124 {
125  unsigned char a;
126  const unsigned char* srcptr = source + length;
127  switch (length) {
128  default:
129  return false;
130  // Everything else falls through when "true"...
131  case 4:
132  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
133  [[fallthrough]];
134  case 3:
135  if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
136  [[fallthrough]];
137  case 2:
138  if ((a = (*--srcptr)) > 0xBF) return false;
139 
140  switch (*source) {
141  // no fall-through in this inner switch
142  case 0xE0:
143  if (a < 0xA0) return false;
144  break;
145  case 0xED:
146  if (a > 0x9F) return false;
147  break;
148  case 0xF0:
149  if (a < 0x90) return false;
150  break;
151  case 0xF4:
152  if (a > 0x8F) return false;
153  break;
154  default:
155  if (a < 0x80) return false;
156  break;
157  }
158  [[fallthrough]];
159 
160  case 1:
161  if (*source >= 0x80 && *source < 0xC2) return false;
162  }
163  if (*source > 0xF4)
164  return false;
165  return true;
166 }
167 
171 bool
172 isValidUtf8String(const char *source, const char *sourceEnd) {
173  while (source < sourceEnd) {
174  const auto length = utf8CodePointLength(*source);
175  if (source + length > sourceEnd || !isValidUtf8CodePoint(reinterpret_cast<const unsigned char*>(source), length))
176  return false;
177  source += length;
178  }
179  return true; // including zero-length input
180 }
181 
Definition: SBuf.h:93
const A & max(A const &lhs, A const &rhs)
SBuf Latin1ToUtf8(const char *in)
converts ISO-LATIN-1 to UTF-8
Definition: toUtf.cc:16
bool isValidUtf8String(const char *source, const char *sourceEnd)
returns whether the given input is a valid (or empty) sequence of UTF-8 code points
Definition: toUtf.cc:172
SBuf & append(const SBuf &S)
Definition: SBuf.cc:185
static bool isValidUtf8CodePoint(const unsigned char *source, const size_t length)
Definition: toUtf.cc:123
static size_t utf8CodePointLength(const char b0)
Definition: toUtf.cc:101
SBuf Cp1251ToUtf8(const char *in)
converts CP1251 to UTF-8
Definition: toUtf.cc:37

 

Introduction

Documentation

Support

Miscellaneous