CXXR (C++ R)
valid_utf8.h
1 /* Private version of _pcre_valid_utf */
2 
3 /*************************************************
4 * Perl-Compatible Regular Expressions *
5 *************************************************/
6 
7 /* PCRE is a library of functions to support regular expressions whose syntax
8 and semantics are as close as possible to those of the Perl 5 language.
9 
10  Written by Philip Hazel
11  Copyright (c) 1997-2012 University of Cambridge
12 
13 -----------------------------------------------------------------------------
14 Redistribution and use in source and binary forms, with or without
15 modification, are permitted provided that the following conditions are met:
16 
17  * Redistributions of source code must retain the above copyright notice,
18  this list of conditions and the following disclaimer.
19 
20  * Redistributions in binary form must reproduce the above copyright
21  notice, this list of conditions and the following disclaimer in the
22  documentation and/or other materials provided with the distribution.
23 
24  * Neither the name of the University of Cambridge nor the names of its
25  contributors may be used to endorse or promote products derived from
26  this software without specific prior written permission.
27 
28 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 POSSIBILITY OF SUCH DAMAGE.
39 -----------------------------------------------------------------------------
40 */
41 
42 /* This module contains an internal function for validating UTF-8 character
43 strings. */
44 
45 
46 /*************************************************
47 * Validate a UTF-8 string *
48 *************************************************/
49 
50 /* This function is called (optionally) at the start of compile or match, to
51 check that a supposed UTF-8 string is actually valid. The early check means
52 that subsequent code can assume it is dealing with a valid string. The check
53 can be turned off for maximum performance, but the consequences of supplying an
54 invalid string are then undefined.
55 
56 Originally, this function checked according to RFC 2279, allowing for values in
57 the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in
58 the canonical format. Once somebody had pointed out RFC 3629 to me (it
59 obsoletes 2279), additional restrictions were applied. The values are now
60 limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the
61 subrange 0xd000 to 0xdfff is excluded. However, the format of 5-byte and 6-byte
62 characters is still checked.
63 
64 */
65 
66 static int
67 valid_utf8(const char *string, int length)
68 {
69  const char *p;
70 
71  for (p = string; length-- > 0; p++) {
72  int ab, c, d;
73  c = (unsigned char)*p;
74  if (c < 128) continue; /* ASCII character */
75  if (c < 0xc0) return 1; /* Isolated 10xx xxxx byte */
76  if (c >= 0xfe) return 1; /* Invalid 0xfe or 0xff bytes */
77 
78  ab = utf8_table4[c & 0x3f]; /* Number of additional bytes */
79  if (length < ab) return 1;
80  length -= ab; /* Length remaining */
81 
82  /* Check top bits in the second byte */
83 
84  if (((d = *(++p)) & 0xc0) != 0x80) return 1;
85 
86  /* For each length, check that the remaining bytes start with
87  the 0x80 bit set and not the 0x40 bit. Then check for an
88  overlong sequence, and for the excluded range 0xd800 to
89  0xdfff. */
90 
91  switch (ab)
92  {
93  /* 2-byte character. No further bytes to check for
94  0x80. Check first byte for for xx00 000x (overlong
95  sequence). */
96  case 1:
97  if ((c & 0x3e) == 0) return 1;
98  break;
99 
100  /* 3-byte character. Check third byte for 0x80. Then check
101  first 2 bytes for 1110 0000, xx0x xxxx (overlong
102  sequence) or 1110 1101, 1010 xxxx (0xd800 - 0xdfff) */
103  case 2:
104  if ((*(++p) & 0xc0) != 0x80) return 1; /* Third byte */
105  if (c == 0xe0 && (d & 0x20) == 0) return 1;
106  if (c == 0xed && d >= 0xa0) return 1;
107  break;
108 
109  /* 4-byte character. Check 3rd and 4th bytes for
110  0x80. Then check first 2 bytes for for 1111 0000, xx00
111  xxxx (overlong sequence), then check for a character
112  greater than 0x0010ffff (f4 8f bf bf) */
113  case 3:
114  if ((*(++p) & 0xc0) != 0x80) return 1; /* Third byte */
115  if ((*(++p) & 0xc0) != 0x80) return 1; /* Fourth byte */
116  if (c == 0xf0 && (d & 0x30) == 0) return 1;
117  if (c > 0xf4 || (c == 0xf4 && d > 0x8f)) return 1;
118  break;
119 
120  /* 5-byte and 6-byte characters are not allowed by RFC
121  3629, and will be rejected by the length test
122  below. However, we do the appropriate tests here so
123  that overlong sequences get diagnosed, and also in case
124  there is ever an option for handling these larger code
125  points. */
126 
127  /* 5-byte character. Check 3rd, 4th, and 5th bytes for
128  0x80. Then check for 1111 1000, xx00 0xxx */
129  case 4:
130  if ((*(++p) & 0xc0) != 0x80) return 1; /* Third byte */
131  if ((*(++p) & 0xc0) != 0x80) return 1; /* Fourth byte */
132  if ((*(++p) & 0xc0) != 0x80) return 1; /* Fifth byte */
133  if (c == 0xf8 && (d & 0x38) == 0) return 1;
134  break;
135 
136  /* 6-byte character. Check 3rd-6th bytes for 0x80. Then
137  check for 1111 1100, xx00 00xx. */
138  case 5:
139  if ((*(++p) & 0xc0) != 0x80) return 1; /* Third byte */
140  if ((*(++p) & 0xc0) != 0x80) return 1; /* Fourth byte */
141  if ((*(++p) & 0xc0) != 0x80) return 1; /* Fifth byte */
142  if ((*(++p) & 0xc0) != 0x80) return 1; /* Sixth byte */
143  if (c == 0xfc && (d & 0x3c) == 0) return 1;
144  break;
145  }
146 
147  /* Character is valid under RFC 2279, but 4-byte and 5-byte
148  characters are excluded by RFC 3629. The pointer p is
149  currently at the last byte of the character. */
150  if (ab > 3) return 1;
151  }
152 
153  return 0;
154 }