1 /*    utf8.c
2  *
3  *    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4  *    by Larry Wall and others
5  *
6  *    You may distribute under the terms of either the GNU General Public
7  *    License or the Artistic License, as specified in the README file.
8  *
9  */
10 
11 /*
12  * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13  * heard of that we don't want to see any closer; and that's the one place
14  * we're trying to get to!  And that's just where we can't get, nohow.'
15  *
16  * 'Well do I understand your speech,' he answered in the same language;
17  * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
18  * as is the custom in the West, if you wish to be answered?'
19  *
20  * ...the travellers perceived that the floor was paved with stones of many
21  * hues; branching runes and strange devices intertwined beneath their feet.
22  */
23 
24 #include "EXTERN.h"
25 #define PERL_IN_UTF8_C
26 #include "perl.h"
27 
28 static const char unees[] =
29     "Malformed UTF-8 character (unexpected end of string)";
30 
31 /*
32 =head1 Unicode Support
33 
34 This file contains various utility functions for manipulating UTF8-encoded
35 strings. For the uninitiated, this is a method of representing arbitrary
36 Unicode characters as a variable number of bytes, in such a way that
37 characters in the ASCII range are unmodified, and a zero byte never appears
38 within non-zero characters.
39 
40 =for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
41 
42 Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44 bytes available. The return value is the pointer to the byte after the
45 end of the new character. In other words,
46 
47     d = uvuni_to_utf8_flags(d, uv, flags);
48 
49 or, in most cases,
50 
51     d = uvuni_to_utf8(d, uv);
52 
53 (which is equivalent to)
54 
55     d = uvuni_to_utf8_flags(d, uv, 0);
56 
57 is the recommended Unicode-aware way of saying
58 
59     *(d++) = uv;
60 
61 =cut
62 */
63 
64 U8 *
Perl_uvuni_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)65 Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
66 {
67     if (ckWARN(WARN_UTF8)) {
68 	 if (UNICODE_IS_SURROGATE(uv) &&
69 	     !(flags & UNICODE_ALLOW_SURROGATE))
70 	      Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
71 	 else if (
72 		  ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 		    !(flags & UNICODE_ALLOW_FDD0))
74 		   ||
75 		   ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76 		    !(flags & UNICODE_ALLOW_FFFF))) &&
77 		  /* UNICODE_ALLOW_SUPER includes
78 		   * FFFEs and FFFFs beyond 0x10FFFF. */
79 		  ((uv <= PERL_UNICODE_MAX) ||
80 		   !(flags & UNICODE_ALLOW_SUPER))
81 		  )
82 	      Perl_warner(aTHX_ packWARN(WARN_UTF8),
83 			 "Unicode character 0x%04"UVxf" is illegal", uv);
84     }
85     if (UNI_IS_INVARIANT(uv)) {
86 	*d++ = (U8)UTF_TO_NATIVE(uv);
87 	return d;
88     }
89 #if defined(EBCDIC)
90     else {
91 	STRLEN len  = UNISKIP(uv);
92 	U8 *p = d+len-1;
93 	while (p > d) {
94 	    *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
95 	    uv >>= UTF_ACCUMULATION_SHIFT;
96 	}
97 	*p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
98 	return d+len;
99     }
100 #else /* Non loop style */
101     if (uv < 0x800) {
102 	*d++ = (U8)(( uv >>  6)         | 0xc0);
103 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
104 	return d;
105     }
106     if (uv < 0x10000) {
107 	*d++ = (U8)(( uv >> 12)         | 0xe0);
108 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
109 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
110 	return d;
111     }
112     if (uv < 0x200000) {
113 	*d++ = (U8)(( uv >> 18)         | 0xf0);
114 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
116 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
117 	return d;
118     }
119     if (uv < 0x4000000) {
120 	*d++ = (U8)(( uv >> 24)         | 0xf8);
121 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
124 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
125 	return d;
126     }
127     if (uv < 0x80000000) {
128 	*d++ = (U8)(( uv >> 30)         | 0xfc);
129 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
133 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
134 	return d;
135     }
136 #ifdef HAS_QUAD
137     if (uv < UTF8_QUAD_MAX)
138 #endif
139     {
140 	*d++ =                            0xfe;	/* Can't match U+FEFF! */
141 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
146 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
147 	return d;
148     }
149 #ifdef HAS_QUAD
150     {
151 	*d++ =                            0xff;		/* Can't match U+FFFE! */
152 	*d++ =                            0x80;		/* 6 Reserved bits */
153 	*d++ = (U8)(((uv >> 60) & 0x0f) | 0x80);	/* 2 Reserved bits */
154 	*d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 	*d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 	*d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 	*d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 	*d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 	*d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 	*d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 	*d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 	*d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
163 	*d++ = (U8)(( uv        & 0x3f) | 0x80);
164 	return d;
165     }
166 #endif
167 #endif /* Loop style */
168 }
169 
170 U8 *
Perl_uvuni_to_utf8(pTHX_ U8 * d,UV uv)171 Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172 {
173     return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174 }
175 
176 /*
177 
178 Tests if some arbitrary number of bytes begins in a valid UTF-8
179 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
180 UTF-8 character.  The actual number of bytes in the UTF-8 character
181 will be returned if it is valid, otherwise 0.
182 
183 This is the "slow" version as opposed to the "fast" version which is
184 the "unrolled" IS_UTF8_CHAR().  E.g. for t/uni/class.t the speed
185 difference is a factor of 2 to 3.  For lengths (UTF8SKIP(s)) of four
186 or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187 you should use the _slow().  In practice this means that the _slow()
188 will be used very rarely, since the maximum Unicode code point (as of
189 Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes.  Only
190 the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191 five bytes or more.
192 
193 =cut */
194 STATIC STRLEN
S_is_utf8_char_slow(pTHX_ const U8 * s,const STRLEN len)195 S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196 {
197     U8 u = *s;
198     STRLEN slen;
199     UV uv, ouv;
200 
201     if (UTF8_IS_INVARIANT(u))
202 	return 1;
203 
204     if (!UTF8_IS_START(u))
205 	return 0;
206 
207     if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 	return 0;
209 
210     slen = len - 1;
211     s++;
212 #ifdef EBCDIC
213     u = NATIVE_TO_UTF(u);
214 #endif
215     u &= UTF_START_MASK(len);
216     uv  = u;
217     ouv = uv;
218     while (slen--) {
219 	if (!UTF8_IS_CONTINUATION(*s))
220 	    return 0;
221 	uv = UTF8_ACCUMULATE(uv, *s);
222 	if (uv < ouv)
223 	    return 0;
224 	ouv = uv;
225 	s++;
226     }
227 
228     if ((STRLEN)UNISKIP(uv) < len)
229 	return 0;
230 
231     return len;
232 }
233 
234 /*
235 =for apidoc A|STRLEN|is_utf8_char|U8 *s
236 
237 Tests if some arbitrary number of bytes begins in a valid UTF-8
238 character.  Note that an INVARIANT (i.e. ASCII) character is a valid
239 UTF-8 character.  The actual number of bytes in the UTF-8 character
240 will be returned if it is valid, otherwise 0.
241 
242 =cut */
243 STRLEN
Perl_is_utf8_char(pTHX_ U8 * s)244 Perl_is_utf8_char(pTHX_ U8 *s)
245 {
246     const STRLEN len = UTF8SKIP(s);
247 #ifdef IS_UTF8_CHAR
248     if (IS_UTF8_CHAR_FAST(len))
249         return IS_UTF8_CHAR(s, len) ? len : 0;
250 #endif /* #ifdef IS_UTF8_CHAR */
251     return is_utf8_char_slow(s, len);
252 }
253 
254 /*
255 =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
256 
257 Returns true if first C<len> bytes of the given string form a valid
258 UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
259 not mean 'a string that contains code points above 0x7F encoded in UTF-8'
260 because a valid ASCII string is a valid UTF-8 string.
261 
262 See also is_utf8_string_loclen() and is_utf8_string_loc().
263 
264 =cut
265 */
266 
267 bool
Perl_is_utf8_string(pTHX_ U8 * s,STRLEN len)268 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
269 {
270     const U8* x = s;
271     const U8* send;
272 
273     if (!len)
274 	len = strlen((const char *)s);
275     send = s + len;
276 
277     while (x < send) {
278 	STRLEN c;
279 	 /* Inline the easy bits of is_utf8_char() here for speed... */
280 	 if (UTF8_IS_INVARIANT(*x))
281 	      c = 1;
282 	 else if (!UTF8_IS_START(*x))
283 	     goto out;
284 	 else {
285 	      /* ... and call is_utf8_char() only if really needed. */
286 #ifdef IS_UTF8_CHAR
287 	     c = UTF8SKIP(x);
288 	     if (IS_UTF8_CHAR_FAST(c)) {
289 	         if (!IS_UTF8_CHAR(x, c))
290 		     goto out;
291 	     } else if (!is_utf8_char_slow(x, c))
292 	         goto out;
293 #else
294 	     c = is_utf8_char(x);
295 #endif /* #ifdef IS_UTF8_CHAR */
296 	      if (!c)
297 		  goto out;
298 	 }
299         x += c;
300     }
301 
302  out:
303     if (x != send)
304 	return FALSE;
305 
306     return TRUE;
307 }
308 
309 /*
310 =for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
311 
312 Like is_utf8_string() but stores the location of the failure (in the
313 case of "utf8ness failure") or the location s+len (in the case of
314 "utf8ness success") in the C<ep>, and the number of UTF-8
315 encoded characters in the C<el>.
316 
317 See also is_utf8_string_loc() and is_utf8_string().
318 
319 =cut
320 */
321 
322 bool
Perl_is_utf8_string_loclen(pTHX_ const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)323 Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
324 {
325     const U8* x = s;
326     const U8* send;
327     STRLEN c;
328 
329     if (!len)
330         len = strlen((const char *)s);
331     send = s + len;
332     if (el)
333         *el = 0;
334 
335     while (x < send) {
336 	 /* Inline the easy bits of is_utf8_char() here for speed... */
337 	 if (UTF8_IS_INVARIANT(*x))
338 	     c = 1;
339 	 else if (!UTF8_IS_START(*x))
340 	     goto out;
341 	 else {
342 	     /* ... and call is_utf8_char() only if really needed. */
343 #ifdef IS_UTF8_CHAR
344 	     c = UTF8SKIP(x);
345 	     if (IS_UTF8_CHAR_FAST(c)) {
346 	         if (!IS_UTF8_CHAR(x, c))
347 		     c = 0;
348 	     } else
349 	         c = is_utf8_char_slow(x, c);
350 #else
351 	     c = is_utf8_char(x);
352 #endif /* #ifdef IS_UTF8_CHAR */
353 	     if (!c)
354 	         goto out;
355 	 }
356          x += c;
357 	 if (el)
358 	     (*el)++;
359     }
360 
361  out:
362     if (ep)
363         *ep = x;
364     if (x != send)
365 	return FALSE;
366 
367     return TRUE;
368 }
369 
370 /*
371 =for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
372 
373 Like is_utf8_string() but stores the location of the failure (in the
374 case of "utf8ness failure") or the location s+len (in the case of
375 "utf8ness success") in the C<ep>.
376 
377 See also is_utf8_string_loclen() and is_utf8_string().
378 
379 =cut
380 */
381 
382 bool
Perl_is_utf8_string_loc(pTHX_ U8 * s,STRLEN len,U8 ** ep)383 Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **ep)
384 {
385     return is_utf8_string_loclen(s, len, (const U8 **)ep, 0);
386 }
387 
388 /*
389 =for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
390 
391 Bottom level UTF-8 decode routine.
392 Returns the unicode code point value of the first character in the string C<s>
393 which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
394 C<retlen> will be set to the length, in bytes, of that character.
395 
396 If C<s> does not point to a well-formed UTF-8 character, the behaviour
397 is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
398 it is assumed that the caller will raise a warning, and this function
399 will silently just set C<retlen> to C<-1> and return zero.  If the
400 C<flags> does not contain UTF8_CHECK_ONLY, warnings about
401 malformations will be given, C<retlen> will be set to the expected
402 length of the UTF-8 character in bytes, and zero will be returned.
403 
404 The C<flags> can also contain various flags to allow deviations from
405 the strict UTF-8 encoding (see F<utf8.h>).
406 
407 Most code should use utf8_to_uvchr() rather than call this directly.
408 
409 =cut
410 */
411 
412 UV
Perl_utf8n_to_uvuni(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)413 Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
414 {
415     const U8 *s0 = s;
416     UV uv = *s, ouv = 0;
417     STRLEN len = 1;
418     const bool dowarn = ckWARN_d(WARN_UTF8);
419     const UV startbyte = *s;
420     STRLEN expectlen = 0;
421     U32 warning = 0;
422 
423 /* This list is a superset of the UTF8_ALLOW_XXX. */
424 
425 #define UTF8_WARN_EMPTY				 1
426 #define UTF8_WARN_CONTINUATION			 2
427 #define UTF8_WARN_NON_CONTINUATION	 	 3
428 #define UTF8_WARN_FE_FF				 4
429 #define UTF8_WARN_SHORT				 5
430 #define UTF8_WARN_OVERFLOW			 6
431 #define UTF8_WARN_SURROGATE			 7
432 #define UTF8_WARN_LONG				 8
433 #define UTF8_WARN_FFFF				 9 /* Also FFFE. */
434 
435     if (curlen == 0 &&
436 	!(flags & UTF8_ALLOW_EMPTY)) {
437 	warning = UTF8_WARN_EMPTY;
438 	goto malformed;
439     }
440 
441     if (UTF8_IS_INVARIANT(uv)) {
442 	if (retlen)
443 	    *retlen = 1;
444 	return (UV) (NATIVE_TO_UTF(*s));
445     }
446 
447     if (UTF8_IS_CONTINUATION(uv) &&
448 	!(flags & UTF8_ALLOW_CONTINUATION)) {
449 	warning = UTF8_WARN_CONTINUATION;
450 	goto malformed;
451     }
452 
453     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
454 	!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
455 	warning = UTF8_WARN_NON_CONTINUATION;
456 	goto malformed;
457     }
458 
459 #ifdef EBCDIC
460     uv = NATIVE_TO_UTF(uv);
461 #else
462     if ((uv == 0xfe || uv == 0xff) &&
463 	!(flags & UTF8_ALLOW_FE_FF)) {
464 	warning = UTF8_WARN_FE_FF;
465 	goto malformed;
466     }
467 #endif
468 
469     if      (!(uv & 0x20))	{ len =  2; uv &= 0x1f; }
470     else if (!(uv & 0x10))	{ len =  3; uv &= 0x0f; }
471     else if (!(uv & 0x08))	{ len =  4; uv &= 0x07; }
472     else if (!(uv & 0x04))	{ len =  5; uv &= 0x03; }
473 #ifdef EBCDIC
474     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
475     else			{ len =  7; uv &= 0x01; }
476 #else
477     else if (!(uv & 0x02))	{ len =  6; uv &= 0x01; }
478     else if (!(uv & 0x01))	{ len =  7; uv = 0; }
479     else			{ len = 13; uv = 0; } /* whoa! */
480 #endif
481 
482     if (retlen)
483 	*retlen = len;
484 
485     expectlen = len;
486 
487     if ((curlen < expectlen) &&
488 	!(flags & UTF8_ALLOW_SHORT)) {
489 	warning = UTF8_WARN_SHORT;
490 	goto malformed;
491     }
492 
493     len--;
494     s++;
495     ouv = uv;
496 
497     while (len--) {
498 	if (!UTF8_IS_CONTINUATION(*s) &&
499 	    !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
500 	    s--;
501 	    warning = UTF8_WARN_NON_CONTINUATION;
502 	    goto malformed;
503 	}
504 	else
505 	    uv = UTF8_ACCUMULATE(uv, *s);
506 	if (!(uv > ouv)) {
507 	    /* These cannot be allowed. */
508 	    if (uv == ouv) {
509 		if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
510 		    warning = UTF8_WARN_LONG;
511 		    goto malformed;
512 		}
513 	    }
514 	    else { /* uv < ouv */
515 		/* This cannot be allowed. */
516 		warning = UTF8_WARN_OVERFLOW;
517 		goto malformed;
518 	    }
519 	}
520 	s++;
521 	ouv = uv;
522     }
523 
524     if (UNICODE_IS_SURROGATE(uv) &&
525 	!(flags & UTF8_ALLOW_SURROGATE)) {
526 	warning = UTF8_WARN_SURROGATE;
527 	goto malformed;
528     } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
529 	       !(flags & UTF8_ALLOW_LONG)) {
530 	warning = UTF8_WARN_LONG;
531 	goto malformed;
532     } else if (UNICODE_IS_ILLEGAL(uv) &&
533 	       !(flags & UTF8_ALLOW_FFFF)) {
534 	warning = UTF8_WARN_FFFF;
535 	goto malformed;
536     }
537 
538     return uv;
539 
540 malformed:
541 
542     if (flags & UTF8_CHECK_ONLY) {
543 	if (retlen)
544 	    *retlen = -1;
545 	return 0;
546     }
547 
548     if (dowarn) {
549 	SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
550 
551 	switch (warning) {
552 	case 0: /* Intentionally empty. */ break;
553 	case UTF8_WARN_EMPTY:
554 	    Perl_sv_catpv(aTHX_ sv, "(empty string)");
555 	    break;
556 	case UTF8_WARN_CONTINUATION:
557 	    Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
558 	    break;
559 	case UTF8_WARN_NON_CONTINUATION:
560 	    if (s == s0)
561 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
562                            (UV)s[1], startbyte);
563 	    else {
564 		const int len = (int)(s-s0);
565 	        Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
566                            (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
567 	    }
568 
569 	    break;
570 	case UTF8_WARN_FE_FF:
571 	    Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
572 	    break;
573 	case UTF8_WARN_SHORT:
574 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
575                            (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
576 	    expectlen = curlen;		/* distance for caller to skip */
577 	    break;
578 	case UTF8_WARN_OVERFLOW:
579 	    Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
580                            ouv, *s, startbyte);
581 	    break;
582 	case UTF8_WARN_SURROGATE:
583 	    Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
584 	    break;
585 	case UTF8_WARN_LONG:
586 	    Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
587 			   (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
588 	    break;
589 	case UTF8_WARN_FFFF:
590 	    Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
591 	    break;
592 	default:
593 	    Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
594 	    break;
595 	}
596 
597 	if (warning) {
598 	    const char * const s = SvPVX_const(sv);
599 
600 	    if (PL_op)
601 		Perl_warner(aTHX_ packWARN(WARN_UTF8),
602 			    "%s in %s", s,  OP_DESC(PL_op));
603 	    else
604 		Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
605 	}
606     }
607 
608     if (retlen)
609 	*retlen = expectlen ? expectlen : len;
610 
611     return 0;
612 }
613 
614 /*
615 =for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
616 
617 Returns the native character value of the first character in the string C<s>
618 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
619 length, in bytes, of that character.
620 
621 If C<s> does not point to a well-formed UTF-8 character, zero is
622 returned and retlen is set, if possible, to -1.
623 
624 =cut
625 */
626 
627 UV
Perl_utf8_to_uvchr(pTHX_ U8 * s,STRLEN * retlen)628 Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
629 {
630     return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
631 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
632 }
633 
634 /*
635 =for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
636 
637 Returns the Unicode code point of the first character in the string C<s>
638 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
639 length, in bytes, of that character.
640 
641 This function should only be used when returned UV is considered
642 an index into the Unicode semantic tables (e.g. swashes).
643 
644 If C<s> does not point to a well-formed UTF-8 character, zero is
645 returned and retlen is set, if possible, to -1.
646 
647 =cut
648 */
649 
650 UV
Perl_utf8_to_uvuni(pTHX_ U8 * s,STRLEN * retlen)651 Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
652 {
653     /* Call the low level routine asking for checks */
654     return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
655 			       ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
656 }
657 
658 /*
659 =for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
660 
661 Return the length of the UTF-8 char encoded string C<s> in characters.
662 Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
663 up past C<e>, croaks.
664 
665 =cut
666 */
667 
668 STRLEN
Perl_utf8_length(pTHX_ U8 * s,U8 * e)669 Perl_utf8_length(pTHX_ U8 *s, U8 *e)
670 {
671     STRLEN len = 0;
672 
673     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
674      * the bitops (especially ~) can create illegal UTF-8.
675      * In other words: in Perl UTF-8 is not just for Unicode. */
676 
677     if (e < s)
678 	goto warn_and_return;
679     while (s < e) {
680 	const U8 t = UTF8SKIP(s);
681 	if (e - s < t) {
682 	    warn_and_return:
683 	    if (ckWARN_d(WARN_UTF8)) {
684 	        if (PL_op)
685 		    Perl_warner(aTHX_ packWARN(WARN_UTF8),
686 			    "%s in %s", unees, OP_DESC(PL_op));
687 		else
688 		    Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
689 	    }
690 	    return len;
691 	}
692 	s += t;
693 	len++;
694     }
695 
696     return len;
697 }
698 
699 /*
700 =for apidoc A|IV|utf8_distance|U8 *a|U8 *b
701 
702 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
703 and C<b>.
704 
705 WARNING: use only if you *know* that the pointers point inside the
706 same UTF-8 buffer.
707 
708 =cut
709 */
710 
711 IV
Perl_utf8_distance(pTHX_ U8 * a,U8 * b)712 Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
713 {
714     IV off = 0;
715 
716     /* Note: cannot use UTF8_IS_...() too eagerly here since  e.g.
717      * the bitops (especially ~) can create illegal UTF-8.
718      * In other words: in Perl UTF-8 is not just for Unicode. */
719 
720     if (a < b) {
721 	while (a < b) {
722 	    const U8 c = UTF8SKIP(a);
723 	    if (b - a < c)
724 		goto warn_and_return;
725 	    a += c;
726 	    off--;
727 	}
728     }
729     else {
730 	while (b < a) {
731 	    const U8 c = UTF8SKIP(b);
732 
733 	    if (a - b < c) {
734 		warn_and_return:
735 	        if (ckWARN_d(WARN_UTF8)) {
736 		    if (PL_op)
737 		        Perl_warner(aTHX_ packWARN(WARN_UTF8),
738 				    "%s in %s", unees, OP_DESC(PL_op));
739 		    else
740 		        Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
741 		}
742 		return off;
743 	    }
744 	    b += c;
745 	    off++;
746 	}
747     }
748 
749     return off;
750 }
751 
752 /*
753 =for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
754 
755 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
756 forward or backward.
757 
758 WARNING: do not use the following unless you *know* C<off> is within
759 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
760 on the first byte of character or just after the last byte of a character.
761 
762 =cut
763 */
764 
765 U8 *
Perl_utf8_hop(pTHX_ U8 * s,I32 off)766 Perl_utf8_hop(pTHX_ U8 *s, I32 off)
767 {
768     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
769      * the bitops (especially ~) can create illegal UTF-8.
770      * In other words: in Perl UTF-8 is not just for Unicode. */
771 
772     if (off >= 0) {
773 	while (off--)
774 	    s += UTF8SKIP(s);
775     }
776     else {
777 	while (off++) {
778 	    s--;
779 	    while (UTF8_IS_CONTINUATION(*s))
780 		s--;
781 	}
782     }
783     return (U8 *)s;
784 }
785 
786 /*
787 =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
788 
789 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
790 Unlike C<bytes_to_utf8>, this over-writes the original string, and
791 updates len to contain the new length.
792 Returns zero on failure, setting C<len> to -1.
793 
794 =cut
795 */
796 
797 U8 *
Perl_utf8_to_bytes(pTHX_ U8 * s,STRLEN * len)798 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
799 {
800     U8 *send;
801     U8 *d;
802     U8 *save = s;
803 
804     /* ensure valid UTF-8 and chars < 256 before updating string */
805     for (send = s + *len; s < send; ) {
806         U8 c = *s++;
807 
808         if (!UTF8_IS_INVARIANT(c) &&
809             (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
810 	     || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
811             *len = -1;
812             return 0;
813         }
814     }
815 
816     d = s = save;
817     while (s < send) {
818         STRLEN ulen;
819         *d++ = (U8)utf8_to_uvchr(s, &ulen);
820         s += ulen;
821     }
822     *d = '\0';
823     *len = d - save;
824     return save;
825 }
826 
827 /*
828 =for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
829 
830 Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
831 Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
832 the newly-created string, and updates C<len> to contain the new
833 length.  Returns the original string if no conversion occurs, C<len>
834 is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
835 0 if C<s> is converted or contains all 7bit characters.
836 
837 =cut
838 */
839 
840 U8 *
Perl_bytes_from_utf8(pTHX_ U8 * s,STRLEN * len,bool * is_utf8)841 Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
842 {
843     U8 *d;
844     const U8 *start = s;
845     const U8 *send;
846     I32 count = 0;
847     const U8 *s2;
848 
849     if (!*is_utf8)
850         return (U8 *)start;
851 
852     /* ensure valid UTF-8 and chars < 256 before converting string */
853     for (send = s + *len; s < send;) {
854         U8 c = *s++;
855 	if (!UTF8_IS_INVARIANT(c)) {
856 	    if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
857                 (c = *s++) && UTF8_IS_CONTINUATION(c))
858 		count++;
859 	    else
860                 return (U8 *)start;
861 	}
862     }
863 
864     *is_utf8 = 0;
865 
866     Newxz(d, (*len) - count + 1, U8);
867     s2 = start; start = d;
868     while (s2 < send) {
869 	U8 c = *s2++;
870 	if (!UTF8_IS_INVARIANT(c)) {
871 	    /* Then it is two-byte encoded */
872 	    c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
873 	    c = ASCII_TO_NATIVE(c);
874 	}
875 	*d++ = c;
876     }
877     *d = '\0';
878     *len = d - start;
879     return (U8 *)start;
880 }
881 
882 /*
883 =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
884 
885 Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
886 Returns a pointer to the newly-created string, and sets C<len> to
887 reflect the new length.
888 
889 If you want to convert to UTF-8 from other encodings than ASCII,
890 see sv_recode_to_utf8().
891 
892 =cut
893 */
894 
895 U8*
Perl_bytes_to_utf8(pTHX_ U8 * s,STRLEN * len)896 Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
897 {
898     const U8 * const send = s + (*len);
899     U8 *d;
900     U8 *dst;
901 
902     Newxz(d, (*len) * 2 + 1, U8);
903     dst = d;
904 
905     while (s < send) {
906         const UV uv = NATIVE_TO_ASCII(*s++);
907         if (UNI_IS_INVARIANT(uv))
908             *d++ = (U8)UTF_TO_NATIVE(uv);
909         else {
910             *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
911             *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
912         }
913     }
914     *d = '\0';
915     *len = d-dst;
916     return dst;
917 }
918 
919 /*
920  * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
921  *
922  * Destination must be pre-extended to 3/2 source.  Do not use in-place.
923  * We optimize for native, for obvious reasons. */
924 
925 U8*
Perl_utf16_to_utf8(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)926 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
927 {
928     U8* pend;
929     U8* dstart = d;
930 
931     if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
932 	 d[0] = 0;
933 	 *newlen = 1;
934 	 return d;
935     }
936 
937     if (bytelen & 1)
938 	Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
939 
940     pend = p + bytelen;
941 
942     while (p < pend) {
943 	UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
944 	p += 2;
945 	if (uv < 0x80) {
946 	    *d++ = (U8)uv;
947 	    continue;
948 	}
949 	if (uv < 0x800) {
950 	    *d++ = (U8)(( uv >>  6)         | 0xc0);
951 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
952 	    continue;
953 	}
954 	if (uv >= 0xd800 && uv < 0xdbff) {	/* surrogates */
955 	    UV low = (p[0] << 8) + p[1];
956 	    p += 2;
957 	    if (low < 0xdc00 || low >= 0xdfff)
958 		Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
959 	    uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
960 	}
961 	if (uv < 0x10000) {
962 	    *d++ = (U8)(( uv >> 12)         | 0xe0);
963 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
964 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
965 	    continue;
966 	}
967 	else {
968 	    *d++ = (U8)(( uv >> 18)         | 0xf0);
969 	    *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
970 	    *d++ = (U8)(((uv >>  6) & 0x3f) | 0x80);
971 	    *d++ = (U8)(( uv        & 0x3f) | 0x80);
972 	    continue;
973 	}
974     }
975     *newlen = d - dstart;
976     return d;
977 }
978 
979 /* Note: this one is slightly destructive of the source. */
980 
981 U8*
Perl_utf16_to_utf8_reversed(pTHX_ U8 * p,U8 * d,I32 bytelen,I32 * newlen)982 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
983 {
984     U8* s = (U8*)p;
985     U8* send = s + bytelen;
986     while (s < send) {
987 	U8 tmp = s[0];
988 	s[0] = s[1];
989 	s[1] = tmp;
990 	s += 2;
991     }
992     return utf16_to_utf8(p, d, bytelen, newlen);
993 }
994 
995 /* for now these are all defined (inefficiently) in terms of the utf8 versions */
996 
997 bool
Perl_is_uni_alnum(pTHX_ UV c)998 Perl_is_uni_alnum(pTHX_ UV c)
999 {
1000     U8 tmpbuf[UTF8_MAXBYTES+1];
1001     uvchr_to_utf8(tmpbuf, c);
1002     return is_utf8_alnum(tmpbuf);
1003 }
1004 
1005 bool
Perl_is_uni_alnumc(pTHX_ UV c)1006 Perl_is_uni_alnumc(pTHX_ UV c)
1007 {
1008     U8 tmpbuf[UTF8_MAXBYTES+1];
1009     uvchr_to_utf8(tmpbuf, c);
1010     return is_utf8_alnumc(tmpbuf);
1011 }
1012 
1013 bool
Perl_is_uni_idfirst(pTHX_ UV c)1014 Perl_is_uni_idfirst(pTHX_ UV c)
1015 {
1016     U8 tmpbuf[UTF8_MAXBYTES+1];
1017     uvchr_to_utf8(tmpbuf, c);
1018     return is_utf8_idfirst(tmpbuf);
1019 }
1020 
1021 bool
Perl_is_uni_alpha(pTHX_ UV c)1022 Perl_is_uni_alpha(pTHX_ UV c)
1023 {
1024     U8 tmpbuf[UTF8_MAXBYTES+1];
1025     uvchr_to_utf8(tmpbuf, c);
1026     return is_utf8_alpha(tmpbuf);
1027 }
1028 
1029 bool
Perl_is_uni_ascii(pTHX_ UV c)1030 Perl_is_uni_ascii(pTHX_ UV c)
1031 {
1032     U8 tmpbuf[UTF8_MAXBYTES+1];
1033     uvchr_to_utf8(tmpbuf, c);
1034     return is_utf8_ascii(tmpbuf);
1035 }
1036 
1037 bool
Perl_is_uni_space(pTHX_ UV c)1038 Perl_is_uni_space(pTHX_ UV c)
1039 {
1040     U8 tmpbuf[UTF8_MAXBYTES+1];
1041     uvchr_to_utf8(tmpbuf, c);
1042     return is_utf8_space(tmpbuf);
1043 }
1044 
1045 bool
Perl_is_uni_digit(pTHX_ UV c)1046 Perl_is_uni_digit(pTHX_ UV c)
1047 {
1048     U8 tmpbuf[UTF8_MAXBYTES+1];
1049     uvchr_to_utf8(tmpbuf, c);
1050     return is_utf8_digit(tmpbuf);
1051 }
1052 
1053 bool
Perl_is_uni_upper(pTHX_ UV c)1054 Perl_is_uni_upper(pTHX_ UV c)
1055 {
1056     U8 tmpbuf[UTF8_MAXBYTES+1];
1057     uvchr_to_utf8(tmpbuf, c);
1058     return is_utf8_upper(tmpbuf);
1059 }
1060 
1061 bool
Perl_is_uni_lower(pTHX_ UV c)1062 Perl_is_uni_lower(pTHX_ UV c)
1063 {
1064     U8 tmpbuf[UTF8_MAXBYTES+1];
1065     uvchr_to_utf8(tmpbuf, c);
1066     return is_utf8_lower(tmpbuf);
1067 }
1068 
1069 bool
Perl_is_uni_cntrl(pTHX_ UV c)1070 Perl_is_uni_cntrl(pTHX_ UV c)
1071 {
1072     U8 tmpbuf[UTF8_MAXBYTES+1];
1073     uvchr_to_utf8(tmpbuf, c);
1074     return is_utf8_cntrl(tmpbuf);
1075 }
1076 
1077 bool
Perl_is_uni_graph(pTHX_ UV c)1078 Perl_is_uni_graph(pTHX_ UV c)
1079 {
1080     U8 tmpbuf[UTF8_MAXBYTES+1];
1081     uvchr_to_utf8(tmpbuf, c);
1082     return is_utf8_graph(tmpbuf);
1083 }
1084 
1085 bool
Perl_is_uni_print(pTHX_ UV c)1086 Perl_is_uni_print(pTHX_ UV c)
1087 {
1088     U8 tmpbuf[UTF8_MAXBYTES+1];
1089     uvchr_to_utf8(tmpbuf, c);
1090     return is_utf8_print(tmpbuf);
1091 }
1092 
1093 bool
Perl_is_uni_punct(pTHX_ UV c)1094 Perl_is_uni_punct(pTHX_ UV c)
1095 {
1096     U8 tmpbuf[UTF8_MAXBYTES+1];
1097     uvchr_to_utf8(tmpbuf, c);
1098     return is_utf8_punct(tmpbuf);
1099 }
1100 
1101 bool
Perl_is_uni_xdigit(pTHX_ UV c)1102 Perl_is_uni_xdigit(pTHX_ UV c)
1103 {
1104     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1105     uvchr_to_utf8(tmpbuf, c);
1106     return is_utf8_xdigit(tmpbuf);
1107 }
1108 
1109 UV
Perl_to_uni_upper(pTHX_ UV c,U8 * p,STRLEN * lenp)1110 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1111 {
1112     uvchr_to_utf8(p, c);
1113     return to_utf8_upper(p, p, lenp);
1114 }
1115 
1116 UV
Perl_to_uni_title(pTHX_ UV c,U8 * p,STRLEN * lenp)1117 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1118 {
1119     uvchr_to_utf8(p, c);
1120     return to_utf8_title(p, p, lenp);
1121 }
1122 
1123 UV
Perl_to_uni_lower(pTHX_ UV c,U8 * p,STRLEN * lenp)1124 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1125 {
1126     uvchr_to_utf8(p, c);
1127     return to_utf8_lower(p, p, lenp);
1128 }
1129 
1130 UV
Perl_to_uni_fold(pTHX_ UV c,U8 * p,STRLEN * lenp)1131 Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1132 {
1133     uvchr_to_utf8(p, c);
1134     return to_utf8_fold(p, p, lenp);
1135 }
1136 
1137 /* for now these all assume no locale info available for Unicode > 255 */
1138 
1139 bool
Perl_is_uni_alnum_lc(pTHX_ UV c)1140 Perl_is_uni_alnum_lc(pTHX_ UV c)
1141 {
1142     return is_uni_alnum(c);	/* XXX no locale support yet */
1143 }
1144 
1145 bool
Perl_is_uni_alnumc_lc(pTHX_ UV c)1146 Perl_is_uni_alnumc_lc(pTHX_ UV c)
1147 {
1148     return is_uni_alnumc(c);	/* XXX no locale support yet */
1149 }
1150 
1151 bool
Perl_is_uni_idfirst_lc(pTHX_ UV c)1152 Perl_is_uni_idfirst_lc(pTHX_ UV c)
1153 {
1154     return is_uni_idfirst(c);	/* XXX no locale support yet */
1155 }
1156 
1157 bool
Perl_is_uni_alpha_lc(pTHX_ UV c)1158 Perl_is_uni_alpha_lc(pTHX_ UV c)
1159 {
1160     return is_uni_alpha(c);	/* XXX no locale support yet */
1161 }
1162 
1163 bool
Perl_is_uni_ascii_lc(pTHX_ UV c)1164 Perl_is_uni_ascii_lc(pTHX_ UV c)
1165 {
1166     return is_uni_ascii(c);	/* XXX no locale support yet */
1167 }
1168 
1169 bool
Perl_is_uni_space_lc(pTHX_ UV c)1170 Perl_is_uni_space_lc(pTHX_ UV c)
1171 {
1172     return is_uni_space(c);	/* XXX no locale support yet */
1173 }
1174 
1175 bool
Perl_is_uni_digit_lc(pTHX_ UV c)1176 Perl_is_uni_digit_lc(pTHX_ UV c)
1177 {
1178     return is_uni_digit(c);	/* XXX no locale support yet */
1179 }
1180 
1181 bool
Perl_is_uni_upper_lc(pTHX_ UV c)1182 Perl_is_uni_upper_lc(pTHX_ UV c)
1183 {
1184     return is_uni_upper(c);	/* XXX no locale support yet */
1185 }
1186 
1187 bool
Perl_is_uni_lower_lc(pTHX_ UV c)1188 Perl_is_uni_lower_lc(pTHX_ UV c)
1189 {
1190     return is_uni_lower(c);	/* XXX no locale support yet */
1191 }
1192 
1193 bool
Perl_is_uni_cntrl_lc(pTHX_ UV c)1194 Perl_is_uni_cntrl_lc(pTHX_ UV c)
1195 {
1196     return is_uni_cntrl(c);	/* XXX no locale support yet */
1197 }
1198 
1199 bool
Perl_is_uni_graph_lc(pTHX_ UV c)1200 Perl_is_uni_graph_lc(pTHX_ UV c)
1201 {
1202     return is_uni_graph(c);	/* XXX no locale support yet */
1203 }
1204 
1205 bool
Perl_is_uni_print_lc(pTHX_ UV c)1206 Perl_is_uni_print_lc(pTHX_ UV c)
1207 {
1208     return is_uni_print(c);	/* XXX no locale support yet */
1209 }
1210 
1211 bool
Perl_is_uni_punct_lc(pTHX_ UV c)1212 Perl_is_uni_punct_lc(pTHX_ UV c)
1213 {
1214     return is_uni_punct(c);	/* XXX no locale support yet */
1215 }
1216 
1217 bool
Perl_is_uni_xdigit_lc(pTHX_ UV c)1218 Perl_is_uni_xdigit_lc(pTHX_ UV c)
1219 {
1220     return is_uni_xdigit(c);	/* XXX no locale support yet */
1221 }
1222 
1223 U32
Perl_to_uni_upper_lc(pTHX_ U32 c)1224 Perl_to_uni_upper_lc(pTHX_ U32 c)
1225 {
1226     /* XXX returns only the first character -- do not use XXX */
1227     /* XXX no locale support yet */
1228     STRLEN len;
1229     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1230     return (U32)to_uni_upper(c, tmpbuf, &len);
1231 }
1232 
1233 U32
Perl_to_uni_title_lc(pTHX_ U32 c)1234 Perl_to_uni_title_lc(pTHX_ U32 c)
1235 {
1236     /* XXX returns only the first character XXX -- do not use XXX */
1237     /* XXX no locale support yet */
1238     STRLEN len;
1239     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1240     return (U32)to_uni_title(c, tmpbuf, &len);
1241 }
1242 
1243 U32
Perl_to_uni_lower_lc(pTHX_ U32 c)1244 Perl_to_uni_lower_lc(pTHX_ U32 c)
1245 {
1246     /* XXX returns only the first character -- do not use XXX */
1247     /* XXX no locale support yet */
1248     STRLEN len;
1249     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1250     return (U32)to_uni_lower(c, tmpbuf, &len);
1251 }
1252 
1253 bool
Perl_is_utf8_alnum(pTHX_ U8 * p)1254 Perl_is_utf8_alnum(pTHX_ U8 *p)
1255 {
1256     if (!is_utf8_char(p))
1257 	return FALSE;
1258     if (!PL_utf8_alnum)
1259 	/* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1260 	 * descendant of isalnum(3), in other words, it doesn't
1261 	 * contain the '_'. --jhi */
1262 	PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1263     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1264 /*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1265 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1266     if (!PL_utf8_alnum)
1267 	PL_utf8_alnum = swash_init("utf8", "",
1268 	    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1269     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1270 #endif
1271 }
1272 
1273 bool
Perl_is_utf8_alnumc(pTHX_ U8 * p)1274 Perl_is_utf8_alnumc(pTHX_ U8 *p)
1275 {
1276     if (!is_utf8_char(p))
1277 	return FALSE;
1278     if (!PL_utf8_alnumc)
1279 	PL_utf8_alnumc = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1280     return swash_fetch(PL_utf8_alnumc, p, TRUE) != 0;
1281 /*    return is_utf8_alpha(p) || is_utf8_digit(p); */
1282 #ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
1283     if (!PL_utf8_alnum)
1284 	PL_utf8_alnum = swash_init("utf8", "",
1285 	    sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1286     return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
1287 #endif
1288 }
1289 
1290 bool
Perl_is_utf8_idfirst(pTHX_ U8 * p)1291 Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
1292 {
1293     if (*p == '_')
1294 	return TRUE;
1295     if (!is_utf8_char(p))
1296 	return FALSE;
1297     if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1298 	PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
1299     return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
1300 }
1301 
1302 bool
Perl_is_utf8_idcont(pTHX_ U8 * p)1303 Perl_is_utf8_idcont(pTHX_ U8 *p)
1304 {
1305     if (*p == '_')
1306 	return TRUE;
1307     if (!is_utf8_char(p))
1308 	return FALSE;
1309     if (!PL_utf8_idcont)
1310 	PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
1311     return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
1312 }
1313 
1314 bool
Perl_is_utf8_alpha(pTHX_ U8 * p)1315 Perl_is_utf8_alpha(pTHX_ U8 *p)
1316 {
1317     if (!is_utf8_char(p))
1318 	return FALSE;
1319     if (!PL_utf8_alpha)
1320 	PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1321     return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
1322 }
1323 
1324 bool
Perl_is_utf8_ascii(pTHX_ U8 * p)1325 Perl_is_utf8_ascii(pTHX_ U8 *p)
1326 {
1327     if (!is_utf8_char(p))
1328 	return FALSE;
1329     if (!PL_utf8_ascii)
1330 	PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1331     return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
1332 }
1333 
1334 bool
Perl_is_utf8_space(pTHX_ U8 * p)1335 Perl_is_utf8_space(pTHX_ U8 *p)
1336 {
1337     if (!is_utf8_char(p))
1338 	return FALSE;
1339     if (!PL_utf8_space)
1340 	PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1341     return swash_fetch(PL_utf8_space, p, TRUE) != 0;
1342 }
1343 
1344 bool
Perl_is_utf8_digit(pTHX_ U8 * p)1345 Perl_is_utf8_digit(pTHX_ U8 *p)
1346 {
1347     if (!is_utf8_char(p))
1348 	return FALSE;
1349     if (!PL_utf8_digit)
1350 	PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1351     return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
1352 }
1353 
1354 bool
Perl_is_utf8_upper(pTHX_ U8 * p)1355 Perl_is_utf8_upper(pTHX_ U8 *p)
1356 {
1357     if (!is_utf8_char(p))
1358 	return FALSE;
1359     if (!PL_utf8_upper)
1360 	PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
1361     return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
1362 }
1363 
1364 bool
Perl_is_utf8_lower(pTHX_ U8 * p)1365 Perl_is_utf8_lower(pTHX_ U8 *p)
1366 {
1367     if (!is_utf8_char(p))
1368 	return FALSE;
1369     if (!PL_utf8_lower)
1370 	PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
1371     return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
1372 }
1373 
1374 bool
Perl_is_utf8_cntrl(pTHX_ U8 * p)1375 Perl_is_utf8_cntrl(pTHX_ U8 *p)
1376 {
1377     if (!is_utf8_char(p))
1378 	return FALSE;
1379     if (!PL_utf8_cntrl)
1380 	PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1381     return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
1382 }
1383 
1384 bool
Perl_is_utf8_graph(pTHX_ U8 * p)1385 Perl_is_utf8_graph(pTHX_ U8 *p)
1386 {
1387     if (!is_utf8_char(p))
1388 	return FALSE;
1389     if (!PL_utf8_graph)
1390 	PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1391     return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
1392 }
1393 
1394 bool
Perl_is_utf8_print(pTHX_ U8 * p)1395 Perl_is_utf8_print(pTHX_ U8 *p)
1396 {
1397     if (!is_utf8_char(p))
1398 	return FALSE;
1399     if (!PL_utf8_print)
1400 	PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1401     return swash_fetch(PL_utf8_print, p, TRUE) != 0;
1402 }
1403 
1404 bool
Perl_is_utf8_punct(pTHX_ U8 * p)1405 Perl_is_utf8_punct(pTHX_ U8 *p)
1406 {
1407     if (!is_utf8_char(p))
1408 	return FALSE;
1409     if (!PL_utf8_punct)
1410 	PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1411     return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
1412 }
1413 
1414 bool
Perl_is_utf8_xdigit(pTHX_ U8 * p)1415 Perl_is_utf8_xdigit(pTHX_ U8 *p)
1416 {
1417     if (!is_utf8_char(p))
1418 	return FALSE;
1419     if (!PL_utf8_xdigit)
1420 	PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1421     return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
1422 }
1423 
1424 bool
Perl_is_utf8_mark(pTHX_ U8 * p)1425 Perl_is_utf8_mark(pTHX_ U8 *p)
1426 {
1427     if (!is_utf8_char(p))
1428 	return FALSE;
1429     if (!PL_utf8_mark)
1430 	PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1431     return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
1432 }
1433 
1434 /*
1435 =for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1436 
1437 The "p" contains the pointer to the UTF-8 string encoding
1438 the character that is being converted.
1439 
1440 The "ustrp" is a pointer to the character buffer to put the
1441 conversion result to.  The "lenp" is a pointer to the length
1442 of the result.
1443 
1444 The "swashp" is a pointer to the swash to use.
1445 
1446 Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1447 and loaded by SWASHGET, using lib/utf8_heavy.pl.  The special (usually,
1448 but not always, a multicharacter mapping), is tried first.
1449 
1450 The "special" is a string like "utf8::ToSpecLower", which means the
1451 hash %utf8::ToSpecLower.  The access to the hash is through
1452 Perl_to_utf8_case().
1453 
1454 The "normal" is a string like "ToLower" which means the swash
1455 %utf8::ToLower.
1456 
1457 =cut */
1458 
1459 UV
Perl_to_utf8_case(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp,SV ** swashp,char * normal,char * special)1460 Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1461 {
1462     U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1463     STRLEN len = 0;
1464 
1465     const UV uv0 = utf8_to_uvchr(p, NULL);
1466     /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1467      * are necessary in EBCDIC, they are redundant no-ops
1468      * in ASCII-ish platforms, and hopefully optimized away. */
1469     const UV uv1 = NATIVE_TO_UNI(uv0);
1470     uvuni_to_utf8(tmpbuf, uv1);
1471 
1472     if (!*swashp) /* load on-demand */
1473          *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1474 
1475     /* The 0xDF is the only special casing Unicode code point below 0x100. */
1476     if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1477          /* It might be "special" (sometimes, but not always,
1478 	  * a multicharacter mapping) */
1479 	 HV *hv;
1480 	 SV **svp;
1481 
1482 	 if ((hv  = get_hv(special, FALSE)) &&
1483 	     (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1484 	     (*svp)) {
1485 	     const char *s;
1486 
1487 	      s = SvPV_const(*svp, len);
1488 	      if (len == 1)
1489 		   len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1490 	      else {
1491 #ifdef EBCDIC
1492 		   /* If we have EBCDIC we need to remap the characters
1493 		    * since any characters in the low 256 are Unicode
1494 		    * code points, not EBCDIC. */
1495 		   U8 *t = (U8*)s, *tend = t + len, *d;
1496 
1497 		   d = tmpbuf;
1498 		   if (SvUTF8(*svp)) {
1499 			STRLEN tlen = 0;
1500 
1501 			while (t < tend) {
1502 			     UV c = utf8_to_uvchr(t, &tlen);
1503 			     if (tlen > 0) {
1504 				  d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1505 				  t += tlen;
1506 			     }
1507 			     else
1508 				  break;
1509 			}
1510 		   }
1511 		   else {
1512 			while (t < tend) {
1513 			     d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1514 			     t++;
1515 			}
1516 		   }
1517 		   len = d - tmpbuf;
1518 		   Copy(tmpbuf, ustrp, len, U8);
1519 #else
1520 		   Copy(s, ustrp, len, U8);
1521 #endif
1522 	      }
1523 	 }
1524     }
1525 
1526     if (!len && *swashp) {
1527 	 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1528 
1529 	 if (uv2) {
1530 	      /* It was "normal" (a single character mapping). */
1531 	      UV uv3 = UNI_TO_NATIVE(uv2);
1532 
1533 	      len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1534 	 }
1535     }
1536 
1537     if (!len) /* Neither: just copy. */
1538 	 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1539 
1540     if (lenp)
1541 	 *lenp = len;
1542 
1543     return len ? utf8_to_uvchr(ustrp, 0) : 0;
1544 }
1545 
1546 /*
1547 =for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1548 
1549 Convert the UTF-8 encoded character at p to its uppercase version and
1550 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1551 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1552 the uppercase version may be longer than the original character.
1553 
1554 The first character of the uppercased version is returned
1555 (but note, as explained above, that there may be more.)
1556 
1557 =cut */
1558 
1559 UV
Perl_to_utf8_upper(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1560 Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1561 {
1562     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1563                              &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1564 }
1565 
1566 /*
1567 =for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1568 
1569 Convert the UTF-8 encoded character at p to its titlecase version and
1570 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1571 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1572 titlecase version may be longer than the original character.
1573 
1574 The first character of the titlecased version is returned
1575 (but note, as explained above, that there may be more.)
1576 
1577 =cut */
1578 
1579 UV
Perl_to_utf8_title(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1580 Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1581 {
1582     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1583                              &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1584 }
1585 
1586 /*
1587 =for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1588 
1589 Convert the UTF-8 encoded character at p to its lowercase version and
1590 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1591 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1592 lowercase version may be longer than the original character.
1593 
1594 The first character of the lowercased version is returned
1595 (but note, as explained above, that there may be more.)
1596 
1597 =cut */
1598 
1599 UV
Perl_to_utf8_lower(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1600 Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1601 {
1602     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1603                              &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1604 }
1605 
1606 /*
1607 =for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1608 
1609 Convert the UTF-8 encoded character at p to its foldcase version and
1610 store that in UTF-8 in ustrp and its length in bytes in lenp.  Note
1611 that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1612 foldcase version may be longer than the original character (up to
1613 three characters).
1614 
1615 The first character of the foldcased version is returned
1616 (but note, as explained above, that there may be more.)
1617 
1618 =cut */
1619 
1620 UV
Perl_to_utf8_fold(pTHX_ U8 * p,U8 * ustrp,STRLEN * lenp)1621 Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1622 {
1623     return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1624                              &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1625 }
1626 
1627 /* a "swash" is a swatch hash */
1628 
1629 SV*
Perl_swash_init(pTHX_ char * pkg,char * name,SV * listsv,I32 minbits,I32 none)1630 Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1631 {
1632     SV* retval;
1633     SV* const tokenbufsv = sv_newmortal();
1634     dSP;
1635     const size_t pkg_len = strlen(pkg);
1636     const size_t name_len = strlen(name);
1637     HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1638     SV* errsv_save;
1639 
1640     PUSHSTACKi(PERLSI_MAGIC);
1641     ENTER;
1642     SAVEI32(PL_hints);
1643     PL_hints = 0;
1644     save_re_context();
1645     if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) {	/* demand load utf8 */
1646 	ENTER;
1647 	errsv_save = newSVsv(ERRSV);
1648 	Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1649 			 Nullsv);
1650 	if (!SvTRUE(ERRSV))
1651 	    sv_setsv(ERRSV, errsv_save);
1652 	SvREFCNT_dec(errsv_save);
1653 	LEAVE;
1654     }
1655     SPAGAIN;
1656     PUSHMARK(SP);
1657     EXTEND(SP,5);
1658     PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1659     PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1660     PUSHs(listsv);
1661     PUSHs(sv_2mortal(newSViv(minbits)));
1662     PUSHs(sv_2mortal(newSViv(none)));
1663     PUTBACK;
1664     if (IN_PERL_COMPILETIME) {
1665 	/* XXX ought to be handled by lex_start */
1666 	SAVEI32(PL_in_my);
1667 	PL_in_my = 0;
1668 	sv_setpv(tokenbufsv, PL_tokenbuf);
1669     }
1670     errsv_save = newSVsv(ERRSV);
1671     if (call_method("SWASHNEW", G_SCALAR))
1672 	retval = newSVsv(*PL_stack_sp--);
1673     else
1674 	retval = &PL_sv_undef;
1675     if (!SvTRUE(ERRSV))
1676 	sv_setsv(ERRSV, errsv_save);
1677     SvREFCNT_dec(errsv_save);
1678     LEAVE;
1679     POPSTACK;
1680     if (IN_PERL_COMPILETIME) {
1681 	STRLEN len;
1682 	const char* const pv = SvPV_const(tokenbufsv, len);
1683 
1684 	Copy(pv, PL_tokenbuf, len+1, char);
1685 	PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1686     }
1687     if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1688         if (SvPOK(retval))
1689 	    Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1690 		       retval);
1691 	Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1692     }
1693     return retval;
1694 }
1695 
1696 
1697 /* This API is wrong for special case conversions since we may need to
1698  * return several Unicode characters for a single Unicode character
1699  * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1700  * the lower-level routine, and it is similarly broken for returning
1701  * multiple values.  --jhi */
1702 UV
Perl_swash_fetch(pTHX_ SV * sv,U8 * ptr,bool do_utf8)1703 Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1704 {
1705     HV* const hv = (HV*)SvRV(sv);
1706     U32 klen;
1707     U32 off;
1708     STRLEN slen;
1709     STRLEN needents;
1710     const U8 *tmps = NULL;
1711     U32 bit;
1712     SV *retval;
1713     U8 tmputf8[2];
1714     UV c = NATIVE_TO_ASCII(*ptr);
1715 
1716     if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1717         tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1718         tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1719         ptr = tmputf8;
1720     }
1721     /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1722      * then the "swatch" is a vec() for al the chars which start
1723      * with 0xAA..0xYY
1724      * So the key in the hash (klen) is length of encoded char -1
1725      */
1726     klen = UTF8SKIP(ptr) - 1;
1727     off  = ptr[klen];
1728 
1729     if (klen == 0)
1730      {
1731       /* If char in invariant then swatch is for all the invariant chars
1732        * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1733        */
1734       needents = UTF_CONTINUATION_MARK;
1735       off      = NATIVE_TO_UTF(ptr[klen]);
1736      }
1737     else
1738      {
1739       /* If char is encoded then swatch is for the prefix */
1740       needents = (1 << UTF_ACCUMULATION_SHIFT);
1741       off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1742      }
1743 
1744     /*
1745      * This single-entry cache saves about 1/3 of the utf8 overhead in test
1746      * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
1747      * it's nothing to sniff at.)  Pity we usually come through at least
1748      * two function calls to get here...
1749      *
1750      * NB: this code assumes that swatches are never modified, once generated!
1751      */
1752 
1753     if (hv   == PL_last_swash_hv &&
1754 	klen == PL_last_swash_klen &&
1755 	(!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1756     {
1757 	tmps = PL_last_swash_tmps;
1758 	slen = PL_last_swash_slen;
1759     }
1760     else {
1761 	/* Try our second-level swatch cache, kept in a hash. */
1762 	SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1763 
1764 	/* If not cached, generate it via utf8::SWASHGET */
1765 	if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1766 	    dSP;
1767 	    /* We use utf8n_to_uvuni() as we want an index into
1768 	       Unicode tables, not a native character number.
1769 	     */
1770 	    const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1771 					   ckWARN(WARN_UTF8) ?
1772 					   0 : UTF8_ALLOW_ANY);
1773 	    SV *errsv_save;
1774 	    ENTER;
1775 	    SAVETMPS;
1776 	    save_re_context();
1777 	    PUSHSTACKi(PERLSI_MAGIC);
1778 	    PUSHMARK(SP);
1779 	    EXTEND(SP,3);
1780 	    PUSHs((SV*)sv);
1781 	    /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1782 	    PUSHs(sv_2mortal(newSViv((klen) ?
1783 				     (code_point & ~(needents - 1)) : 0)));
1784 	    PUSHs(sv_2mortal(newSViv(needents)));
1785 	    PUTBACK;
1786 	    errsv_save = newSVsv(ERRSV);
1787 	    if (call_method("SWASHGET", G_SCALAR))
1788 		retval = newSVsv(*PL_stack_sp--);
1789 	    else
1790 		retval = &PL_sv_undef;
1791 	    if (!SvTRUE(ERRSV))
1792 		sv_setsv(ERRSV, errsv_save);
1793 	    SvREFCNT_dec(errsv_save);
1794 	    POPSTACK;
1795 	    FREETMPS;
1796 	    LEAVE;
1797 	    if (IN_PERL_COMPILETIME)
1798 		PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1799 
1800 	    svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
1801 
1802 	    if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1803 		Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1804 	}
1805 
1806 	PL_last_swash_hv = hv;
1807 	PL_last_swash_klen = klen;
1808 	/* FIXME change interpvar.h?  */
1809 	PL_last_swash_tmps = (U8 *) tmps;
1810 	PL_last_swash_slen = slen;
1811 	if (klen)
1812 	    Copy(ptr, PL_last_swash_key, klen, U8);
1813     }
1814 
1815     switch ((int)((slen << 3) / needents)) {
1816     case 1:
1817 	bit = 1 << (off & 7);
1818 	off >>= 3;
1819 	return (tmps[off] & bit) != 0;
1820     case 8:
1821 	return tmps[off];
1822     case 16:
1823 	off <<= 1;
1824 	return (tmps[off] << 8) + tmps[off + 1] ;
1825     case 32:
1826 	off <<= 2;
1827 	return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1828     }
1829     Perl_croak(aTHX_ "panic: swash_fetch");
1830     return 0;
1831 }
1832 
1833 
1834 /*
1835 =for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1836 
1837 Adds the UTF-8 representation of the Native codepoint C<uv> to the end
1838 of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
1839 bytes available. The return value is the pointer to the byte after the
1840 end of the new character. In other words,
1841 
1842     d = uvchr_to_utf8(d, uv);
1843 
1844 is the recommended wide native character-aware way of saying
1845 
1846     *(d++) = uv;
1847 
1848 =cut
1849 */
1850 
1851 /* On ASCII machines this is normally a macro but we want a
1852    real function in case XS code wants it
1853 */
1854 #undef Perl_uvchr_to_utf8
1855 U8 *
Perl_uvchr_to_utf8(pTHX_ U8 * d,UV uv)1856 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1857 {
1858     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1859 }
1860 
1861 U8 *
Perl_uvchr_to_utf8_flags(pTHX_ U8 * d,UV uv,UV flags)1862 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1863 {
1864     return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1865 }
1866 
1867 /*
1868 =for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1869 
1870 Returns the native character value of the first character in the string C<s>
1871 which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
1872 length, in bytes, of that character.
1873 
1874 Allows length and flags to be passed to low level routine.
1875 
1876 =cut
1877 */
1878 /* On ASCII machines this is normally a macro but we want
1879    a real function in case XS code wants it
1880 */
1881 #undef Perl_utf8n_to_uvchr
1882 UV
Perl_utf8n_to_uvchr(pTHX_ U8 * s,STRLEN curlen,STRLEN * retlen,U32 flags)1883 Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1884 {
1885     const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1886     return UNI_TO_NATIVE(uv);
1887 }
1888 
1889 /*
1890 =for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1891 
1892 Build to the scalar dsv a displayable version of the string spv,
1893 length len, the displayable version being at most pvlim bytes long
1894 (if longer, the rest is truncated and "..." will be appended).
1895 
1896 The flags argument can have UNI_DISPLAY_ISPRINT set to display
1897 isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1898 to display the \\[nrfta\\] as the backslashed versions (like '\n')
1899 (UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1900 UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1901 UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1902 
1903 The pointer to the PV of the dsv is returned.
1904 
1905 =cut */
1906 char *
Perl_pv_uni_display(pTHX_ SV * dsv,U8 * spv,STRLEN len,STRLEN pvlim,UV flags)1907 Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1908 {
1909     int truncated = 0;
1910     const char *s, *e;
1911 
1912     sv_setpvn(dsv, "", 0);
1913     for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1914 	 UV u;
1915 	  /* This serves double duty as a flag and a character to print after
1916 	     a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1917 	  */
1918 	 char ok = 0;
1919 
1920 	 if (pvlim && SvCUR(dsv) >= pvlim) {
1921 	      truncated++;
1922 	      break;
1923 	 }
1924 	 u = utf8_to_uvchr((U8*)s, 0);
1925 	 if (u < 256) {
1926 	     const unsigned char c = (unsigned char)u & 0xFF;
1927 	     if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1928 	         switch (c) {
1929 		 case '\n':
1930 		     ok = 'n'; break;
1931 		 case '\r':
1932 		     ok = 'r'; break;
1933 		 case '\t':
1934 		     ok = 't'; break;
1935 		 case '\f':
1936 		     ok = 'f'; break;
1937 		 case '\a':
1938 		     ok = 'a'; break;
1939 		 case '\\':
1940 		     ok = '\\'; break;
1941 		 default: break;
1942 		 }
1943 		 if (ok) {
1944 		     Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1945 		 }
1946 	     }
1947 	     /* isPRINT() is the locale-blind version. */
1948 	     if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1949 	         Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1950 		 ok = 1;
1951 	     }
1952 	 }
1953 	 if (!ok)
1954 	     Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1955     }
1956     if (truncated)
1957 	 sv_catpvn(dsv, "...", 3);
1958 
1959     return SvPVX(dsv);
1960 }
1961 
1962 /*
1963 =for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1964 
1965 Build to the scalar dsv a displayable version of the scalar sv,
1966 the displayable version being at most pvlim bytes long
1967 (if longer, the rest is truncated and "..." will be appended).
1968 
1969 The flags argument is as in pv_uni_display().
1970 
1971 The pointer to the PV of the dsv is returned.
1972 
1973 =cut */
1974 char *
Perl_sv_uni_display(pTHX_ SV * dsv,SV * ssv,STRLEN pvlim,UV flags)1975 Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1976 {
1977      return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
1978 				SvCUR(ssv), pvlim, flags);
1979 }
1980 
1981 /*
1982 =for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1983 
1984 Return true if the strings s1 and s2 differ case-insensitively, false
1985 if not (if they are equal case-insensitively).  If u1 is true, the
1986 string s1 is assumed to be in UTF-8-encoded Unicode.  If u2 is true,
1987 the string s2 is assumed to be in UTF-8-encoded Unicode.  If u1 or u2
1988 are false, the respective string is assumed to be in native 8-bit
1989 encoding.
1990 
1991 If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1992 in there (they will point at the beginning of the I<next> character).
1993 If the pointers behind pe1 or pe2 are non-NULL, they are the end
1994 pointers beyond which scanning will not continue under any
1995 circumstances.  If the byte lengths l1 and l2 are non-zero, s1+l1 and
1996 s2+l2 will be used as goal end pointers that will also stop the scan,
1997 and which qualify towards defining a successful match: all the scans
1998 that define an explicit length must reach their goal pointers for
1999 a match to succeed).
2000 
2001 For case-insensitiveness, the "casefolding" of Unicode is used
2002 instead of upper/lowercasing both the characters, see
2003 http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2004 
2005 =cut */
2006 I32
Perl_ibcmp_utf8(pTHX_ const char * s1,char ** pe1,register UV l1,bool u1,const char * s2,char ** pe2,register UV l2,bool u2)2007 Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2008 {
2009      register const U8 *p1  = (const U8*)s1;
2010      register const U8 *p2  = (const U8*)s2;
2011      register const U8 *f1 = 0, *f2 = 0;
2012      register U8 *e1 = 0, *q1 = 0;
2013      register U8 *e2 = 0, *q2 = 0;
2014      STRLEN n1 = 0, n2 = 0;
2015      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2016      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2017      U8 natbuf[1+1];
2018      STRLEN foldlen1, foldlen2;
2019      bool match;
2020 
2021      if (pe1)
2022 	  e1 = *(U8**)pe1;
2023      if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2024 	  f1 = (const U8*)s1 + l1;
2025      if (pe2)
2026 	  e2 = *(U8**)pe2;
2027      if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2028 	  f2 = (const U8*)s2 + l2;
2029 
2030      if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2031 	  return 1; /* mismatch; possible infinite loop or false positive */
2032 
2033      if (!u1 || !u2)
2034 	  natbuf[1] = 0; /* Need to terminate the buffer. */
2035 
2036      while ((e1 == 0 || p1 < e1) &&
2037 	    (f1 == 0 || p1 < f1) &&
2038 	    (e2 == 0 || p2 < e2) &&
2039 	    (f2 == 0 || p2 < f2)) {
2040 	  if (n1 == 0) {
2041 	       if (u1)
2042 		    to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
2043 	       else {
2044 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2045 		    to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2046 	       }
2047 	       q1 = foldbuf1;
2048 	       n1 = foldlen1;
2049 	  }
2050 	  if (n2 == 0) {
2051 	       if (u2)
2052 		    to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
2053 	       else {
2054 		    uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2055 		    to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2056 	       }
2057 	       q2 = foldbuf2;
2058 	       n2 = foldlen2;
2059 	  }
2060 	  while (n1 && n2) {
2061 	       if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2062 		   (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2063 		    memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2064 		   return 1; /* mismatch */
2065 	       n1 -= UTF8SKIP(q1);
2066 	       q1 += UTF8SKIP(q1);
2067 	       n2 -= UTF8SKIP(q2);
2068 	       q2 += UTF8SKIP(q2);
2069 	  }
2070 	  if (n1 == 0)
2071 	       p1 += u1 ? UTF8SKIP(p1) : 1;
2072 	  if (n2 == 0)
2073 	       p2 += u2 ? UTF8SKIP(p2) : 1;
2074 
2075      }
2076 
2077      /* A match is defined by all the scans that specified
2078       * an explicit length reaching their final goals. */
2079      match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2080 
2081      if (match) {
2082 	  if (pe1)
2083 	       *pe1 = (char*)p1;
2084 	  if (pe2)
2085 	       *pe2 = (char*)p2;
2086      }
2087 
2088      return match ? 0 : 1; /* 0 match, 1 mismatch */
2089 }
2090 
2091 /*
2092  * Local variables:
2093  * c-indentation-style: bsd
2094  * c-basic-offset: 4
2095  * indent-tabs-mode: t
2096  * End:
2097  *
2098  * ex: set ts=8 sts=4 sw=4 noet:
2099  */
2100