Skip to content

Commit 9acae56

Browse files
committed
Inline basic UTF-8 functions.
Shows a measurable speedup when processing UTF-8 data, such as with the new builtin collation provider. Discussion: https://postgr.es/m/163f4e2190cdf67f67016044e503c5004547e5a9.camel@j-davis.com Reviewed-by: Peter Eisentraut
1 parent 2b52086 commit 9acae56

File tree

2 files changed

+61
-61
lines changed

2 files changed

+61
-61
lines changed

src/common/wchar.c

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -476,39 +476,6 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
476476
}
477477

478478

479-
/*
480-
* Map a Unicode code point to UTF-8. utf8string must have at least
481-
* unicode_utf8len(c) bytes available.
482-
*/
483-
unsigned char *
484-
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
485-
{
486-
if (c <= 0x7F)
487-
{
488-
utf8string[0] = c;
489-
}
490-
else if (c <= 0x7FF)
491-
{
492-
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
493-
utf8string[1] = 0x80 | (c & 0x3F);
494-
}
495-
else if (c <= 0xFFFF)
496-
{
497-
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
498-
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
499-
utf8string[2] = 0x80 | (c & 0x3F);
500-
}
501-
else
502-
{
503-
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
504-
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
505-
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
506-
utf8string[3] = 0x80 | (c & 0x3F);
507-
}
508-
509-
return utf8string;
510-
}
511-
512479
/*
513480
* Trivial conversion from pg_wchar to UTF-8.
514481
* caller should allocate enough space for "to"
@@ -670,34 +637,6 @@ ucs_wcwidth(pg_wchar ucs)
670637
return 1;
671638
}
672639

673-
/*
674-
* Convert a UTF-8 character to a Unicode code point.
675-
* This is a one-character version of pg_utf2wchar_with_len.
676-
*
677-
* No error checks here, c must point to a long-enough string.
678-
*/
679-
pg_wchar
680-
utf8_to_unicode(const unsigned char *c)
681-
{
682-
if ((*c & 0x80) == 0)
683-
return (pg_wchar) c[0];
684-
else if ((*c & 0xe0) == 0xc0)
685-
return (pg_wchar) (((c[0] & 0x1f) << 6) |
686-
(c[1] & 0x3f));
687-
else if ((*c & 0xf0) == 0xe0)
688-
return (pg_wchar) (((c[0] & 0x0f) << 12) |
689-
((c[1] & 0x3f) << 6) |
690-
(c[2] & 0x3f));
691-
else if ((*c & 0xf8) == 0xf0)
692-
return (pg_wchar) (((c[0] & 0x07) << 18) |
693-
((c[1] & 0x3f) << 12) |
694-
((c[2] & 0x3f) << 6) |
695-
(c[3] & 0x3f));
696-
else
697-
/* that is an invalid code on purpose */
698-
return 0xffffffff;
699-
}
700-
701640
static int
702641
pg_utf_dsplen(const unsigned char *s)
703642
{

src/include/mb/pg_wchar.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,67 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
555555
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
556556
}
557557

558+
/*
559+
* Convert a UTF-8 character to a Unicode code point.
560+
* This is a one-character version of pg_utf2wchar_with_len.
561+
*
562+
* No error checks here, c must point to a long-enough string.
563+
*/
564+
static inline pg_wchar
565+
utf8_to_unicode(const unsigned char *c)
566+
{
567+
if ((*c & 0x80) == 0)
568+
return (pg_wchar) c[0];
569+
else if ((*c & 0xe0) == 0xc0)
570+
return (pg_wchar) (((c[0] & 0x1f) << 6) |
571+
(c[1] & 0x3f));
572+
else if ((*c & 0xf0) == 0xe0)
573+
return (pg_wchar) (((c[0] & 0x0f) << 12) |
574+
((c[1] & 0x3f) << 6) |
575+
(c[2] & 0x3f));
576+
else if ((*c & 0xf8) == 0xf0)
577+
return (pg_wchar) (((c[0] & 0x07) << 18) |
578+
((c[1] & 0x3f) << 12) |
579+
((c[2] & 0x3f) << 6) |
580+
(c[3] & 0x3f));
581+
else
582+
/* that is an invalid code on purpose */
583+
return 0xffffffff;
584+
}
585+
586+
/*
587+
* Map a Unicode code point to UTF-8. utf8string must have at least
588+
* unicode_utf8len(c) bytes available.
589+
*/
590+
static inline unsigned char *
591+
unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
592+
{
593+
if (c <= 0x7F)
594+
{
595+
utf8string[0] = c;
596+
}
597+
else if (c <= 0x7FF)
598+
{
599+
utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
600+
utf8string[1] = 0x80 | (c & 0x3F);
601+
}
602+
else if (c <= 0xFFFF)
603+
{
604+
utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
605+
utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
606+
utf8string[2] = 0x80 | (c & 0x3F);
607+
}
608+
else
609+
{
610+
utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
611+
utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
612+
utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
613+
utf8string[3] = 0x80 | (c & 0x3F);
614+
}
615+
616+
return utf8string;
617+
}
618+
558619
/*
559620
* Number of bytes needed to represent the given char in UTF8.
560621
*/

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy