Skip to content

Commit 46e5441

Browse files
committed
Add unicode_strtitle() for Unicode Default Case Conversion.
This brings the titlecasing implementation for the builtin provider out of formatting.c and into unicode_case.c, along with unicode_strlower() and unicode_strupper(). Accepts an arbitrary word boundary callback. Simple for now, but can be extended to support the Unicode Default Case Conversion algorithm with full case mapping. Discussion: https://postgr.es/m/3bc653b5d562ae9e2838b11cb696816c328a489a.camel@j-davis.com Reviewed-by: Peter Eisentraut
1 parent a96a8b1 commit 46e5441

File tree

3 files changed

+140
-48
lines changed

3 files changed

+140
-48
lines changed

src/backend/utils/adt/formatting.c

Lines changed: 67 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1922,6 +1922,47 @@ str_toupper(const char *buff, size_t nbytes, Oid collid)
19221922
return result;
19231923
}
19241924

1925+
struct WordBoundaryState
1926+
{
1927+
const char *str;
1928+
size_t len;
1929+
size_t offset;
1930+
bool init;
1931+
bool prev_alnum;
1932+
};
1933+
1934+
/*
1935+
* Simple word boundary iterator that draws boundaries each time the result of
1936+
* pg_u_isalnum() changes.
1937+
*/
1938+
static size_t
1939+
initcap_wbnext(void *state)
1940+
{
1941+
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
1942+
1943+
while (wbstate->offset < wbstate->len &&
1944+
wbstate->str[wbstate->offset] != '\0')
1945+
{
1946+
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
1947+
wbstate->offset);
1948+
bool curr_alnum = pg_u_isalnum(u, true);
1949+
1950+
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
1951+
{
1952+
size_t prev_offset = wbstate->offset;
1953+
1954+
wbstate->init = true;
1955+
wbstate->offset += unicode_utf8len(u);
1956+
wbstate->prev_alnum = curr_alnum;
1957+
return prev_offset;
1958+
}
1959+
1960+
wbstate->offset += unicode_utf8len(u);
1961+
}
1962+
1963+
return wbstate->len;
1964+
}
1965+
19251966
/*
19261967
* collation-aware, wide-character-aware initcap function
19271968
*
@@ -1980,56 +2021,42 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
19802021
#endif
19812022
if (mylocale && mylocale->provider == COLLPROVIDER_BUILTIN)
19822023
{
1983-
const unsigned char *src = (unsigned char *) buff;
2024+
const char *src = buff;
19842025
size_t srclen = nbytes;
1985-
unsigned char *dst;
19862026
size_t dstsize;
1987-
int srcoff = 0;
1988-
int dstoff = 0;
2027+
char *dst;
2028+
size_t needed;
2029+
struct WordBoundaryState wbstate = {
2030+
.str = src,
2031+
.len = srclen,
2032+
.offset = 0,
2033+
.init = false,
2034+
.prev_alnum = false,
2035+
};
19892036

19902037
Assert(GetDatabaseEncoding() == PG_UTF8);
19912038

1992-
/* overflow paranoia */
1993-
if ((srclen + 1) > (INT_MAX / MAX_MULTIBYTE_CHAR_LEN))
1994-
ereport(ERROR,
1995-
(errcode(ERRCODE_OUT_OF_MEMORY),
1996-
errmsg("out of memory")));
1997-
1998-
/* result is at most srclen codepoints plus terminating NUL */
1999-
dstsize = srclen * MAX_MULTIBYTE_CHAR_LEN + 1;
2000-
dst = (unsigned char *) palloc(dstsize);
2039+
/* first try buffer of equal size plus terminating NUL */
2040+
dstsize = srclen + 1;
2041+
dst = palloc(dstsize);
20012042

2002-
while (srcoff < nbytes)
2043+
needed = unicode_strtitle(dst, dstsize, src, srclen,
2044+
initcap_wbnext, &wbstate);
2045+
if (needed + 1 > dstsize)
20032046
{
2004-
pg_wchar u1 = utf8_to_unicode(src + srcoff);
2005-
pg_wchar u2;
2006-
int u1len = unicode_utf8len(u1);
2007-
int u2len;
2008-
2009-
if (wasalnum)
2010-
u2 = unicode_lowercase_simple(u1);
2011-
else
2012-
u2 = unicode_uppercase_simple(u1);
2047+
/* reset iterator */
2048+
wbstate.offset = 0;
2049+
wbstate.init = false;
20132050

2014-
u2len = unicode_utf8len(u2);
2015-
2016-
Assert(dstoff + u2len + 1 <= dstsize);
2017-
2018-
wasalnum = pg_u_isalnum(u2, true);
2019-
2020-
unicode_to_utf8(u2, dst + dstoff);
2021-
srcoff += u1len;
2022-
dstoff += u2len;
2051+
/* grow buffer if needed and retry */
2052+
dstsize = needed + 1;
2053+
dst = repalloc(dst, dstsize);
2054+
needed = unicode_strtitle(dst, dstsize, src, srclen,
2055+
initcap_wbnext, &wbstate);
2056+
Assert(needed + 1 == dstsize);
20232057
}
20242058

2025-
Assert(dstoff + 1 <= dstsize);
2026-
*(dst + dstoff) = '\0';
2027-
dstoff++;
2028-
2029-
/* allocate result buffer of the right size and free workspace */
2030-
result = palloc(dstoff);
2031-
memcpy(result, dst, dstoff);
2032-
pfree(dst);
2059+
result = dst;
20332060
}
20342061
else
20352062
{

src/common/unicode_case.c

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,9 @@
2121
#include "mb/pg_wchar.h"
2222

2323
static const pg_case_map *find_case_map(pg_wchar ucs);
24-
static size_t convert_case(char *dst, size_t dstsize, const char *src,
25-
ssize_t srclen, CaseKind casekind);
24+
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
25+
CaseKind str_casekind, WordBoundaryNext wbnext,
26+
void *wbstate);
2627

2728
pg_wchar
2829
unicode_lowercase_simple(pg_wchar code)
@@ -67,7 +68,40 @@ unicode_uppercase_simple(pg_wchar code)
6768
size_t
6869
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
6970
{
70-
return convert_case(dst, dstsize, src, srclen, CaseLower);
71+
return convert_case(dst, dstsize, src, srclen, CaseLower, NULL, NULL);
72+
}
73+
74+
/*
75+
* unicode_strtitle()
76+
*
77+
* Convert src to titlecase, and return the result length (not including
78+
* terminating NUL).
79+
*
80+
* String src must be encoded in UTF-8. If srclen < 0, src must be
81+
* NUL-terminated.
82+
*
83+
* Result string is stored in dst, truncating if larger than dstsize. If
84+
* dstsize is greater than the result length, dst will be NUL-terminated;
85+
* otherwise not.
86+
*
87+
* If dstsize is zero, dst may be NULL. This is useful for calculating the
88+
* required buffer size before allocating.
89+
*
90+
* Titlecasing requires knowledge about word boundaries, which is provided by
91+
* the callback wbnext. A word boundary is the offset of the start of a word
92+
* or the offset of the character immediately following a word.
93+
*
94+
* The caller is expected to initialize and free the callback state
95+
* wbstate. The callback should first return offset 0 for the first boundary;
96+
* then the offset of each subsequent word boundary; then the total length of
97+
* the string to indicate the final boundary.
98+
*/
99+
size_t
100+
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
101+
WordBoundaryNext wbnext, void *wbstate)
102+
{
103+
return convert_case(dst, dstsize, src, srclen, CaseTitle, wbnext,
104+
wbstate);
71105
}
72106

73107
/*
@@ -89,30 +123,56 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen)
89123
size_t
90124
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen)
91125
{
92-
return convert_case(dst, dstsize, src, srclen, CaseUpper);
126+
return convert_case(dst, dstsize, src, srclen, CaseUpper, NULL, NULL);
93127
}
94128

95129
/*
96-
* Implement Unicode Default Case Conversion algorithm.
130+
* If str_casekind is CaseLower or CaseUpper, map each character in the string
131+
* for which a mapping is available.
97132
*
98-
* Map each character in the string for which a mapping is available.
133+
* If str_casekind is CaseTitle, maps characters found on a word boundary to
134+
* uppercase and other characters to lowercase.
99135
*/
100136
static size_t
101137
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
102-
CaseKind casekind)
138+
CaseKind str_casekind, WordBoundaryNext wbnext, void *wbstate)
103139
{
140+
/* character CaseKind varies while titlecasing */
141+
CaseKind chr_casekind = str_casekind;
104142
size_t srcoff = 0;
105143
size_t result_len = 0;
144+
size_t boundary = 0;
145+
146+
Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
147+
(str_casekind != CaseTitle && !wbnext && !wbstate));
148+
149+
if (str_casekind == CaseTitle)
150+
{
151+
boundary = wbnext(wbstate);
152+
Assert(boundary == 0); /* start of text is always a boundary */
153+
}
106154

107155
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
108156
{
109157
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
110158
int u1len = unicode_utf8len(u1);
111159
const pg_case_map *casemap = find_case_map(u1);
112160

161+
if (str_casekind == CaseTitle)
162+
{
163+
if (srcoff == boundary)
164+
{
165+
chr_casekind = CaseUpper;
166+
boundary = wbnext(wbstate);
167+
}
168+
else
169+
chr_casekind = CaseLower;
170+
}
171+
172+
/* perform mapping, update result_len, and write to dst */
113173
if (casemap)
114174
{
115-
pg_wchar u2 = casemap->simplemap[casekind];
175+
pg_wchar u2 = casemap->simplemap[chr_casekind];
116176
pg_wchar u2len = unicode_utf8len(u2);
117177

118178
if (result_len + u2len <= dstsize)

src/include/common/unicode_case.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,16 @@
1616

1717
#include "mb/pg_wchar.h"
1818

19+
typedef size_t (*WordBoundaryNext) (void *wbstate);
20+
1921
pg_wchar unicode_lowercase_simple(pg_wchar ucs);
2022
pg_wchar unicode_titlecase_simple(pg_wchar ucs);
2123
pg_wchar unicode_uppercase_simple(pg_wchar ucs);
2224
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
2325
ssize_t srclen);
26+
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
27+
ssize_t srclen, WordBoundaryNext wbnext,
28+
void *wbstate);
2429
size_t unicode_strupper(char *dst, size_t dstsize, const char *src,
2530
ssize_t srclen);
2631

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy