Skip to content

Commit d3b2e5e

Browse files
committed
Refactor convert_case() to prepare for optimizations.
Upcoming optimizations will add complexity to convert_case(). This patch reorganizes slightly so that the complexity can be contained within the logic to convert the case of a single character, rather than mixing it in with logic to iterate through the string. Reviewed-by: Alexander Borisov <lex.borisov@gmail.com> Discussion: https://postgr.es/m/44005c3d-88f4-4a26-981f-fd82dfa8e313@gmail.com
1 parent 3abe9dc commit d3b2e5e

File tree

1 file changed

+101
-52
lines changed

1 file changed

+101
-52
lines changed

src/common/unicode_case.c

Lines changed: 101 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,20 @@
2020
#include "common/unicode_category.h"
2121
#include "mb/pg_wchar.h"
2222

23+
enum CaseMapResult
24+
{
25+
CASEMAP_SELF,
26+
CASEMAP_SIMPLE,
27+
CASEMAP_SPECIAL,
28+
};
29+
2330
static const pg_case_map *find_case_map(pg_wchar ucs);
2431
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
2532
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
2633
void *wbstate);
27-
static bool check_special_conditions(int conditions, const char *str,
28-
size_t len, size_t offset);
34+
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
35+
const char *src, size_t srclen, size_t srcoff,
36+
pg_wchar *u2, const pg_wchar **special);
2937

3038
pg_wchar
3139
unicode_lowercase_simple(pg_wchar code)
@@ -214,8 +222,9 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
214222
{
215223
pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
216224
int u1len = unicode_utf8len(u1);
217-
const pg_case_map *casemap = find_case_map(u1);
218-
const pg_special_case *special = NULL;
225+
pg_wchar simple = 0;
226+
const pg_wchar *special = NULL;
227+
enum CaseMapResult casemap_result;
219228

220229
if (str_casekind == CaseTitle)
221230
{
@@ -228,56 +237,47 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
228237
chr_casekind = CaseLower;
229238
}
230239

231-
/*
232-
* Find special case that matches the conditions, if any.
233-
*
234-
* Note: only a single special mapping per codepoint is currently
235-
* supported, though Unicode allows for multiple special mappings for
236-
* a single codepoint.
237-
*/
238-
if (full && casemap && casemap->special_case)
239-
{
240-
int16 conditions = casemap->special_case->conditions;
241-
242-
Assert(casemap->special_case->codepoint == u1);
243-
if (check_special_conditions(conditions, src, srclen, srcoff))
244-
special = casemap->special_case;
245-
}
240+
casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
241+
&simple, &special);
246242

247-
/* perform mapping, update result_len, and write to dst */
248-
if (special)
243+
switch (casemap_result)
249244
{
250-
for (int i = 0; i < MAX_CASE_EXPANSION; i++)
251-
{
252-
pg_wchar u2 = special->map[chr_casekind][i];
253-
size_t u2len = unicode_utf8len(u2);
254-
255-
if (u2 == '\0')
256-
break;
257-
258-
if (result_len + u2len <= dstsize)
259-
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
260-
261-
result_len += u2len;
262-
}
263-
}
264-
else if (casemap)
265-
{
266-
pg_wchar u2 = casemap->simplemap[chr_casekind];
267-
pg_wchar u2len = unicode_utf8len(u2);
268-
269-
if (result_len + u2len <= dstsize)
270-
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
271-
272-
result_len += u2len;
273-
}
274-
else
275-
{
276-
/* no mapping; copy bytes from src */
277-
if (result_len + u1len <= dstsize)
278-
memcpy(dst + result_len, src + srcoff, u1len);
279-
280-
result_len += u1len;
245+
case CASEMAP_SELF:
246+
/* no mapping; copy bytes from src */
247+
Assert(simple == 0);
248+
Assert(special == NULL);
249+
if (result_len + u1len <= dstsize)
250+
memcpy(dst + result_len, src + srcoff, u1len);
251+
252+
result_len += u1len;
253+
break;
254+
case CASEMAP_SIMPLE:
255+
{
256+
/* replace with single character */
257+
pg_wchar u2 = simple;
258+
pg_wchar u2len = unicode_utf8len(u2);
259+
260+
Assert(special == NULL);
261+
if (result_len + u2len <= dstsize)
262+
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
263+
264+
result_len += u2len;
265+
}
266+
break;
267+
case CASEMAP_SPECIAL:
268+
/* replace with up to MAX_CASE_EXPANSION characters */
269+
Assert(simple == 0);
270+
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
271+
{
272+
pg_wchar u2 = special[i];
273+
size_t u2len = unicode_utf8len(u2);
274+
275+
if (result_len + u2len <= dstsize)
276+
unicode_to_utf8(u2, (unsigned char *) dst + result_len);
277+
278+
result_len += u2len;
279+
}
280+
break;
281281
}
282282

283283
srcoff += u1len;
@@ -351,6 +351,10 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
351351
return true;
352352
}
353353

354+
/*
355+
* Unicode allows for special casing to be applied only under certain
356+
* circumstances. The only currently-supported condition is Final_Sigma.
357+
*/
354358
static bool
355359
check_special_conditions(int conditions, const char *str, size_t len,
356360
size_t offset)
@@ -365,6 +369,51 @@ check_special_conditions(int conditions, const char *str, size_t len,
365369
return false;
366370
}
367371

372+
/*
373+
* Map the given character to the requested case.
374+
*
375+
* If full is true, and a special case mapping is found and the conditions are
376+
* met, 'special' is set to the mapping result (which is an array of up to
377+
* MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
378+
*
379+
* Otherwise, search for a simple mapping, and if found, set 'simple' to the
380+
* result and return CASEMAP_SIMPLE.
381+
*
382+
* If no mapping is found, return CASEMAP_SELF, and the caller should copy the
383+
* character without modification.
384+
*/
385+
static enum CaseMapResult
386+
casemap(pg_wchar u1, CaseKind casekind, bool full,
387+
const char *src, size_t srclen, size_t srcoff,
388+
pg_wchar *simple, const pg_wchar **special)
389+
{
390+
const pg_case_map *map;
391+
392+
if (u1 < 0x80)
393+
{
394+
*simple = case_map[u1].simplemap[casekind];
395+
396+
return CASEMAP_SIMPLE;
397+
}
398+
399+
map = find_case_map(u1);
400+
401+
if (map == NULL)
402+
return CASEMAP_SELF;
403+
404+
if (full && map->special_case != NULL &&
405+
check_special_conditions(map->special_case->conditions,
406+
src, srclen, srcoff))
407+
{
408+
*special = map->special_case->map[casekind];
409+
return CASEMAP_SPECIAL;
410+
}
411+
412+
*simple = map->simplemap[casekind];
413+
414+
return CASEMAP_SIMPLE;
415+
}
416+
368417
/* find entry in simple case map, if any */
369418
static const pg_case_map *
370419
find_case_map(pg_wchar ucs)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy