Skip to content

Commit 78d523b

Browse files
committed
Improve make_greater_string() with encoding-specific incrementers.
This infrastructure doesn't in any way guarantee that the character we produce will sort before the one we incremented; but it does at least make it much more likely that we'll end up with something that is a valid character, which improves our chances. Kyotaro Horiguchi, with various adjustments by me.
1 parent 51eba98 commit 78d523b

File tree

3 files changed

+297
-28
lines changed

3 files changed

+297
-28
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5665,6 +5665,19 @@ pattern_selectivity(Const *patt, Pattern_Type ptype)
56655665
}
56665666

56675667

5668+
/*
5669+
* For bytea, the increment function need only increment the current byte
5670+
* (there are no multibyte characters to worry about).
5671+
*/
5672+
static bool
5673+
byte_increment(unsigned char *ptr, int len)
5674+
{
5675+
if (*ptr >= 255)
5676+
return false;
5677+
(*ptr)++;
5678+
return true;
5679+
}
5680+
56685681
/*
56695682
* Try to generate a string greater than the given string or any
56705683
* string it is a prefix of. If successful, return a palloc'd string
@@ -5704,6 +5717,7 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
57045717
int len;
57055718
Datum cmpstr;
57065719
text *cmptxt = NULL;
5720+
mbcharacter_incrementer charinc;
57075721

57085722
/*
57095723
* Get a modifiable copy of the prefix string in C-string format, and set
@@ -5765,29 +5779,33 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
57655779
}
57665780
}
57675781

5782+
if (datatype == BYTEAOID)
5783+
charinc = &byte_increment;
5784+
else
5785+
charinc = pg_database_encoding_character_incrementer();
5786+
57685787
while (len > 0)
57695788
{
5770-
unsigned char *lastchar = (unsigned char *) (workstr + len - 1);
5771-
unsigned char savelastchar = *lastchar;
5789+
int charlen;
5790+
unsigned char *lastchar;
5791+
Const *workstr_const;
5792+
5793+
if (datatype == BYTEAOID)
5794+
charlen = 1;
5795+
else
5796+
charlen = len - pg_mbcliplen(workstr, len, len - 1);
5797+
lastchar = (unsigned char *) (workstr + len - charlen);
57725798

57735799
/*
5774-
* Try to generate a larger string by incrementing the last byte.
5800+
* Try to generate a larger string by incrementing the last character
5801+
* (for BYTEA, we treat each byte as a character).
57755802
*/
5776-
while (*lastchar < (unsigned char) 255)
5803+
if (charinc(lastchar, charlen))
57775804
{
5778-
Const *workstr_const;
5779-
5780-
(*lastchar)++;
5781-
5782-
if (datatype != BYTEAOID)
5783-
{
5784-
/* do not generate invalid encoding sequences */
5785-
if (!pg_verifymbstr(workstr, len, true))
5786-
continue;
5787-
workstr_const = string_to_const(workstr, datatype);
5788-
}
5789-
else
5805+
if (datatype == BYTEAOID)
57905806
workstr_const = string_to_bytea_const(workstr, len);
5807+
else
5808+
workstr_const = string_to_const(workstr, datatype);
57915809

57925810
if (DatumGetBool(FunctionCall2Coll(ltproc,
57935811
collation,
@@ -5806,20 +5824,11 @@ make_greater_string(const Const *str_const, FmgrInfo *ltproc, Oid collation)
58065824
pfree(workstr_const);
58075825
}
58085826

5809-
/* restore last byte so we don't confuse pg_mbcliplen */
5810-
*lastchar = savelastchar;
5811-
58125827
/*
5813-
* Truncate off the last character, which might be more than 1 byte,
5814-
* depending on the character encoding.
5828+
* Truncate off the last character or byte.
58155829
*/
5816-
if (datatype != BYTEAOID && pg_database_encoding_max_length() > 1)
5817-
len = pg_mbcliplen(workstr, len, len - 1);
5818-
else
5819-
len -= 1;
5820-
5821-
if (datatype != BYTEAOID)
5822-
workstr[len] = '\0';
5830+
len -= charlen;
5831+
workstr[len] = '\0';
58235832
}
58245833

58255834
/* Failed... */

src/backend/utils/mb/wchar.c

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,6 +1334,244 @@ pg_utf8_islegal(const unsigned char *source, int length)
13341334
return true;
13351335
}
13361336

1337+
#ifndef FRONTEND
1338+
1339+
/*
1340+
* Generic character increment function.
1341+
*
1342+
* Not knowing anything about the properties of the encoding in use, we just
1343+
* keep incrementing the last byte until pg_verifymbstr() likes the result,
1344+
* or we run out of values to try.
1345+
*
1346+
* Like all character-increment functions, we must restore the original input
1347+
* string on failure.
1348+
*/
1349+
static bool
1350+
pg_generic_charinc(unsigned char *charptr, int len)
1351+
{
1352+
unsigned char *lastchar = (unsigned char *) (charptr + len - 1);
1353+
unsigned char savelastchar = *lastchar;
1354+
const char *const_charptr = (const char *)charptr;
1355+
1356+
while (*lastchar < (unsigned char) 255)
1357+
{
1358+
(*lastchar)++;
1359+
if (!pg_verifymbstr(const_charptr, len, true))
1360+
continue;
1361+
return true;
1362+
}
1363+
1364+
*lastchar = savelastchar;
1365+
return false;
1366+
}
1367+
1368+
/*
1369+
* UTF-8 character increment function.
1370+
*
1371+
* For a one-byte character less than 0x7F, we just increment the byte.
1372+
*
1373+
* For a multibyte character, every byte but the first must fall between 0x80
1374+
* and 0xBF; and the first byte must be between 0xC0 and 0xF4. We increment
1375+
* the last byte that's not already at its maximum value, and set any following
1376+
* bytes back to 0x80. If we can't find a byte that's less than the maximum
1377+
* allowable vale, we simply fail. We also have some special-case logic to
1378+
* skip regions used for surrogate pair handling, as those should not occur in
1379+
* valid UTF-8.
1380+
*
1381+
* Like all character-increment functions, we must restore the original input
1382+
* string on failure.
1383+
*/
1384+
static bool
1385+
pg_utf8_increment(unsigned char *charptr, int length)
1386+
{
1387+
unsigned char a;
1388+
unsigned char bak[4];
1389+
unsigned char limit;
1390+
1391+
switch (length)
1392+
{
1393+
default:
1394+
/* reject lengths 5 and 6 for now */
1395+
return false;
1396+
case 4:
1397+
bak[3] = charptr[3];
1398+
a = charptr[3];
1399+
if (a < 0xBF)
1400+
{
1401+
charptr[3]++;
1402+
break;
1403+
}
1404+
charptr[3] = 0x80;
1405+
/* FALL THRU */
1406+
case 3:
1407+
bak[2] = charptr[2];
1408+
a = charptr[2];
1409+
if (a < 0xBF)
1410+
{
1411+
charptr[2]++;
1412+
break;
1413+
}
1414+
charptr[2] = 0x80;
1415+
/* FALL THRU */
1416+
case 2:
1417+
bak[1] = charptr[1];
1418+
a = charptr[1];
1419+
switch (*charptr)
1420+
{
1421+
case 0xED:
1422+
limit = 0x9F;
1423+
break;
1424+
case 0xF4:
1425+
limit = 0x8F;
1426+
break;
1427+
default:
1428+
limit = 0xBF;
1429+
break;
1430+
}
1431+
if (a < limit)
1432+
{
1433+
charptr[1]++;
1434+
break;
1435+
}
1436+
charptr[1] = 0x80;
1437+
/* FALL THRU */
1438+
case 1:
1439+
bak[0] = *charptr;
1440+
a = *charptr;
1441+
if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
1442+
{
1443+
/* Restore original string. */
1444+
memcpy(charptr, bak, length);
1445+
return false;
1446+
}
1447+
charptr[0]++;
1448+
break;
1449+
}
1450+
1451+
return true;
1452+
}
1453+
1454+
/*
1455+
* EUC-JP character increment function.
1456+
*
1457+
* If the sequence starts with SS2(0x8e), it must be a two-byte sequence
1458+
* representing JIS X 0201 characters with the second byte ranges between
1459+
* 0xa1 and 0xde. We just increment the last byte if it's less than 0xde,
1460+
* and otherwise rewrite whole the sequence to 0xa1 0xa1.
1461+
*
1462+
* If the sequence starts with SS3(0x8f), it must be a three-byte sequence
1463+
* which the last two bytes ranges between 0xa1 and 0xfe. The last byte
1464+
* is incremented, carrying overflow to the second-to-last byte.
1465+
*
1466+
* If the sequence starts with the values other than the aboves and its MSB
1467+
* is set, it must be a two-byte sequence representing JIS X 0208 characters
1468+
* with both bytes ranges between 0xa1 and 0xfe. The last byte is incremented,
1469+
* carrying overflow to the second-to-last byte.
1470+
*
1471+
* Otherwise the sequence is consists of single byte representing ASCII
1472+
* characters. It is incremented up to 0x7f.
1473+
*
1474+
* Only three EUC-JP byte sequences shown below - which have no character
1475+
* allocated - make this function to fail in spite of its validity: 0x7f,
1476+
* 0xfe 0xfe, 0x8f 0xfe 0xfe.
1477+
*/
1478+
static bool
1479+
pg_eucjp_increment(unsigned char *charptr, int length)
1480+
{
1481+
unsigned char bak[3];
1482+
unsigned char c1, c2;
1483+
signed int i;
1484+
1485+
c1 = *charptr;
1486+
1487+
switch (c1)
1488+
{
1489+
case SS2: /* JIS X 0201 */
1490+
if (length != 2)
1491+
return false;
1492+
1493+
c2 = charptr[1];
1494+
1495+
if (c2 > 0xde)
1496+
charptr[0] = charptr[1] = 0xa1;
1497+
else if (c2 < 0xa1)
1498+
charptr[1] = 0xa1;
1499+
else
1500+
charptr[1]++;
1501+
1502+
break;
1503+
1504+
case SS3: /* JIS X 0212 */
1505+
if (length != 3)
1506+
return false;
1507+
1508+
for (i = 2; i > 0; i--)
1509+
{
1510+
bak[i] = charptr[i];
1511+
c2 = charptr[i];
1512+
if (c2 < 0xa1)
1513+
{
1514+
charptr[i] = 0xa1;
1515+
return true;
1516+
}
1517+
else if (c2 < 0xfe)
1518+
{
1519+
charptr[i]++;
1520+
break;
1521+
}
1522+
charptr[i] = 0xa1;
1523+
}
1524+
1525+
if (i == 0) /* Out of 3-byte code region */
1526+
{
1527+
charptr[1] = bak[1];
1528+
charptr[2] = bak[2];
1529+
return false;
1530+
}
1531+
break;
1532+
1533+
default:
1534+
if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
1535+
{
1536+
if (length != 2)
1537+
return false;
1538+
1539+
for (i = 1 ; i >= 0 ; i--) /* i must be signed */
1540+
{
1541+
bak[i] = charptr[i];
1542+
c2 = charptr[i];
1543+
if (c2 < 0xa1)
1544+
{
1545+
charptr[i] = 0xa1;
1546+
return true;
1547+
}
1548+
else if (c2 < 0xfe)
1549+
{
1550+
charptr[i]++;
1551+
break;
1552+
}
1553+
charptr[i] = 0xa1;
1554+
}
1555+
1556+
if (i < 0) /* Out of 2 byte code region */
1557+
{
1558+
charptr[0] = bak[0];
1559+
charptr[1] = bak[1];
1560+
return false;
1561+
}
1562+
}
1563+
else
1564+
{ /* ASCII, single byte */
1565+
if (c1 > 0x7e)
1566+
return false;
1567+
(*charptr)++;
1568+
}
1569+
}
1570+
1571+
return true;
1572+
}
1573+
#endif
1574+
13371575
/*
13381576
*-------------------------------------------------------------------
13391577
* encoding info table
@@ -1458,6 +1696,25 @@ pg_database_encoding_max_length(void)
14581696
return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
14591697
}
14601698

1699+
/*
1700+
* give the character incrementer for the encoding for the current database
1701+
*/
1702+
mbcharacter_incrementer
1703+
pg_database_encoding_character_incrementer(void)
1704+
{
1705+
switch (GetDatabaseEncoding())
1706+
{
1707+
case PG_UTF8:
1708+
return pg_utf8_increment;
1709+
1710+
case PG_EUC_JP:
1711+
return pg_eucjp_increment;
1712+
1713+
default:
1714+
return pg_generic_charinc;
1715+
}
1716+
}
1717+
14611718
/*
14621719
* Verify mbstr to make sure that it is validly encoded in the current
14631720
* database encoding. Otherwise same as pg_verify_mbstr().

src/include/mb/pg_wchar.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,8 @@ typedef int (*mblen_converter) (const unsigned char *mbstr);
284284

285285
typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
286286

287+
typedef bool (*mbcharacter_incrementer) (unsigned char *mbstr, int len);
288+
287289
typedef int (*mbverifier) (const unsigned char *mbstr, int len);
288290

289291
typedef struct
@@ -389,6 +391,7 @@ extern int pg_encoding_mbcliplen(int encoding, const char *mbstr,
389391
extern int pg_mbcharcliplen(const char *mbstr, int len, int imit);
390392
extern int pg_encoding_max_length(int encoding);
391393
extern int pg_database_encoding_max_length(void);
394+
extern mbcharacter_incrementer pg_database_encoding_character_incrementer(void);
392395

393396
extern int PrepareClientEncoding(int encoding);
394397
extern int SetClientEncoding(int encoding);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy