Skip to content

Commit c64d0cd

Browse files
committed
Use perfect hashing, instead of binary search, for keyword lookup.
We've been speculating for a long time that hash-based keyword lookup ought to be faster than binary search, but up to now we hadn't found a suitable tool for generating the hash function. Joerg Sonnenberger provided the inspiration, and sample code, to show us that rolling our own generator wasn't a ridiculous idea. Hence, do that. The method used here requires a lookup table of approximately 4 bytes per keyword, but that's less than what we saved in the predecessor commit afb0d07, so it's not a big problem. The time savings is indeed significant: preliminary testing suggests that the total time for raw parsing (flex + bison phases) drops by ~20%. Patch by me, but it owes its existence to Joerg Sonnenberger; thanks also to John Naylor for review. Discussion: https://postgr.es/m/20190103163340.GA15803@britannica.bec.de
1 parent 5d59a6c commit c64d0cd

File tree

14 files changed

+516
-107
lines changed

14 files changed

+516
-107
lines changed

src/common/Makefile

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o
6363
OBJS_SHLIB = $(OBJS_FRONTEND:%.o=%_shlib.o)
6464
OBJS_SRV = $(OBJS_COMMON:%.o=%_srv.o)
6565

66+
# where to find gen_keywordlist.pl and subsidiary files
67+
TOOLSDIR = $(top_srcdir)/src/tools
68+
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
69+
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
70+
6671
all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a
6772

6873
distprep: kwlist_d.h
@@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)
118123
$(CC) $(CFLAGS) $(subst -DFRONTEND,, $(CPPFLAGS)) -c $< -o $@
119124

120125
# generate SQL keyword lookup table to be included into keywords*.o.
121-
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(top_srcdir)/src/tools/gen_keywordlist.pl
122-
$(PERL) $(top_srcdir)/src/tools/gen_keywordlist.pl --extern $<
126+
kwlist_d.h: $(top_srcdir)/src/include/parser/kwlist.h $(GEN_KEYWORDLIST_DEPS)
127+
$(GEN_KEYWORDLIST) --extern $<
123128

124129
# Dependencies of keywords*.o need to be managed explicitly to make sure
125130
# that you don't get broken parsing code, even in a non-enable-depend build.

src/common/kwlookup.c

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -35,60 +35,51 @@
3535
* receive a different case-normalization mapping.
3636
*/
3737
int
38-
ScanKeywordLookup(const char *text,
38+
ScanKeywordLookup(const char *str,
3939
const ScanKeywordList *keywords)
4040
{
41-
int len,
42-
i;
43-
char word[NAMEDATALEN];
44-
const char *kw_string;
45-
const uint16 *kw_offsets;
46-
const uint16 *low;
47-
const uint16 *high;
48-
49-
len = strlen(text);
41+
size_t len;
42+
int h;
43+
const char *kw;
5044

45+
/*
46+
* Reject immediately if too long to be any keyword. This saves useless
47+
* hashing and downcasing work on long strings.
48+
*/
49+
len = strlen(str);
5150
if (len > keywords->max_kw_len)
52-
return -1; /* too long to be any keyword */
53-
54-
/* We assume all keywords are shorter than NAMEDATALEN. */
55-
Assert(len < NAMEDATALEN);
51+
return -1;
5652

5753
/*
58-
* Apply an ASCII-only downcasing. We must not use tolower() since it may
59-
* produce the wrong translation in some locales (eg, Turkish).
54+
* Compute the hash function. We assume it was generated to produce
55+
* case-insensitive results. Since it's a perfect hash, we need only
56+
* match to the specific keyword it identifies.
6057
*/
61-
for (i = 0; i < len; i++)
62-
{
63-
char ch = text[i];
58+
h = keywords->hash(str, len);
6459

65-
if (ch >= 'A' && ch <= 'Z')
66-
ch += 'a' - 'A';
67-
word[i] = ch;
68-
}
69-
word[len] = '\0';
60+
/* An out-of-range result implies no match */
61+
if (h < 0 || h >= keywords->num_keywords)
62+
return -1;
7063

7164
/*
72-
* Now do a binary search using plain strcmp() comparison.
65+
* Compare character-by-character to see if we have a match, applying an
66+
* ASCII-only downcasing to the input characters. We must not use
67+
* tolower() since it may produce the wrong translation in some locales
68+
* (eg, Turkish).
7369
*/
74-
kw_string = keywords->kw_string;
75-
kw_offsets = keywords->kw_offsets;
76-
low = kw_offsets;
77-
high = kw_offsets + (keywords->num_keywords - 1);
78-
while (low <= high)
70+
kw = GetScanKeyword(h, keywords);
71+
while (*str != '\0')
7972
{
80-
const uint16 *middle;
81-
int difference;
73+
char ch = *str++;
8274

83-
middle = low + (high - low) / 2;
84-
difference = strcmp(kw_string + *middle, word);
85-
if (difference == 0)
86-
return middle - kw_offsets;
87-
else if (difference < 0)
88-
low = middle + 1;
89-
else
90-
high = middle - 1;
75+
if (ch >= 'A' && ch <= 'Z')
76+
ch += 'a' - 'A';
77+
if (ch != *kw++)
78+
return -1;
9179
}
80+
if (*kw != '\0')
81+
return -1;
9282

93-
return -1;
83+
/* Success! */
84+
return h;
9485
}

src/include/common/kwlookup.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
#ifndef KWLOOKUP_H
1515
#define KWLOOKUP_H
1616

17+
/* Hash function used by ScanKeywordLookup */
18+
typedef int (*ScanKeywordHashFunc) (const void *key, size_t keylen);
19+
1720
/*
1821
* This struct contains the data needed by ScanKeywordLookup to perform a
1922
* search within a set of keywords. The contents are typically generated by
@@ -23,6 +26,7 @@ typedef struct ScanKeywordList
2326
{
2427
const char *kw_string; /* all keywords in order, separated by \0 */
2528
const uint16 *kw_offsets; /* offsets to the start of each keyword */
29+
ScanKeywordHashFunc hash; /* perfect hash function for keywords */
2630
int num_keywords; /* number of keywords */
2731
int max_kw_len; /* length of longest keyword */
2832
} ScanKeywordList;

src/include/parser/kwlist.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121
/*
2222
* List of keyword (name, token-value, category) entries.
2323
*
24-
* !!WARNING!!: This list must be sorted by ASCII name, because binary
25-
* search is used to locate entries.
24+
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
2625
*/
2726

2827
/* name, value, category */

src/interfaces/ecpg/preproc/Makefile

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ OBJS= preproc.o pgc.o type.o ecpg.o output.o parser.o \
2828
keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o \
2929
$(WIN32RES)
3030

31-
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
31+
# where to find gen_keywordlist.pl and subsidiary files
32+
TOOLSDIR = $(top_srcdir)/src/tools
33+
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
34+
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
3235

3336
# Suppress parallel build to avoid a bug in GNU make 3.82
3437
# (see comments in ../Makefile)
@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.
5659
$(PERL) $(srcdir)/check_rules.pl $(srcdir) $<
5760

5861
# generate keyword headers
59-
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST)
60-
$(PERL) $(GEN_KEYWORDLIST) --varname ScanCKeywords $<
62+
c_kwlist_d.h: c_kwlist.h $(GEN_KEYWORDLIST_DEPS)
63+
$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold $<
6164

62-
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST)
63-
$(PERL) $(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
65+
ecpg_kwlist_d.h: ecpg_kwlist.h $(GEN_KEYWORDLIST_DEPS)
66+
$(GEN_KEYWORDLIST) --varname ScanECPGKeywords $<
6467

6568
# Force these dependencies to be known even without dependency info built:
6669
ecpg_keywords.o c_keywords.o keywords.o preproc.o pgc.o parser.o: preproc.h

src/interfaces/ecpg/preproc/c_keywords.c

Lines changed: 24 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
*/
1010
#include "postgres_fe.h"
1111

12-
#include <ctype.h>
13-
1412
#include "preproc_extern.h"
1513
#include "preproc.h"
1614

@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {
3230
*
3331
* Returns the token value of the keyword, or -1 if no match.
3432
*
35-
* Do a binary search using plain strcmp() comparison. This is much like
33+
* Do a hash search using plain strcmp() comparison. This is much like
3634
* ScanKeywordLookup(), except we want case-sensitive matching.
3735
*/
3836
int
39-
ScanCKeywordLookup(const char *text)
37+
ScanCKeywordLookup(const char *str)
4038
{
41-
const char *kw_string;
42-
const uint16 *kw_offsets;
43-
const uint16 *low;
44-
const uint16 *high;
39+
size_t len;
40+
int h;
41+
const char *kw;
42+
43+
/*
44+
* Reject immediately if too long to be any keyword. This saves useless
45+
* hashing work on long strings.
46+
*/
47+
len = strlen(str);
48+
if (len > ScanCKeywords.max_kw_len)
49+
return -1;
4550

46-
if (strlen(text) > ScanCKeywords.max_kw_len)
47-
return -1; /* too long to be any keyword */
51+
/*
52+
* Compute the hash function. Since it's a perfect hash, we need only
53+
* match to the specific keyword it identifies.
54+
*/
55+
h = ScanCKeywords_hash_func(str, len);
4856

49-
kw_string = ScanCKeywords.kw_string;
50-
kw_offsets = ScanCKeywords.kw_offsets;
51-
low = kw_offsets;
52-
high = kw_offsets + (ScanCKeywords.num_keywords - 1);
57+
/* An out-of-range result implies no match */
58+
if (h < 0 || h >= ScanCKeywords.num_keywords)
59+
return -1;
5360

54-
while (low <= high)
55-
{
56-
const uint16 *middle;
57-
int difference;
61+
kw = GetScanKeyword(h, &ScanCKeywords);
5862

59-
middle = low + (high - low) / 2;
60-
difference = strcmp(kw_string + *middle, text);
61-
if (difference == 0)
62-
return ScanCKeywordTokens[middle - kw_offsets];
63-
else if (difference < 0)
64-
low = middle + 1;
65-
else
66-
high = middle - 1;
67-
}
63+
if (strcmp(kw, str) == 0)
64+
return ScanCKeywordTokens[h];
6865

6966
return -1;
7067
}

src/interfaces/ecpg/preproc/c_kwlist.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
/*
2121
* List of (keyword-name, keyword-token-value) pairs.
2222
*
23-
* !!WARNING!!: This list must be sorted by ASCII name, because binary
24-
* search is used to locate entries.
23+
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
2524
*/
2625

2726
/* name, value */

src/interfaces/ecpg/preproc/ecpg_kwlist.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,7 @@
2020
/*
2121
* List of (keyword-name, keyword-token-value) pairs.
2222
*
23-
* !!WARNING!!: This list must be sorted by ASCII name, because binary
24-
* search is used to locate entries.
23+
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
2524
*/
2625

2726
/* name, value */

src/pl/plpgsql/src/Makefile

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ REGRESS_OPTS = --dbname=$(PL_TESTDB)
2929
REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record \
3030
plpgsql_cache plpgsql_transaction plpgsql_trigger plpgsql_varprops
3131

32-
GEN_KEYWORDLIST = $(top_srcdir)/src/tools/gen_keywordlist.pl
32+
# where to find gen_keywordlist.pl and subsidiary files
33+
TOOLSDIR = $(top_srcdir)/src/tools
34+
GEN_KEYWORDLIST = $(PERL) -I $(TOOLSDIR) $(TOOLSDIR)/gen_keywordlist.pl
35+
GEN_KEYWORDLIST_DEPS = $(TOOLSDIR)/gen_keywordlist.pl $(TOOLSDIR)/PerfectHash.pm
3336

3437
all: all-lib
3538

@@ -76,11 +79,11 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p
7679
$(PERL) $(srcdir)/generate-plerrcodes.pl $< > $@
7780

7881
# generate keyword headers for the scanner
79-
pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST)
80-
$(PERL) $(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
82+
pl_reserved_kwlist_d.h: pl_reserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
83+
$(GEN_KEYWORDLIST) --varname ReservedPLKeywords $<
8184

82-
pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST)
83-
$(PERL) $(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
85+
pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h $(GEN_KEYWORDLIST_DEPS)
86+
$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords $<
8487

8588

8689
check: submake

src/pl/plpgsql/src/pl_reserved_kwlist.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,9 @@
2020
/*
2121
* List of (keyword-name, keyword-token-value) pairs.
2222
*
23-
* Be careful not to put the same word in both lists.
23+
* Be careful not to put the same word into pl_unreserved_kwlist.h.
2424
*
25-
* !!WARNING!!: This list must be sorted by ASCII name, because binary
26-
* search is used to locate entries.
25+
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
2726
*/
2827

2928
/* name, value */

src/pl/plpgsql/src/pl_unreserved_kwlist.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,10 @@
2020
/*
2121
* List of (keyword-name, keyword-token-value) pairs.
2222
*
23-
* Be careful not to put the same word in both lists. Also be sure that
24-
* pl_gram.y's unreserved_keyword production agrees with this list.
23+
* Be careful not to put the same word into pl_reserved_kwlist.h. Also be
24+
* sure that pl_gram.y's unreserved_keyword production agrees with this list.
2525
*
26-
* !!WARNING!!: This list must be sorted by ASCII name, because binary
27-
* search is used to locate entries.
26+
* Note: gen_keywordlist.pl requires the entries to appear in ASCII order.
2827
*/
2928

3029
/* name, value */

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy