Skip to content

Commit 1c1791e

Browse files
committed
Add json(b)_to_tsvector function
Jsonb has a complex nature so there isn't best-for-everything way to convert it to tsvector for full text search. Current to_tsvector(json(b)) suggests to convert only string values, but it's possible to index keys, numerics and even booleans value. To solve that json(b)_to_tsvector has a second required argument contained a list of desired types of json fields. Second argument is a jsonb scalar or array right now with possibility to add new options in a future. Bump catalog version Author: Dmitry Dolgov with some editorization by me Reviewed by: Teodor Sigaev Discussion: https://www.postgresql.org/message-id/CA+q6zcXJQbS1b4kJ_HeAOoOc=unfnOrUEL=KGgE32QKDww7d8g@mail.gmail.com
1 parent 529ab7b commit 1c1791e

File tree

10 files changed

+629
-49
lines changed

10 files changed

+629
-49
lines changed

doc/src/sgml/func.sgml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9727,6 +9727,26 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple
97279727
<entry><literal>to_tsvector('english', '{"a": "The Fat Rats"}'::json)</literal></entry>
97289728
<entry><literal>'fat':2 'rat':3</literal></entry>
97299729
</row>
9730+
<row>
9731+
<entry>
9732+
<literal><function>json(b)_to_tsvector(<optional> <replaceable class="parameter">config</replaceable> <type>regconfig</type>,
9733+
</optional> <replaceable class="parameter">document</replaceable> <type>json(b)</type>,
9734+
<replaceable class="parameter">filter</replaceable> <type>json(b)</type>)</function></literal>
9735+
</entry>
9736+
<entry><type>tsvector</type></entry>
9737+
<entry>
9738+
reduce each value in the document, specified by <replaceable class="parameter">filter</replaceable> to a <type>tsvector</type>,
9739+
and then concatenate those in document order to produce a single <type>tsvector</type>.
9740+
<replaceable class="parameter">filter</replaceable> is a jsonb array, that enumerates what kind of elements need to be included
9741+
into the resulting <type>tsvector</type>. Possible values for <replaceable class="parameter">filter</replaceable> are
9742+
<literal>"string"</literal> (to include all string values), <literal>"numeric"</literal> (to include all numeric values in the string format),
9743+
<literal>"boolean"</literal> (to include all boolean values in the string format "true"/"false"),
9744+
<literal>"key"</literal> (to include all keys) or <literal>"all"</literal> (to include all above). These values
9745+
can be combined together to include, e.g. all string and numeric values.
9746+
</entry>
9747+
<entry><literal>json_to_tsvector('english', '{"a": "The Fat Rats", "b": 123}'::json, '["string", "numeric"]')</literal></entry>
9748+
<entry><literal>'123':5 'fat':2 'rat':3</literal></entry>
9749+
</row>
97309750
<row>
97319751
<entry>
97329752
<indexterm>

src/backend/tsearch/to_tsany.c

Lines changed: 108 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -267,12 +267,12 @@ to_tsvector(PG_FUNCTION_ARGS)
267267
PointerGetDatum(in)));
268268
}
269269

270-
Datum
271-
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
270+
/*
271+
* Worker function for jsonb(_string)_to_tsvector(_byid)
272+
*/
273+
static TSVector
274+
jsonb_to_tsvector_worker(Oid cfgId, Jsonb *jb, uint32 flags)
272275
{
273-
Oid cfgId = PG_GETARG_OID(0);
274-
Jsonb *jb = PG_GETARG_JSONB_P(1);
275-
TSVector result;
276276
TSVectorBuildState state;
277277
ParsedText prs;
278278

@@ -281,33 +281,77 @@ jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
281281
state.prs = &prs;
282282
state.cfgId = cfgId;
283283

284-
iterate_jsonb_string_values(jb, &state, add_to_tsvector);
284+
iterate_jsonb_values(jb, flags, &state, add_to_tsvector);
285285

286-
PG_FREE_IF_COPY(jb, 1);
286+
return make_tsvector(&prs);
287+
}
288+
289+
Datum
290+
jsonb_string_to_tsvector_byid(PG_FUNCTION_ARGS)
291+
{
292+
Oid cfgId = PG_GETARG_OID(0);
293+
Jsonb *jb = PG_GETARG_JSONB_P(1);
294+
TSVector result;
287295

288-
result = make_tsvector(&prs);
296+
result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
297+
PG_FREE_IF_COPY(jb, 1);
289298

290299
PG_RETURN_TSVECTOR(result);
291300
}
292301

293302
Datum
294-
jsonb_to_tsvector(PG_FUNCTION_ARGS)
303+
jsonb_string_to_tsvector(PG_FUNCTION_ARGS)
295304
{
296305
Jsonb *jb = PG_GETARG_JSONB_P(0);
297306
Oid cfgId;
307+
TSVector result;
298308

299309
cfgId = getTSCurrentConfig(true);
300-
PG_RETURN_DATUM(DirectFunctionCall2(jsonb_to_tsvector_byid,
301-
ObjectIdGetDatum(cfgId),
302-
JsonbPGetDatum(jb)));
310+
result = jsonb_to_tsvector_worker(cfgId, jb, jtiString);
311+
PG_FREE_IF_COPY(jb, 0);
312+
313+
PG_RETURN_TSVECTOR(result);
303314
}
304315

305316
Datum
306-
json_to_tsvector_byid(PG_FUNCTION_ARGS)
317+
jsonb_to_tsvector_byid(PG_FUNCTION_ARGS)
307318
{
308319
Oid cfgId = PG_GETARG_OID(0);
309-
text *json = PG_GETARG_TEXT_P(1);
320+
Jsonb *jb = PG_GETARG_JSONB_P(1);
321+
Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
322+
TSVector result;
323+
uint32 flags = parse_jsonb_index_flags(jbFlags);
324+
325+
result = jsonb_to_tsvector_worker(cfgId, jb, flags);
326+
PG_FREE_IF_COPY(jb, 1);
327+
PG_FREE_IF_COPY(jbFlags, 2);
328+
329+
PG_RETURN_TSVECTOR(result);
330+
}
331+
332+
Datum
333+
jsonb_to_tsvector(PG_FUNCTION_ARGS)
334+
{
335+
Jsonb *jb = PG_GETARG_JSONB_P(0);
336+
Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
337+
Oid cfgId;
310338
TSVector result;
339+
uint32 flags = parse_jsonb_index_flags(jbFlags);
340+
341+
cfgId = getTSCurrentConfig(true);
342+
result = jsonb_to_tsvector_worker(cfgId, jb, flags);
343+
PG_FREE_IF_COPY(jb, 0);
344+
PG_FREE_IF_COPY(jbFlags, 1);
345+
346+
PG_RETURN_TSVECTOR(result);
347+
}
348+
349+
/*
350+
* Worker function for json(_string)_to_tsvector(_byid)
351+
*/
352+
static TSVector
353+
json_to_tsvector_worker(Oid cfgId, text *json, uint32 flags)
354+
{
311355
TSVectorBuildState state;
312356
ParsedText prs;
313357

@@ -316,11 +360,50 @@ json_to_tsvector_byid(PG_FUNCTION_ARGS)
316360
state.prs = &prs;
317361
state.cfgId = cfgId;
318362

319-
iterate_json_string_values(json, &state, add_to_tsvector);
363+
iterate_json_values(json, flags, &state, add_to_tsvector);
364+
365+
return make_tsvector(&prs);
366+
}
367+
368+
Datum
369+
json_string_to_tsvector_byid(PG_FUNCTION_ARGS)
370+
{
371+
Oid cfgId = PG_GETARG_OID(0);
372+
text *json = PG_GETARG_TEXT_P(1);
373+
TSVector result;
320374

375+
result = json_to_tsvector_worker(cfgId, json, jtiString);
321376
PG_FREE_IF_COPY(json, 1);
322377

323-
result = make_tsvector(&prs);
378+
PG_RETURN_TSVECTOR(result);
379+
}
380+
381+
Datum
382+
json_string_to_tsvector(PG_FUNCTION_ARGS)
383+
{
384+
text *json = PG_GETARG_TEXT_P(0);
385+
Oid cfgId;
386+
TSVector result;
387+
388+
cfgId = getTSCurrentConfig(true);
389+
result = json_to_tsvector_worker(cfgId, json, jtiString);
390+
PG_FREE_IF_COPY(json, 0);
391+
392+
PG_RETURN_TSVECTOR(result);
393+
}
394+
395+
Datum
396+
json_to_tsvector_byid(PG_FUNCTION_ARGS)
397+
{
398+
Oid cfgId = PG_GETARG_OID(0);
399+
text *json = PG_GETARG_TEXT_P(1);
400+
Jsonb *jbFlags = PG_GETARG_JSONB_P(2);
401+
TSVector result;
402+
uint32 flags = parse_jsonb_index_flags(jbFlags);
403+
404+
result = json_to_tsvector_worker(cfgId, json, flags);
405+
PG_FREE_IF_COPY(json, 1);
406+
PG_FREE_IF_COPY(jbFlags, 2);
324407

325408
PG_RETURN_TSVECTOR(result);
326409
}
@@ -329,12 +412,17 @@ Datum
329412
json_to_tsvector(PG_FUNCTION_ARGS)
330413
{
331414
text *json = PG_GETARG_TEXT_P(0);
415+
Jsonb *jbFlags = PG_GETARG_JSONB_P(1);
332416
Oid cfgId;
417+
TSVector result;
418+
uint32 flags = parse_jsonb_index_flags(jbFlags);
333419

334420
cfgId = getTSCurrentConfig(true);
335-
PG_RETURN_DATUM(DirectFunctionCall2(json_to_tsvector_byid,
336-
ObjectIdGetDatum(cfgId),
337-
PointerGetDatum(json)));
421+
result = json_to_tsvector_worker(cfgId, json, flags);
422+
PG_FREE_IF_COPY(json, 0);
423+
PG_FREE_IF_COPY(jbFlags, 1);
424+
425+
PG_RETURN_TSVECTOR(result);
338426
}
339427

340428
/*
@@ -353,7 +441,7 @@ add_to_tsvector(void *_state, char *elem_value, int elem_len)
353441
* First time through: initialize words array to a reasonable size.
354442
* (parsetext() will realloc it bigger as needed.)
355443
*/
356-
prs->lenwords = Max(elem_len / 6, 64);
444+
prs->lenwords = 16;
357445
prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords);
358446
prs->curwords = 0;
359447
prs->pos = 0;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy