Skip to content

Commit e8c81e1

Browse files
committed
Improve word parser.
- improve file and path recognition - fix misspeling - improve tag recognition
1 parent 8cb4e4f commit e8c81e1

File tree

2 files changed

+65
-22
lines changed

2 files changed

+65
-22
lines changed

contrib/tsearch2/wordparser/parser.c

Lines changed: 59 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ static TParserStateActionItem actionTPS_Base[] = {
327327
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
328328
{p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL},
329329
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
330+
{p_iseqC, '.', A_PUSH, TPS_InPathFirst, 0, NULL},
330331
{NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
331332
};
332333

@@ -336,15 +337,16 @@ static TParserStateActionItem actionTPS_InUWord[] = {
336337
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
337338
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
338339
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
340+
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
339341
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
340342
{NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
341343
};
342344

343345
static TParserStateActionItem actionTPS_InLatWord[] = {
344346
{p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
345347
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
346-
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
347-
{p_iseqC, '.', A_PUSH, TPS_InFileFirst, 0, NULL},
348+
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
349+
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
348350
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
349351
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
350352
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
@@ -366,7 +368,7 @@ static TParserStateActionItem actionTPS_InCyrWord[] = {
366368
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
367369
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
368370
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
369-
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
371+
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
370372
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
371373
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
372374
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
@@ -500,10 +502,19 @@ static TParserStateActionItem actionTPS_InTagFirst[] = {
500502
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
501503
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
502504
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
505+
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
503506
{p_islatin, 0, A_PUSH, TPS_InTag, 0, NULL},
504507
{NULL, 0, A_POP, TPS_Null, 0, NULL}
505508
};
506509

510+
static TParserStateActionItem actionTPS_InXMLBegin[] = {
511+
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
512+
/* <?xml ... */
513+
{p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
514+
{p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL},
515+
{NULL, 0, A_POP, TPS_Null, 0, NULL}
516+
};
517+
507518
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
508519
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
509520
{p_islatin, 0, A_NEXT, TPS_InTag, 0, NULL},
@@ -520,6 +531,11 @@ static TParserStateActionItem actionTPS_InTag[] = {
520531
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
521532
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
522533
{p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
534+
{p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
535+
{p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
536+
{p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
537+
{p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
538+
{p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
523539
{p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
524540
{p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
525541
{NULL, 0, A_POP, TPS_Null, 0, NULL}
@@ -551,6 +567,9 @@ static TParserStateActionItem actionTPS_InTagEnd[] = {
551567
static TParserStateActionItem actionTPS_InCommentFirst[] = {
552568
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
553569
{p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
570+
/* <!DOCTYPE ...>*/
571+
{p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
572+
{p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
554573
{NULL, 0, A_POP, TPS_Null, 0, NULL}
555574
};
556575

@@ -583,30 +602,30 @@ static TParserStateActionItem actionTPS_InCommentEnd[] = {
583602
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
584603
};
585604

586-
static TParserStateActionItem actionTPS_InHostFirstDomen[] = {
605+
static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
587606
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
588-
{p_islatin, 0, A_NEXT, TPS_InHostDomenSecond, 0, NULL},
607+
{p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
589608
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
590609
{NULL, 0, A_POP, TPS_Null, 0, NULL}
591610
};
592611

593-
static TParserStateActionItem actionTPS_InHostDomenSecond[] = {
612+
static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
594613
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
595-
{p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
614+
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
596615
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
597616
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
598-
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
617+
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
599618
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
600619
{NULL, 0, A_POP, TPS_Null, 0, NULL}
601620
};
602621

603-
static TParserStateActionItem actionTPS_InHostDomen[] = {
622+
static TParserStateActionItem actionTPS_InHostDomain[] = {
604623
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
605-
{p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
624+
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
606625
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
607626
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
608627
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
609-
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
628+
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
610629
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
611630
{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
612631
{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURIStart, HOST, NULL},
@@ -640,7 +659,7 @@ static TParserStateActionItem actionTPS_InHost[] = {
640659
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
641660
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
642661
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
643-
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
662+
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
644663
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
645664
{NULL, 0, A_POP, TPS_Null, 0, NULL}
646665
};
@@ -652,14 +671,32 @@ static TParserStateActionItem actionTPS_InEmail[] = {
652671

653672
static TParserStateActionItem actionTPS_InFileFirst[] = {
654673
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
655-
{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
656-
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
657-
{p_iseqC, '.', A_CLEAR, TPS_InFile, 0, NULL},
658-
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
674+
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
675+
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
676+
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
677+
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
659678
{p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
660679
{NULL, 0, A_POP, TPS_Null, 0, NULL}
661680
};
662681

682+
static TParserStateActionItem actionTPS_InPathFirst[] = {
683+
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
684+
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
685+
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
686+
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
687+
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
688+
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
689+
{NULL, 0, A_POP, TPS_Null, 0, NULL}
690+
};
691+
692+
static TParserStateActionItem actionTPS_InPathSecond[] = {
693+
{p_isEOF, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
694+
{p_iseqC, '/', A_NEXT|A_PUSH, TPS_InFileFirst, 0, NULL},
695+
{p_iseqC, '/', A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
696+
{p_isspace, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
697+
{NULL, 0, A_POP, TPS_Null, 0, NULL}
698+
};
699+
663700
static TParserStateActionItem actionTPS_InFile[] = {
664701
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
665702
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
@@ -894,6 +931,7 @@ static const TParserStateAction Actions[] = {
894931
{TPS_InHTMLEntityNum, actionTPS_InHTMLEntityNum},
895932
{TPS_InHTMLEntityEnd, actionTPS_InHTMLEntityEnd},
896933
{TPS_InTagFirst, actionTPS_InTagFirst},
934+
{TPS_InXMLBegin, actionTPS_InXMLBegin},
897935
{TPS_InTagCloseFirst, actionTPS_InTagCloseFirst},
898936
{TPS_InTag, actionTPS_InTag},
899937
{TPS_InTagEscapeK, actionTPS_InTagEscapeK},
@@ -906,15 +944,17 @@ static const TParserStateAction Actions[] = {
906944
{TPS_InCloseCommentFirst, actionTPS_InCloseCommentFirst},
907945
{TPS_InCloseCommentLast, actionTPS_InCloseCommentLast},
908946
{TPS_InCommentEnd, actionTPS_InCommentEnd},
909-
{TPS_InHostFirstDomen, actionTPS_InHostFirstDomen},
910-
{TPS_InHostDomenSecond, actionTPS_InHostDomenSecond},
911-
{TPS_InHostDomen, actionTPS_InHostDomen},
947+
{TPS_InHostFirstDomain, actionTPS_InHostFirstDomain},
948+
{TPS_InHostDomainSecond, actionTPS_InHostDomainSecond},
949+
{TPS_InHostDomain, actionTPS_InHostDomain},
912950
{TPS_InPortFirst, actionTPS_InPortFirst},
913951
{TPS_InPort, actionTPS_InPort},
914952
{TPS_InHostFirstAN, actionTPS_InHostFirstAN},
915953
{TPS_InHost, actionTPS_InHost},
916954
{TPS_InEmail, actionTPS_InEmail},
917955
{TPS_InFileFirst, actionTPS_InFileFirst},
956+
{TPS_InPathFirst, actionTPS_InPathFirst},
957+
{TPS_InPathSecond, actionTPS_InPathSecond},
918958
{TPS_InFile, actionTPS_InFile},
919959
{TPS_InFileNext, actionTPS_InFileNext},
920960
{TPS_InURIFirst, actionTPS_InURIFirst},

contrib/tsearch2/wordparser/parser.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ typedef enum
3030
TPS_InHTMLEntityNum,
3131
TPS_InHTMLEntityEnd,
3232
TPS_InTagFirst,
33+
TPS_InXMLBegin,
3334
TPS_InTagCloseFirst,
3435
TPS_InTag,
3536
TPS_InTagEscapeK,
@@ -42,15 +43,17 @@ typedef enum
4243
TPS_InCloseCommentFirst,
4344
TPS_InCloseCommentLast,
4445
TPS_InCommentEnd,
45-
TPS_InHostFirstDomen,
46-
TPS_InHostDomenSecond,
47-
TPS_InHostDomen,
46+
TPS_InHostFirstDomain,
47+
TPS_InHostDomainSecond,
48+
TPS_InHostDomain,
4849
TPS_InPortFirst,
4950
TPS_InPort,
5051
TPS_InHostFirstAN,
5152
TPS_InHost,
5253
TPS_InEmail,
5354
TPS_InFileFirst,
55+
TPS_InPathFirst,
56+
TPS_InPathSecond,
5457
TPS_InFile,
5558
TPS_InFileNext,
5659
TPS_InURIFirst,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy