Skip to content

Commit 284502d

Browse files
author
Oleksandr Volha
committed
spl keywords improvements, refactoring
1 parent 162af80 commit 284502d

File tree

11 files changed

+188
-148
lines changed

11 files changed

+188
-148
lines changed

translator/app/translator/core/tokenizer.py

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -43,35 +43,34 @@ def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
4343

4444

4545
class QueryTokenizer(BaseTokenizer):
46-
field_pattern = r"(?P<field_name>[a-zA-Z\._\-]+)"
47-
operator_pattern = r"\s?(?P<operator>and|or|not|AND|OR|NOT)\s?"
48-
field_value_pattern = r"""^___field___\s*___match_operator___\s*___value___"""
49-
match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>ilike|contains|endswith|startswith|in|>=|<=|==|>|<|=~|!=|=|:|\:))\s?"""
46+
single_value_operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
47+
multi_value_operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
48+
operators_map: dict[str, str] = {} # used to generate re pattern. so the keys order is important
49+
50+
logical_operator_pattern = r"\s?(?P<logical_operator>and|or|not|AND|OR|NOT)\s?"
51+
field_value_pattern = r"""^___field___\s*___operator___\s*___value___"""
5052
base_value_pattern = r"(?:___value_pattern___)"
51-
_value_pattern = r"""(?:\"|\')*(?P<value>[:a-zA-Z\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)(?:\*|\'|\"|\s|\$)*"""
52-
value_pattern = base_value_pattern.replace('___value_pattern___', _value_pattern)
53-
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)\)"""
54-
keyword_pattern = None # do not modify, use subclasses to define this attribute
5553

56-
multi_value_operators = tuple()
54+
# do not modify, use subclasses to define this attribute
55+
field_pattern: str = None
56+
_value_pattern: str = None
57+
value_pattern: str = None
58+
multi_value_pattern: str = None
59+
keyword_pattern: str = None
60+
5761
multi_value_delimiter = ","
5862
wildcard_symbol = None
5963

60-
operators_map = {
61-
"=": OperatorType.EQ,
62-
"in": OperatorType.EQ,
63-
"<": OperatorType.LT,
64-
"<=": OperatorType.LTE,
65-
">": OperatorType.GT,
66-
">=": OperatorType.GTE,
67-
"!=": OperatorType.NEQ,
68-
"contains": OperatorType.CONTAINS,
69-
"startswith": OperatorType.STARTSWITH,
70-
"endswith": OperatorType.ENDSWITH
71-
}
72-
7364
def __init_subclass__(cls, **kwargs):
65+
cls._validate_re_patterns()
7466
cls.value_pattern = cls.base_value_pattern.replace('___value_pattern___', cls._value_pattern)
67+
cls.operators_map = {**cls.single_value_operators_map, **cls.multi_value_operators_map}
68+
cls.operator_pattern = fr"""(?:___field___\s*(?P<operator>(?:{'|'.join(cls.operators_map)})))\s*"""
69+
70+
@classmethod
71+
def _validate_re_patterns(cls):
72+
if not all([cls.field_pattern, cls._value_pattern]):
73+
raise ValueError(f"{cls.__name__} re patterns must be set")
7574

7675
def map_operator(self, operator: str) -> str:
7776
try:
@@ -89,16 +88,16 @@ def search_field(self, query):
8988
def escape_field_name(self, field_name):
9089
return field_name.replace(".", r"\.")
9190

92-
def search_match_operator(self, query, field_name) -> str:
91+
def search_operator(self, query, field_name) -> str:
9392
field_name = self.escape_field_name(field_name)
94-
match_operator_pattern = self.match_operator_pattern.replace("___field___", field_name)
95-
match_operator_regex = re.compile(match_operator_pattern, re.IGNORECASE)
96-
match_operator_search = re.search(match_operator_regex, query)
97-
if match_operator_search is None:
93+
operator_pattern = self.operator_pattern.replace("___field___", field_name)
94+
compiled_operator_regex = re.compile(operator_pattern, re.IGNORECASE)
95+
if (operator_search := re.search(compiled_operator_regex, query)) is None:
9896
raise TokenizerGeneralException(error=f"Operator couldn't be found in query part: {query}")
99-
match_operator = match_operator_search.group("match_operator")
100-
match_operator = match_operator.strip(" ")
101-
return match_operator
97+
98+
operator = operator_search.group("operator")
99+
operator = operator.strip(" ")
100+
return operator
102101

103102
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:
104103
return operator, get_match_group(match, group_name='value')
@@ -118,7 +117,7 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
118117
field_value_pattern = self.get_field_value_pattern(operator, field_name)
119118
value_pattern = self.value_pattern
120119
is_multi = False
121-
if operator.lower() in self.multi_value_operators:
120+
if operator.lower() in self.multi_value_operators_map:
122121
value_pattern = self.multi_value_pattern
123122
is_multi = True
124123

@@ -142,7 +141,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
142141

143142
def get_field_value_pattern(self, operator, field_name):
144143
field_value_pattern = self.field_value_pattern.replace("___field___", self.escape_field_name(field_name))
145-
return field_value_pattern.replace("___match_operator___", operator)
144+
return field_value_pattern.replace("___operator___", operator)
146145

147146
@staticmethod
148147
def _clean_value(value: str, wildcard_symbol: str) -> str:
@@ -183,28 +182,45 @@ def create_field(field_name: str, operator: Identifier, value: Union[str, List])
183182

184183
def search_field_value(self, query):
185184
field_name = self.search_field(query)
186-
operator = self.search_match_operator(query, field_name)
185+
operator = self.search_operator(query, field_name)
187186
query, operator, value = self.search_value(query=query, operator=operator, field_name=field_name)
188187
value, operator_token = self.process_value_wildcard_symbols(value=value,
189188
operator=operator,
190189
wildcard_symbol=self.wildcard_symbol)
191190
field = self.create_field(field_name=field_name, operator=operator_token, value=value)
192191
return field, query
193192

194-
def __get_identifier(self, query: str) -> Tuple[Union[Field, Keyword, Identifier], str]:
193+
def _match_field_value(self, query: str, white_space_pattern: str = r"\s+") -> bool:
194+
single_value_operator_group = fr"(?:{'|'.join(self.single_value_operators_map)})"
195+
single_value_pattern = fr"""{self.field_pattern}\s*{single_value_operator_group}\s*{self.value_pattern}\s*"""
196+
if re.match(single_value_pattern, query, re.IGNORECASE):
197+
return True
198+
199+
if self.multi_value_operators_map:
200+
multi_value_operator_group = fr"(?:{'|'.join(self.multi_value_operators_map)})"
201+
pattern = f"{self.field_pattern}{white_space_pattern}{multi_value_operator_group}{white_space_pattern}"
202+
multi_value_pattern = fr"{pattern}{self.multi_value_pattern}"
203+
if re.match(multi_value_pattern, query, re.IGNORECASE):
204+
return True
205+
206+
return False
207+
208+
def _get_identifier(self, query: str) -> Tuple[Union[Field, Keyword, Identifier], str]:
195209
query = query.strip("\n").strip(" ").strip("\n")
196210
if query.startswith(GroupType.L_PAREN):
197211
return Identifier(token_type=GroupType.L_PAREN), query[1:]
198212
elif query.startswith(GroupType.R_PAREN):
199213
return Identifier(token_type=GroupType.R_PAREN), query[1:]
200-
elif operator_search := re.match(self.operator_pattern, query):
201-
operator = operator_search.group("operator")
202-
pos = operator_search.end()
203-
return Identifier(token_type=operator.lower()), query[pos:]
214+
elif logical_operator_search := re.match(self.logical_operator_pattern, query):
215+
logical_operator = logical_operator_search.group("logical_operator")
216+
pos = logical_operator_search.end()
217+
return Identifier(token_type=logical_operator.lower()), query[pos:]
218+
elif self._match_field_value(query):
219+
return self.search_field_value(query)
204220
elif self.keyword_pattern and re.match(self.keyword_pattern, query):
205221
return self.search_keyword(query)
206-
else:
207-
return self.search_field_value(query)
222+
223+
raise TokenizerGeneralException("Unsupported query entry")
208224

209225
@staticmethod
210226
def _validate_parentheses(tokens):
@@ -224,7 +240,7 @@ def _validate_parentheses(tokens):
224240
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
225241
tokenized = []
226242
while query:
227-
identifier, query = self.__get_identifier(query=query)
243+
identifier, query = self._get_identifier(query=query)
228244
tokenized.append(identifier)
229245
self._validate_parentheses(tokenized)
230246
return tokenized
@@ -234,8 +250,7 @@ def filter_tokens(tokens: List[TOKEN_TYPE],
234250
token_type: Union[Type[Field], Type[Keyword], Type[Identifier]]) -> List[TOKEN_TYPE]:
235251
return [token for token in tokens if isinstance(token, token_type)]
236252

237-
def filter_function_tokens(self,
238-
tokens: List[Union[Field, Keyword, Identifier, Function, SortArg]]) -> List[TOKEN_TYPE]:
253+
def filter_function_tokens(self, tokens: List[Union[Field, Keyword, Identifier, Function]]) -> List[TOKEN_TYPE]:
239254
result = []
240255
for token in tokens:
241256
if isinstance(token, Field):

translator/app/translator/platforms/athena/tokenizer.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,23 +26,28 @@
2626

2727

2828
class AthenaTokenizer(QueryTokenizer):
29+
single_value_operators_map = {
30+
"=": OperatorType.EQ,
31+
"<=": OperatorType.LTE,
32+
"<": OperatorType.LT,
33+
">=": OperatorType.GTE,
34+
">": OperatorType.GT,
35+
"!=": OperatorType.NEQ,
36+
"<>": OperatorType.NEQ,
37+
"like": OperatorType.EQ
38+
}
39+
multi_value_operators_map = {
40+
"in": OperatorType.EQ
41+
}
42+
2943
field_pattern = r'(?P<field_name>"[a-zA-Z\._\-\s]+"|[a-zA-Z\._\-]+)'
30-
match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>like|in|<=|>=|==|>|<|<>|!=|=))\s?"""
3144
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3245
bool_value_pattern = r"(?P<bool_value>true|false)\s*"
3346
single_quotes_value_pattern = r"""'(?P<s_q_value>(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*)'"""
3447
_value_pattern = fr"{num_value_pattern}|{bool_value_pattern}|{single_quotes_value_pattern}"
3548
multi_value_pattern = r"""\((?P<value>\d+(?:,\s*\d+)*|'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*'(?:,\s*'(?:[:a-zA-Z\*0-9=+%#\-\/\\,_".$&^@!\(\)\{\}\s]|'')*')*)\)"""
3649

37-
multi_value_operators = ("in",)
3850
wildcard_symbol = "%"
39-
operators_map = {
40-
"like": OperatorType.EQ
41-
}
42-
43-
def __init__(self):
44-
super().__init__()
45-
self.operators_map.update(super().operators_map)
4651

4752
@staticmethod
4853
def should_process_value_wildcard_symbols(operator: str) -> bool:
@@ -62,7 +67,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
6267

6368
def search_field_value(self, query):
6469
field_name = self.search_field(query)
65-
operator = self.search_match_operator(query, field_name)
70+
operator = self.search_operator(query, field_name)
6671
should_process_value_wildcard_symbols = self.should_process_value_wildcard_symbols(operator)
6772
query, operator, value = self.search_value(query=query, operator=operator, field_name=field_name)
6873

translator/app/translator/platforms/base/lucene/const.py

Lines changed: 0 additions & 11 deletions
This file was deleted.

translator/app/translator/platforms/base/lucene/tokenizer.py

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -26,34 +26,36 @@
2626
from app.translator.core.tokenizer import QueryTokenizer
2727
from app.translator.core.custom_types.tokens import OperatorType
2828
from app.translator.tools.utils import get_match_group
29-
from app.translator.platforms.base.lucene.const import COMPARISON_OPERATORS_MAP
3029

3130

3231
class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
32+
single_value_operators_map = {
33+
":>": OperatorType.GT,
34+
":<": OperatorType.LT,
35+
":": OperatorType.EQ
36+
}
37+
multi_value_operators_map = {
38+
":": OperatorType.EQ
39+
}
40+
3341
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_]+)"
3442
match_operator_pattern = r"(?:___field___\s*(?P<match_operator>:\[\*\sTO|:\[|:<|:>|:))\s*"
35-
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
43+
_num_value_pattern = r"\d+(?:\.\d+)*"
44+
num_value_pattern = fr"(?P<num_value>{_num_value_pattern})\s*"
3645
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
3746
no_quotes_value_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\\)+)\s*"
3847
re_value_pattern = r"/(?P<re_value>[:a-zA-Z\*0-9=+%#\\\-_\,\"\'\.$&^@!\(\)\{\}\[\]\s?]+)/\s*"
39-
_value_pattern = fr"{num_value_pattern}|{re_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}"
48+
gte_value_pattern = fr"\[\s*(?P<gte_value>{_num_value_pattern})\s+TO\s+\*\s*\]"
49+
lte_value_pattern = fr"\[\s*\*\s+TO\s+(?P<lte_value>{_num_value_pattern})\s*\]"
50+
range_value_pattern = fr"{gte_value_pattern}|{lte_value_pattern}"
51+
_value_pattern = fr"{num_value_pattern}|{re_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}|{range_value_pattern}"
4052
keyword_pattern = r"(?P<n_q_value>(?:[a-zA-Z\*0-9=%#_/,\'\.$@]|\\\"|\\\(|\\\)|\\\[|\\\]|\\\{|\\\}|\\\:|\\)+)(?:\s+|\)|$)"
4153

4254
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\[\]\s]+)\)"""
4355
multi_value_check_pattern = r"___field___\s*___operator___\s*\("
4456

4557
wildcard_symbol = "*"
4658

47-
operators_map = {
48-
":": OperatorType.EQ,
49-
":>": OperatorType.GT,
50-
":<": OperatorType.LT
51-
}
52-
53-
def __init__(self):
54-
super().__init__()
55-
self.operators_map.update(super().operators_map)
56-
5759
@staticmethod
5860
def create_field(field_name: str, operator: Identifier, value: Union[str, List]) -> Field:
5961
field_name = field_name.replace(".text", "")
@@ -79,11 +81,15 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E
7981
elif (d_q_value := get_match_group(match, group_name='d_q_value')) is not None:
8082
return operator, d_q_value
8183

84+
elif (gte_value := get_match_group(match, group_name='gte_value')) is not None:
85+
return OperatorType.GTE, gte_value
86+
87+
elif (lte_value := get_match_group(match, group_name='lte_value')) is not None:
88+
return OperatorType.LTE, lte_value
89+
8290
return super().get_operator_and_value(match, operator)
8391

8492
def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str, str, Union[str, List[str]]]:
85-
if operator in COMPARISON_OPERATORS_MAP.keys():
86-
return self.search_value_gte_lte(query, operator, field_name)
8793
check_pattern = self.multi_value_check_pattern
8894
check_regex = check_pattern.replace('___field___', field_name).replace('___operator___', operator)
8995
if re.match(check_regex, query):
@@ -105,22 +111,22 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
105111
pos = field_value_search.end()
106112
return query[pos:], operator, value
107113

108-
def search_value_gte_lte(self, query: str, operator: str, field_name: str) -> Tuple[str, str, Union[str, List[str]]]:
109-
query_list = query.split("]")
110-
to_replace = [v for val in COMPARISON_OPERATORS_MAP.values() for v in val["replace"]]
111-
to_replace.append(field_name)
112-
regex = re.compile('|'.join(to_replace))
113-
value = re.sub(regex, '', query_list.pop(0))
114-
return "".join(query_list), COMPARISON_OPERATORS_MAP.get(operator, {}).get("default_op"), value.strip()
115-
116114
def search_keyword(self, query: str) -> Tuple[Keyword, str]:
117115
keyword_search = re.search(self.keyword_pattern, query)
118116
_, value = self.get_operator_and_value(keyword_search)
119117
value = value.strip(self.wildcard_symbol)
120118
keyword = Keyword(value=value)
121-
pos = keyword_search.end() - 1
119+
pos = keyword_search.end() - 1 # FIXME: do not count the last group of pattern e.g. )
122120
return keyword, query[pos:]
123121

122+
def _match_field_value(self, query: str, white_space_pattern: str = r"\s*") -> bool:
123+
range_value_pattern = f"(?:{self.gte_value_pattern}|{self.lte_value_pattern})"
124+
range_pattern = fr"{self.field_pattern}{white_space_pattern}:\s*{range_value_pattern}"
125+
if re.match(range_pattern, query, re.IGNORECASE):
126+
return True
127+
128+
return super()._match_field_value(query, white_space_pattern=white_space_pattern)
129+
124130
def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
125131
tokens = super().tokenize(query=query)
126132
return self.add_and_token_if_missed(tokens=tokens)

translator/app/translator/platforms/base/spl/tokenizer.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,25 @@
2828

2929

3030
class SplTokenizer(QueryTokenizer, ANDLogicOperatorMixin):
31+
single_value_operators_map = {
32+
"=": OperatorType.EQ,
33+
"<=": OperatorType.LTE,
34+
"<": OperatorType.LT,
35+
">=": OperatorType.GTE,
36+
">": OperatorType.GT,
37+
"!=": OperatorType.NEQ
38+
}
39+
multi_value_operators_map = {"in": OperatorType.EQ}
40+
3141
field_pattern = r"(?P<field_name>[a-zA-Z\.\-_\{\}]+)"
32-
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
33-
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*'
42+
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)(?=$|\s|\))"
43+
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\]\[\(\)\{\}\s]|\\\"|\\)*)"\s*'
3444
single_quotes_value_pattern = r"'(?P<s_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,;\"\.$&^@!\(\)\{\}\s]|\\\'|\\)*)'\s*"
35-
no_quotes_value = r"(?P<no_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\.\\$&^@!])+)\s*"
36-
_value_pattern = fr"{num_value_pattern}|{no_quotes_value}|{double_quotes_value_pattern}|{single_quotes_value_pattern}"
45+
no_quotes_value_pattern = r"(?P<no_q_value>(?:[:a-zA-Z\*0-9+%#\-_/,\.$&^@!]|\\\s|\\=|\\!=|\\<|\\<=|\\>|\\>=|\\\\)+)(?=$|\s|\))"
46+
_value_pattern = fr"{num_value_pattern}|{no_quotes_value_pattern}|{double_quotes_value_pattern}|{single_quotes_value_pattern}"
3747
multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,;.$&^@!\{\}\(\s]+)\)"""
38-
keyword_pattern = double_quotes_value_pattern
48+
keyword_pattern = fr"{double_quotes_value_pattern}|{no_quotes_value_pattern}"
3949

40-
multi_value_operators = ("in",)
4150
wildcard_symbol = "*"
4251

4352
def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.EQ) -> Tuple[str, Any]:

translator/app/translator/platforms/chronicle/tokenizer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,16 @@
2626

2727

2828
class ChronicleQueryTokenizer(QueryTokenizer):
29-
field_pattern = r"(?P<field_name>[a-zA-Z0-9\._]+)"
29+
single_value_operators_map = {
30+
"=": OperatorType.EQ,
31+
"<=": OperatorType.LTE,
32+
"<": OperatorType.LT,
33+
">=": OperatorType.GTE,
34+
">": OperatorType.GT,
35+
"!=": OperatorType.NEQ
36+
}
3037

38+
field_pattern = r"(?P<field_name>[a-zA-Z0-9\._]+)"
3139
num_value_pattern = r"(?P<num_value>\d+(?:\.\d+)*)\s*"
3240
bool_value_pattern = r"(?P<bool_value>true|false)\s*"
3341
double_quotes_value_pattern = r'"(?P<d_q_value>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\\\)*)"\s*(?:nocase)?'

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy