@@ -43,35 +43,34 @@ def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]:
43
43
44
44
45
45
class QueryTokenizer (BaseTokenizer ):
46
- field_pattern = r"(?P<field_name>[a-zA-Z\._\-]+)"
47
- operator_pattern = r"\s?(?P<operator>and|or|not|AND|OR|NOT)\s?"
48
- field_value_pattern = r"""^___field___\s*___match_operator___\s*___value___"""
49
- match_operator_pattern = r"""(?:___field___\s?(?P<match_operator>ilike|contains|endswith|startswith|in|>=|<=|==|>|<|=~|!=|=|:|\:))\s?"""
46
+ single_value_operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
47
+ multi_value_operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
48
+ operators_map : dict [str , str ] = {} # used to generate re pattern. so the keys order is important
49
+
50
+ logical_operator_pattern = r"\s?(?P<logical_operator>and|or|not|AND|OR|NOT)\s?"
51
+ field_value_pattern = r"""^___field___\s*___operator___\s*___value___"""
50
52
base_value_pattern = r"(?:___value_pattern___)"
51
- _value_pattern = r"""(?:\"|\')*(?P<value>[:a-zA-Z\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)(?:\*|\'|\"|\s|\$)*"""
52
- value_pattern = base_value_pattern .replace ('___value_pattern___' , _value_pattern )
53
- multi_value_pattern = r"""\((?P<value>[:a-zA-Z\"\*0-9=+%#\-_\/\\'\,.&^@!\(\s]*)\)"""
54
- keyword_pattern = None # do not modify, use subclasses to define this attribute
55
53
56
- multi_value_operators = tuple ()
54
+ # do not modify, use subclasses to define this attribute
55
+ field_pattern : str = None
56
+ _value_pattern : str = None
57
+ value_pattern : str = None
58
+ multi_value_pattern : str = None
59
+ keyword_pattern : str = None
60
+
57
61
multi_value_delimiter = ","
58
62
wildcard_symbol = None
59
63
60
- operators_map = {
61
- "=" : OperatorType .EQ ,
62
- "in" : OperatorType .EQ ,
63
- "<" : OperatorType .LT ,
64
- "<=" : OperatorType .LTE ,
65
- ">" : OperatorType .GT ,
66
- ">=" : OperatorType .GTE ,
67
- "!=" : OperatorType .NEQ ,
68
- "contains" : OperatorType .CONTAINS ,
69
- "startswith" : OperatorType .STARTSWITH ,
70
- "endswith" : OperatorType .ENDSWITH
71
- }
72
-
73
64
def __init_subclass__ (cls , ** kwargs ):
65
+ cls ._validate_re_patterns ()
74
66
cls .value_pattern = cls .base_value_pattern .replace ('___value_pattern___' , cls ._value_pattern )
67
+ cls .operators_map = {** cls .single_value_operators_map , ** cls .multi_value_operators_map }
68
+ cls .operator_pattern = fr"""(?:___field___\s*(?P<operator>(?:{ '|' .join (cls .operators_map )} )))\s*"""
69
+
70
+ @classmethod
71
+ def _validate_re_patterns (cls ):
72
+ if not all ([cls .field_pattern , cls ._value_pattern ]):
73
+ raise ValueError (f"{ cls .__name__ } re patterns must be set" )
75
74
76
75
def map_operator (self , operator : str ) -> str :
77
76
try :
@@ -89,16 +88,16 @@ def search_field(self, query):
89
88
def escape_field_name (self , field_name ):
90
89
return field_name .replace ("." , r"\." )
91
90
92
- def search_match_operator (self , query , field_name ) -> str :
91
+ def search_operator (self , query , field_name ) -> str :
93
92
field_name = self .escape_field_name (field_name )
94
- match_operator_pattern = self .match_operator_pattern .replace ("___field___" , field_name )
95
- match_operator_regex = re .compile (match_operator_pattern , re .IGNORECASE )
96
- match_operator_search = re .search (match_operator_regex , query )
97
- if match_operator_search is None :
93
+ operator_pattern = self .operator_pattern .replace ("___field___" , field_name )
94
+ compiled_operator_regex = re .compile (operator_pattern , re .IGNORECASE )
95
+ if (operator_search := re .search (compiled_operator_regex , query )) is None :
98
96
raise TokenizerGeneralException (error = f"Operator couldn't be found in query part: { query } " )
99
- match_operator = match_operator_search .group ("match_operator" )
100
- match_operator = match_operator .strip (" " )
101
- return match_operator
97
+
98
+ operator = operator_search .group ("operator" )
99
+ operator = operator .strip (" " )
100
+ return operator
102
101
103
102
def get_operator_and_value (self , match : re .Match , operator : str = OperatorType .EQ ) -> Tuple [str , Any ]:
104
103
return operator , get_match_group (match , group_name = 'value' )
@@ -118,7 +117,7 @@ def search_value(self, query: str, operator: str, field_name: str) -> Tuple[str,
118
117
field_value_pattern = self .get_field_value_pattern (operator , field_name )
119
118
value_pattern = self .value_pattern
120
119
is_multi = False
121
- if operator .lower () in self .multi_value_operators :
120
+ if operator .lower () in self .multi_value_operators_map :
122
121
value_pattern = self .multi_value_pattern
123
122
is_multi = True
124
123
@@ -142,7 +141,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]:
142
141
143
142
def get_field_value_pattern (self , operator , field_name ):
144
143
field_value_pattern = self .field_value_pattern .replace ("___field___" , self .escape_field_name (field_name ))
145
- return field_value_pattern .replace ("___match_operator___ " , operator )
144
+ return field_value_pattern .replace ("___operator___ " , operator )
146
145
147
146
@staticmethod
148
147
def _clean_value (value : str , wildcard_symbol : str ) -> str :
@@ -183,28 +182,45 @@ def create_field(field_name: str, operator: Identifier, value: Union[str, List])
183
182
184
183
def search_field_value (self , query ):
185
184
field_name = self .search_field (query )
186
- operator = self .search_match_operator (query , field_name )
185
+ operator = self .search_operator (query , field_name )
187
186
query , operator , value = self .search_value (query = query , operator = operator , field_name = field_name )
188
187
value , operator_token = self .process_value_wildcard_symbols (value = value ,
189
188
operator = operator ,
190
189
wildcard_symbol = self .wildcard_symbol )
191
190
field = self .create_field (field_name = field_name , operator = operator_token , value = value )
192
191
return field , query
193
192
194
- def __get_identifier (self , query : str ) -> Tuple [Union [Field , Keyword , Identifier ], str ]:
193
+ def _match_field_value (self , query : str , white_space_pattern : str = r"\s+" ) -> bool :
194
+ single_value_operator_group = fr"(?:{ '|' .join (self .single_value_operators_map )} )"
195
+ single_value_pattern = fr"""{ self .field_pattern } \s*{ single_value_operator_group } \s*{ self .value_pattern } \s*"""
196
+ if re .match (single_value_pattern , query , re .IGNORECASE ):
197
+ return True
198
+
199
+ if self .multi_value_operators_map :
200
+ multi_value_operator_group = fr"(?:{ '|' .join (self .multi_value_operators_map )} )"
201
+ pattern = f"{ self .field_pattern } { white_space_pattern } { multi_value_operator_group } { white_space_pattern } "
202
+ multi_value_pattern = fr"{ pattern } { self .multi_value_pattern } "
203
+ if re .match (multi_value_pattern , query , re .IGNORECASE ):
204
+ return True
205
+
206
+ return False
207
+
208
+ def _get_identifier (self , query : str ) -> Tuple [Union [Field , Keyword , Identifier ], str ]:
195
209
query = query .strip ("\n " ).strip (" " ).strip ("\n " )
196
210
if query .startswith (GroupType .L_PAREN ):
197
211
return Identifier (token_type = GroupType .L_PAREN ), query [1 :]
198
212
elif query .startswith (GroupType .R_PAREN ):
199
213
return Identifier (token_type = GroupType .R_PAREN ), query [1 :]
200
- elif operator_search := re .match (self .operator_pattern , query ):
201
- operator = operator_search .group ("operator" )
202
- pos = operator_search .end ()
203
- return Identifier (token_type = operator .lower ()), query [pos :]
214
+ elif logical_operator_search := re .match (self .logical_operator_pattern , query ):
215
+ logical_operator = logical_operator_search .group ("logical_operator" )
216
+ pos = logical_operator_search .end ()
217
+ return Identifier (token_type = logical_operator .lower ()), query [pos :]
218
+ elif self ._match_field_value (query ):
219
+ return self .search_field_value (query )
204
220
elif self .keyword_pattern and re .match (self .keyword_pattern , query ):
205
221
return self .search_keyword (query )
206
- else :
207
- return self . search_field_value ( query )
222
+
223
+ raise TokenizerGeneralException ( "Unsupported query entry" )
208
224
209
225
@staticmethod
210
226
def _validate_parentheses (tokens ):
@@ -224,7 +240,7 @@ def _validate_parentheses(tokens):
224
240
def tokenize (self , query : str ) -> List [Union [Field , Keyword , Identifier ]]:
225
241
tokenized = []
226
242
while query :
227
- identifier , query = self .__get_identifier (query = query )
243
+ identifier , query = self ._get_identifier (query = query )
228
244
tokenized .append (identifier )
229
245
self ._validate_parentheses (tokenized )
230
246
return tokenized
0 commit comments