From f72ca55d65c8080d34d78248c0f28a37bda5337c Mon Sep 17 00:00:00 2001 From: vh Date: Mon, 4 Dec 2023 12:05:18 +0200 Subject: [PATCH] Interpret a space as and --- .../app/converter/core/mixins/logic.py | 27 +++++++++++++++++++ .../platforms/base/lucene/tokenizer.py | 7 ++++- .../converter/platforms/base/spl/tokenizer.py | 11 ++++++-- .../converter/platforms/logscale/tokenizer.py | 9 ++++--- 4 files changed, 47 insertions(+), 7 deletions(-) create mode 100644 siem-converter/app/converter/core/mixins/logic.py diff --git a/siem-converter/app/converter/core/mixins/logic.py b/siem-converter/app/converter/core/mixins/logic.py new file mode 100644 index 00000000..84b26a8e --- /dev/null +++ b/siem-converter/app/converter/core/mixins/logic.py @@ -0,0 +1,27 @@ +from typing import List, Union + +from app.converter.core.models.field import Field, Keyword +from app.converter.core.models.identifier import Identifier +from app.converter.core.custom_types.tokens import LogicalOperatorType, GroupType + + +class ANDLogicOperatorMixin: + + @staticmethod + def get_missed_and_token_indices(tokens: List[Union[Field, Keyword, Identifier]]) -> List[int]: + missed_and_indices = [] + for index in range(len(tokens) - 1): + token = tokens[index] + next_token = tokens[index + 1] + if (isinstance(token, (Field, Keyword)) + and not (isinstance(next_token, Identifier) and ( + next_token.token_type in LogicalOperatorType + or next_token.token_type == GroupType.R_PAREN))): + missed_and_indices.append(index + 1) + return reversed(missed_and_indices) + + def add_and_token_if_missed(self, tokens: List[Union[Field, Keyword, Identifier]]) -> List[Union[Field, Keyword, Identifier]]: + indices = self.get_missed_and_token_indices(tokens=tokens) + for index in indices: + tokens.insert(index, Identifier(token_type=LogicalOperatorType.AND)) + return tokens diff --git a/siem-converter/app/converter/platforms/base/lucene/tokenizer.py b/siem-converter/app/converter/platforms/base/lucene/tokenizer.py index 0ac47881..d48acfb5 100644 --- a/siem-converter/app/converter/platforms/base/lucene/tokenizer.py +++ b/siem-converter/app/converter/platforms/base/lucene/tokenizer.py @@ -20,6 +20,7 @@ from typing import Tuple, Union, List, Any from app.converter.core.exceptions.parser import TokenizerGeneralException +from app.converter.core.mixins.logic import ANDLogicOperatorMixin from app.converter.core.models.field import Keyword, Field from app.converter.core.models.identifier import Identifier from app.converter.core.tokenizer import QueryTokenizer @@ -27,7 +28,7 @@ from app.converter.tools.utils import get_match_group -class LuceneTokenizer(QueryTokenizer): +class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin): field_pattern = r"(?P[a-zA-Z\.\-_]+)" match_operator_pattern = r"(?:___field___\s*(?P:))\s*" @@ -107,3 +108,7 @@ def search_keyword(self, query: str) -> Tuple[Keyword, str]: keyword = Keyword(value=value) pos = keyword_search.end() - 1 return keyword, query[pos:] + + def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + tokens = super().tokenize(query=query) + return self.add_and_token_if_missed(tokens=tokens) diff --git a/siem-converter/app/converter/platforms/base/spl/tokenizer.py b/siem-converter/app/converter/platforms/base/spl/tokenizer.py index e0207cd7..f4f2f127 100644 --- a/siem-converter/app/converter/platforms/base/spl/tokenizer.py +++ b/siem-converter/app/converter/platforms/base/spl/tokenizer.py @@ -17,14 +17,17 @@ """ import re -from typing import Tuple, Any +from typing import Tuple, Any, List, Union +from app.converter.core.mixins.logic import ANDLogicOperatorMixin +from app.converter.core.models.field import Field, Keyword +from app.converter.core.models.identifier import Identifier from app.converter.core.tokenizer import QueryTokenizer from app.converter.core.custom_types.tokens import OperatorType from app.converter.tools.utils import get_match_group -class SplTokenizer(QueryTokenizer): +class SplTokenizer(QueryTokenizer, ANDLogicOperatorMixin): field_pattern = r"(?P[a-zA-Z\.\-_\{\}]+)" num_value_pattern = r"(?P\d+(?:\.\d+)*)\s*" double_quotes_value_pattern = r'"(?P(?:[:a-zA-Z\*0-9=+%#\-_/,;\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*' @@ -51,3 +54,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E return operator, s_q_value return super().get_operator_and_value(match) + + def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + tokens = super().tokenize(query=query) + return self.add_and_token_if_missed(tokens=tokens) diff --git a/siem-converter/app/converter/platforms/logscale/tokenizer.py b/siem-converter/app/converter/platforms/logscale/tokenizer.py index dd665c7b..cba94b07 100644 --- a/siem-converter/app/converter/platforms/logscale/tokenizer.py +++ b/siem-converter/app/converter/platforms/logscale/tokenizer.py @@ -17,8 +17,9 @@ """ import re -from typing import Tuple, Any +from typing import Tuple, Any, List, Union +from app.converter.core.mixins.logic import ANDLogicOperatorMixin from app.converter.core.models.field import Keyword, Field from app.converter.core.models.identifier import Identifier from app.converter.core.custom_types.tokens import GroupType, LogicalOperatorType, OperatorType @@ -26,7 +27,7 @@ from app.converter.tools.utils import get_match_group -class LogScaleTokenizer(QueryTokenizer): +class LogScaleTokenizer(QueryTokenizer, ANDLogicOperatorMixin): match_operator_pattern = r"""(?:___field___\s?(?P=|!=))\s?""" num_value_pattern = r"(?P\d+(?:\.\d+)*)\s*" double_quotes_value_pattern = r'"(?P(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{\}\s]|\\\"|\\)*)"\s*' @@ -65,7 +66,7 @@ def __get_identifier(self, query: str) -> (list, str): else: return self.search_field_value(query) - def tokenize(self, query: str) -> list: + def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: tokenized = [] while query: identifier, query = self.__get_identifier(query=query) @@ -78,4 +79,4 @@ def tokenize(self, query: str) -> list: tokenized.append(Identifier(token_type=LogicalOperatorType.AND)) tokenized.append(identifier) self._validate_parentheses(tokenized) - return tokenized + return self.add_and_token_if_missed(tokens=tokenized) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy