From d9427f5daeea17b746a00382d07023ddc02ca922 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 10:52:59 -0300 Subject: [PATCH 01/12] add custom mask functionalities --- .../utilities/data_masking/base.py | 129 ++++++++++++++++-- .../utilities/data_masking/provider/base.py | 73 +++++++++- 2 files changed, 186 insertions(+), 16 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 9b80e50bd58..4cebcef37cb 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,5 +1,6 @@ from __future__ import annotations +import ast import functools import logging import warnings @@ -94,8 +95,41 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... - def erase(self, data: Sequence | Mapping, fields: list[str] | None = None) -> str | list[str] | tuple[str] | dict: - return self._apply_action(data=data, fields=fields, action=self.provider.erase) + @overload + def erase( + self, + data: dict, + fields: list[str], + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + ) -> dict: ... + + def erase( + self, + data: Sequence | Mapping, + fields: list[str] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + ) -> str | list[str] | tuple[str] | dict: + if not data: + return data + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return self._apply_action( + data=data, + fields=fields, + action=self.provider.erase, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + ) def _apply_action( self, @@ -103,6 +137,10 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ): """ @@ -136,11 +174,23 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, ) else: logger.debug(f"Running action {action.__name__} with the entire data") - return action(data=data, provider_options=provider_options, **encryption_context) + return action( + data=data, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) def _apply_action_to_fields( self, @@ -148,6 +198,10 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context: str, ) -> dict | str: """ @@ -194,6 +248,8 @@ def _apply_action_to_fields( new_dict = {'a': {'b': {'c': '*****'}}, 'x': {'y': '*****'}} ``` """ + if not fields: + raise ValueError("Fields parameter cannot be empty") data_parsed: dict = self._normalize_data_to_parse(fields, data) @@ -204,6 +260,10 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, **encryption_context, # type: ignore[arg-type] ) @@ -225,12 +285,6 @@ def _apply_action_to_fields( # For in-place updates, json_parse accepts a callback function # that receives 3 args: field_value, fields, field_name # We create a partial callback to pre-populate known provider options (action, provider opts, enc ctx) - update_callback = functools.partial( - self._call_action, - action=action, - provider_options=provider_options, - **encryption_context, # type: ignore[arg-type] - ) json_parse.update( data_parsed, @@ -239,6 +293,49 @@ def _apply_action_to_fields( return data_parsed + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """ + Apply masking rules to data, supporting different rules for each field. + """ + result = data.copy() + + for path, rule in masking_rules.items(): + try: + # Handle nested paths (e.g., 'address.street') + parts = path.split(".") + current = result + + for part in parts[:-1]: + if isinstance(current[part], str) and current[part].startswith("{"): + try: + current[part] = ast.literal_eval(current[part]) + except (ValueError, SyntaxError): + continue + current = current[part] + + final_field = parts[-1] + + # Apply masking rule to the target field + if final_field in current: + current[final_field] = self.provider.erase(str(current[final_field]), **rule) + + except (KeyError, TypeError, AttributeError): + # Log warning if field not found or invalid path + warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + continue + + return result + + def _mask_nested_field(self, data: dict, field_path: str, mask_function): + keys = field_path.split(".") + current = data + for key in keys[:-1]: + current = current.get(key, {}) + if not isinstance(current, dict): + return # Caminho inválido + if keys[-1] in current: + current[keys[-1]] = mask_function(current[keys[-1]]) + @staticmethod def _call_action( field_value: Any, @@ -246,6 +343,10 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, **encryption_context, ) -> None: """ @@ -263,7 +364,15 @@ def _call_action( Returns: - fields[field_name]: Returns the processed field value """ - fields[field_name] = action(field_value, provider_options=provider_options, **encryption_context) + fields[field_name] = action( + field_value, + provider_options=provider_options, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + **encryption_context, + ) return fields[field_name] def _normalize_data_to_parse(self, fields: list, data: str | dict) -> dict: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 28bc8384f8d..6a5d806f056 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -2,10 +2,14 @@ import functools import json +import re from typing import Any, Callable, Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING +PRESERVE_CHARS = set("-_. ") +_regex_cache = {} + class BaseProvider: """ @@ -63,7 +67,16 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte """ raise NotImplementedError("Subclasses must implement decrypt()") - def erase(self, data, **kwargs) -> Iterable[str]: + def erase( + self, + data, + custom_mask: bool | None = None, + mask_pattern: str | None = None, + regex_pattern: str | None = None, + mask_format: str | None = None, + masking_rules: dict | None = None, + **kwargs, + ) -> Iterable[str]: """ This method irreversibly erases data. @@ -72,10 +85,58 @@ def erase(self, data, **kwargs) -> Iterable[str]: If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****". + input data but with each element replaced by the string "*****" or following one of the custom masks. """ - if isinstance(data, (str, dict, bytes)): + result = DATA_MASKING_STRING + + if data: + if isinstance(data, str): + if custom_mask: + if mask_pattern: + result = self._pattern_mask(data, mask_pattern) + elif regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + else: + result = self._custom_erase(data, **kwargs) + elif isinstance(data, dict): + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + elif isinstance(data, (list, tuple, set)): + result = type(data)( + self.erase( + item, + custom_mask=custom_mask, + mask_pattern=mask_pattern, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ) + + return result + + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + return { + key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) + for key, value in data.items() + } + + def _pattern_mask(self, data: str, pattern: str) -> str: + return pattern[: len(data)] if len(pattern) >= len(data) else pattern + + def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + try: + if regex_pattern not in _regex_cache: + _regex_cache[regex_pattern] = re.compile(regex_pattern) + return _regex_cache[regex_pattern].sub(mask_format, data) + except re.error: return DATA_MASKING_STRING - elif isinstance(data, (list, tuple, set)): - return type(data)([DATA_MASKING_STRING] * len(data)) - return DATA_MASKING_STRING + + def _custom_erase(self, data: str, **kwargs) -> str: + if not data: + return "" + + # Use join with list comprehension instead of building list incrementally + return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 796bd898de8e95c8342994449e157ccad15d1ce7 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 14:05:39 -0300 Subject: [PATCH 02/12] change flags name to more intuitive --- .../utilities/data_masking/base.py | 30 +++++++++---------- .../utilities/data_masking/provider/base.py | 17 +++++------ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 4cebcef37cb..f08e10371f7 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -100,8 +100,8 @@ def erase( self, data: dict, fields: list[str], - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, ) -> dict: ... @@ -110,8 +110,8 @@ def erase( self, data: Sequence | Mapping, fields: list[str] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -125,8 +125,8 @@ def erase( data=data, fields=fields, action=self.provider.erase, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, ) @@ -137,8 +137,8 @@ def _apply_action( fields: list[str] | None, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -174,8 +174,8 @@ def _apply_action( fields=fields, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -185,8 +185,8 @@ def _apply_action( return action( data=data, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, @@ -198,8 +198,8 @@ def _apply_action_to_fields( fields: list, action: Callable, provider_options: dict | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context: str, @@ -260,8 +260,8 @@ def _apply_action_to_fields( self._call_action, action=action, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, # type: ignore[arg-type] @@ -343,8 +343,8 @@ def _call_action( field_name: str, action: Callable, provider_options: dict[str, Any] | None = None, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, **encryption_context, @@ -367,8 +367,8 @@ def _call_action( fields[field_name] = action( field_value, provider_options=provider_options, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, **encryption_context, diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6a5d806f056..4337a0e6502 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -70,8 +70,8 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, data, - custom_mask: bool | None = None, - mask_pattern: str | None = None, + dynamic_mask: bool | None = None, + custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, @@ -91,13 +91,12 @@ def erase( if data: if isinstance(data, str): + if dynamic_mask: + result = self._custom_erase(data, **kwargs) if custom_mask: - if mask_pattern: - result = self._pattern_mask(data, mask_pattern) - elif regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - else: - result = self._custom_erase(data, **kwargs) + result = self._pattern_mask(data, custom_mask) + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) elif isinstance(data, dict): if masking_rules: result = self._apply_masking_rules(data, masking_rules) @@ -105,8 +104,8 @@ def erase( result = type(data)( self.erase( item, + dynamic_mask=dynamic_mask, custom_mask=custom_mask, - mask_pattern=mask_pattern, regex_pattern=regex_pattern, mask_format=mask_format, masking_rules=masking_rules, From d9319179fa9bacde7f10e0e301a980b4c4ebf0c1 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Tue, 7 Jan 2025 18:50:30 -0300 Subject: [PATCH 03/12] fix type check error --- .../utilities/data_masking/provider/base.py | 81 ++++++++++++------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 4337a0e6502..382264c220e 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -3,8 +3,9 @@ import functools import json import re -from typing import Any, Callable, Iterable +from typing import Any, Callable +# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -69,14 +70,14 @@ def decrypt(self, data, provider_options: dict | None = None, **encryption_conte def erase( self, - data, + data: Any, dynamic_mask: bool | None = None, custom_mask: str | None = None, regex_pattern: str | None = None, mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> Iterable[str]: + ) -> str | dict | list | tuple | set: """ This method irreversibly erases data. @@ -85,47 +86,68 @@ def erase( If the data to be erased is of an iterable type like `list`, `tuple`, or `set`, this method will return a new object of the same type as the - input data but with each element replaced by the string "*****" or following one of the custom masks. + input data but with each element masked according to the specified rules. """ - result = DATA_MASKING_STRING - - if data: - if isinstance(data, str): - if dynamic_mask: - result = self._custom_erase(data, **kwargs) - if custom_mask: - result = self._pattern_mask(data, custom_mask) - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - elif isinstance(data, (list, tuple, set)): - result = type(data)( - self.erase( - item, - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - for item in data + result = None + + # Handle empty or None data + if not data: + result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + + # Handle string data + elif isinstance(data, str): + if regex_pattern and mask_format: + result = self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + result = self._pattern_mask(data, custom_mask) + elif dynamic_mask: + result = self._custom_erase(data, **kwargs) + else: + result = DATA_MASKING_STRING + + # Handle dictionary data + elif isinstance(data, dict): + result = ( + self._apply_masking_rules(data, masking_rules) + if masking_rules + else {k: DATA_MASKING_STRING for k in data} + ) + + # Handle iterable data (list, tuple, set) + elif isinstance(data, (list, tuple, set)): + masked_data = ( + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, ) + for item in data + ) + result = type(data)(masked_data) + + # Default case + else: + result = DATA_MASKING_STRING return result def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) for key, value in data.items() } def _pattern_mask(self, data: str, pattern: str) -> str: + """Apply pattern masking to string data.""" return pattern[: len(data)] if len(pattern) >= len(data) else pattern def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: + """Apply regex masking to string data.""" try: if regex_pattern not in _regex_cache: _regex_cache[regex_pattern] = re.compile(regex_pattern) @@ -137,5 +159,4 @@ def _custom_erase(self, data: str, **kwargs) -> str: if not data: return "" - # Use join with list comprehension instead of building list incrementally return "".join("*" if char not in PRESERVE_CHARS else char for char in data) From 4c0070c30050749d4b570f0fc10916624ee52081 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 13:55:54 -0300 Subject: [PATCH 04/12] add draft documentation --- docs/utilities/data_masking.md | 21 +++++++++- .../data_masking/src/custom_data_masking.py | 38 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 examples/data_masking/src/custom_data_masking.py diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 162292e79a0..b1485dac6df 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -43,7 +43,7 @@ stateDiagram-v2 ## Terminology -**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_. This operation replaces data in-memory, making it a one-way action. +**Erasing** replaces sensitive information **irreversibly** with a non-sensitive placeholder _(`*****`)_, or with a customized mask. This operation replaces data in-memory, making it a one-way action. **Encrypting** transforms plaintext into ciphertext using an encryption algorithm and a cryptographic key. It allows you to encrypt any sensitive data, so only allowed personnel to decrypt it. Learn more about encryption [here](https://aws.amazon.com/blogs/security/importance-of-encryption-and-how-aws-can-help/){target="_blank"}. @@ -117,6 +117,25 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +The `erase` method also supports additional flags for more advanced and flexible masking: + +| Flag | Behavior | +| ---------------- | ----------------------------------------------------------| +| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| +| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| +| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| +| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| +| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| + +=== "custom_data_masking.py" + ```python hl_lines="13 17 21 25 36" + --8<-- "examples/data_masking/src/custom_data_masking.py" + ``` +=== "generic_data_input.json" + ```json hl_lines="6 7 9 12" + --8<-- "examples/data_masking/src/generic_data_input.json" + ``` + ### Encrypting data ???+ note "About static typing and encryption" diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py new file mode 100644 index 00000000000..a99b9045cac --- /dev/null +++ b/examples/data_masking/src/custom_data_masking.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from aws_lambda_powertools.utilities.data_masking import DataMasking +from aws_lambda_powertools.utilities.typing import LambdaContext + +data_masker = DataMasking() + + +def lambda_handler(event: dict, context: LambdaContext) -> dict: + data: dict = event.get("body", {}) + + # Default erase (*****) + default_erased = data_masker.erase(data, fields=["address.zip"]) + # 'street': '*****' + + # dynamic_mask + dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) + #'street': '*** **** **' + + # custom_mask + custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") + #'zip': 'XX' + + # regex_pattern and mask_format + regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") + #'email': 'j****@example.com' + + # Masking rules for each field + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, + "address.street": {"dynamic_mask": False}, + } + + masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + + return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase From ae81dce2d47967145b9f2a223356996a986ea0fb Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 8 Jan 2025 16:43:16 -0300 Subject: [PATCH 05/12] change doc examples --- .../utilities/data_masking/base.py | 3 ++ docs/utilities/data_masking.md | 43 +++++++++++++------ .../data_masking/src/custom_data_masking.py | 20 +-------- 3 files changed, 34 insertions(+), 32 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index f08e10371f7..7695b41bd6b 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -95,6 +95,9 @@ def erase(self, data: tuple, fields: list[str]) -> tuple[str]: ... @overload def erase(self, data: dict, fields: list[str]) -> dict: ... + @overload + def erase(self, data: dict[Any, Any], *, masking_rules: dict[str, object]) -> dict[Any, Any]: ... + @overload def erase( self, diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index b1485dac6df..c90abfc236e 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -119,22 +119,37 @@ Erasing will remove the original data and replace it with a `*****`. This means The `erase` method also supports additional flags for more advanced and flexible masking: -| Flag | Behavior | -| ---------------- | ----------------------------------------------------------| -| `dynamic_mask`(bool) | When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking.| -| `custom_mask`(str) | Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX".| -| `regex_pattern`(str) | Defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`.| -| `mask_format`(str) | Specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved.| -| `masking_rules`(dict) | Allows you to apply different masking rules (flags) for each data field.| - -=== "custom_data_masking.py" - ```python hl_lines="13 17 21 25 36" +=== "dynamic_mask" + + (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + + > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` + + > Field result: `'street': '*** **** **'` + +=== "custom_mask" + + (str) Specifies a simple pattern for masking data. This pattern is applied directly to the input string, replacing all the original characters. For example, with a `custom_mask` of "XX-XX" applied to "12345", the result would be "XX-XX". + + > Expression: `data_masker.erase(data, fields=["address.zip"], custom_mask="XX")` + + > Field result: `'zip': 'XX'` + +=== "regex_pattern & mask_format" + + (str) `regex_pattern` defines a regular expression pattern used to identify parts of the input string that should be masked. This allows for more complex and flexible masking rules. It's used in conjunction with `mask_format`. + `mask_format` specifies the format to use when replacing parts of the string matched by `regex_pattern`. It can include placeholders (like \1, \2) to refer to captured groups in the regex pattern, allowing some parts of the original string to be preserved. + + > Expression: `data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3")` + + > Field result: `'email': 'j****@example.com'` + +=== "masking_rules" + + (dict) Allows you to apply different masking rules (flags) for each data field. + ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` -=== "generic_data_input.json" - ```json hl_lines="6 7 9 12" - --8<-- "examples/data_masking/src/generic_data_input.json" - ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index a99b9045cac..24a5d51bc81 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -9,22 +9,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: data: dict = event.get("body", {}) - # Default erase (*****) - default_erased = data_masker.erase(data, fields=["address.zip"]) - # 'street': '*****' - - # dynamic_mask - dynamic_mask = data_masker.erase(data, fields=["address.zip"], dynamic_mask=True) - #'street': '*** **** **' - - # custom_mask - custom_mask = data_masker.erase(data, fields=["address.zip"], custom_mask="XX") - #'zip': 'XX' - - # regex_pattern and mask_format - regex_pattern = data_masker.erase(data, fields=["email"], regex_pattern=r"(.)(.*)(@.*)", mask_format=r"\1****\3") - #'email': 'j****@example.com' - # Masking rules for each field masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, @@ -33,6 +17,6 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: "address.street": {"dynamic_mask": False}, } - masking_rules_erase = data_masker.erase(data, masking_rules=masking_rules) + result = data_masker.erase(data, masking_rules=masking_rules) - return default_erased, dynamic_mask, custom_mask, regex_pattern, masking_rules_erase + return result From 7630b068ed0f9bffa2894f7dc8f0c86c04bba134 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:30:26 -0300 Subject: [PATCH 06/12] style: format code with black --- .../utilities/data_masking/base.py | 2 +- .../utilities/data_masking/provider/base.py | 35 ++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 7695b41bd6b..8136c8bcaaf 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -335,7 +335,7 @@ def _mask_nested_field(self, data: dict, field_path: str, mask_function): for key in keys[:-1]: current = current.get(key, {}) if not isinstance(current, dict): - return # Caminho inválido + return if keys[-1] in current: current[keys[-1]] = mask_function(current[keys[-1]]) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 382264c220e..6fa5648e7bc 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -91,11 +91,11 @@ def erase( result = None # Handle empty or None data - if not data: - result = DATA_MASKING_STRING if isinstance(data, (str, bytes)) else data + if data is None or (isinstance(data, (str, list, dict)) and not data): + return data # Handle string data - elif isinstance(data, str): + if isinstance(data, str): if regex_pattern and mask_format: result = self._regex_mask(data, regex_pattern, mask_format) elif custom_mask: @@ -107,15 +107,24 @@ def erase( # Handle dictionary data elif isinstance(data, dict): - result = ( - self._apply_masking_rules(data, masking_rules) - if masking_rules - else {k: DATA_MASKING_STRING for k in data} - ) + if masking_rules: + result = self._apply_masking_rules(data, masking_rules) + else: + result = {} + for k, v in data.items(): + result[str(k)] = self.erase( + str(v), + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) # Handle iterable data (list, tuple, set) elif isinstance(data, (list, tuple, set)): - masked_data = ( + masked_data = [ self.erase( item, dynamic_mask=dynamic_mask, @@ -126,16 +135,16 @@ def erase( **kwargs, ) for item in data - ) + ] result = type(data)(masked_data) - # Default case + # Handle other types (int, float, bool, etc.) else: - result = DATA_MASKING_STRING + result = str(data) return result - def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: + def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" return { key: self.erase(str(value), **masking_rules[key]) if key in masking_rules else str(value) From 6e2ec354612b44ceede4c56bc4d56bb602ecc7e4 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 08:49:17 -0300 Subject: [PATCH 07/12] fix format base --- .../utilities/data_masking/provider/base.py | 138 +++++++++++------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 6fa5648e7bc..47079c42484 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -5,7 +5,6 @@ import re from typing import Any, Callable -# , Iterable from aws_lambda_powertools.utilities.data_masking.constants import DATA_MASKING_STRING PRESERVE_CHARS = set("-_. ") @@ -77,56 +76,72 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, **kwargs, - ) -> str | dict | list | tuple | set: - """ - This method irreversibly erases data. - - If the data to be erased is of type `str`, `dict`, or `bytes`, - this method will return an erased string, i.e. "*****". - - If the data to be erased is of an iterable type like `list`, `tuple`, - or `set`, this method will return a new object of the same type as the - input data but with each element masked according to the specified rules. - """ - result = None - + ) -> Any: # Handle empty or None data if data is None or (isinstance(data, (str, list, dict)) and not data): return data - # Handle string data - if isinstance(data, str): - if regex_pattern and mask_format: - result = self._regex_mask(data, regex_pattern, mask_format) - elif custom_mask: - result = self._pattern_mask(data, custom_mask) - elif dynamic_mask: - result = self._custom_erase(data, **kwargs) - else: - result = DATA_MASKING_STRING - - # Handle dictionary data + result = data # Default to returning the original data + + if isinstance(data, (str, int, float)): + result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) elif isinstance(data, dict): - if masking_rules: - result = self._apply_masking_rules(data, masking_rules) - else: - result = {} - for k, v in data.items(): - result[str(k)] = self.erase( - str(v), - dynamic_mask=dynamic_mask, - custom_mask=custom_mask, - regex_pattern=regex_pattern, - mask_format=mask_format, - masking_rules=masking_rules, - **kwargs, - ) - - # Handle iterable data (list, tuple, set) + result = self._mask_dict( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) elif isinstance(data, (list, tuple, set)): - masked_data = [ - self.erase( - item, + result = self._mask_iterable( + data, + dynamic_mask, + custom_mask, + regex_pattern, + mask_format, + masking_rules, + **kwargs, + ) + + return result + + def _mask_primitive( + self, + data: str, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + **kwargs, + ) -> str: + if regex_pattern and mask_format: + return self._regex_mask(data, regex_pattern, mask_format) + elif custom_mask: + return self._pattern_mask(data, custom_mask) + elif dynamic_mask: + return self._custom_erase(data, **kwargs) + else: + return DATA_MASKING_STRING + + def _mask_dict( + self, + data: dict, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> dict: + if masking_rules: + return self._apply_masking_rules(data, masking_rules) + else: + return { + k: self.erase( + v, dynamic_mask=dynamic_mask, custom_mask=custom_mask, regex_pattern=regex_pattern, @@ -134,15 +149,32 @@ def erase( masking_rules=masking_rules, **kwargs, ) - for item in data - ] - result = type(data)(masked_data) + for k, v in data.items() + } - # Handle other types (int, float, bool, etc.) - else: - result = str(data) - - return result + def _mask_iterable( + self, + data: list | tuple | set, + dynamic_mask: bool | None, + custom_mask: str | None, + regex_pattern: str | None, + mask_format: str | None, + masking_rules: dict | None, + **kwargs, + ) -> list | tuple | set: + masked_data = [ + self.erase( + item, + dynamic_mask=dynamic_mask, + custom_mask=custom_mask, + regex_pattern=regex_pattern, + mask_format=mask_format, + masking_rules=masking_rules, + **kwargs, + ) + for item in data + ] + return type(data)(masked_data) def _apply_masking_rules(self, data: dict, masking_rules: dict) -> Any: """Apply masking rules to dictionary data.""" From 93c1544fd31cb282ba15fc86de3b76fd53fbf02c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Thu, 9 Jan 2025 11:39:39 -0300 Subject: [PATCH 08/12] add tests for new masks --- .../utilities/data_masking/base.py | 2 - .../utilities/data_masking/provider/base.py | 15 ++++--- .../test_unit_data_masking.py | 43 +++++++++++++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 8136c8bcaaf..23b7a684dde 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -119,8 +119,6 @@ def erase( mask_format: str | None = None, masking_rules: dict | None = None, ) -> str | list[str] | tuple[str] | dict: - if not data: - return data if masking_rules: return self._apply_masking_rules(data, masking_rules) else: diff --git a/aws_lambda_powertools/utilities/data_masking/provider/base.py b/aws_lambda_powertools/utilities/data_masking/provider/base.py index 47079c42484..02e6406b862 100644 --- a/aws_lambda_powertools/utilities/data_masking/provider/base.py +++ b/aws_lambda_powertools/utilities/data_masking/provider/base.py @@ -77,11 +77,16 @@ def erase( masking_rules: dict | None = None, **kwargs, ) -> Any: - # Handle empty or None data - if data is None or (isinstance(data, (str, list, dict)) and not data): - return data - result = data # Default to returning the original data + result = DATA_MASKING_STRING + + if not any([dynamic_mask, custom_mask, regex_pattern, mask_format, masking_rules]): + if isinstance(data, (str, int, float, dict, bytes)): + return DATA_MASKING_STRING + elif isinstance(data, (list, tuple, set)): + return type(data)([DATA_MASKING_STRING] * len(data)) + else: + return DATA_MASKING_STRING if isinstance(data, (str, int, float)): result = self._mask_primitive(str(data), dynamic_mask, custom_mask, regex_pattern, mask_format, **kwargs) @@ -194,7 +199,7 @@ def _regex_mask(self, data: str, regex_pattern: str, mask_format: str) -> str: _regex_cache[regex_pattern] = re.compile(regex_pattern) return _regex_cache[regex_pattern].sub(mask_format, data) except re.error: - return DATA_MASKING_STRING + return data def _custom_erase(self, data: str, **kwargs) -> str: if not data: diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 4fbbc188ceb..cd728904cc7 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -25,6 +25,16 @@ def test_erase_int(data_masker): assert erased_string == DATA_MASKING_STRING +def test_erase_int_custom_mask(data_masker): + # GIVEN an int data type + + # WHEN erase is called with no fields argument + erased_string = data_masker.erase(42, custom_mask="XX") + + # THEN the result is the data masked + assert erased_string == "XX" + + def test_erase_float(data_masker): # GIVEN a float data type @@ -205,3 +215,36 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): # THEN the "erased" payload is the same of the original assert masked_json_string == data + + +def test_regex_mask(data_masker): + data = "Hello! My name is Fulano Ciclano" + regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" + mask_format = "XXXX XXXX" + + result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + + assert result == "Hello! My name is XXXX XXXX" + + +def test_erase_json_dict_with_fields_and_masks(data_masker): + # GIVEN the data type is a json representation of a dictionary + data = json.dumps( + { + "a": { + "1": {"None": "hello", "four": "world"}, + "b": {"3": {"4": "goodbye", "e": "world"}}, + }, + }, + ) + + # WHEN erase is called with a list of fields specified + masked_json_string = data_masker.erase(data, fields=["a.'1'.None", "a..'4'"], dynamic_mask=True) + + # THEN the result is only the specified fields are erased + assert masked_json_string == { + "a": { + "1": {"None": "*****", "four": "world"}, + "b": {"3": {"4": "*******", "e": "world"}}, + }, + } From 92d474020b65100fb23d4903fd51e42a5f38ce0c Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 11:22:39 -0300 Subject: [PATCH 09/12] sub header for custom mask in docs --- docs/utilities/data_masking.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index c90abfc236e..596fa2c3fa3 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -117,6 +117,8 @@ Erasing will remove the original data and replace it with a `*****`. This means --8<-- "examples/data_masking/src/getting_started_erase_data_output.json" ``` +#### Custom masking + The `erase` method also supports additional flags for more advanced and flexible masking: === "dynamic_mask" From d9535d6ff78ee638c5ea9b4813c16b64c8dd8bdc Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:03:04 -0300 Subject: [PATCH 10/12] masking rules to handle complex nest --- .../utilities/data_masking/base.py | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 23b7a684dde..0dd41522d61 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -1,6 +1,5 @@ from __future__ import annotations -import ast import functools import logging import warnings @@ -296,33 +295,55 @@ def _apply_action_to_fields( def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: """ - Apply masking rules to data, supporting different rules for each field. + Apply masking rules to data, supporting both simple field names and complex path expressions. + + Args: + data: The dictionary containing data to mask + masking_rules: Dictionary mapping field names or path expressions to masking rules + + Returns: + dict: The masked data dictionary """ result = data.copy() for path, rule in masking_rules.items(): try: - # Handle nested paths (e.g., 'address.street') - parts = path.split(".") - current = result - - for part in parts[:-1]: - if isinstance(current[part], str) and current[part].startswith("{"): - try: - current[part] = ast.literal_eval(current[part]) - except (ValueError, SyntaxError): - continue - current = current[part] - - final_field = parts[-1] - - # Apply masking rule to the target field - if final_field in current: - current[final_field] = self.provider.erase(str(current[final_field]), **rule) - - except (KeyError, TypeError, AttributeError): - # Log warning if field not found or invalid path - warnings.warn(f"Could not apply masking rule for path: {path}", stacklevel=2) + if ".." in path: + # Handle recursive descent paths (e.g., "address..name") + base_path, field = path.split("..") + jsonpath_expr = parse(f"$.{base_path}..{field}") + elif "[" in path: + # Handle array notation paths (e.g., "address[*].street") + jsonpath_expr = parse(f"$.{path}") + else: + # Handle simple field names (e.g., "email") + jsonpath_expr = parse(f"$.{path}") + + matches = jsonpath_expr.find(result) + + if not matches: + warnings.warn(f"No matches found for path: {path}", stacklevel=2) + continue + + for match in matches: + try: + value = match.value + if value is not None: + if isinstance(value, dict): + # Handle dictionary values by masking each field + for k, v in value.items(): + if v is not None: + value[k] = self.provider.erase(str(v), **rule) + else: + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) + + except Exception as e: + warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) + continue + + except Exception as e: + warnings.warn(f"Error processing path {path}: {str(e)}", stacklevel=2) continue return result From 9dc2b562e685d1ec1f37e691403c32eb7644f8c0 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Wed, 15 Jan 2025 13:18:25 -0300 Subject: [PATCH 11/12] add test for masking rules --- .../test_unit_data_masking.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index cd728904cc7..8eb0f955958 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -248,3 +248,37 @@ def test_erase_json_dict_with_fields_and_masks(data_masker): "b": {"3": {"4": "*******", "e": "world"}}, }, } + + +def test_erase_json_dict_with_complex_masking_rules(data_masker): + # GIVEN the data type is a json representation of a dictionary with nested and filtered paths + data = json.dumps( + { + "email": "john.doe@example.com", + "age": 30, + "addres": [ + {"postcode": 13000, "street": "123 Main St", "details": {"name": "Home", "type": "Primary"}}, + {"postcode": 14000, "street": "456 Other Street", "details": {"name": "Office", "type": "Secondary"}}, + ], + }, + ) + + # WHEN erase is called with complex masking rules + masking_rules = { + "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, + "age": {"dynamic_mask": True}, + "addres..name": {"custom_mask": "xxx"}, + "addres[?(@.postcode > 12000)]": {"dynamic_mask": True}, + } + + masked_json_string = data_masker.erase(data, masking_rules=masking_rules) + + # THEN the result should have all specified fields masked according to their rules + assert masked_json_string == { + "email": "j****@example.com", + "age": "*****", + "addres": [ + {"postcode": "*****", "street": "*** *** **", "details": {"name": "xxx", "type": "*******"}}, + {"postcode": "*****", "street": "*** ***** ******", "details": {"name": "xxx", "type": "********"}}, + ], + } From 63c7918876c596c1b1d890885b2cee19a9d80a29 Mon Sep 17 00:00:00 2001 From: Ana Falcao Date: Fri, 31 Jan 2025 09:33:55 -0300 Subject: [PATCH 12/12] modifications based on the feedback --- .../utilities/data_masking/base.py | 25 +++----------- docs/utilities/data_masking.md | 12 ++++++- .../data_masking/src/custom_data_masking.py | 4 +-- .../src/output_custom_masking.json | 29 ++++++++++++++++ .../src/payload_custom_masking.json | 34 +++++++++++++++++++ .../test_unit_data_masking.py | 6 +++- 6 files changed, 86 insertions(+), 24 deletions(-) create mode 100644 examples/data_masking/src/output_custom_masking.json create mode 100644 examples/data_masking/src/payload_custom_masking.json diff --git a/aws_lambda_powertools/utilities/data_masking/base.py b/aws_lambda_powertools/utilities/data_masking/base.py index 0dd41522d61..00650789696 100644 --- a/aws_lambda_powertools/utilities/data_masking/base.py +++ b/aws_lambda_powertools/utilities/data_masking/base.py @@ -3,6 +3,7 @@ import functools import logging import warnings +from copy import deepcopy from typing import TYPE_CHECKING, Any, Callable, Mapping, Sequence, overload from jsonpath_ng.ext import parse @@ -304,21 +305,11 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: Returns: dict: The masked data dictionary """ - result = data.copy() + result = deepcopy(data) for path, rule in masking_rules.items(): try: - if ".." in path: - # Handle recursive descent paths (e.g., "address..name") - base_path, field = path.split("..") - jsonpath_expr = parse(f"$.{base_path}..{field}") - elif "[" in path: - # Handle array notation paths (e.g., "address[*].street") - jsonpath_expr = parse(f"$.{path}") - else: - # Handle simple field names (e.g., "email") - jsonpath_expr = parse(f"$.{path}") - + jsonpath_expr = parse(f"$.{path}") matches = jsonpath_expr.find(result) if not matches: @@ -329,14 +320,8 @@ def _apply_masking_rules(self, data: dict, masking_rules: dict) -> dict: try: value = match.value if value is not None: - if isinstance(value, dict): - # Handle dictionary values by masking each field - for k, v in value.items(): - if v is not None: - value[k] = self.provider.erase(str(v), **rule) - else: - masked_value = self.provider.erase(str(value), **rule) - match.full_path.update(result, masked_value) + masked_value = self.provider.erase(str(value), **rule) + match.full_path.update(result, masked_value) except Exception as e: warnings.warn(f"Error masking value for path {path}: {str(e)}", stacklevel=2) diff --git a/docs/utilities/data_masking.md b/docs/utilities/data_masking.md index 596fa2c3fa3..94e470aa965 100644 --- a/docs/utilities/data_masking.md +++ b/docs/utilities/data_masking.md @@ -123,7 +123,7 @@ The `erase` method also supports additional flags for more advanced and flexible === "dynamic_mask" - (bool) When set to `True`, this flag enables custom masking behavior. It activates the use of advanced masking techniques such as pattern-based or regex-based masking. + (bool) Enables dynamic masking behavior when set to `True`, by maintaining the original length and structure of the text replacing with *. > Expression: `data_masker.erase(data, fields=["address.zip"], dynamic_mask=True)` @@ -152,6 +152,16 @@ The `erase` method also supports additional flags for more advanced and flexible ```python hl_lines="20" --8<-- "examples/data_masking/src/custom_data_masking.py" ``` +=== "Input example" + + ```json + --8<-- "examples/data_masking/src/payload_custom_masking.json" + ``` +=== "Masking rules output example" + + ```json hl_lines="4 5 10 21" + --8<-- "examples/data_masking/src/output_custom_masking.json" + ``` ### Encrypting data diff --git a/examples/data_masking/src/custom_data_masking.py b/examples/data_masking/src/custom_data_masking.py index 24a5d51bc81..7b96f6f379f 100644 --- a/examples/data_masking/src/custom_data_masking.py +++ b/examples/data_masking/src/custom_data_masking.py @@ -13,8 +13,8 @@ def lambda_handler(event: dict, context: LambdaContext) -> dict: masking_rules = { "email": {"regex_pattern": "(.)(.*)(@.*)", "mask_format": r"\1****\3"}, "age": {"dynamic_mask": True}, - "address.zip": {"dynamic_mask": True, "custom_mask": "xxx"}, - "address.street": {"dynamic_mask": False}, + "address.zip": {"custom_mask": "xxx"}, + "$.other_address[?(@.postcode > 12000)]": {"custom_mask": "Masked"}, } result = data_masker.erase(data, masking_rules=masking_rules) diff --git a/examples/data_masking/src/output_custom_masking.json b/examples/data_masking/src/output_custom_masking.json new file mode 100644 index 00000000000..0571da99808 --- /dev/null +++ b/examples/data_masking/src/output_custom_masking.json @@ -0,0 +1,29 @@ +{ + "id": 1, + "name": "John Doe", + "age": "**", + "email": "j****@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "xxx", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + "Masked" + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } +} \ No newline at end of file diff --git a/examples/data_masking/src/payload_custom_masking.json b/examples/data_masking/src/payload_custom_masking.json new file mode 100644 index 00000000000..d50b715ffa4 --- /dev/null +++ b/examples/data_masking/src/payload_custom_masking.json @@ -0,0 +1,34 @@ +{ + "body": { + "id": 1, + "name": "Jane Doe", + "age": 30, + "email": "janedoe@example.com", + "address": { + "street": "123 Main St", + "city": "Anytown", + "state": "CA", + "zip": "12345", + "postcode": 12345, + "product": { + "name": "Car" + } + }, + "other_address": [ + { + "postcode": 11345, + "street": "123 Any Drive" + }, + { + "postcode": 67890, + "street": "100 Main Street," + } + ], + "company_address": { + "street": "456 ACME Ave", + "city": "Anytown", + "state": "CA", + "zip": "12345" + } + } +} \ No newline at end of file diff --git a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py index 8eb0f955958..93588445034 100644 --- a/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py +++ b/tests/unit/data_masking/_aws_encryption_sdk/test_unit_data_masking.py @@ -218,12 +218,16 @@ def test_parsing_nonexistent_fields_warning_on_missing_field(): def test_regex_mask(data_masker): - data = "Hello! My name is Fulano Ciclano" + # GIVEN a str data type + data = "Hello! My name is John Doe" + + # WHEN erase is called with regex pattern and mask format regex_pattern = r"\b[A-Z][a-z]+ [A-Z][a-z]+\b" mask_format = "XXXX XXXX" result = data_masker.erase(data, regex_pattern=regex_pattern, mask_format=mask_format) + # THEN the result is the regex part masked by the masked format assert result == "Hello! My name is XXXX XXXX" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy