From d2813a1b5b563f37ecbdbf53ff4ad3443e7159c0 Mon Sep 17 00:00:00 2001 From: Jared Hancock Date: Mon, 25 Mar 2024 20:58:51 -0500 Subject: [PATCH] re: Add support for start- and endpos. Pattern objects have two additional parameters for the ::search and ::match methods to define the starting and ending position of the subject within the string to be searched. This allows for searching a sub-string without creating a slice. However, one caveat of using the start-pos rather than a slice is that the start anchor (`^`) remains anchored to the beginning of the text. Signed-off-by: Jared Hancock --- docs/library/re.rst | 14 +++++- extmod/modre.c | 25 +++++++++- tests/extmod/re_start_end_pos.py | 79 ++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 3 deletions(-) create mode 100644 tests/extmod/re_start_end_pos.py diff --git a/docs/library/re.rst b/docs/library/re.rst index 19b15d2d2c299..b8aeefd90cfa4 100644 --- a/docs/library/re.rst +++ b/docs/library/re.rst @@ -154,8 +154,8 @@ Regex objects Compiled regular expression. Instances of this class are created using `re.compile()`. -.. method:: regex.match(string) - regex.search(string) +.. method:: regex.match(string, [pos, [endpos]]) + regex.search(string, [pos, [endpos]]) regex.sub(replace, string, count=0, flags=0, /) Similar to the module-level functions :meth:`match`, :meth:`search` @@ -163,6 +163,16 @@ Compiled regular expression. Instances of this class are created using Using methods is (much) more efficient if the same regex is applied to multiple strings. + The optional second parameter *pos* gives an index in the string where the + search is to start; it defaults to ``0``. This is not completely equivalent + to slicing the string; the ``'^'`` pattern character matches at the real + beginning of the string and at positions just after a newline, but not + necessarily at the index where the search is to start. + + The optional parameter *endpos* limits how far the string will be searched; + it will be as if the string is *endpos* characters long, so only the + characters from *pos* to ``endpos - 1`` will be searched for a match. + .. method:: regex.split(string, max_split=-1, /) Split a *string* using regex. If *max_split* is given, it specifies diff --git a/extmod/modre.c b/extmod/modre.c index d17ec68d50eda..85e5d1b0f74f6 100644 --- a/extmod/modre.c +++ b/extmod/modre.c @@ -196,10 +196,11 @@ static void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t // Note: this function can't be named re_exec because it may clash with system headers, eg on FreeBSD static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *args) { - (void)n_args; mp_obj_re_t *self; + bool was_compiled = false; if (mp_obj_is_type(args[0], (mp_obj_type_t *)&re_type)) { self = MP_OBJ_TO_PTR(args[0]); + was_compiled = true; } else { self = MP_OBJ_TO_PTR(mod_re_compile(1, args)); } @@ -207,6 +208,28 @@ static mp_obj_t re_exec_helper(bool is_anchored, uint n_args, const mp_obj_t *ar size_t len; subj.begin_line = subj.begin = mp_obj_str_get_data(args[1], &len); subj.end = subj.begin + len; + + if (was_compiled && n_args > 2) { + // Arg #2 is starting-pos + mp_int_t startpos = mp_obj_get_int(args[2]); + if (startpos > (mp_int_t)len) { + startpos = len; + } else if (startpos < 0) { + startpos = 0; + } + subj.begin += startpos; + if (n_args > 3) { + // Arg #3 is ending-pos + mp_int_t endpos = mp_obj_get_int(args[3]); + if (endpos > (mp_int_t)len) { + endpos = len; + } else if (endpos < startpos) { + endpos = startpos; + } + subj.end = subj.begin_line + endpos; + } + } + int caps_num = (self->re.sub + 1) * 2; mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, caps, char *, caps_num); // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char diff --git a/tests/extmod/re_start_end_pos.py b/tests/extmod/re_start_end_pos.py new file mode 100644 index 0000000000000..6d10e340717df --- /dev/null +++ b/tests/extmod/re_start_end_pos.py @@ -0,0 +1,79 @@ +# test start and end pos specification + +try: + import re +except ImportError: + print("SKIP") + raise SystemExit + + +def print_groups(match): + print("----") + try: + if match is not None: + i = 0 + while True: + print(match.group(i)) + i += 1 + except IndexError: + pass + + +p = re.compile(r"o") +m = p.match("dog") +print_groups(m) + +m = p.match("dog", 1) +print_groups(m) + +m = p.match("dog", 2) +print_groups(m) + +# No match past end of input +m = p.match("dog", 5) +print_groups(m) + +m = p.match("dog", 0, 1) +print_groups(m) + +# Caret only matches the actual beginning +p = re.compile(r"^o") +m = p.match("dog", 1) +print_groups(m) + +# End at beginning means searching empty string +p = re.compile(r"o") +m = p.match("dog", 1, 1) +print_groups(m) + +# End before the beginning doesn't match anything +m = p.match("dog", 2, 1) +print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) + +# Search also works +print("--search") + +p = re.compile(r"o") +m = p.search("dog") +print_groups(m) + +m = p.search("dog", 1) +print_groups(m) + +m = p.search("dog", 2) +print_groups(m) + +# Negative starting values don't crash +m = p.search("dog", -2) +print_groups(m) + +m = p.search("dog", -2, -5) +print_groups(m) + pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy