diff --git a/10-algo-ds-implementations/string_pattern_matching_algorithms.py b/10-algo-ds-implementations/string_pattern_matching_algorithms.py deleted file mode 100644 index e69de29..0000000 diff --git a/10-algo-ds-implementations/string_pattern_matching_rabin_karp.py b/10-algo-ds-implementations/string_pattern_matching_rabin_karp.py new file mode 100644 index 0000000..350420f --- /dev/null +++ b/10-algo-ds-implementations/string_pattern_matching_rabin_karp.py @@ -0,0 +1,71 @@ +''' +Problem: Exact pattern matching + - Input: a text (T), a pattern (P) + - Output: 1 or all occurences of P in T + +Algorithm: + - Hashing function: H(S: str, n: int, m: int) = Sum(S[i] * x^i-n % p) for n <= i < m + - Compute H(P, 0, |P|) + - Compute H(T, 0, |P|) + - H(S, n + 1, m + 1) = (H(S, n, m) - S[n]) / x + S[m] * x^m-n-1 % p + - Loop through T: + - Compute H(T, i + 1, i + 1 + |P|) by using H(T, i, i + |P|) in O(1) time complexity + - Compare it with H(P) + +Questions: + - How to choose the multiplier and the prime numbers? + - How to make the trade-off between collision numbers and performance of recurrence calculation for H[i]? + - More the prime is bigger, less false positive we gets. This is supposed to make our code faster. + - In the other hand, more the prime is bigger, the related division and multiplication operations get slower. + +''' + +class Rabin_Karp: + def __init__(self): + self.__multiplier = 31 + self._prime = 1000000007 + + self.__multiplier_power_prime = self.__multiplier ** self._prime % self._prime + + def __abs_hash__(self, s: str, length: int) -> int: + assert(length < len(s)) + + hash, power_x = 0, 1 + for idx in range(length): + hash += ord(s[idx]) * power_x + hash %= self.__prime + + power_x *= self.__multiplier + + return hash + + def __rel_hash(self, s: str, prev_hash: int, idx: int, length: int) -> int: + assert(0 < idx < len(s)) + + return (prev_hash - s[idx - 1]) // self.__multiplier + s[idx + length - 1] * self.__multiplier + + def find(self, t: str, p: str) -> List[int]: + len_t = len(t) + len_p = len(p) + if len_p > len_t: + return [] + + # 1. find hash functions for t and p + hash_p = self.__abs_hash__(p, len_p) + hash_t = self.__abs_hash__(t, len_p) + + # 2. find all occurences + occurences = [] + if hash_p == hash_t: + occurences.append(0) + for idx in range(1, len_t): + hash_t = self.__rel_hash(t, hash_t, idx, len_p) + if hash_p == hash_t and p == t[idx:idx+len_p]: + occurences.append(idx) + + return occurences + + + + + \ No newline at end of file pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.

Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy