Source code for tryalgo.rabin_karp
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""\
Find substrings by Rabin-Karp
jill-jenn vie et christoph durr - 2015-2019
"""
# http://www.wolframalpha.com/input/?i=nearest+prime+number+to+2**56
# snip{ rabin_karp_roll_hash
PRIME = 72057594037927931 # < 2^{56}
DOMAIN = 128
[docs]
def roll_hash(old_val, out_digit, in_digit, last_pos):
"""roll_hash """
val = (old_val - out_digit * last_pos + DOMAIN * PRIME) % PRIME
val = (val * DOMAIN) % PRIME
return (val + in_digit) % PRIME
# snip}
# snip{ rabin_karp_matches
[docs]
def matches(s, t, i, j, k):
"""
Checks whether s[i:i + k] is equal to t[j:j + k].
We used a loop to ease the implementation in other languages.
"""
# tests if s[i:i + k] equals t[j:j + k]
for d in range(k):
if s[i + d] != t[j + d]:
return False
return True
# snip}
# snip{ rabin_karp_matching
[docs]
def rabin_karp_matching(s, t):
"""Find a substring by Rabin-Karp
:param s: the haystack string
:param t: the needle string
:returns: index i such that s[i: i + len(t)] == t, or -1
:complexity: O(len(s) + len(t)) in expected time,
and O(len(s) * len(t)) in worst case
"""
hash_s = 0
hash_t = 0
len_s = len(s)
len_t = len(t)
last_pos = pow(DOMAIN, len_t - 1) % PRIME
if len_s < len_t: # substring too long
return -1
for i in range(len_t): # preprocessing
hash_s = (DOMAIN * hash_s + ord(s[i])) % PRIME
hash_t = (DOMAIN * hash_t + ord(t[i])) % PRIME
for i in range(len_s - len_t + 1):
if hash_s == hash_t: # hashes match
# check character by character
if matches(s, t, i, 0, len_t):
return i
if i < len_s - len_t:
# shift window and calculate new hash on s
hash_s = roll_hash(hash_s, ord(s[i]), ord(s[i + len_t]),
last_pos)
return -1 # no match
# snip}
# snip{ rabin_karp_factor
[docs]
def rabin_karp_factor(s, t, k):
"""Find a common factor by Rabin-Karp
:param string s: haystack
:param string t: needle
:param int k: factor length
:returns: (i, j) such that s[i:i + k] == t[j:j + k] or None.
In case of tie, lexicographical minimum (i, j) is returned
:complexity: O(len(s) + len(t)) in expected time,
and O(len(s) + len(t) * k) in worst case
"""
last_pos = pow(DOMAIN, k - 1) % PRIME
pos = {}
assert k > 0
if len(s) < k or len(t) < k:
return None
hash_t = 0
# First calculate hash values of factors of t
for j in range(k):
hash_t = (DOMAIN * hash_t + ord(t[j])) % PRIME
for j in range(len(t) - k + 1):
# store the start position with the hash value
if hash_t in pos:
pos[hash_t].append(j)
else:
pos[hash_t] = [j]
if j < len(t) - k:
hash_t = roll_hash(hash_t, ord(t[j]), ord(t[j + k]), last_pos)
hash_s = 0
# Now check for matching factors in s
for i in range(k): # preprocessing
hash_s = (DOMAIN * hash_s + ord(s[i])) % PRIME
for i in range(len(s) - k + 1):
if hash_s in pos: # is this signature in s?
for j in pos[hash_s]:
if matches(s, t, i, j, k):
return (i, j)
if i < len(s) - k:
hash_s = roll_hash(hash_s, ord(s[i]), ord(s[i + k]), last_pos)
return None
# snip}