phase 1: tokenize
make re expression into a list of pairs (once_or_many, some_char_or_any), called unit.
e.g. "a*b.c*" > [('*', 'a'), ('1', 'b'), ('1', '.'), ('*', 'c')]
phase 2: dp
make a 2d array. dp[i][j]
means whether string[:i+1]
matches units[:j+1]
 add
<begin>
before the string and the units (conceptually).<begin>
has index of1
. dp[1][1] = True
sincematch("", "") == True
dp[i][1] = False
sincematch(anynonemptystring, "") == False
dp[1][j] = dp[1][j1] and units[j][0] == '*'
, sincematch("", re) == True
iffre
is are all*
unitsdp[i][j] =
units[j][0] != '*'
in this case, thej
th unit shall eat a char in string, provided the prev string and units are match.
dp[i][j] = dp[i1][j1] and atom_match(string[i], units[j])
units[j][0] == '*'

dp[i][j1] == True
, and the current*
unit matches an empty char. e.g."ab"
and"abc*"

dp[i1][j] == True
, the current*
unit has eaten the prev char. it must be able to eat current char, too. we addand atom_match(string[i], units[j])
.notice always we have
dp[i1][j1] == dp[i1][j]
ifunits[j][0] == '*'

return dp[len(string)][len(units)]
akadp[2][2]
import operator
#re unit type:
# unit = once atom  many atom
# atom = char c  any
def tokenize(re):
prev_c = None
for c in re:
if c == '*':
yield ('*', prev_c) #many atom
prev_c = None
else:
if prev_c != None:
yield ('1', prev_c) #once atom
prev_c = c
if prev_c:
yield ('1', prev_c)
def atom_match(c, unit):
return unit[1] == '.' or unit[1] == c
def unit_match_dp(string, units):
if not string and not units:
return True
if not units:
return False
if not string: #True if all units are '*'
return reduce(operator.and_, map(lambda p:p[0] == '*', units))
#has string and has units
dp = [[False for _ in range(len(units) + 1)] for _ in range(len(string) + 1)]
dp[1][1] = True
for j in range(len(units)):
dp[1][j] = dp[1][j1] and units[j][0] == '*'
for i in range(len(string)):
for j in range(len(units)):
if units[j][0] == '*':
dp[i][j] = (dp[i1][j] and atom_match(string[i], units[j])) or (dp[i][j1])
else:
dp[i][j] = dp[i1][j1] and atom_match(string[i], units[j])
return dp[2][2]
def match(string, re):
units = list(tokenize(re))
return unit_match_dp(string, units)
'''
<b> a* a a b c*
<b> 1 1 0 0 0 0
a 0 1 1 0 0 0
a 0 1 0 1 0 0
b 0 0 0 0 1 0
c 0 0 0 0 0 1
c 0 0 0 0 0 1
'''
assert match('aa', 'aa')
assert match('a', 'a*a')
assert match("abcd", "d*")
assert match("aaaaaaaaaaaaab", "a*a*a*a*a*a*a*a*a*a*b")
print('ok!')