Here is my python solution. ~ 500 ms. Not as fast as others' Java or C++ code. Not sure if it is because my implementation or the python interpreter.

```
class Solution:
# @param board, a 2D array
# Capture all regions by modifying the input board in-place.
# Do not return any value.
def solve(self, board):
N = len(board)
if N <= 2: return
M = len(board[0])
if M <= 2: return
def dfs(i, j):
S = [(i, j)] # dfs stack
while S:
p, q = S.pop()
board[p][q] = 'B'
# up, down, left, right
if p > 0 and board[p - 1][q] == 'O': S.append((p - 1, q))
if p < N - 1 and board[p + 1][q] == 'O': S.append((p + 1, q))
if q > 0 and board[p][q - 1] == 'O': S.append((p, q - 1))
if q < M - 1 and board[p][q + 1] == 'O': S.append((p, q + 1))
# top and bottom
for j in xrange(M):
if board[0][j] == 'O': dfs(0, j)
if board[N - 1][j] == 'O': dfs(N - 1, j)
# left and right
for i in xrange(1, N - 1):
if board[i][0] == 'O': dfs(i, 0)
if board[i][M - 1] == 'O': dfs(i, M - 1)
# final mark
for i in xrange(N):
for j in xrange(M):
if board[i][j] == 'O': board[i][j] = 'X'
elif board[i][j] == 'B': board[i][j] = 'O'
```