Strict O(n) solution with Aho Corasick [question]


  • 0
    A

    First compute for each position which word starts here, if any O(N), then try different offsets and see if any contains a window of length K that contains all the words.
    I've read a bunch of solutions that claimed to be O(N) but they were O(K*N). Are there any O(N) solutions that don't use aho corasick?

    struct AC {
        struct State {
            vector<int> tr;
            vector<int> au;
            int sl;
            int final;
            int par;
            State() : tr(26), au(26), sl(0), final(-1), par(0) {}
        };
        vector<State> states;
        AC(const vector<string>& words) : states(1) {
            int w = 0;
            for (auto && word : words) {
                int v = 0;
                for (auto c : word) {
                    int cc = c - 'a';
                    if (!states[v].tr[cc]) {
                        states.emplace_back();
                        states.back().par = v;
                        states[v].tr[cc] = states.size() - 1;
                    }
                    v = states[v].tr[cc];
                }
                states[v].final = w++;
            }
            build_automata();
        }
        
        void build_automata() {
            deque<int> q = {0};
            while (!q.empty()) {
                int u = q.front(); q.pop_front();
                for (int cc = 0; cc < 26; ++cc) {
                    int v = states[u].tr[cc];
                    if (v) {
                        states[u].au[cc] = v;
                        if (u)
                            states[v].sl = states[states[u].sl].au[cc];
                        q.push_back(v);
                    } else {
                        states[u].au[cc] = states[states[u].sl].au[cc];
                    }
                }
            }
        }
    };
    class Solution {
    public:
        vector<int> findSubstring(string s, vector<string>& words) {
            unordered_map<string, int> woo;
            int orw = words.size();
            vector<int> states(s.size(), -1);
            int tw = 0;
            for (auto && w : words) {
                woo[w] += 1;
                tw += w.size();
            }
            words.clear();
            vector<int> counts;
            for (auto && kw : woo) {words.emplace_back(kw.first); counts.emplace_back(kw.second);}
            AC ac(words);
            int v = 0;
            for (int i = 0; i < s.size(); ++i) {
                int cc = s[i] - 'a';
                v = ac.states[v].au[cc];
                if (ac.states[v].final != -1)
                states[i - words[0].size() + 1] = ac.states[v].final;
            }
            vector<int> ans;
            for (int offset = 0; offset < words[0].size(); ++offset) {
                vector<int> slice;
                for (int i = offset; i < states.size(); i += words[0].size()) {
                    slice.push_back(states[i]);
                }
                vector<int> hist(counts);
                int non_zero = hist.size();
                auto add = [&] (int wn, int cnt) {
                    if (wn != -1) {
                        if (hist[wn] == 0) non_zero++;
                        hist[wn] += cnt;
                        if (hist[wn] == 0) non_zero --;
                    }
                    return (non_zero == 0);
                };
                for (int i = 0; i < slice.size(); ++i) {
                    if (i >= orw) {
                        add(slice[i - orw], 1);
                    }
                    if (add(slice[i], -1)) {
                        ans.push_back(offset + (i + 1) * words[0].size() - tw);
                    }
                }
            }
            sort(ans.begin(), ans.end());
            return ans;
        }
    };
    

Log in to reply
 

Looks like your connection to LeetCode Discuss was lost, please wait while we try to reconnect.