~11ms Solution with Unified Hash Fxn


  • 22
    W

    Appricate for advice.

    vector<string> findRepeatedDnaSequences(string s)
    {
        vector<string> ret;
        if ( s.length() < 11 )
        {
        	return ret;
        }
        
        char table[1048576] = "";
        unsigned int hash = 0U;
        
        for ( size_t i = 0; i < 10; ++i )
        {
        	/** 'A' - 'A' + 1 = 1  = 1 (mod 5)
        	 *  'C' - 'A' + 1 = 3  = 3 (mod 5)
        	 *  'G' - 'A' + 1 = 7  = 2 (mod 5)
        	 *  'T' - 'A' + 1 = 20 = 0 (mod 5)
        	 */
        	hash = ( hash << 2 ) | ( ( s[i] - 'A' + 1 ) % 5 );
        }
        	
        table[hash] = 1;
        
        for ( size_t i = 10; i < s.length(); ++i )
        {
        	hash = ( ( hash << 2 )
                 ^ ( ( s[ i - 10 ] - 'A' + 1 ) % 5 ) << 20 )
                 | ( ( s[i] - 'A' + 1 ) % 5 );
        	         
        	if ( table[hash] == 0 )
        	{
        		table[hash] = 1;
        	}
        	else if ( table[hash] == 1 )
        	{
        		table[hash] = 2;
        		ret.push_back( string( s, i - 9, 10 ) );
        	}
        }
        
        return ret;
    }

  • 0
    Y

    the leedcode system accept the code, but i'm seeing the stack overflow here for very large stack variables char table[1048576], is this a bug in the leedcode?


  • 0

    Thank you for your concise solution! Here is my solution base on your idea.

    class Solution {
    public:
    	std::vector<std::string> findRepeatedDnaSequences(std::string s) {
    		std::vector<std::string> res;
    		if (s.size() < 11)
    			return res;
    		std::unordered_map<int, int> flag;
    		for (int t = 0, i = 0; i != s.size(); ++i)
    			if (flag[t = t << 2 & 0xfffff | (s[i] - 64) % 5]++ == 1)
    				res.push_back(s.substr(i - 9, 10));
    		return res;
    	}
    };

  • 0
    L

    I think your code is not right.
    when s start with "TT", just as "TTAAAAACCCCCAAAAACCCCC", the line "res.push_back(s.substr(i - 9, 10));" could out of range when i=2.


  • 0

    Thank you very much, I think the follow is the right:

    class Solution {
    public:
        std::vector<std::string> findRepeatedDnaSequences(std::string s) {
            std::vector<std::string> res;
            if (s.size() < 11)
                return res;
            std::unordered_map<int, int> flag;
    		int t = 0, i = 0;
    		for (; i != 9; ++i)
    			t = t << 2 | (s[i] - 64) % 5;
            for (; i != s.size(); ++i)
                if (flag[t = t << 2 & 0xfffff | (s[i] - 64) % 5]++ == 1)
                    res.push_back(s.substr(i - 9, 10));
            return res;
        }
    };

  • 0
    E

    Why do you have to convert ATGC to int? Can't we just use them as strings?


Log in to reply
 

Looks like your connection to LeetCode Discuss was lost, please wait while we try to reconnect.