Here's AC DFS+memorization (about 70ms), same algorithm as original post, except adding an unordered_set to memorize failed paths.

class Solution {
private:
unordered_set<string> visited;
bool dfs(vector<int>& nums, int idx, int target) {
if (target<0) return false;
if (target==0) return true;
if (visited.count(to_string(idx)+'-'+to_string(target))) return false;
for (int i=idx; i<nums.size(); i++) {
if (dfs(nums,i+1,target-nums[i])) return true;
}
visited.insert(to_string(idx)+'-'+to_string(target));
return false;
}
public:
bool canPartition(vector<int>& nums) {
int sum=accumulate(nums.begin(),nums.end(),0);
return sum%2==0 && dfs(nums,0,sum/2);
}
};

Looking at @zestypanda's solution, as well as other posts, I think sorting + aggressive trimming can be extremely fast. I added 2 lines to the original post's DFS solution to accomplish the 3ms run time reported by others.

class Solution {
private:
bool dfs(vector<int>& nums, int idx, int target) {
if (target<0) return false;
if (target==0) return true;
for (int i=idx; i<nums.size(); i++) {
if (target-nums[i]<0) break; // trims the dfs since we already sorted nums
if (dfs(nums,i+1,target-nums[i])) return true;
}
return false;
}
public:
bool canPartition(vector<int>& nums) {
int sum=accumulate(nums.begin(),nums.end(),0);
sort(nums.rbegin(),nums.rend()); // sort the nums
return sum%2==0 && dfs(nums,0,sum/2);
}
};