The most intuitive way is to try every square less than n and then solve the same subproblem. Time complexity is much better than O(2^n) but I am not sure how to come up a tighter bound.

```
int numSquares(int n) {
int sr = sqrt(n);
if(sr*sr == n) return 1;
int mn = n;
for(int i=1;i<=sr;i++) mn = min(mn, numSquares(n-i*i));
return mn+1;
}
```

Like all dp problems, we can immediately optimize it with memoization and dp. Time complexity is O(n^3/2)

```
int numSquares(int n) {
vector<int> mem(n+1);
return numSquares(n,mem);
}
int numSquares(int n, vector<int> &mem) {
if(mem[n]) return mem[n];
int sr = sqrt(n);
if(sr*sr == n) return mem[n] = 1;
int mn = n;
for(int i=1;i<=sr;i++) mn = min(mn, numSquares(n-i*i));
return mem[n] = mn+1;
}
int numSquares(int n) {
vector<int> dp(n+1,n);
dp[0]=0;
for(int i=1;i<=n;i++)
for(int j=1;j*j<=i;j++) dp[i]=min(dp[i],1+dp[i-j*j]);
return dp[n];
}
```

For problems asking for mininum number, BFS may be a good option because it generates the shortest path from the root. Time complexity is similar to dp O(n^3/2). Test cases show BFS is faster than dp. I think it is because BFS only visits numbers that are sum of some squares while dp visits all n numbers.

```
int numSquares(int n) {
vector<int> squares;
for(int i=1;i*i<=n;i++) squares.push_back(i*i);
queue<int> q({0});
int level = 0;
vector<bool> vstd(n+1);
while(!q.empty()) {
int s=q.size();
level++;
for(int i=0;i<s;i++) {
int cur=q.front();
q.pop();
if(vstd[cur]) continue;
vstd[cur]=true;
for(auto sq:squares) {
int nxt = sq+cur;
if(nxt==n) return level;
else if(nxt>n) break;
else q.push(nxt);
}
}
}
return level;
}
```

Eventually, the optimal solution is actually O(n^1/2) using math theory. See details