This solution is almost the same as the nlogn solution for finding the max length of increasing sub-sequence. Based on the observation the end number for the optimal sub-sequence of length K is decreasing.

For a nums[i], first find it's optimal length K, this can be done by binary search.

```
dp[k][end_index] = sum_j(dp[k-1][j]), for all the index j, whose corresponding number < nums[i]
```

as the number for dp[k-1][j] is decreasing, this can also be done by binary search.

The summation is O(1)

This implementation using int64 to store the end number and the sum of count up to it. A std::pair<int, int> also works.

```
class Solution {
public:
#define lowbit(x) (x&0x00000000FFFFFFFF)
#define highbit(x) (int((x>>32)&0x00000000FFFFFFFF))
#define extbit(x) (int64(x)<<32)
typedef long long int64;
int backvalue[2000];
int findNumberOfLIS(vector<int>& nums) {
int sz = nums.size();
if (sz == 0) return 0;
vector<vector<int64>> dp(1, {extbit(INT_MAX), extbit(INT_MIN)|1});
backvalue[0] = INT_MIN;
for (int i = 0; i < nums.size(); i++) {
int maxlen = dp.size(), val = nums[i];
int64 extval = extbit(val);
int k = lower_bound(backvalue, backvalue + maxlen, val) - backvalue;
if (k == maxlen) {
dp.push_back(vector<int64>(1,extbit(INT_MAX)));
}
backvalue[k] = val;
auto p = upper_bound(dp[k-1].begin(), dp[k-1].end(), extval , greater<int64>());
int64 cnt = lowbit(dp[k-1].back()) - lowbit(*--p);
if (val == highbit(dp[k].back())) {
dp[k].back() = (extval | (lowbit(dp[k].back())+cnt));
} else {
dp[k].push_back(extval | (lowbit(dp[k].back())+cnt));
}
}
return lowbit(dp.back().back());
}
};
```