My implementation got time limit exceeded (for the last testcase) despite being extrememly similar to yours. Interesting behavior.

int ans = INT_MIN; for (int left = 0; left < matrix[0].size(); ++left) { // sum of all elements in a row in [left, right] std::vector<int> rowSum(matrix.size(), 0); for (int right = left; right < matrix[0].size(); ++right) { for (int i = 0; i < matrix.size(); ++i) { rowSum[i] += matrix[i][right]; } // find max sum <= k with current left & right std::set<int> accRowSum; accRowSum.insert(0); int sum = 0; // the accumulated sum from top for (int i = 0; i < rowSum.size(); ++i) { sum += rowSum[i]; std::set<int>::iterator top = std::lower_bound(accRowSum.begin(), accRowSum.end(), sum - k); if (top != accRowSum.end()) { ans = std::max(ans, sum - *top); } accRowSum.insert(sum); } } } return ans;Max Sum of Sub-Matrix No Larger Than K