I have read the paper. Could anyone tell me why k1 is picked by that way?

k1_ = floor(k1 / 4) + n + 1 (when n is even)
floor((k1 + 2 * n + 1) / 4) (when n is odd)

update_0:

After drawing some pictures, I can say the original author may have a mistake.

matrix = [
[ 1, 5, 9],
[10, 11, 13],
[12, 13, 15]
],
sub_matrix = [
[1,9],
[12,15]
]

let's say k_2=4,

k_2_hat would be (4+3)/4=1. Clearly, 1st elem in sub_matrix is 1 which is not largest integer smaller than 4th elem(10) in matrix.

The above code @zhiqing_xiao provided proves my thought.

update_1:

I still think k1 selection is non-sense. We don't need it to AC. Here is my code based on @zhiqing_xiao great code,

class Solution {
public:
int kthSmallest(const vector<vector<int>> & matrix, int nth) {
if (nth == 1) { // corner case
return matrix.front().front();
}
size_t n = matrix.size();
vector<int> indices(n);
iota(indices.begin(), indices.end(), 0); // {1, 2, 3...}
array<int, 2> nth_range{{nth - 1, nth - 1}}; // zero-based indices, {lower_bound, upper_bound}
array<int, 2> vals = bi_select(matrix, indices, nth_range);
return vals[0];
}
private:
// nth, nth => val, val
array<int, 2> bi_select(const vector<vector<int>> & matrix, const vector<int> & indices,
const array<int, 2> & nth_range) {
size_t n = indices.size();
if (n == 2) { // base case
return bi_select_base(matrix, indices, nth_range);
}
// indices for sub_matrix
vector<int> sub_indices;
for (int i = 0; i < n; i += 2) {
sub_indices.push_back(indices[i]);
}
if (n % 2 == 0) { // even
sub_indices.push_back(indices.back());
}
// lower bound in sub is guaranteed to be 1/4, because 4 elements are folded into 1
// upper bound in sub is always last element(I have a different opinion with the author here)
array<int, 2> sub_nth_range{{nth_range[0] / 4, (int) (sub_indices.size() * sub_indices.size()) - 1}};
// call recursively
array<int, 2> sub_vals = bi_select(matrix, sub_indices, sub_nth_range);
// use cardinality to relocate
array<int, 2> num_less_x{};
vector<int> elems_between;
// saddleback search
array<int, 2> cols{{(int) n, (int) n}};
for (int row = 0; row < n; ++row) {
int row_idx = indices[row];
for (int i : {0, 1}) {
while (cols[i] > 0 && (matrix[row_idx][indices[cols[i] - 1]] >= sub_vals[i])) {
--cols[i];
}
num_less_x[i] += cols[i];
}
for (int col = cols[0]; col < cols[1]; ++col) {
elems_between.push_back(matrix[row_idx][indices[col]]);
}
}
array<int, 2> vals{};
for (int i : {0, 1}) {
int nth = nth_range[i];
if (nth < num_less_x[0]) {
// cannot make it more precise
vals[i] = sub_vals[0];
} else if (nth < num_less_x[1]) {
// O(n) nth_element
int offset = nth - num_less_x[0];
nth_element(elems_between.begin(), elems_between.begin() + offset, elems_between.end());
vals[i] = elems_between[offset];
} else {
// cannot make it more precise
vals[i] = sub_vals[1];
}
}
return vals;
}
array<int, 2> bi_select_base(const vector<vector<int>> & matrix, const vector<int> & indices,
const array<int, 2> & nth_range) {
vector<int> all_elems;
for (int i : indices) {
for (int j : indices) {
all_elems.push_back(matrix[i][j]);
}
}
sort(all_elems.begin(), all_elems.end());
array<int, 2> vals;
for (int i : {0, 1}) {
vals[i] = all_elems[nth_range[i]];
}
return vals;
}
};

The main idea is simple and elegant, but the k1 selection is really hard to understand and unnecessary. I guess it is kind of optimization.