Amazing! And we could further improve performance from O(6^3 log n) to O(3^3 log n) by deviding matrix P into:

P =
| P1 |
| P2 P1 |
where
P = P1 = P2 =
| 0 0 1 0 0 0 | | 0 0 1 | | 0 0 1 |
| 1 0 1 0 0 0 | | 1 0 1 | | 0 0 1 |
| 0 1 1 0 0 0 | | 0 1 1 | | 0 0 1 |
| 0 0 1 0 0 1 |
| 0 0 1 1 0 1 |
| 0 0 1 0 1 1 |

So matrix addition / multiplication costs O(3^3) because:

* =
| A | | C | | AC |
| B A | | D C | | AD+BC AC |
+ =
| A | | C | | A+C |
| B A | | D C | | B+D A+C |

This is my C++ implementation below:

#include <stdlib.h>
#define M (1000000000 + 7)
#define D 3
void multiply(int(*A)[D], int(*B)[D], int(*C)[D])
{
int i, j, k;
for (i = 0; i < D; i++)
{
for (j = 0; j < D; j++)
{
C[i][j] = 0;
for (k = 0; k < D; k++)
{
C[i][j] = (C[i][j] + (long)A[i][k] * B[k][j] % M) % M;
}
}
}
}
void add(int(*A)[D], int(*B)[D])
{
int i, j;
for (i = 0; i < D; i++)
{
for (j = 0; j < D; j++)
{
B[i][j] = (A[i][j] + B[i][j]) % M;
}
}
}
void swap(int(**A)[D], int(**B)[D])
{
int(*temp)[D] = *A;
*A = *B;
*B = temp;
}
int power(int(*PA)[D], int(*PB)[D], int(*QA)[D], int(*QB)[D], int n)
{
int T1[D][D] =
{
{ 0, 0, 0 },
{ 0, 0, 0 },
{ 0, 0, 0 }
};
int T2[D][D] =
{
{ 0, 0, 0 },
{ 0, 0, 0 },
{ 0, 0, 0 }
};
int(*R)[D] = T1;
while (n)
{
if (n & 1)
{
multiply(PA, QB, R);
multiply(PB, QA, QB);
add(R, QB);
multiply(PA, QA, R);
swap(&QA, &R);
}
multiply(PA, PB, T2);
multiply(PB, PA, R);
add(T2, R);
swap(&PB, &R);
multiply(PA, PA, R);
swap(&PA, &R);
n = n >> 1;
}
return QB[D - 1][D - 1];
}
int checkRecord(int n)
{
int PA[D][D] =
{
{0, 0, 1},
{1, 0, 1},
{0, 1, 1}
};
int PB[D][D] =
{
{ 0, 0, 1 },
{ 0, 0, 1 },
{ 0, 0, 1 }
};
int QA[D][D] =
{
{ 1, 0, 0 },
{ 0, 1, 0 },
{ 0, 0, 1 }
};
int QB[D][D] =
{
{ 0, 0, 0 },
{ 0, 0, 0 },
{ 0, 0, 0 }
};
return power(PA, PB, QA, QB, n + 1);
}