CUDAprogrammingのthreadに割り当てられる処理について

以下のプログラムで、行列の指標を下記のように書ける理由が分かりません。
CUDAでは行列はGPUにどのように割り当てられるのでしょうか？
thread一つ一つに行列の各要素が割り当てられるのでしょうか？

// Kernel definition
__global__ void MatAdd(float A[N][N], float B[N][N],
float C[N][N])
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if (i < N && j < N)
        C[i][j] = A[i][j] + B[i][j];
}

int main()
{
    ...
    // Kernel invocation
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks(N / threadsPerBlock.x, N / threadsPerBlock.y);
    MatAdd<<<numBlocks, threadsPerBlock>>>(A, B, C);
    ...
}