GPGPUの練習として、PyCUDAを用いて自己組織化マップを実装しようとしています。
ところが下記のようなエラーが出て、先へ進めません。

pycuda._driver.LogicError: cuFuncSetBlockShape failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: an illegal memory access was encountered

PyCUDAでは多次元配列を扱えないのでしょうか?
python2.7.12 cuda7.5を使用しています。
以下、ソースです。

module_template = """
    #include <float.h>
    #include <stdio.h>

    #define N (%(N)s)
    #define step (%(step)s)
    #define teacher (%(teach)s)

    __global__ void train_kernel(float ***node, float **teach, int index, int index1, int ss){
        int x = blockIdx.x*blockDim.x + threadIdx.x;
        int y = blockIdx.y*blockDim.y + threadIdx.y;
        int z = blockIdx.z*blockDim.z + threadIdx.z;
        float ramuda = teacher / 4.f;
        float l0 = 0.4;
        float sigma0 = N / 2.f;
        int t = ss;
        float a = x - index, b = y - index1;
        float d = sqrt((a*a) + (b*b));
        float l = l0*exp(-t/ramuda);
        float s = sigma0*exp(-t/ramuda);
        float sigma = exp(-((d*d)/(2*s*s)));
        for(int k=0; k<%(channel)s; k++){
            node[x][y][k] +=  l*sigma*(teach[t][k] - node[x][y][k]); //22 line
        }


    }
"""

def main():
    # initialize node vectors
    nodes = np.random.rand(N,N,3).astype(np.float32)# node array. each node has 3-dim weight vector
    #initial out put
    #TODO; make out put function to simplify here 
    plt.imshow(nodes, interpolation='none')
    plt.savefig("init.png")
    """"""
    """ Learning """
    """"""
    # teacher signal
    teachers = np.random.rand(n_teacher,3).astype(np.float32)
    use_gpu = 1;
    gpu_teatchers = gpuarray.to_gpu(teachers)
    gpu_nodes = gpuarray.to_gpu(nodes)

    block = (N,N,1)
    grid = (1,1,1)
    use_gpu = 1
    kernel = module_template %{'teach':n_teacher, 'N':N, 'step':N, 'channel':3}
    module = SourceModule(kernel)
    train_k = module.get_function("train_kernel")

    for i in range(n_teacher):
        if use_gpu==1:
            bmu = best_matching_unit(nodes, teachers[i])
            index_x = bmu[0]
            index_y = bmu[1]
            train_k(gpu_nodes,gpu_teatchers,np.int32(index_x), np.int32(index_y), np.int32(i), grid=grid,block=block)
        # intermediate out put
        if i%10000 ==0 :#or i< 1: #out put for i<100 or each 1000 iteration
            plt.imshow(nodes, interpolation='none')
            plt.savefig(str(i)+".png")

    #output
    plt.imshow(nodes, interpolation='none')
    plt.savefig("final.png")

    """
    L(t)=L0exp(?t/λ),Θ(x ,t)=exp(?d^2/σ^2(t))
    """

def best_matching_unit(nodes, teacher):
    #compute all norms (square)
    #TODO simplify using numpy function
    norms = np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            for k in range(3):
                norms[i,j] += (nodes[i,j,k] - teacher[k])**2
    #then, choose the minimum one
    bmu = np.argmin(norms).astype(np.int) #argment with minimum element 
    # argmin returns just flatten, serial index, 
    # so convert it using unravel_index
    return np.unravel_index(bmu,(N,N))