PYCUDAの多次元配列について
GPGPUの練習として、PyCUDAを用いて自己組織化マップを実装しようとしています。
ところが下記のようなエラーが出て、先へ進めません。
pycuda._driver.LogicError: cuFuncSetBlockShape failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuModuleUnload failed: an illegal memory access was encountered
PyCUDAでは多次元配列を扱えないのでしょうか?
python2.7.12 cuda7.5を使用しています。
以下、ソースです。
module_template = """
#include <float.h>
#include <stdio.h>
#define N (%(N)s)
#define step (%(step)s)
#define teacher (%(teach)s)
__global__ void train_kernel(float ***node, float **teach, int index, int index1, int ss){
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
int z = blockIdx.z*blockDim.z + threadIdx.z;
float ramuda = teacher / 4.f;
float l0 = 0.4;
float sigma0 = N / 2.f;
int t = ss;
float a = x - index, b = y - index1;
float d = sqrt((a*a) + (b*b));
float l = l0*exp(-t/ramuda);
float s = sigma0*exp(-t/ramuda);
float sigma = exp(-((d*d)/(2*s*s)));
for(int k=0; k<%(channel)s; k++){
node[x][y][k] += l*sigma*(teach[t][k] - node[x][y][k]); //22 line
}
}
"""
def main():
# initialize node vectors
nodes = np.random.rand(N,N,3).astype(np.float32)# node array. each node has 3-dim weight vector
#initial out put
#TODO; make out put function to simplify here
plt.imshow(nodes, interpolation='none')
plt.savefig("init.png")
""""""
""" Learning """
""""""
# teacher signal
teachers = np.random.rand(n_teacher,3).astype(np.float32)
use_gpu = 1;
gpu_teatchers = gpuarray.to_gpu(teachers)
gpu_nodes = gpuarray.to_gpu(nodes)
block = (N,N,1)
grid = (1,1,1)
use_gpu = 1
kernel = module_template %{'teach':n_teacher, 'N':N, 'step':N, 'channel':3}
module = SourceModule(kernel)
train_k = module.get_function("train_kernel")
for i in range(n_teacher):
if use_gpu==1:
bmu = best_matching_unit(nodes, teachers[i])
index_x = bmu[0]
index_y = bmu[1]
train_k(gpu_nodes,gpu_teatchers,np.int32(index_x), np.int32(index_y), np.int32(i), grid=grid,block=block)
# intermediate out put
if i%10000 ==0 :#or i< 1: #out put for i<100 or each 1000 iteration
plt.imshow(nodes, interpolation='none')
plt.savefig(str(i)+".png")
#output
plt.imshow(nodes, interpolation='none')
plt.savefig("final.png")
"""
L(t)=L0exp(?t/λ),Θ(x ,t)=exp(?d^2/σ^2(t))
"""
def best_matching_unit(nodes, teacher):
#compute all norms (square)
#TODO simplify using numpy function
norms = np.zeros((N,N))
for i in range(N):
for j in range(N):
for k in range(3):
norms[i,j] += (nodes[i,j,k] - teacher[k])**2
#then, choose the minimum one
bmu = np.argmin(norms).astype(np.int) #argment with minimum element
# argmin returns just flatten, serial index,
# so convert it using unravel_index
return np.unravel_index(bmu,(N,N))