現在、SSDを用いた物体検出を実施しようとこのサイトを参考に学習を進めています。
上記サイトのコードをコピーして実行したところ、google colaboratoryでは学習できましたが、
azure上で実行するとtrainer.run()時に下記エラーが出ます。
なにとぞご教授よろしくお願いします。

Exception in main training loop: merge_sort: failed to synchronize: unspecified launch failure
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/chainer/training/trainer.py", line 316, in run
    update()
  File "/usr/local/lib/python3.6/dist-packages/chainer/training/updaters/standard_updater.py", line 175, in update
    self.update_core()
  File "/usr/local/lib/python3.6/dist-packages/chainer/training/updaters/standard_updater.py", line 187, in update_core
    optimizer.update(loss_func, *in_arrays)
  File "/usr/local/lib/python3.6/dist-packages/chainer/optimizer.py", line 825, in update
    loss = lossfun(*args, **kwds)
  File "/usr/local/lib/python3.6/dist-packages/chainer/link.py", line 294, in __call__
    out = forward(*args, **kwargs)
  File "<ipython-input-7-393c695393be>", line 18, in forward
    mb_locs, mb_confs, gt_mb_locs, gt_mb_labels, self.k)
  File "/usr/local/lib/python3.6/dist-packages/chainercv/links/model/ssd/multibox_loss.py", line 91, in multibox_loss
    hard_negative = _hard_negative(conf_loss.array, positive, k)
  File "/usr/local/lib/python3.6/dist-packages/chainercv/links/model/ssd/multibox_loss.py", line 19, in _hard_negative
    rank = (x * (positive - 1)).argsort(axis=1).argsort(axis=1)
  File "cupy/core/core.pyx", line 627, in cupy.core.core.ndarray.argsort
  File "cupy/core/core.pyx", line 644, in cupy.core.core.ndarray.argsort
  File "cupy/core/_routines_sorting.pyx", line 101, in cupy.core._routines_sorting._ndarray_argsort
  File "cupy/cuda/thrust.pyx", line 135, in cupy.cuda.thrust.argsort
Will finalize trainer extensions and updater before reraising the exception.

RuntimeErrorTraceback (most recent call last)
<ipython-input-17-041e2033e90a> in <module>
----> 1 trainer.run()

/usr/local/lib/python3.6/dist-packages/chainer/training/trainer.py in run(self, show_loop_exception_msg)
    347                         f.write('Traceback (most recent call last):\n')
    348                         traceback.print_tb(sys.exc_info()[2])
--> 349             six.reraise(*exc_info)
    350         finally:
    351             for _, entry in extensions:

/usr/local/lib/python3.6/dist-packages/six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693             raise value
    694         finally:
    695             value = None

/usr/local/lib/python3.6/dist-packages/chainer/training/trainer.py in run(self, show_loop_exception_msg)
    314                 self.observation = {}
    315                 with reporter.scope(self.observation):
--> 316                     update()
    317                     for name, entry in extensions:
    318                         if entry.trigger(self):

/usr/local/lib/python3.6/dist-packages/chainer/training/updaters/standard_updater.py in update(self)
    173 
    174         """
--> 175         self.update_core()
    176         self.iteration += 1
    177 

/usr/local/lib/python3.6/dist-packages/chainer/training/updaters/standard_updater.py in update_core(self)
    185 
    186         if isinstance(in_arrays, tuple):
--> 187             optimizer.update(loss_func, *in_arrays)
    188         elif isinstance(in_arrays, dict):
    189             optimizer.update(loss_func, **in_arrays)

/usr/local/lib/python3.6/dist-packages/chainer/optimizer.py in update(self, lossfun, *args, **kwds)
    823         if lossfun is not None:
    824             use_cleargrads = getattr(self, '_use_cleargrads', True)
--> 825             loss = lossfun(*args, **kwds)
    826             if use_cleargrads:
    827                 self.target.cleargrads()

/usr/local/lib/python3.6/dist-packages/chainer/link.py in __call__(self, *args, **kwargs)
    292             # forward is implemented in the child classes
    293             forward = self.forward  # type: ignore
--> 294         out = forward(*args, **kwargs)
    295 
    296         # Call forward_postprocess hook

<ipython-input-7-393c695393be> in forward(self, imgs, gt_mb_locs, gt_mb_labels)
     16         mb_locs, mb_confs = self.model(imgs)
     17         loc_loss, conf_loss = multibox_loss(
---> 18             mb_locs, mb_confs, gt_mb_locs, gt_mb_labels, self.k)
     19         loss = loc_loss * self.alpha + conf_loss
     20 

/usr/local/lib/python3.6/dist-packages/chainercv/links/model/ssd/multibox_loss.py in multibox_loss(mb_locs, mb_confs, gt_mb_locs, gt_mb_labels, k, comm)
     89 
     90         conf_loss = _elementwise_softmax_cross_entropy(mb_confs, gt_mb_labels)
---> 91         hard_negative = _hard_negative(conf_loss.array, positive, k)
     92         conf_loss *= xp.logical_or(
     93             positive, hard_negative).astype(conf_loss.dtype)

/usr/local/lib/python3.6/dist-packages/chainercv/links/model/ssd/multibox_loss.py in _hard_negative(x, positive, k)
     17 
     18 def _hard_negative(x, positive, k):
---> 19     rank = (x * (positive - 1)).argsort(axis=1).argsort(axis=1)
     20     hard_negative = rank < (positive.sum(axis=1) * k)[:, np.newaxis]
     21     return hard_negative

cupy/core/core.pyx in cupy.core.core.ndarray.argsort()

cupy/core/core.pyx in cupy.core.core.ndarray.argsort()

cupy/core/_routines_sorting.pyx in cupy.core._routines_sorting._ndarray_argsort()

cupy/cuda/thrust.pyx in cupy.cuda.thrust.argsort()

RuntimeError: merge_sort: failed to synchronize: unspecified launch failure