Hello.
I am using TensorRT 2.1 and want to implement a simple custom layer. To practice, I wanted to make an “Inc” layer (just adding 1.0 to an input tensor values and keeping dimension the same).
I kept everything almost the same with the “class Reshape : public Iplugin” in sampleFasterRNN.cpp, except “getOutputDimensions()” to keep the same dimension. (this seems fine.)
Where should I implement the “adding 1.0” part? I guess it should be in “enqueue()”. So, I tried
int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override { # the below is from the Reshape class. seems to copy from input to output CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); # add 1.0 to first ten values float* foutputs = (float*) outputs[0]; int i; for (i = 0; i < 10; i++) foutputs[i] += 1.0; return 0; } However, this part results in “segmentation fault” error.
My questions are:
1) where and how can I implement some calculation between input and output (addition in this case)?
2) Can you provide a simple example?
** Just in case, I post the full code of this example API here. (almost same with the Reshape)
class Inc : public IPlugin { public: Inc() {} Inc(const void* buffer, size_t size) { assert(size == sizeof(mCopySize)); mCopySize = *reinterpret_cast<const size_t*>(buffer); } int getNbOutputs() const override { return 1; } Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override { assert(nbInputDims == 1); assert(index == 0); assert(inputs[index].nbDims == 3); return DimsCHW(inputs[0].d[0], inputs[0].d[1], inputs[0].d[2]); // same dimension } int initialize() override { return 0; } void terminate() override { } size_t getWorkspaceSize(int) const override { return 0; } // currently it is not possible for a plugin to execute "in place". Therefore we memcpy the data from the input to the output buffer int enqueue(int batchSize, const void*const *inputs, void** outputs, void*, cudaStream_t stream) override { CHECK(cudaMemcpyAsync(outputs[0], inputs[0], mCopySize * batchSize, cudaMemcpyDeviceToDevice, stream)); float* foutputs = (float*) outputs[0]; int i; for (i = 0; i < 10; i++) foutputs[i] += 1.0; return 0; } size_t getSerializationSize() override { return sizeof(mCopySize); } void serialize(void* buffer) override { *reinterpret_cast<size_t*>(buffer) = mCopySize; } void configure(const Dims*inputs, int nbInputs, const Dims* outputs, int nbOutputs, int) override { mCopySize = inputs[0].d[0] * inputs[0].d[1] * inputs[0].d[2] * sizeof(float); } protected: size_t mCopySize; };