2424 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2525 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626 */
27+ #include < NvInfer.h>
2728#include " cudaWrapper.h"
2829#include " ioHelper.h"
29- #include < NvInfer.h>
3030#include < NvOnnxParser.h>
3131#include < algorithm>
3232#include < cassert>
3333#include < iostream>
3434#include < memory>
3535#include < string>
3636#include < vector>
37+ #include < numeric>
38+ #include < math.h>
39+ #include < cmath>
3740
3841using namespace nvinfer1 ;
3942using namespace std ;
@@ -46,52 +49,49 @@ constexpr double ABS_EPSILON = 0.005;
4649// Maxmimum relative tolerance for output tensor comparison against reference.
4750constexpr double REL_EPSILON = 0.05 ;
4851
49- ICudaEngine* createCudaEngine (string const & onnxModelPath, int batchSize)
52+ nvinfer1:: ICudaEngine* createCudaEngine (string const & onnxModelPath, int batchSize)
5053{
51- unique_ptr<IBuilder, Destroy<IBuilder>> builder{createInferBuilder (gLogger )};
52- unique_ptr<INetworkDefinition, Destroy<INetworkDefinition>> network{builder->createNetwork ()};
54+ const auto explicitBatch = 1U << static_cast <uint32_t >(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH );
55+ unique_ptr<nvinfer1::IBuilder, Destroy<nvinfer1::IBuilder>> builder{nvinfer1::createInferBuilder (gLogger )};
56+ unique_ptr<nvinfer1::INetworkDefinition, Destroy<nvinfer1::INetworkDefinition>> network{builder->createNetworkV2 (explicitBatch)};
5357 unique_ptr<nvonnxparser::IParser, Destroy<nvonnxparser::IParser>> parser{nvonnxparser::createParser (*network, gLogger )};
58+ unique_ptr<nvinfer1::IBuilderConfig,Destroy<nvinfer1::IBuilderConfig>> config{builder->createBuilderConfig ()};
5459
5560 if (!parser->parseFromFile (onnxModelPath.c_str (), static_cast <int >(ILogger::Severity::kINFO )))
5661 {
5762 cout << " ERROR: could not parse input engine." << endl;
5863 return nullptr ;
5964 }
6065
61- return builder->buildCudaEngine (*network); // Build and return TensorRT engine.
66+ builder->setMaxBatchSize (batchSize);
67+ config->setMaxWorkspaceSize ((1 << 30 ));
68+
69+ auto profile = builder->createOptimizationProfile ();
70+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kMIN , Dims4{1 , 3 , 256 , 256 });
71+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kOPT , Dims4{1 , 3 , 256 , 256 });
72+ profile->setDimensions (network->getInput (0 )->getName (), OptProfileSelector::kMAX , Dims4{32 , 3 , 256 , 256 });
73+ config->addOptimizationProfile (profile);
74+
75+ return builder->buildEngineWithConfig (*network, *config);
6276}
6377
64- static int getBindingInputIndex (IExecutionContext* context)
78+ static int getBindingInputIndex (nvinfer1:: IExecutionContext* context)
6579{
6680 return !context->getEngine ().bindingIsInput (0 ); // 0 (false) if bindingIsInput(0), 1 (true) otherwise
6781}
6882
6983void launchInference (IExecutionContext* context, cudaStream_t stream, vector<float > const & inputTensor, vector<float >& outputTensor, void ** bindings, int batchSize)
7084{
7185 int inputId = getBindingInputIndex (context);
72-
7386 cudaMemcpyAsync (bindings[inputId], inputTensor.data (), inputTensor.size () * sizeof (float ), cudaMemcpyHostToDevice, stream);
74- context->enqueue (batchSize, bindings, stream, nullptr );
87+ context->enqueueV2 ( bindings, stream, nullptr );
7588 cudaMemcpyAsync (outputTensor.data (), bindings[1 - inputId], outputTensor.size () * sizeof (float ), cudaMemcpyDeviceToHost, stream);
76- }
7789
78- void softmax (vector<float >& tensor, int batchSize)
79- {
80- size_t batchElements = tensor.size () / batchSize;
81-
82- for (int i = 0 ; i < batchSize; ++i)
83- {
84- float * batchVector = &tensor[i * batchElements];
85- double maxValue = *max_element (batchVector, batchVector + batchElements);
86- double expSum = accumulate (batchVector, batchVector + batchElements, 0.0 , [=](double acc, float value) { return acc + exp (value - maxValue); });
87-
88- transform (batchVector, batchVector + batchElements, batchVector, [=](float input) { return static_cast <float >(std::exp (input - maxValue) / expSum); });
89- }
9090}
9191
92- void verifyOutput (vector<float > const & outputTensor, vector<float > const & referenceTensor)
92+ void verifyOutput (vector<float > const & outputTensor, vector<float > const & referenceTensor, int size )
9393{
94- for (size_t i = 0 ; i < referenceTensor. size () ; ++i)
94+ for (size_t i = 0 ; i < size; ++i)
9595 {
9696 double reference = static_cast <double >(referenceTensor[i]);
9797 // Check absolute and relative tolerance.
@@ -102,8 +102,31 @@ void verifyOutput(vector<float> const& outputTensor, vector<float> const& refere
102102 return ;
103103 }
104104 }
105+ cout << " OK" << endl;
106+ }
105107
106- cout << " OK" << endl;
108+ void saveImageAsPGM (vector<float >& outputTensor,int H, int W)
109+ {
110+ FILE* pgmimg;
111+ pgmimg = fopen (" output.pgm" , " wb" );
112+
113+ fprintf (pgmimg, " P2\n " );
114+ // Writing Width and Height
115+ fprintf (pgmimg, " %d %d\n " , H, W);
116+ // Writing the maximum gray value
117+ fprintf (pgmimg, " 255\n " );
118+
119+ for (int i=0 ; i< H; ++i)
120+ {
121+ for (int j=0 ; j<W; ++j)
122+ {
123+ int temp = round (255 * outputTensor[i*H + j]);
124+ fprintf (pgmimg, " %d " , temp);
125+ }
126+ fprintf (pgmimg, " \n " );
127+ }
128+
129+ fclose (pgmimg);
107130}
108131
109132int main (int argc, char * argv[])
@@ -141,13 +164,14 @@ int main(int argc, char* argv[])
141164 for (int i = 0 ; i < engine->getNbBindings (); ++i)
142165 {
143166 Dims dims{engine->getBindingDimensions (i)};
144- size_t size = accumulate (dims.d , dims.d + dims.nbDims , batchSize, multiplies<size_t >());
167+ size_t size = accumulate (dims.d + 1 , dims.d + dims.nbDims , batchSize, multiplies<size_t >());
145168 // Create CUDA buffer for Tensor.
146- cudaMalloc (&bindings[i], size * sizeof (float ));
169+ cudaMalloc (&bindings[i], batchSize * size * sizeof (float ));
147170
148171 // Resize CPU buffers to fit Tensor.
149- if (engine->bindingIsInput (i))
172+ if (engine->bindingIsInput (i)){
150173 inputTensor.resize (size);
174+ }
151175 else
152176 outputTensor.resize (size);
153177 }
@@ -158,31 +182,39 @@ int main(int argc, char* argv[])
158182 cout << " Couldn't read input Tensor" << endl;
159183 return 1 ;
160184 }
185+
161186
162187 // Create Execution Context.
163188 context.reset (engine->createExecutionContext ());
189+
190+ Dims dims_i{engine->getBindingDimensions (0 )};
191+ Dims4 inputDims{batchSize, dims_i.d [1 ], dims_i.d [2 ], dims_i.d [3 ]};
192+ context->setBindingDimensions (0 , inputDims);
164193
165194 launchInference (context.get (), stream, inputTensor, outputTensor, bindings, batchSize);
195+
196+ Dims dims{engine->getBindingDimensions (1 )};
197+ saveImageAsPGM (outputTensor, dims.d [2 ], dims.d [3 ]);
166198 // Wait until the work is finished.
167199 cudaStreamSynchronize (stream);
168200
169201 vector<string> referenceFiles;
170202 for (string path : inputFiles)
171203 referenceFiles.push_back (path.replace (path.rfind (" input" ), 5 , " output" ));
172204 // Try to read and compare against reference tensor from protobuf file.
205+
206+
173207 referenceTensor.resize (outputTensor.size ());
174208 if (readTensor (referenceFiles, referenceTensor) != referenceTensor.size ())
175209 {
176210 cout << " Couldn't read reference Tensor" << endl;
177211 return 1 ;
178212 }
179213
180- // Apply a softmax on the CPU to create a normalized distribution suitable for measuring relative error in probabilities.
181- softmax (outputTensor, batchSize);
182- softmax (referenceTensor, batchSize);
183-
184- verifyOutput (outputTensor, referenceTensor);
185-
214+ Dims dims_o{engine->getBindingDimensions (1 )};
215+ int size = batchSize * dims_o.d [2 ] * dims_o.d [3 ];
216+ verifyOutput (outputTensor, referenceTensor, size);
217+
186218 for (void * ptr : bindings)
187219 cudaFree (ptr);
188220
0 commit comments