I’m currently trying to evaluate on Tao with a trainable peoplenet model provided by nvidian ngc.
However, if you proceed with evaluation, the validation cost will be zero.
How can I solve this problem?
Attached is the spec below.
random_seed : 42 dataset_config { data_sources: { tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*" image_directory_path: "/home/ssh/tao-experiments/data/training/" } image_extension: "png" target_class_mapping { key: "person" value: "pedestrian" } validation_fold: 0 # For evaluation on test set validation_data_source: { tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*" image_directory_path: "/home/ssh/tao-experiments/data/training/" } } model_config { pretrained_model_file: "/home/ssh/tao-experiments/detectnet_v2/pretrained_resnet34/pretrained_detectnet_v2_vresnet34/resnet_34.hdf5" num_layers: 34 freeze_blocks: 0 arch: "resnet" use_batch_norm: true objective_set { bbox { scale: 35.0 offset: 0.5 } cov { } } training_precision { backend_floatx: FLOAT32 } } training_config { batch_size_per_gpu: 12 num_epochs: 100 learning_rate { soft_start_annealing_schedule { min_learning_rate: 5e-06 max_learning_rate: 0.0005 soft_start: 0.1 annealing: 0.7 } } regularizer { type: L1 weight: 3e-09 } optimizer { adam { epsilon: 9.9e-09 beta1: 0.9 beta2: 0.999 } } cost_scaling { initial_exponent: 20.0 increment: 0.005 decrement: 1.0 } checkpoint_interval: 10 } augmentation_config { preprocessing { output_image_width: 960 output_image_height: 544 crop_right: 1244 crop_left: 700 crop_top: 0 crop_bottom: 320 # crop_right: 960 # crop_bottom: 544 min_bbox_width: 1.0 min_bbox_height: 1.0 output_image_channel: 3 } spatial_augmentation { hflip_probability: 0.5 zoom_min: 1.0 zoom_max: 1.0 translate_max_x: 8.0 translate_max_y: 8.0 } color_augmentation { color_shift_stddev: 0.0 hue_rotation_max: 25.0 saturation_shift_max: 0.20000000298 contrast_scale_max: 0.10000000149 contrast_center: 0.5 } } postprocessing_config{ target_class_config{ key: "person" value: { clustering_config { coverage_threshold: 0.005 dbscan_eps: 0.265 dbscan_min_samples: 0.05 minimum_bounding_box_height: 20 } } } } cost_function_config { target_classes { name: "person" class_weight: 1.0 coverage_foreground_weight: 0.05 objectives { name: "cov" initial_weight: 1.0 weight_target: 1.0 } objectives { name: "bbox" initial_weight: 10.0 weight_target: 10.0 } } enable_autoweighting: true max_objective_weight: 0.9999 min_objective_weight: 0.0001 } evaluation_config { validation_period_during_training: 10 first_validation_epoch: 10 minimum_detection_ground_truth_overlap { key: "person" value: 0.5 } evaluation_box_config { key: "person" value { minimum_height: 4 maximum_height: 9999 minimum_width: 4 maximum_width: 9999 } } } bbox_rasterizer_config { target_class_config { key: "person" value { cov_center_x: 0.5 cov_center_y: 0.5 cov_radius_x: 1.0 cov_radius_y: 1.0 bbox_min_radius: 1.0 } } deadzone_radius: 0.2 }
I found out that the key and value values were set incorrectly in dataset_config, and I corrected them to confirm that they were working properly.
However, I want to learn only about person in retrain afterwards, so how should I modify config?
Can you share the latest spec file?
random_seed : 42 dataset_config { data_sources: { tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*" image_directory_path: "/home/ssh/tao-experiments/data/training/" } image_extension: "png" target_class_mapping { key: "pedestrian" value: "person" } target_class_mapping { key: "bag" value: "bag" } target_class_mapping { key: "face" value: "face" } validation_fold: 0 # For evaluation on test set validation_data_source: { tfrecords_path: "/home/ssh/tao-experiments/data/tfrecords/kitti_trainval/kitti_trainval*" image_directory_path: "/home/ssh/tao-experiments/data/training/" } } model_config { pretrained_model_file: "/home/ssh/tao-experiments/detectnet_v2/pretrained_resnet34/pretrained_detectnet_v2_vresnet34/resnet_34.hdf5" num_layers: 34 #freeze_blocks: 0 arch: "resnet" use_batch_norm: true objective_set { bbox { scale: 35.0 offset: 0.5 } cov { } } training_precision { backend_floatx: FLOAT32 } } training_config { batch_size_per_gpu: 32 num_epochs: 10 learning_rate { soft_start_annealing_schedule { min_learning_rate: 5e-06 max_learning_rate: 0.0005 soft_start: 0.1 annealing: 0.7 } } regularizer { type: L1 weight: 3e-09 } optimizer { adam { epsilon: 9.9e-09 beta1: 0.9 beta2: 0.999 } } cost_scaling { initial_exponent: 20.0 increment: 0.005 decrement: 1.0 } checkpoint_interval: 10 } augmentation_config { preprocessing { output_image_width: 960 output_image_height: 544 min_bbox_width: 1.0 min_bbox_height: 1.0 output_image_channel: 3 } spatial_augmentation { hflip_probability: 0.5 zoom_min: 1.0 zoom_max: 1.0 translate_max_x: 8.0 translate_max_y: 8.0 } color_augmentation { hue_rotation_max: 25.0 saturation_shift_max: 0.20000000298 contrast_scale_max: 0.10000000149 contrast_center: 0.5 } } postprocessing_config{ target_class_config{ key: "person" value: { clustering_config { coverage_threshold: 0.005 dbscan_eps: 0.265 dbscan_min_samples: 0.05 minimum_bounding_box_height: 20 } } } target_class_config{ key: "bag" value: { clustering_config { coverage_threshold: 0.005 dbscan_eps: 0.265 dbscan_min_samples: 0.05 minimum_bounding_box_height: 20 } } } target_class_config{ key: "face" value: { clustering_config { coverage_threshold: 0.005 dbscan_eps: 0.265 dbscan_min_samples: 0.05 minimum_bounding_box_height: 20 } } } } cost_function_config { target_classes { name: "person" class_weight: 1.0 coverage_foreground_weight: 0.05 objectives { name: "cov" initial_weight: 1.0 weight_target: 1.0 } objectives { name: "bbox" initial_weight: 10.0 weight_target: 10.0 } } target_classes { name: "bag" class_weight: 1.0 coverage_foreground_weight: 0.05 objectives { name: "cov" initial_weight: 1.0 weight_target: 1.0 } objectives { name: "bbox" initial_weight: 10.0 weight_target: 10.0 } } target_classes { name: "face" class_weight: 1.0 coverage_foreground_weight: 0.05 objectives { name: "cov" initial_weight: 1.0 weight_target: 1.0 } objectives { name: "bbox" initial_weight: 10.0 weight_target: 10.0 } } enable_autoweighting: true max_objective_weight: 0.9999 min_objective_weight: 0.0001 } evaluation_config { validation_period_during_training: 10 first_validation_epoch: 10 minimum_detection_ground_truth_overlap { key: "person" value: 0.5 } minimum_detection_ground_truth_overlap { key: "bag" value: 0.5 } minimum_detection_ground_truth_overlap { key: "face" value: 0.5 } evaluation_box_config { key: "person" value { minimum_height: 4 maximum_height: 9999 minimum_width: 4 maximum_width: 9999 } } evaluation_box_config { key: "bag" value { minimum_height: 4 maximum_height: 9999 minimum_width: 4 maximum_width: 9999 } } evaluation_box_config { key: "face" value { minimum_height: 4 maximum_height: 9999 minimum_width: 4 maximum_width: 9999 } } average_precision_mode: INTEGRATE } bbox_rasterizer_config { target_class_config { key: "person" value { cov_center_x: 0.5 cov_center_y: 0.5 cov_radius_x: 1.0 cov_radius_y: 1.0 bbox_min_radius: 1.0 } } target_class_config { key: "bag" value { cov_center_x: 0.5 cov_center_y: 0.5 cov_radius_x: 1.0 cov_radius_y: 1.0 bbox_min_radius: 1.0 } } target_class_config { key: "face" value { cov_center_x: 0.5 cov_center_y: 0.5 cov_radius_x: 1.0 cov_radius_y: 1.0 bbox_min_radius: 1.0 } } deadzone_radius: 0.2 }
How many classes in your training dataset? Did you ever keep the log when you generate above tfrecords files?
I will learn data only with person class
Do you mean there is only one class(person) in your training dataset?
Yes, there is only one class (person) in the training data, and the question is how to modify the config to learn with this data.
Can you check several label files?
Is the class name “person” or “pedestrian” ?
Is the class name ‘person’
Please change to below and retry.
target_class_mapping {
key: “person”
value: “person”
}
Can I not touch the other two classes on target_class_mapping?
I wonder if it doesn’t matter if there are no other two classes in the learning data
Yes, you can delete the content of other two classes.
Thank you for your reply.
I have an additional question.
I have gpu 8 and I want to proceed with the learning in gpu 4.
I proceeded with the code as below, but an error comes up.
# Retraining using the pruned model as pretrained weights !tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_retrain_resnet34_kitti.txt \ -r $USER_EXPERIMENT_DIR/experiment_dir_retrain \ -k $KEY \ -n resnet34_detector \ --gpus 4
“error code”
[f66fb9fe45ba:00263] *** Process received signal *** [f66fb9fe45ba:00263] Signal: Bus error (7) [f66fb9fe45ba:00263] Signal code: (-6) [f66fb9fe45ba:00263] Failing at address: 0x3ea00000107 [f66fb9fe45ba:00263] [ 0] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x43090)[0x7fc9b1b44090] [f66fb9fe45ba:00263] [ 1] /usr/lib/x86_64-linux-gnu/libc.so.6(+0x18bb41)[0x7fc9b1c8cb41] [f66fb9fe45ba:00263] [ 2] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x755bd)[0x7fc8a34a85bd] [f66fb9fe45ba:00263] [ 3] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x7a74f)[0x7fc8a34ad74f] [f66fb9fe45ba:00263] [ 4] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x59e67)[0x7fc8a348ce67] [f66fb9fe45ba:00263] [ 5] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x48985)[0x7fc8a347b985] [f66fb9fe45ba:00263] [ 6] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x4a5c2)[0x7fc8a347d5c2] [f66fb9fe45ba:00263] [ 7] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x64f66)[0x7fc8a3497f66] [f66fb9fe45ba:00263] [ 8] /usr/lib/x86_64-linux-gnu/libnccl.so.2(+0x4ae0b)[0x7fc8a347de0b] [f66fb9fe45ba:00263] [ 9] /usr/lib/x86_64-linux-gnu/libnccl.so.2(ncclCommInitRank+0xd8)[0x7fc8a347e068] [f66fb9fe45ba:00263] [10] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZN7horovod6common13NCCLOpContext12InitNCCLCommERKSt6vectorINS0_16TensorTableEntryESaIS3_EERKS2_IiSaIiEE+0x284)[0x7fc87268f354] [f66fb9fe45ba:00263] [11] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZN7horovod6common13NCCLAllreduce7ExecuteERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseE+0x61)[0x7fc87268f581] [f66fb9fe45ba:00263] [12] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZNK7horovod6common16OperationManager16ExecuteAllreduceERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseE+0x7d)[0x7fc8726513cd] [f66fb9fe45ba:00263] [13] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(_ZNK7horovod6common16OperationManager16ExecuteOperationERSt6vectorINS0_16TensorTableEntryESaIS3_EERKNS0_8ResponseERNS0_10ProcessSetE+0x4c)[0x7fc8726517fc] [f66fb9fe45ba:00263] [14] /usr/local/lib/python3.6/dist-packages/horovod/tensorflow/mpi_lib.cpython-36m-x86_64-linux-gnu.so(+0xa902d)[0x7fc87262002d] [f66fb9fe45ba:00263] [15] /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xd6de4)[0x7fc9b0eacde4] [f66fb9fe45ba:00263] [16] /usr/lib/x86_64-linux-gnu/libpthread.so.0(+0x8609)[0x7fc9b1ae6609] [f66fb9fe45ba:00263] [17] /usr/lib/x86_64-linux-gnu/libc.so.6(clone+0x43)[0x7fc9b1c20133] [f66fb9fe45ba:00263] *** End of error message ***
Can you upload the full log via below button ?

Please try to increase Docker Virtual Memory Size.
Reference:
https://github.com/microsoft/DeepSpeed/issues/2693#issuecomment-1473302302
I solved my question with the following command.
!tao detectnet_v2 train -e $SPECS_DIR/detectnet_v2_retrain_resnet34_kitti.txt \ -r $USER_EXPERIMENT_DIR/experiment_dir_retrain \ -k $KEY \ -n resnet34_detector \ --gpus 1 \ --gpu_index 4
But as a result, the AP was only 25.
What kind of problem would there be?
It was my mistake. Tlt file is not included. Please tell me how to put the tlt file in Retrain