nms_kernel.cu 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. // ------------------------------------------------------------------
  2. // Faster R-CNN
  3. // Copyright (c) 2015 Microsoft
  4. // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5. // Written by Shaoqing Ren
  6. // ------------------------------------------------------------------
  7. #include "gpu_nms.hpp"
  8. #include <vector>
  9. #include <iostream>
  10. #define CUDA_CHECK(condition) \
  11. /* Code block avoids redefinition of cudaError_t error */ \
  12. do { \
  13. cudaError_t error = condition; \
  14. if (error != cudaSuccess) { \
  15. std::cout << cudaGetErrorString(error) << std::endl; \
  16. } \
  17. } while (0)
  18. #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
  19. int const threadsPerBlock = sizeof(unsigned long long) * 8;
  20. __device__ inline float devIoU(float const * const a, float const * const b) {
  21. float left = max(a[0], b[0]), right = min(a[2], b[2]);
  22. float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
  23. float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
  24. float interS = width * height;
  25. float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
  26. float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
  27. return interS / (Sa + Sb - interS);
  28. }
  29. __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
  30. const float *dev_boxes, unsigned long long *dev_mask) {
  31. const int row_start = blockIdx.y;
  32. const int col_start = blockIdx.x;
  33. // if (row_start > col_start) return;
  34. const int row_size =
  35. min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
  36. const int col_size =
  37. min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
  38. __shared__ float block_boxes[threadsPerBlock * 5];
  39. if (threadIdx.x < col_size) {
  40. block_boxes[threadIdx.x * 5 + 0] =
  41. dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
  42. block_boxes[threadIdx.x * 5 + 1] =
  43. dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
  44. block_boxes[threadIdx.x * 5 + 2] =
  45. dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
  46. block_boxes[threadIdx.x * 5 + 3] =
  47. dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
  48. block_boxes[threadIdx.x * 5 + 4] =
  49. dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
  50. }
  51. __syncthreads();
  52. if (threadIdx.x < row_size) {
  53. const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
  54. const float *cur_box = dev_boxes + cur_box_idx * 5;
  55. int i = 0;
  56. unsigned long long t = 0;
  57. int start = 0;
  58. if (row_start == col_start) {
  59. start = threadIdx.x + 1;
  60. }
  61. for (i = start; i < col_size; i++) {
  62. if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
  63. t |= 1ULL << i;
  64. }
  65. }
  66. const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
  67. dev_mask[cur_box_idx * col_blocks + col_start] = t;
  68. }
  69. }
  70. void _set_device(int device_id) {
  71. int current_device;
  72. CUDA_CHECK(cudaGetDevice(&current_device));
  73. if (current_device == device_id) {
  74. return;
  75. }
  76. // The call to cudaSetDevice must come before any calls to Get, which
  77. // may perform initialization using the GPU.
  78. CUDA_CHECK(cudaSetDevice(device_id));
  79. }
  80. void _nms(long* keep_out, int* num_out, const float* boxes_host, int boxes_num,
  81. int boxes_dim, float nms_overlap_thresh, int device_id) {
  82. _set_device(device_id);
  83. float* boxes_dev = NULL;
  84. unsigned long long* mask_dev = NULL;
  85. const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
  86. CUDA_CHECK(cudaMalloc(&boxes_dev,
  87. boxes_num * boxes_dim * sizeof(float)));
  88. CUDA_CHECK(cudaMemcpy(boxes_dev,
  89. boxes_host,
  90. boxes_num * boxes_dim * sizeof(float),
  91. cudaMemcpyHostToDevice));
  92. CUDA_CHECK(cudaMalloc(&mask_dev,
  93. boxes_num * col_blocks * sizeof(unsigned long long)));
  94. dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
  95. DIVUP(boxes_num, threadsPerBlock));
  96. dim3 threads(threadsPerBlock);
  97. nms_kernel<<<blocks, threads>>>(boxes_num,
  98. nms_overlap_thresh,
  99. boxes_dev,
  100. mask_dev);
  101. std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
  102. CUDA_CHECK(cudaMemcpy(&mask_host[0],
  103. mask_dev,
  104. sizeof(unsigned long long) * boxes_num * col_blocks,
  105. cudaMemcpyDeviceToHost));
  106. std::vector<unsigned long long> remv(col_blocks);
  107. memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
  108. int num_to_keep = 0;
  109. for (int i = 0; i < boxes_num; i++) {
  110. int nblock = i / threadsPerBlock;
  111. int inblock = i % threadsPerBlock;
  112. if (!(remv[nblock] & (1ULL << inblock))) {
  113. keep_out[num_to_keep++] = i;
  114. unsigned long long *p = &mask_host[0] + i * col_blocks;
  115. for (int j = nblock; j < col_blocks; j++) {
  116. remv[j] |= p[j];
  117. }
  118. }
  119. }
  120. *num_out = num_to_keep;
  121. CUDA_CHECK(cudaFree(boxes_dev));
  122. CUDA_CHECK(cudaFree(mask_dev));
  123. }