增加bilinear上采样

zjhellofss · Aug 17, 2023 · 22fcc33 · 22fcc33
1 parent 7218e64
commit 22fcc33
Show file tree

Hide file tree

Showing 8 changed files with 288 additions and 94 deletions.
diff --git a/bench/bench_layer.cpp b/bench/bench_layer.cpp
@@ -18,7 +18,7 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 23-4-27.
 #include <benchmark/benchmark.h>
 #include "../source/layer/details/adaptive_avgpooling.hpp"
@@ -238,7 +238,6 @@ BENCHMARK(BM_SiLU)->Args({32, 160, 160})->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_SiLU)->Args({64, 80, 80})->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_SiLU)->Args({128, 40, 40})->Unit(benchmark::kMillisecond);
 
-
 static void BM_ReLU(benchmark::State& state) {
   using namespace kuiper_infer;
 
@@ -334,6 +333,7 @@ static void BM_Upsample(benchmark::State& state) {
   uint32_t rows = state.range(1);
   uint32_t cols = state.range(2);
 
+  UpSampleMode mode = UpSampleMode(state.range(3));
   std::shared_ptr<Tensor<float>> input =
       std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
@@ -342,17 +342,22 @@ static void BM_Upsample(benchmark::State& state) {
   inputs.push_back(input);
 
   std::vector<std::shared_ptr<Tensor<float>>> outputs(1);
-  UpSampleLayer layer(2.f, 2.f);
+  UpSampleLayer layer(3.f, 3.f, mode);
 
   for (auto _ : state) {
     const auto status = layer.Forward(inputs, outputs);
   }
 }
 
-BENCHMARK(BM_Upsample)->Args({3, 320, 320})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({32, 160, 160})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({64, 80, 80})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({128, 40, 40})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({3, 320, 320, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({32, 160, 160, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({64, 80, 80, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({128, 40, 40, 0})->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Upsample)->Args({3, 320, 320, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({32, 160, 160, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({64, 80, 80, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({128, 40, 40, 1})->Unit(benchmark::kMillisecond);
 
 static void BM_AdaptivePooling(benchmark::State& state) {
   using namespace kuiper_infer;

diff --git a/include/status_code.hpp b/include/status_code.hpp
@@ -75,8 +75,9 @@ enum class ParseParameterAttrStatus {
   kParameterMissingScale = 14,
   kParameterMissingResizeMode = 15,
   kParameterMissingDilation = 16,
-  kParameterMissingPaddingMode = 16,
-  kParameterMissingOutputPadding = 17,
+  kParameterMissingPaddingMode = 17,
+  kParameterMissingOutputPadding = 18,
+  kParameterMissingAlignCorner = 19,
 
   kAttrMissingBias = 21,
   kAttrMissingWeight = 22,

diff --git a/source/layer/details/convolution.cpp b/source/layer/details/convolution.cpp
@@ -246,7 +246,7 @@ InferStatus ConvolutionLayer::Forward(
         CHECK(kernel_count % groups_ == 0);
         CHECK(input_c % groups_ == 0);
       }
-      uint32_t input_c_group = input_c / groups_;
+      const uint32_t input_c_group = input_c / groups_;
       CHECK(input_c_group == kernel_c)
           << "The number of channel for the kernel "
              "matrix and input tensor do not match";

diff --git a/source/layer/details/upsample.cpp b/source/layer/details/upsample.cpp
@@ -25,7 +25,32 @@
 #include "layer/abstract/layer_factory.hpp"
 namespace kuiper_infer {
 
-UpSampleLayer::UpSampleLayer(float scale_h, float scale_w, UpSampleMode mode)
+static void CalcIndexAndLambda(int32_t input_size, int32_t output_size,
+                               float div_scale, int32_t output_idx,
+                               float& lambda0, float& lambda1,
+                               int32_t& input_index0, int32_t& input_index1) {
+  if (output_size == input_size) {
+    input_index0 = input_index1 = output_idx;
+    lambda0 = 1;
+    lambda1 = 0;
+  } else {
+    float real_input_idx =
+        div_scale * (static_cast<float>(output_idx) + 0.5f) - 0.5f;
+    if (real_input_idx < 0) {
+      real_input_idx = 0;
+    }
+
+    input_index0 = static_cast<int>(real_input_idx);
+    int32_t offset = (input_index0 < input_size - 1) ? 1 : 0;
+    input_index1 = input_index0 + offset;
+
+    lambda1 = real_input_idx - static_cast<float>(input_index0);
+    lambda0 = 1.0f - lambda1;
+  }
+}
+
+UpSampleLayer::UpSampleLayer(uint32_t scale_h, uint32_t scale_w,
+                             UpSampleMode mode)
     : NonParamLayer("upsample"),
       scale_h_(scale_h),
       scale_w_(scale_w),
@@ -46,14 +71,15 @@ InferStatus UpSampleLayer::Forward(
     return InferStatus::kInferFailedInputOutSizeMatchError;
   }
 
-  auto test_scale_factor = [](uint32_t origin, float scale_factor) {
-    float result = origin * scale_factor;
-    if (std::abs(result - std::round(result)) > 1e-4f) {
+  auto test_scale_factor = [](uint32_t origin, uint32_t scale_factor) {
+    float result = static_cast<float>(origin * scale_factor);
+    if (std::abs(result - std::round(result)) > 1e-5f) {
       LOG(ERROR) << "The input scale_factor is wrong";
     }
   };
 
-  LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest)
+  LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest &&
+                    this->mode_ != UpSampleMode::kModeBilinear)
       << "Unsupported upsample mode: " << int(mode_);
 
   for (uint32_t i = 0; i < inputs.size(); ++i) {
@@ -69,25 +95,23 @@ InferStatus UpSampleLayer::Forward(
   }
 
   const uint32_t batch_size = inputs.size();
-  const uint32_t scale_w = uint32_t(scale_w_);
-  const uint32_t scale_h = uint32_t(scale_h_);
 
 #pragma omp parallel for num_threads(batch_size)
   for (uint32_t i = 0; i < batch_size; ++i) {
     const arma::fcube& input_data = inputs.at(i)->data();
     std::shared_ptr<Tensor<float>> output = outputs.at(i);
     if (output == nullptr || output->empty()) {
-      output = std::make_shared<Tensor<float>>(
-          input_data.n_slices, uint32_t(input_data.n_rows * scale_h),
-          uint32_t(input_data.n_cols * scale_w));
+      output = std::make_shared<Tensor<float>>(input_data.n_slices,
+                                               input_data.n_rows * scale_h_,
+                                               input_data.n_cols * scale_w_);
       outputs.at(i) = output;
     }
     auto& output_data = output->data();
-    CHECK(output_data.n_rows == input_data.n_rows * scale_h)
+    CHECK(output_data.n_rows == input_data.n_rows * scale_h_)
         << "The input and output tensor height of the upsample layer do not "
            "match "
         << i << "th";
-    CHECK(output_data.n_cols == input_data.n_cols * scale_w)
+    CHECK(output_data.n_cols == input_data.n_cols * scale_w_)
         << "The input and output tensor width of the upsample layer do not "
            "match "
         << i << "th";
@@ -97,36 +121,80 @@ InferStatus UpSampleLayer::Forward(
            "match "
         << i << "th";
 
+    const float div_scale_h = 1.f / static_cast<float>(scale_h_);
+    const float div_scale_w = 1.f / static_cast<float>(scale_w_);
     const uint32_t channels = input_data.n_slices;
-    for (uint32_t c = 0; c < channels; ++c) {
-      const arma::fmat& input_channel = input_data.slice(c);
-      arma::fmat& output_channel = output_data.slice(c);
-
-      const uint32_t input_w = input_channel.n_cols;
-      const uint32_t input_h = input_channel.n_rows;
-      const uint32_t output_w = output_channel.n_cols;
-      const uint32_t output_h = output_channel.n_rows;
-
-      for (uint32_t w = 0; w < input_w; ++w) {
-        const float* input_col_ptr = input_channel.colptr(w);
-        const uint32_t scaled_w = w * scale_w;
-        for (uint32_t sw = 0; sw < scale_w; ++sw) {
-          if (scaled_w + sw >= output_w) {
-            continue;
-          }
-          float* output_col_ptr = output_channel.colptr(scaled_w + sw);
-          for (uint32_t h = 0; h < input_h; ++h) {
-            const uint32_t scaled_h = h * scale_h;
-            float* output_ptr = output_col_ptr + scaled_h;
-            float input_value = *(input_col_ptr + h);
-            for (uint32_t sh = 0; sh < scale_h; ++sh) {
-              if (scaled_h + sh < output_h) {
-                *(output_ptr + sh) = input_value;
+    if (mode_ == UpSampleMode::kModeNearest) {
+#pragma omp parallel for
+      for (uint32_t c = 0; c < channels; ++c) {
+        const arma::fmat& input_channel = input_data.slice(c);
+        arma::fmat& output_channel = output_data.slice(c);
+
+        const uint32_t input_w = input_channel.n_cols;
+        const uint32_t input_h = input_channel.n_rows;
+        const uint32_t output_w = output_channel.n_cols;
+        const uint32_t output_h = output_channel.n_rows;
+        for (uint32_t w = 0; w < input_w; ++w) {
+          const float* input_col_ptr = input_channel.colptr(w);
+          const uint32_t scaled_w = w * scale_w_;
+          for (uint32_t sw = 0; sw < scale_w_; ++sw) {
+            if (scaled_w + sw >= output_w) {
+              continue;
+            }
+            float* output_col_ptr = output_channel.colptr(scaled_w + sw);
+            for (uint32_t h = 0; h < input_h; ++h) {
+              const uint32_t scaled_h = h * scale_h_;
+              float* output_ptr = output_col_ptr + scaled_h;
+              float input_value = *(input_col_ptr + h);
+              for (uint32_t sh = 0; sh < scale_h_; ++sh) {
+                if (scaled_h + sh < output_h) {
+                  *(output_ptr + sh) = input_value;
+                }
               }
             }
           }
         }
       }
+    } else {
+#pragma omp parallel for
+      for (uint32_t c = 0; c < channels; ++c) {
+        const arma::fmat& input_channel = input_data.slice(c);
+        arma::fmat& output_channel = output_data.slice(c);
+
+        const uint32_t input_w = input_channel.n_cols;
+        const uint32_t input_h = input_channel.n_rows;
+        const uint32_t output_w = output_channel.n_cols;
+        const uint32_t output_h = output_channel.n_rows;
+        for (uint32_t w = 0; w < output_w; ++w) {
+          float* output_ptr = output_channel.colptr(w);
+          float w0_lambda = 0.f;
+          float w1_lambda = 0.f;
+          int32_t input_w0 = 0;
+          int32_t input_w1 = 0;
+          CalcIndexAndLambda(static_cast<int32_t>(input_w),
+                             static_cast<int32_t>(output_w), div_scale_w,
+                             static_cast<int32_t>(w), w0_lambda, w1_lambda,
+                             input_w0, input_w1);
+          const float* input_ptr0 = input_channel.colptr(input_w0);
+          const float* input_ptr1 = input_channel.colptr(input_w1);
+          for (uint32_t h = 0; h < output_h; ++h) {
+            float h0_lambda = 0.f;
+            float h1_lambda = 0.f;
+            int32_t input_h0 = 0;
+            int32_t input_h1 = 0;
+            CalcIndexAndLambda(static_cast<int32_t>(input_h),
+                               static_cast<int32_t>(output_h), div_scale_h,
+                               static_cast<int32_t>(h), h0_lambda, h1_lambda,
+                               input_h0, input_h1);
+
+            *(output_ptr + h) =
+                h0_lambda * w0_lambda * (*(input_ptr0 + input_h0)) +
+                h0_lambda * w1_lambda * (*(input_ptr1 + input_h0)) +
+                h1_lambda * w0_lambda * (*(input_ptr0 + input_h1)) +
+                h1_lambda * w1_lambda * (*(input_ptr1 + input_h1));
+          }
+        }
+      }
     }
   }
   return InferStatus::kInferSuccess;
@@ -143,7 +211,6 @@ ParseParameterAttrStatus UpSampleLayer::CreateInstance(
     return ParseParameterAttrStatus::kParameterMissingScale;
   }
 
-  const auto& scale_param = params.at("scale_factor");
   auto scales = std::dynamic_pointer_cast<RuntimeParameterFloatArray>(
       params.at("scale_factor"));
   if (scales == nullptr) {
@@ -157,17 +224,45 @@ ParseParameterAttrStatus UpSampleLayer::CreateInstance(
     return ParseParameterAttrStatus::kParameterMissingResizeMode;
   }
 
-  auto mode =
+  auto mode_param =
       std::dynamic_pointer_cast<RuntimeParameterString>(params.at("mode"));
-  CHECK(mode->value == "nearest")
-      << "The mode " << mode->value << " is not supported!";
+
+  UpSampleMode mode;
+  if (mode_param->value == "nearest") {
+    mode = UpSampleMode::kModeNearest;
+  } else if (mode_param->value == "bilinear") {
+    mode = UpSampleMode::kModeBilinear;
+  } else {
+    LOG(FATAL) << "The mode " << mode_param->value << " is not supported!";
+  }
+
+  if (params.find("align_corners") != params.end()) {
+    auto align_corner_param = std::dynamic_pointer_cast<RuntimeParameterBool>(
+        params.at("align_corners"));
+    if (!align_corner_param) {
+      return ParseParameterAttrStatus::kParameterMissingAlignCorner;
+    }
+    bool align_corner = align_corner_param->value;
+    CHECK_EQ(align_corner, false);
+  }
 
   float scale_h = scales->value.at(0);
   float scale_w = scales->value.at(1);
-  upsample_layer = std::make_shared<UpSampleLayer>(scale_h, scale_w);
+  // scale放大的倍数大于0
+  CHECK_GT(scale_h, 0.f);
+  CHECK_GT(scale_w, 0.f);
+
+  // scale放大的倍数必须是整数
+  CHECK_LE(scale_h - static_cast<uint32_t>(scale_h), 1e-5f);
+  CHECK_LE(scale_w - static_cast<uint32_t>(scale_w), 1e-5f);
+
+  upsample_layer = std::make_shared<UpSampleLayer>(
+      static_cast<uint32_t>(scale_h), static_cast<uint32_t>(scale_w), mode);
   return ParseParameterAttrStatus::kParameterAttrParseSuccess;
 }
 
 LayerRegistererWrapper kUpSamplerCreateInstance("nn.Upsample",
                                                 UpSampleLayer::CreateInstance);
+LayerRegistererWrapper kUpSamplerFCreateInstance("F.upsample",
+                                                 UpSampleLayer::CreateInstance);
 }  // namespace kuiper_infer
diff --git a/source/layer/details/upsample.hpp b/source/layer/details/upsample.hpp
@@ -18,7 +18,7 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 22-12-25.
 
 #ifndef KUIPER_INFER_SOURCE_LAYER_DETAILS_UPSAMPLE_HPP_
@@ -27,12 +27,13 @@
 
 namespace kuiper_infer {
 enum class UpSampleMode {
-  kModeNearest = 0,  // 目前上采样层只支持邻近采样
+  kModeNearest = 0,
+  kModeBilinear = 1,  // 目前上采样支持这两种
 };
 
 class UpSampleLayer : public NonParamLayer {
  public:
-  explicit UpSampleLayer(float scale_h, float scale_w,
+  explicit UpSampleLayer(uint32_t scale_h, uint32_t scale_w,
                          UpSampleMode mode = UpSampleMode::kModeNearest);
 
   InferStatus Forward(
@@ -44,8 +45,8 @@ class UpSampleLayer : public NonParamLayer {
       std::shared_ptr<Layer>& upsample_layer);
 
  private:
-  float scale_h_ = 1.f;
-  float scale_w_ = 1.f;
+  uint32_t scale_h_ = 1;
+  uint32_t scale_w_ = 1;
   UpSampleMode mode_ = UpSampleMode::kModeNearest;
 };
 }  // namespace kuiper_infer