diff --git a/bench/bench_layer.cpp b/bench/bench_layer.cpp
index b4370456..a3bc2538 100644
--- a/bench/bench_layer.cpp
+++ b/bench/bench_layer.cpp
@@ -18,7 +18,7 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 23-4-27.
 #include <benchmark/benchmark.h>
 #include "../source/layer/details/adaptive_avgpooling.hpp"
@@ -238,7 +238,6 @@ BENCHMARK(BM_SiLU)->Args({32, 160, 160})->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_SiLU)->Args({64, 80, 80})->Unit(benchmark::kMillisecond);
 BENCHMARK(BM_SiLU)->Args({128, 40, 40})->Unit(benchmark::kMillisecond);
 
-
 static void BM_ReLU(benchmark::State& state) {
   using namespace kuiper_infer;
 
@@ -334,6 +333,7 @@ static void BM_Upsample(benchmark::State& state) {
   uint32_t rows = state.range(1);
   uint32_t cols = state.range(2);
 
+  UpSampleMode mode = UpSampleMode(state.range(3));
   std::shared_ptr<Tensor<float>> input =
       std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
@@ -342,17 +342,22 @@ static void BM_Upsample(benchmark::State& state) {
   inputs.push_back(input);
 
   std::vector<std::shared_ptr<Tensor<float>>> outputs(1);
-  UpSampleLayer layer(2.f, 2.f);
+  UpSampleLayer layer(3.f, 3.f, mode);
 
   for (auto _ : state) {
     const auto status = layer.Forward(inputs, outputs);
   }
 }
 
-BENCHMARK(BM_Upsample)->Args({3, 320, 320})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({32, 160, 160})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({64, 80, 80})->Unit(benchmark::kMillisecond);
-BENCHMARK(BM_Upsample)->Args({128, 40, 40})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({3, 320, 320, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({32, 160, 160, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({64, 80, 80, 0})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({128, 40, 40, 0})->Unit(benchmark::kMillisecond);
+
+BENCHMARK(BM_Upsample)->Args({3, 320, 320, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({32, 160, 160, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({64, 80, 80, 1})->Unit(benchmark::kMillisecond);
+BENCHMARK(BM_Upsample)->Args({128, 40, 40, 1})->Unit(benchmark::kMillisecond);
 
 static void BM_AdaptivePooling(benchmark::State& state) {
   using namespace kuiper_infer;
diff --git a/include/status_code.hpp b/include/status_code.hpp
index a7119339..b243ab41 100644
--- a/include/status_code.hpp
+++ b/include/status_code.hpp
@@ -75,8 +75,9 @@ enum class ParseParameterAttrStatus {
   kParameterMissingScale = 14,
   kParameterMissingResizeMode = 15,
   kParameterMissingDilation = 16,
-  kParameterMissingPaddingMode = 16,
-  kParameterMissingOutputPadding = 17,
+  kParameterMissingPaddingMode = 17,
+  kParameterMissingOutputPadding = 18,
+  kParameterMissingAlignCorner = 19,
 
   kAttrMissingBias = 21,
   kAttrMissingWeight = 22,
diff --git a/source/layer/details/convolution.cpp b/source/layer/details/convolution.cpp
index a459a59d..7d96fbde 100644
--- a/source/layer/details/convolution.cpp
+++ b/source/layer/details/convolution.cpp
@@ -246,7 +246,7 @@ InferStatus ConvolutionLayer::Forward(
         CHECK(kernel_count % groups_ == 0);
         CHECK(input_c % groups_ == 0);
       }
-      uint32_t input_c_group = input_c / groups_;
+      const uint32_t input_c_group = input_c / groups_;
       CHECK(input_c_group == kernel_c)
           << "The number of channel for the kernel "
              "matrix and input tensor do not match";
diff --git a/source/layer/details/upsample.cpp b/source/layer/details/upsample.cpp
index 0786835c..546f14ef 100644
--- a/source/layer/details/upsample.cpp
+++ b/source/layer/details/upsample.cpp
@@ -25,7 +25,32 @@
 #include "layer/abstract/layer_factory.hpp"
 namespace kuiper_infer {
 
-UpSampleLayer::UpSampleLayer(float scale_h, float scale_w, UpSampleMode mode)
+static void CalcIndexAndLambda(int32_t input_size, int32_t output_size,
+                               float div_scale, int32_t output_idx,
+                               float& lambda0, float& lambda1,
+                               int32_t& input_index0, int32_t& input_index1) {
+  if (output_size == input_size) {
+    input_index0 = input_index1 = output_idx;
+    lambda0 = 1;
+    lambda1 = 0;
+  } else {
+    float real_input_idx =
+        div_scale * (static_cast<float>(output_idx) + 0.5f) - 0.5f;
+    if (real_input_idx < 0) {
+      real_input_idx = 0;
+    }
+
+    input_index0 = static_cast<int>(real_input_idx);
+    int32_t offset = (input_index0 < input_size - 1) ? 1 : 0;
+    input_index1 = input_index0 + offset;
+
+    lambda1 = real_input_idx - static_cast<float>(input_index0);
+    lambda0 = 1.0f - lambda1;
+  }
+}
+
+UpSampleLayer::UpSampleLayer(uint32_t scale_h, uint32_t scale_w,
+                             UpSampleMode mode)
     : NonParamLayer("upsample"),
       scale_h_(scale_h),
       scale_w_(scale_w),
@@ -46,14 +71,15 @@ InferStatus UpSampleLayer::Forward(
     return InferStatus::kInferFailedInputOutSizeMatchError;
   }
 
-  auto test_scale_factor = [](uint32_t origin, float scale_factor) {
-    float result = origin * scale_factor;
-    if (std::abs(result - std::round(result)) > 1e-4f) {
+  auto test_scale_factor = [](uint32_t origin, uint32_t scale_factor) {
+    float result = static_cast<float>(origin * scale_factor);
+    if (std::abs(result - std::round(result)) > 1e-5f) {
       LOG(ERROR) << "The input scale_factor is wrong";
     }
   };
 
-  LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest)
+  LOG_IF(FATAL, this->mode_ != UpSampleMode::kModeNearest &&
+                    this->mode_ != UpSampleMode::kModeBilinear)
       << "Unsupported upsample mode: " << int(mode_);
 
   for (uint32_t i = 0; i < inputs.size(); ++i) {
@@ -69,25 +95,23 @@ InferStatus UpSampleLayer::Forward(
   }
 
   const uint32_t batch_size = inputs.size();
-  const uint32_t scale_w = uint32_t(scale_w_);
-  const uint32_t scale_h = uint32_t(scale_h_);
 
 #pragma omp parallel for num_threads(batch_size)
   for (uint32_t i = 0; i < batch_size; ++i) {
     const arma::fcube& input_data = inputs.at(i)->data();
     std::shared_ptr<Tensor<float>> output = outputs.at(i);
     if (output == nullptr || output->empty()) {
-      output = std::make_shared<Tensor<float>>(
-          input_data.n_slices, uint32_t(input_data.n_rows * scale_h),
-          uint32_t(input_data.n_cols * scale_w));
+      output = std::make_shared<Tensor<float>>(input_data.n_slices,
+                                               input_data.n_rows * scale_h_,
+                                               input_data.n_cols * scale_w_);
       outputs.at(i) = output;
     }
     auto& output_data = output->data();
-    CHECK(output_data.n_rows == input_data.n_rows * scale_h)
+    CHECK(output_data.n_rows == input_data.n_rows * scale_h_)
         << "The input and output tensor height of the upsample layer do not "
            "match "
         << i << "th";
-    CHECK(output_data.n_cols == input_data.n_cols * scale_w)
+    CHECK(output_data.n_cols == input_data.n_cols * scale_w_)
         << "The input and output tensor width of the upsample layer do not "
            "match "
         << i << "th";
@@ -97,36 +121,80 @@ InferStatus UpSampleLayer::Forward(
            "match "
         << i << "th";
 
+    const float div_scale_h = 1.f / static_cast<float>(scale_h_);
+    const float div_scale_w = 1.f / static_cast<float>(scale_w_);
     const uint32_t channels = input_data.n_slices;
-    for (uint32_t c = 0; c < channels; ++c) {
-      const arma::fmat& input_channel = input_data.slice(c);
-      arma::fmat& output_channel = output_data.slice(c);
-
-      const uint32_t input_w = input_channel.n_cols;
-      const uint32_t input_h = input_channel.n_rows;
-      const uint32_t output_w = output_channel.n_cols;
-      const uint32_t output_h = output_channel.n_rows;
-
-      for (uint32_t w = 0; w < input_w; ++w) {
-        const float* input_col_ptr = input_channel.colptr(w);
-        const uint32_t scaled_w = w * scale_w;
-        for (uint32_t sw = 0; sw < scale_w; ++sw) {
-          if (scaled_w + sw >= output_w) {
-            continue;
-          }
-          float* output_col_ptr = output_channel.colptr(scaled_w + sw);
-          for (uint32_t h = 0; h < input_h; ++h) {
-            const uint32_t scaled_h = h * scale_h;
-            float* output_ptr = output_col_ptr + scaled_h;
-            float input_value = *(input_col_ptr + h);
-            for (uint32_t sh = 0; sh < scale_h; ++sh) {
-              if (scaled_h + sh < output_h) {
-                *(output_ptr + sh) = input_value;
+    if (mode_ == UpSampleMode::kModeNearest) {
+#pragma omp parallel for
+      for (uint32_t c = 0; c < channels; ++c) {
+        const arma::fmat& input_channel = input_data.slice(c);
+        arma::fmat& output_channel = output_data.slice(c);
+
+        const uint32_t input_w = input_channel.n_cols;
+        const uint32_t input_h = input_channel.n_rows;
+        const uint32_t output_w = output_channel.n_cols;
+        const uint32_t output_h = output_channel.n_rows;
+        for (uint32_t w = 0; w < input_w; ++w) {
+          const float* input_col_ptr = input_channel.colptr(w);
+          const uint32_t scaled_w = w * scale_w_;
+          for (uint32_t sw = 0; sw < scale_w_; ++sw) {
+            if (scaled_w + sw >= output_w) {
+              continue;
+            }
+            float* output_col_ptr = output_channel.colptr(scaled_w + sw);
+            for (uint32_t h = 0; h < input_h; ++h) {
+              const uint32_t scaled_h = h * scale_h_;
+              float* output_ptr = output_col_ptr + scaled_h;
+              float input_value = *(input_col_ptr + h);
+              for (uint32_t sh = 0; sh < scale_h_; ++sh) {
+                if (scaled_h + sh < output_h) {
+                  *(output_ptr + sh) = input_value;
+                }
               }
             }
           }
         }
       }
+    } else {
+#pragma omp parallel for
+      for (uint32_t c = 0; c < channels; ++c) {
+        const arma::fmat& input_channel = input_data.slice(c);
+        arma::fmat& output_channel = output_data.slice(c);
+
+        const uint32_t input_w = input_channel.n_cols;
+        const uint32_t input_h = input_channel.n_rows;
+        const uint32_t output_w = output_channel.n_cols;
+        const uint32_t output_h = output_channel.n_rows;
+        for (uint32_t w = 0; w < output_w; ++w) {
+          float* output_ptr = output_channel.colptr(w);
+          float w0_lambda = 0.f;
+          float w1_lambda = 0.f;
+          int32_t input_w0 = 0;
+          int32_t input_w1 = 0;
+          CalcIndexAndLambda(static_cast<int32_t>(input_w),
+                             static_cast<int32_t>(output_w), div_scale_w,
+                             static_cast<int32_t>(w), w0_lambda, w1_lambda,
+                             input_w0, input_w1);
+          const float* input_ptr0 = input_channel.colptr(input_w0);
+          const float* input_ptr1 = input_channel.colptr(input_w1);
+          for (uint32_t h = 0; h < output_h; ++h) {
+            float h0_lambda = 0.f;
+            float h1_lambda = 0.f;
+            int32_t input_h0 = 0;
+            int32_t input_h1 = 0;
+            CalcIndexAndLambda(static_cast<int32_t>(input_h),
+                               static_cast<int32_t>(output_h), div_scale_h,
+                               static_cast<int32_t>(h), h0_lambda, h1_lambda,
+                               input_h0, input_h1);
+
+            *(output_ptr + h) =
+                h0_lambda * w0_lambda * (*(input_ptr0 + input_h0)) +
+                h0_lambda * w1_lambda * (*(input_ptr1 + input_h0)) +
+                h1_lambda * w0_lambda * (*(input_ptr0 + input_h1)) +
+                h1_lambda * w1_lambda * (*(input_ptr1 + input_h1));
+          }
+        }
+      }
     }
   }
   return InferStatus::kInferSuccess;
@@ -143,7 +211,6 @@ ParseParameterAttrStatus UpSampleLayer::CreateInstance(
     return ParseParameterAttrStatus::kParameterMissingScale;
   }
 
-  const auto& scale_param = params.at("scale_factor");
   auto scales = std::dynamic_pointer_cast<RuntimeParameterFloatArray>(
       params.at("scale_factor"));
   if (scales == nullptr) {
@@ -157,17 +224,45 @@ ParseParameterAttrStatus UpSampleLayer::CreateInstance(
     return ParseParameterAttrStatus::kParameterMissingResizeMode;
   }
 
-  auto mode =
+  auto mode_param =
       std::dynamic_pointer_cast<RuntimeParameterString>(params.at("mode"));
-  CHECK(mode->value == "nearest")
-      << "The mode " << mode->value << " is not supported!";
+
+  UpSampleMode mode;
+  if (mode_param->value == "nearest") {
+    mode = UpSampleMode::kModeNearest;
+  } else if (mode_param->value == "bilinear") {
+    mode = UpSampleMode::kModeBilinear;
+  } else {
+    LOG(FATAL) << "The mode " << mode_param->value << " is not supported!";
+  }
+
+  if (params.find("align_corners") != params.end()) {
+    auto align_corner_param = std::dynamic_pointer_cast<RuntimeParameterBool>(
+        params.at("align_corners"));
+    if (!align_corner_param) {
+      return ParseParameterAttrStatus::kParameterMissingAlignCorner;
+    }
+    bool align_corner = align_corner_param->value;
+    CHECK_EQ(align_corner, false);
+  }
 
   float scale_h = scales->value.at(0);
   float scale_w = scales->value.at(1);
-  upsample_layer = std::make_shared<UpSampleLayer>(scale_h, scale_w);
+  // scale放大的倍数大于0
+  CHECK_GT(scale_h, 0.f);
+  CHECK_GT(scale_w, 0.f);
+
+  // scale放大的倍数必须是整数
+  CHECK_LE(scale_h - static_cast<uint32_t>(scale_h), 1e-5f);
+  CHECK_LE(scale_w - static_cast<uint32_t>(scale_w), 1e-5f);
+
+  upsample_layer = std::make_shared<UpSampleLayer>(
+      static_cast<uint32_t>(scale_h), static_cast<uint32_t>(scale_w), mode);
   return ParseParameterAttrStatus::kParameterAttrParseSuccess;
 }
 
 LayerRegistererWrapper kUpSamplerCreateInstance("nn.Upsample",
                                                 UpSampleLayer::CreateInstance);
+LayerRegistererWrapper kUpSamplerFCreateInstance("F.upsample",
+                                                 UpSampleLayer::CreateInstance);
 }  // namespace kuiper_infer
diff --git a/source/layer/details/upsample.hpp b/source/layer/details/upsample.hpp
index 750ef8c2..1f47acac 100644
--- a/source/layer/details/upsample.hpp
+++ b/source/layer/details/upsample.hpp
@@ -18,7 +18,7 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 22-12-25.
 
 #ifndef KUIPER_INFER_SOURCE_LAYER_DETAILS_UPSAMPLE_HPP_
@@ -27,12 +27,13 @@
 
 namespace kuiper_infer {
 enum class UpSampleMode {
-  kModeNearest = 0,  // 目前上采样层只支持邻近采样
+  kModeNearest = 0,
+  kModeBilinear = 1,  // 目前上采样支持这两种
 };
 
 class UpSampleLayer : public NonParamLayer {
  public:
-  explicit UpSampleLayer(float scale_h, float scale_w,
+  explicit UpSampleLayer(uint32_t scale_h, uint32_t scale_w,
                          UpSampleMode mode = UpSampleMode::kModeNearest);
 
   InferStatus Forward(
@@ -44,8 +45,8 @@ class UpSampleLayer : public NonParamLayer {
       std::shared_ptr<Layer>& upsample_layer);
 
  private:
-  float scale_h_ = 1.f;
-  float scale_w_ = 1.f;
+  uint32_t scale_h_ = 1;
+  uint32_t scale_w_ = 1;
   UpSampleMode mode_ = UpSampleMode::kModeNearest;
 };
 }  // namespace kuiper_infer
diff --git a/test/test_data/test_load_data.cpp b/test/test_data/test_load_data.cpp
index 79c083ad..164d6e9c 100644
--- a/test/test_data/test_load_data.cpp
+++ b/test/test_data/test_load_data.cpp
@@ -18,15 +18,16 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 22-11-21.
-#include <gtest/gtest.h>
 #include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "data/load_data.hpp"
 
 TEST(test_load, load_csv_data) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data1.csv");
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data1.csv");
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
   ASSERT_EQ(data.n_cols, 4);
@@ -41,7 +42,8 @@ TEST(test_load, load_csv_data) {
 
 TEST(test_load, load_csv_arange) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data2.csv");
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data2.csv");
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
   ASSERT_EQ(data.n_cols, 4);
@@ -59,7 +61,8 @@ TEST(test_load, load_csv_arange) {
 
 TEST(test_load, load_csv_missing_data1) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data4.csv");
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data4.csv");
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
   ASSERT_EQ(data.n_cols, 11);
@@ -79,7 +82,8 @@ TEST(test_load, load_csv_missing_data1) {
 
 TEST(test_load, load_csv_missing_data2) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data3.csv");
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data3.csv");
 
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
@@ -105,7 +109,8 @@ TEST(test_load, load_csv_missing_data2) {
 
 TEST(test_load, split_char) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data5.csv", '-');
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data5.csv", '-');
 
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
@@ -122,7 +127,8 @@ TEST(test_load, split_char) {
 
 TEST(test_load, load_minus_data) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data6.csv", ',');
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data6.csv", ',');
 
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 3);
@@ -144,7 +150,8 @@ TEST(test_load, load_minus_data) {
 
 TEST(test_load, load_large_data) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/data7.csv", ',');
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/data7.csv", ',');
   ASSERT_NE(data.empty(), true);
   ASSERT_EQ(data.n_rows, 1024);
   ASSERT_EQ(data.n_cols, 1024);
@@ -165,6 +172,7 @@ TEST(test_load, load_large_data) {
 
 TEST(test_load, load_empty_data) {
   using namespace kuiper_infer;
-  const arma::fmat &data = CSVDataLoader::LoadData("./tmp/data_loader/notexists.csv", ',');
+  const arma::fmat& data =
+      CSVDataLoader::LoadData("./tmp/data_loader/notexists.csv", ',');
   ASSERT_EQ(data.empty(), true);
 }
\ No newline at end of file
diff --git a/test/test_layer/test_upsample.cpp b/test/test_layer/test_upsample.cpp
index 72c89f25..860946d3 100644
--- a/test/test_layer/test_upsample.cpp
+++ b/test/test_layer/test_upsample.cpp
@@ -18,11 +18,13 @@
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
-    
+
 // Created by fss on 22-12-25.
-#include <gtest/gtest.h>
 #include <glog/logging.h>
+#include <gtest/gtest.h>
 #include "../../source/layer/details/upsample.hpp"
+#include "data/load_data.hpp"
+#include "runtime/runtime_ir.hpp"
 
 TEST(test_layer, forward_upsample1) {
   using namespace kuiper_infer;
@@ -32,7 +34,8 @@ TEST(test_layer, forward_upsample1) {
   const uint32_t rows = 224;
   const uint32_t cols = 224;
 
-  std::shared_ptr<Tensor<float>> input = std::make_shared<Tensor<float>>(channels, rows, cols);
+  std::shared_ptr<Tensor<float>> input =
+      std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
 
   std::vector<std::shared_ptr<Tensor<float>>> inputs;
@@ -43,16 +46,17 @@ TEST(test_layer, forward_upsample1) {
   ASSERT_EQ(status, InferStatus::kInferSuccess);
 
   for (int i = 0; i < outputs.size(); ++i) {
-    const auto &output = outputs.at(i);
+    const auto& output = outputs.at(i);
     for (int c = 0; c < channels; ++c) {
-      const auto &output_channel = output->slice(i);
-      const auto &input_channel = input->slice(i);
+      const auto& output_channel = output->slice(i);
+      const auto& input_channel = input->slice(i);
       ASSERT_EQ(output_channel.n_rows / input_channel.n_rows, 2);
       ASSERT_EQ(output_channel.n_cols / input_channel.n_cols, 2);
 
       for (int r = 0; r < output_channel.n_rows; ++r) {
         for (int c_ = 0; c_ < output_channel.n_cols; ++c_) {
-          ASSERT_EQ(input_channel.at(r / 2, c_ / 2), output_channel.at(r, c_)) << r << " " << c_;
+          ASSERT_EQ(input_channel.at(r / 2, c_ / 2), output_channel.at(r, c_))
+              << r << " " << c_;
         }
       }
     }
@@ -67,7 +71,8 @@ TEST(test_layer, forward_upsample2) {
   const uint32_t rows = 224;
   const uint32_t cols = 224;
 
-  std::shared_ptr<Tensor<float>> input = std::make_shared<Tensor<float>>(channels, rows, cols);
+  std::shared_ptr<Tensor<float>> input =
+      std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
 
   std::vector<std::shared_ptr<Tensor<float>>> inputs;
@@ -78,16 +83,17 @@ TEST(test_layer, forward_upsample2) {
   ASSERT_EQ(status, InferStatus::kInferSuccess);
 
   for (int i = 0; i < outputs.size(); ++i) {
-    const auto &output = outputs.at(i);
+    const auto& output = outputs.at(i);
     for (int c = 0; c < channels; ++c) {
-      const auto &output_channel = output->slice(i);
-      const auto &input_channel = input->slice(i);
+      const auto& output_channel = output->slice(i);
+      const auto& input_channel = input->slice(i);
       ASSERT_EQ(output_channel.n_rows / input_channel.n_rows, 2);
       ASSERT_EQ(output_channel.n_cols / input_channel.n_cols, 3);
 
       for (int r = 0; r < output_channel.n_rows; ++r) {
         for (int c_ = 0; c_ < output_channel.n_cols; ++c_) {
-          ASSERT_EQ(input_channel.at(r / 2, c_ / 3), output_channel.at(r, c_)) << r << " " << c_;
+          ASSERT_EQ(input_channel.at(r / 2, c_ / 3), output_channel.at(r, c_))
+              << r << " " << c_;
         }
       }
     }
@@ -101,7 +107,8 @@ TEST(test_layer, forward_upsample3) {
   const uint32_t rows = 224;
   const uint32_t cols = 224;
 
-  std::shared_ptr<Tensor<float>> input = std::make_shared<Tensor<float>>(channels, rows, cols);
+  std::shared_ptr<Tensor<float>> input =
+      std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
 
   std::vector<std::shared_ptr<Tensor<float>>> inputs;
@@ -112,16 +119,17 @@ TEST(test_layer, forward_upsample3) {
   ASSERT_EQ(status, InferStatus::kInferSuccess);
 
   for (int i = 0; i < outputs.size(); ++i) {
-    const auto &output = outputs.at(i);
+    const auto& output = outputs.at(i);
     for (int c = 0; c < channels; ++c) {
-      const auto &output_channel = output->slice(i);
-      const auto &input_channel = input->slice(i);
+      const auto& output_channel = output->slice(i);
+      const auto& input_channel = input->slice(i);
       ASSERT_EQ(output_channel.n_rows / input_channel.n_rows, 3);
       ASSERT_EQ(output_channel.n_cols / input_channel.n_cols, 2);
 
       for (int r = 0; r < output_channel.n_rows; ++r) {
         for (int c_ = 0; c_ < output_channel.n_cols; ++c_) {
-          ASSERT_EQ(input_channel.at(r / 3, c_ / 2), output_channel.at(r, c_)) << r << " " << c_;
+          ASSERT_EQ(input_channel.at(r / 3, c_ / 2), output_channel.at(r, c_))
+              << r << " " << c_;
         }
       }
     }
@@ -135,7 +143,8 @@ TEST(test_layer, forward_upsample4) {
   const uint32_t rows = 224;
   const uint32_t cols = 224;
 
-  std::shared_ptr<Tensor<float>> input = std::make_shared<Tensor<float>>(channels, rows, cols);
+  std::shared_ptr<Tensor<float>> input =
+      std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
 
   std::vector<std::shared_ptr<Tensor<float>>> inputs;
@@ -146,16 +155,17 @@ TEST(test_layer, forward_upsample4) {
   ASSERT_EQ(status, InferStatus::kInferSuccess);
 
   for (int i = 0; i < outputs.size(); ++i) {
-    const auto &output = outputs.at(i);
+    const auto& output = outputs.at(i);
     for (int c = 0; c < channels; ++c) {
-      const auto &output_channel = output->slice(i);
-      const auto &input_channel = input->slice(i);
+      const auto& output_channel = output->slice(i);
+      const auto& input_channel = input->slice(i);
       ASSERT_EQ(output_channel.n_rows / input_channel.n_rows, 3);
       ASSERT_EQ(output_channel.n_cols / input_channel.n_cols, 3);
 
       for (int r = 0; r < output_channel.n_rows; ++r) {
         for (int c_ = 0; c_ < output_channel.n_cols; ++c_) {
-          ASSERT_EQ(input_channel.at(r / 3, c_ / 3), output_channel.at(r, c_)) << r << " " << c_;
+          ASSERT_EQ(input_channel.at(r / 3, c_ / 3), output_channel.at(r, c_))
+              << r << " " << c_;
         }
       }
     }
@@ -169,7 +179,8 @@ TEST(test_layer, forward_upsample5) {
   const uint32_t rows = 224;
   const uint32_t cols = 224;
 
-  std::shared_ptr<Tensor<float>> input = std::make_shared<Tensor<float>>(channels, rows, cols);
+  std::shared_ptr<Tensor<float>> input =
+      std::make_shared<Tensor<float>>(channels, rows, cols);
   input->Rand();
 
   std::vector<std::shared_ptr<Tensor<float>>> inputs;
@@ -180,19 +191,92 @@ TEST(test_layer, forward_upsample5) {
   ASSERT_EQ(status, InferStatus::kInferSuccess);
 
   for (int i = 0; i < outputs.size(); ++i) {
-    const auto &output = outputs.at(i);
+    const auto& output = outputs.at(i);
     for (int c = 0; c < channels; ++c) {
-      const auto &output_channel = output->slice(i);
-      const auto &input_channel = input->slice(i);
+      const auto& output_channel = output->slice(i);
+      const auto& input_channel = input->slice(i);
       ASSERT_EQ(output_channel.n_rows / input_channel.n_rows, 4);
       ASSERT_EQ(output_channel.n_cols / input_channel.n_cols, 4);
 
       for (int r = 0; r < output_channel.n_rows; ++r) {
         for (int c_ = 0; c_ < output_channel.n_cols; ++c_) {
-          ASSERT_EQ(input_channel.at(r / 4, c_ / 4), output_channel.at(r, c_)) << r << " " << c_;
+          ASSERT_EQ(input_channel.at(r / 4, c_ / 4), output_channel.at(r, c_))
+              << r << " " << c_;
         }
       }
     }
   }
 }
 
+TEST(test_layer, forward_upsample_bilinear_noalign1) {
+  using namespace kuiper_infer;
+  RuntimeGraph graph("tmp/up/up_layer_mod.pnnx.param",
+                     "tmp/up/up_layer_mod.pnnx.bin");
+
+  graph.Build();
+  const uint32_t batch_size = 1;
+  std::vector<std::shared_ptr<Tensor<float>>> inputs;
+
+  const uint32_t size = 3 * 16 * 31;
+  std::vector<float> input_values;
+  for (uint32_t i = 0; i < size; ++i) {
+    input_values.push_back(float(i));
+  }
+  sftensor input_tensor = std::make_shared<ftensor>(3, 16, 31);
+  input_tensor->Fill(input_values, true);
+  inputs.push_back(input_tensor);
+  graph.set_inputs("pnnx_input_0", inputs);
+
+  graph.Forward();
+  std::vector<std::shared_ptr<Tensor<float>>> outputs =
+      graph.get_outputs("pnnx_output_0");
+  ASSERT_EQ(outputs.size(), 1);
+
+  sftensor output = outputs.front();
+  arma::fmat real =
+      CSVDataLoader::LoadData("tmp/up/test_upsample_bilinear.csv");
+  auto output_values = output->values(true);
+  for (uint32_t i = 0; i < output->size(); ++i) {
+    float output1 = real.at(i);
+    float output2 = output_values.at(i);
+    ASSERT_LE(std::abs(output1 - output2), 5e-4f)
+        << i << " output1: " << output1 << " output2: " << output2;
+  }
+}
+
+
+TEST(test_layer, forward_upsample_bilinear_noalign2) {
+  using namespace kuiper_infer;
+  RuntimeGraph graph("tmp/up/up_layer_mod1.pnnx.param",
+                     "tmp/up/up_layer_mod1.pnnx.bin");
+
+  graph.Build();
+  const uint32_t batch_size = 1;
+  std::vector<std::shared_ptr<Tensor<float>>> inputs;
+
+  const uint32_t size = 7 * 16 * 31;
+  std::vector<float> input_values;
+  for (uint32_t i = 0; i < size; ++i) {
+    input_values.push_back(float(i));
+  }
+  sftensor input_tensor = std::make_shared<ftensor>(7, 16, 31);
+  input_tensor->Fill(input_values, true);
+  inputs.push_back(input_tensor);
+  graph.set_inputs("pnnx_input_0", inputs);
+
+  graph.Forward();
+  std::vector<std::shared_ptr<Tensor<float>>> outputs =
+      graph.get_outputs("pnnx_output_0");
+  ASSERT_EQ(outputs.size(), 1);
+
+  sftensor output = outputs.front();
+  arma::fmat real =
+      CSVDataLoader::LoadData("tmp/up/test_upsample_bilinear11.csv");
+  auto output_values = output->values(true);
+  for (uint32_t i = 0; i < output->size(); ++i) {
+    float output1 = real.at(i);
+    float output2 = output_values.at(i);
+    ASSERT_LE(std::abs(output1 - output2), 5e-4f)
+        << i << " output1: " << output1 << " output2: " << output2;
+  }
+}
diff --git a/tmp b/tmp
index 84f3ae85..f67f1833 160000
--- a/tmp
+++ b/tmp
@@ -1 +1 @@
-Subproject commit 84f3ae857901f05743014fc72f3931239e0e8625
+Subproject commit f67f1833da6a7e2a513f01b0f003ef9e9a2a8d29