Skip to content

Commit

Permalink
修改sse实现
Browse files Browse the repository at this point in the history
  • Loading branch information
zjhellofss committed Jul 26, 2023
1 parent 5427b72 commit 09d7823
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 50 deletions.
2 changes: 1 addition & 1 deletion bench/bench_exp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#include <benchmark/benchmark.h>
#include <armadillo>
#include "../source/layer/details/arma_sse.hpp"
#include "../source/layer/details/activation_sse.hpp"
#include "data/tensor.hpp"
static void BM_ExpSimd(benchmark::State& state) {
using namespace kuiper_infer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,26 @@
//
// Created by fss on 23-7-26.
//
#include "arma_sse.hpp"
#include "activation_sse.hpp"
#include <glog/logging.h>

namespace kuiper_infer {

namespace math {
void ArmaSigmoid(const arma::fcube& input_data, arma::fcube& output_data) {
CHECK(!input_data.empty() && !output_data.empty())

static void SigmoidSSE(sftensor input, sftensor output) {
CHECK(input != nullptr && output != nullptr)
<< "The input or output tensor is empty.";
CHECK(!input->empty() && !output->empty())
<< "The input or output tensor is empty.";
const uint32_t in_size = input_data.size();
const uint32_t out_size = output_data.size();
CHECK(in_size == out_size) << "The input and output sizes are not equal.";
CHECK(input->size() == output->size())
<< "The input and output sizes are not equal.";
#ifdef __SSE2__
int32_t packet_size = 4;
int32_t index = 0;
const float* in_ptr = input_data.memptr();
float* out_ptr = output_data.memptr();
int32_t packet_size = 4;
const uint32_t in_size = input->size();
const float* in_ptr = input->raw_ptr();
float* out_ptr = output->raw_ptr();
#ifdef __AVX2__
packet_size = 8;
__m256 _one = _mm256_set1_ps(1.f);
Expand All @@ -65,34 +68,36 @@ void ArmaSigmoid(const arma::fcube& input_data, arma::fcube& output_data) {
#endif
if (index < in_size) {
while (index < in_size) {
float value = input_data.at(index);
output_data.at(index) = 1 / (1.f + fmath::exp(-value));
float value = input->index(index);
output->index(index) = 1 / (1.f + fmath::exp(-value));
index += 1;
}
}
#else
output_data = 1.f / (1.f + arma::exp(-input_data));
output->data() = 1.f / (1.f + arma::exp(-input->data()));
#endif
}

void ArmaReLU(const arma::fcube& input_data, arma::fcube& output_data) {
CHECK(!input_data.empty() && !output_data.empty())
static void ReluSSE(sftensor input, sftensor output) {
CHECK(input != nullptr && output != nullptr)
<< "The input or output tensor is empty.";
CHECK(!input->empty() && !output->empty())
<< "The input or output tensor is empty.";
CHECK(input_data.size() == output_data.size())
CHECK(input->size() == output->size())
<< "The input and output sizes are not equal.";
#ifndef __SSE2__
for (uint32_t j = 0; j < input_data.size(); ++j) {
float value = input_data.at(j);
output_data.at(j) = value > 0.f ? value : 0.f;
for (uint32_t j = 0; j < input->size(); ++j) {
float value = input->index(j);
output->index(j) = value > 0.f ? value : 0.f;
}
#else
int32_t j = 0;
int32_t packet_size = 4;
const uint32_t size = output_data.size();
const uint32_t size = input->size();
const float* in_ptr = input->raw_ptr();
float* out_ptr = output->raw_ptr();
#ifdef __AVX2__
packet_size = 8;
const float* in_ptr = input_data.memptr();
float* out_ptr = output_data.memptr();
__m256 _zero = _mm256_setzero_ps();
for (j = 0; j <= (int32_t)size - packet_size; j += packet_size) {
__m256 _p = _mm256_loadu_ps(in_ptr);
Expand All @@ -102,8 +107,6 @@ void ArmaReLU(const arma::fcube& input_data, arma::fcube& output_data) {
out_ptr += packet_size;
}
#else
const float* in_ptr = input_data.memptr();
float* out_ptr = output_data.memptr();
__m128 _zero = _mm_setzero_ps();
for (j = 0; j <= (int32_t)size - packet_size; j += packet_size) {
__m128 _p = _mm_load_ps(in_ptr);
Expand All @@ -115,29 +118,31 @@ void ArmaReLU(const arma::fcube& input_data, arma::fcube& output_data) {
#endif
if (j < size) {
while (j < size) {
float value = input_data.at(j);
output_data.at(j) = value > 0.f ? value : 0.f;
float value = input->index(j);
output->index(j) = value > 0.f ? value : 0.f;
j += 1;
}
}
#endif
}

void ArmaSiLU(const arma::fcube& input_data, arma::fcube& output_data) {
CHECK(!input_data.empty() && !output_data.empty())
static void SiluSSE(sftensor input, sftensor output) {
CHECK(input != nullptr && output != nullptr)
<< "The input or output tensor is empty.";
CHECK(!input->empty() && !output->empty())
<< "The input or output tensor is empty.";
CHECK(input_data.size() == output_data.size())
CHECK(input->size() == output->size())
<< "The input and output sizes are not equal.";
#ifndef __SSE2__
output_data = input_data / (1 + arma::exp(-input_data));
output->data() = input->data() / (1 + arma::exp(-input->data()));
#else
int32_t j = 0;
int32_t packet_size = 4;
const uint32_t size = output_data.size();
const uint32_t size = input->size();
const float* in_ptr = input->raw_ptr();
float* out_ptr = output->raw_ptr();
#ifdef __AVX2__
packet_size = 8;
const float* in_ptr = input_data.memptr();
float* out_ptr = output_data.memptr();
__m256 _one = _mm256_set1_ps(1.f);
__m256 _zero = _mm256_setzero_ps();

Expand All @@ -150,8 +155,6 @@ void ArmaSiLU(const arma::fcube& input_data, arma::fcube& output_data) {
out_ptr += packet_size;
}
#else
const float* in_ptr = input_data.memptr();
float* out_ptr = output_data.memptr();
__m128 _one = _mm_set1_ps(1.f);
__m128 _zero = _mm_setzero_ps();

Expand All @@ -165,13 +168,33 @@ void ArmaSiLU(const arma::fcube& input_data, arma::fcube& output_data) {
#endif
if (j < size) {
while (j < size) {
float value = input_data.at(j);
output_data.at(j) = value / (1.f + fmath::exp(-value));
float value = input->index(j);
output->index(j) = value / (1.f + fmath::exp(-value));
j += 1;
}
}
#endif
}

ActivationFunc ApplySSEActivation(ActivationType act_type) {
ActivationFunc function;
switch (act_type) {
case ActivationType::kActivationRelu: {
function = ReluSSE;
return function;
}
case ActivationType::kActivationSigmoid: {
function = SigmoidSSE;
return function;
}
case ActivationType::kActivationSilu: {
function = SiluSSE;
return function;
}
default: {
LOG(FATAL) << "Unknown activation type: " << int(act_type);
}
}
}
} // namespace math
} // namespace kuiper_infer
} // namespace kuiper_infer
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,27 @@
#ifndef KUIPER_INFER_INCLUDE_MATH_ARMA_SSE
#define KUIPER_INFER_INCLUDE_MATH_ARMA_SSE
#include <armadillo>
#include "data/tensor.hpp"
#include "utils/math/fmath.hpp"
namespace kuiper_infer {
namespace math {
void ArmaSigmoid(const arma::fcube& input_data, arma::fcube& output_data);
enum class ActivationType {
kActivatetionUnknown = -1,
kActivationRelu = 0,
kActivationSilu = 1,
kActivationSigmoid = 2,
};

void ArmaReLU(const arma::fcube& input_data, arma::fcube& output_data);
using ActivationFunc = std::function<void(sftensor, sftensor)>;

ActivationFunc ApplySSEActivation(ActivationType act_type);

static void SigmoidSSE(sftensor input, sftensor output);

static void ReluSSE(sftensor input, sftensor output);

static void SiluSSE(sftensor input, sftensor output);

void ArmaSiLU(const arma::fcube& input_data, arma::fcube& output_data);
} // namespace math
} // namespace kuiper_infer
#endif // KUIPER_INFER_INCLUDE_MATH_ARMA_SSE
4 changes: 2 additions & 2 deletions source/layer/details/relu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

// Created by fss on 22-11-18.
#include "relu.hpp"
#include "arma_sse.hpp"
#include "activation_sse.hpp"
#include "layer/abstract/layer_factory.hpp"

namespace kuiper_infer {
Expand Down Expand Up @@ -77,7 +77,7 @@ InferStatus ReluLayer::Forward(
<< "The input and output tensor shapes of the relu layer do not match "
<< i << " th";
using namespace kuiper_infer::math;
ArmaReLU(input->data(), output->data());
ApplySSEActivation(ActivationType::kActivationRelu)(input, output);
}
return InferStatus::kInferSuccess;
}
Expand Down
4 changes: 2 additions & 2 deletions source/layer/details/sigmoid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

#include "sigmoid.hpp"
#include <glog/logging.h>
#include "arma_sse.hpp"
#include "activation_sse.hpp"
#include "layer/abstract/layer_factory.hpp"

namespace kuiper_infer {
Expand Down Expand Up @@ -61,7 +61,7 @@ InferStatus SigmoidLayer::Forward(
"match "
<< i << " th";
using namespace kuiper_infer::math;
ArmaSigmoid(input->data(), output->data());
ApplySSEActivation(ActivationType::kActivationSigmoid)(input, output);
}
return InferStatus::kInferSuccess;
}
Expand Down
4 changes: 2 additions & 2 deletions source/layer/details/silu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
// Created by fss on 22-12-25.

#include "silu.hpp"
#include "arma_sse.hpp"
#include "activation_sse.hpp"
#include "layer/abstract/layer_factory.hpp"
#include "tick.hpp"

Expand Down Expand Up @@ -81,7 +81,7 @@ InferStatus SiLULayer::Forward(
<< "The input and output tensor shapes of the silu layer do not match "
<< i << " th";
using namespace kuiper_infer::math;
ArmaSiLU(input->data(), output->data());
ApplySSEActivation(ActivationType::kActivationSilu)(input, output);
}
return InferStatus::kInferSuccess;
}
Expand Down
6 changes: 3 additions & 3 deletions source/layer/details/yolo_detect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

// Created by fss on 22-12-26.
#include "yolo_detect.hpp"
#include "arma_sse.hpp"
#include "activation_sse.hpp"
#include "data/tensor_util.hpp"
#include "layer/abstract/layer_factory.hpp"

Expand Down Expand Up @@ -125,9 +125,9 @@ InferStatus YoloDetectLayer::Forward(
CHECK_EQ(stages_tensor->rows(), stages_ * nx * ny);
CHECK_EQ(stages_tensor->cols(), classes_info);

arma::fcube input_data = input->data();
using namespace kuiper_infer::math;
ArmaSigmoid(input_data, input_data);
ApplySSEActivation(ActivationType::kActivationSigmoid)(input, input);
const arma::fcube& input_data = input->data();

arma::fmat& x_stages = stages_tensor->slice(b);
for (uint32_t na = 0; na < num_anchors_; ++na) {
Expand Down

0 comments on commit 09d7823

Please sign in to comment.