From b239c3304c66767077956140f8671299d0d8e144 Mon Sep 17 00:00:00 2001
From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Fri, 27 Dec 2024 15:17:39 +0900
Subject: [PATCH 1/4] [onert-micro] Enable UnidirectionalSequenceLSTM

ONE-DCO-1.0-Signed-off-by: Chunseok Lee <chunseok.lee@samsung.com>
---
 .../include/execute/OMRuntimeKernel.h         |   2 +-
 .../kernels/UnidirectionalSequenceLSTM.h      | 180 ++++++
 .../PALUnidirectionalSequenceLSTMCommon.h     | 597 ++++++++++++++++++
 .../include/pal/mcu/KernelsToBuild.lst        |   2 +-
 .../pal/mcu/PALUnidirectionalSequenceLSTM.h   |  23 +
 .../kernels/UnidirectionalSequenceLSTM.cpp    | 389 ++++++++++++
 .../tests/UnidirectionalSequenceLSTM.test.cpp |   0
 .../kernels/UnidirectionalSequenceLSTM.cpp    |  49 ++
 8 files changed, 1240 insertions(+), 2 deletions(-)
 create mode 100644 onert-micro/onert-micro/include/execute/kernels/UnidirectionalSequenceLSTM.h
 create mode 100644 onert-micro/onert-micro/include/pal/common/PALUnidirectionalSequenceLSTMCommon.h
 create mode 100644 onert-micro/onert-micro/include/pal/mcu/PALUnidirectionalSequenceLSTM.h
 create mode 100644 onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
 create mode 100644 onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp
 create mode 100644 onert-micro/onert-micro/src/import/kernels/UnidirectionalSequenceLSTM.cpp
diff --git a/onert-micro/onert-micro/include/execute/OMRuntimeKernel.h b/onert-micro/onert-micro/include/execute/OMRuntimeKernel.h
index e33239f7256..50814267d89 100644
--- a/onert-micro/onert-micro/include/execute/OMRuntimeKernel.h
+++ b/onert-micro/onert-micro/include/execute/OMRuntimeKernel.h
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-constexpr static uint32_t maxInputSize = 6;
+constexpr static uint32_t maxInputSize = 24; // was 6, but lstm takes 24 inputs
 constexpr static uint32_t maxOutputSize = 5;
 
 namespace onert_micro
diff --git a/onert-micro/onert-micro/include/execute/kernels/UnidirectionalSequenceLSTM.h b/onert-micro/onert-micro/include/execute/kernels/UnidirectionalSequenceLSTM.h
new file mode 100644
index 00000000000..ff3672a62ea
--- /dev/null
+++ b/onert-micro/onert-micro/include/execute/kernels/UnidirectionalSequenceLSTM.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H
+#define ONERT_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H
+
+#include "OMStatus.h"
+
+#include "core/OMUtils.h"
+#include "core/OMKernelData.h"
+#include "core/OMDataType.h"
+
+#include "execute/OMKernelExecutionBuilder.h"
+#include "execute/OMUtils.h"
+#include "execute/OMRuntimeKernel.h"
+
+using namespace onert_micro::core;
+using namespace onert_micro::execute;
+
+namespace
+{
+
+int dim(const circle::Tensor *x, int index)
+{
+  onert_micro::core::OMRuntimeShape shape(x);
+  return shape.dims(index);
+}
+
+int num_elements(const circle::Tensor *x)
+{
+  onert_micro::core::OMRuntimeShape shape(x);
+  return shape.flatSize();
+}
+
+int num_dims(const circle::Tensor *x)
+{
+  onert_micro::core::OMRuntimeShape shape(x);
+  return shape.dimensionsCount();
+}
+
+} // namespace
+namespace onert_micro
+{
+namespace lstm
+{
+
+struct LSTMStruct
+{
+  LSTMStruct() = delete;
+  LSTMStruct(const LSTMStruct &) = delete;
+
+  explicit LSTMStruct(const OMExecuteArgs &execute_args)
+  {
+    core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
+    core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
+    uint16_t op_index = execute_args.kernel_index;
+
+    execute::OMRuntimeKernel runtime_kernel;
+    runtime_kernel.readKernel(op_index, runtime_context);
+
+    for (int i; i < 24; i++)
+    {
+      internal_tensors[i] = runtime_kernel.inputs[i];
+    }
+
+    output_internal = runtime_kernel.outputs[0];
+
+    options = runtime_kernel.first_operator->builtin_options_as_UnidirectionalSequenceLSTMOptions();
+  }
+
+  void validateTensorTypes()
+  {
+    assert(input()->type() == (output_state()->type()));
+    assert(output()->type() == (input()->type()));
+
+    for (int32_t i = 1; i < 9; ++i)
+    {
+      assert(internal_tensors[i] == nullptr or
+             (input_to_forget_weights()->type()) == (internal_tensors[i])->type());
+    }
+
+    for (int32_t i = 12; i < 16; ++i)
+    {
+      assert(internal_tensors[i] == nullptr or
+             (forget_gate_bias()->type()) == (internal_tensors[i]->type()));
+    }
+  }
+
+  const circle::Tensor *input() { return internal_tensors[0]; };
+
+  const circle::Tensor *input_to_input_weights() { return internal_tensors[1]; };
+  const circle::Tensor *input_to_forget_weights() { return internal_tensors[2]; };
+  const circle::Tensor *input_to_cell_weights() { return internal_tensors[3]; };
+  const circle::Tensor *input_to_output_weights() { return internal_tensors[4]; };
+
+  const circle::Tensor *recurrent_to_input_weights() { return internal_tensors[5]; };
+  const circle::Tensor *recurrent_to_forget_weights() { return internal_tensors[6]; };
+  const circle::Tensor *recurrent_to_cell_weights() { return internal_tensors[7]; };
+  const circle::Tensor *recurrent_to_output_weights() { return internal_tensors[8]; };
+
+  const circle::Tensor *cell_to_input_weights() { return internal_tensors[9]; };
+  const circle::Tensor *cell_to_forget_weights() { return internal_tensors[10]; };
+  const circle::Tensor *cell_to_output_weights() { return internal_tensors[11]; };
+
+  const circle::Tensor *input_gate_bias() { return internal_tensors[12]; };
+  const circle::Tensor *forget_gate_bias() { return internal_tensors[13]; };
+  const circle::Tensor *cell_gate_bias() { return internal_tensors[14]; };
+  const circle::Tensor *output_gate_bias() { return internal_tensors[15]; };
+
+  const circle::Tensor *projection_weights() { return internal_tensors[16]; };
+  const circle::Tensor *projection_bias() { return internal_tensors[17]; };
+
+  const circle::Tensor *output_state() { return internal_tensors[18]; };
+  const circle::Tensor *cell_state() { return internal_tensors[19]; };
+
+  const circle::Tensor *input_layer_norm_coefficients() { return internal_tensors[20]; };
+  const circle::Tensor *forget_layer_norm_coefficients() { return internal_tensors[21]; };
+  const circle::Tensor *cell_layer_norm_coefficients() { return internal_tensors[22]; };
+  const circle::Tensor *output_layer_norm_coefficients() { return internal_tensors[23]; };
+  const circle::Tensor *output() { return output_internal; };
+
+  const circle::UnidirectionalSequenceLSTMOptions *options;
+
+  const circle::Tensor *get_internal_tensor(int i) { return internal_tensors[i]; }
+
+private:
+  const circle::Tensor *output_internal;
+  const circle::Tensor *internal_tensors[24];
+};
+
+struct GateParameters
+{
+  FullyConnectedParams input_fc_params;
+  FullyConnectedParams recurrent_fc_params;
+};
+
+struct InterGateParameters
+{
+  BinaryArithmeticBroadcastParams forget_cell_mul_params;
+  BinaryArithmeticBroadcastParams input_mul_params;
+  BinaryArithmeticBroadcastParams output_mul_params;
+};
+
+struct CellStateInfo
+{
+  float cell_clip;
+  // clipping range for cell state only 16 bits cell is supported (could be
+  // generalized through templatation)
+  int16_t quantized_cell_clip;
+  // 2^-cell_state_scale_power = cell state scale, required by integer tanh
+  // computation
+  int32_t cell_state_scale_power;
+};
+
+struct LSTMParameters
+{
+  GateParameters forget_gate_parameters;
+  GateParameters input_gate_parameters;
+  GateParameters cell_gate_parameters;
+  GateParameters output_gate_parameters;
+  InterGateParameters inter_gate_parameters;
+};
+
+} // namespace lstm
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_KERNELS_UNIDIRECTIONAL_SEQUENCE_LSTM_H
diff --git a/onert-micro/onert-micro/include/pal/common/PALUnidirectionalSequenceLSTMCommon.h b/onert-micro/onert-micro/include/pal/common/PALUnidirectionalSequenceLSTMCommon.h
new file mode 100644
index 00000000000..399b97984c5
--- /dev/null
+++ b/onert-micro/onert-micro/include/pal/common/PALUnidirectionalSequenceLSTMCommon.h
@@ -0,0 +1,597 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_PAL_COMMON_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
+#define ONERT_MICRO_PAL_COMMON_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
+
+#include "core/OMUtils.h"
+#include "execute/kernels/UnidirectionalSequenceLSTM.h"
+#include "PALTanh.h"
+#include "PALLogistic.h"
+#include "PALFullyConnected.h"
+#include "PALMul.h"
+#include "PALUtils.h"
+
+using namespace onert_micro::core::utils;
+using namespace onert_micro::execute::pal;
+
+namespace onert_micro
+{
+namespace lstm_internal
+{
+namespace
+{
+// Possible fused activation functions.
+typedef enum
+{
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActReluN1To1, // min(max(-1, x), 1)
+  kTfLiteActRelu6,     // min(max(0, x), 6)
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} FusedActivation;
+
+} // namespace
+
+// #ifndef DIS_QUANT
+
+// template <typename InputType, typename OutputType>
+// void mulElementwise(int size, const ArithmeticQuantParams *params, const InputType *input1_data,
+//                     const InputType *input2_data, OutputType *output_data)
+// {
+//   for (int i = 0; i < size; ++i)
+//   {
+//     const int32_t input1_val = params->input1_offset + input1_data[i];
+//     const int32_t input2_val = params->input2_offset + input2_data[i];
+//     const int32_t unclamped_result =
+//       params->output_offset + multiplyByQuantizedMultiplier(input1_val * input2_val,
+//                                                             params->output_multiplier,
+//                                                             params->output_shift);
+//     const int32_t clamped_output =
+//       std::min(params->quantized_activation_max,
+//                std::max(params->quantized_activation_min, unclamped_result));
+//     output_data[i] = static_cast<OutputType>(clamped_output);
+//   }
+// }
+
+// // Input and output have the same shape in LSTM
+// void mul(const onert_micro::RuntimeShape &shape, const ArithmeticQuantParams *params,
+//          const int16_t *input1_data, const int16_t *input2_data, int8_t *output_data)
+// {
+//   return mulElementwise<int16_t, int8_t>(shape.flatSize(), params, input1_data, input2_data,
+//                                          output_data);
+// }
+
+// // Input and output have the same shape in LSTM
+// void mul(const onert_micro::RuntimeShape &shape, const ArithmeticQuantParams *params,
+//          const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data)
+// {
+//   return mulElementwise(shape.flatSize(), params, input1_data, input2_data, output_data);
+// }
+
+// void addElementWise(const int16_t *input_1, const int16_t *input_2, int n_batch, int n_input,
+//                     int16_t *output)
+// {
+//   for (int batch = 0; batch < n_batch; ++batch)
+//   {
+//     for (int i = 0; i < n_input; ++i)
+//     {
+//       const int index = batch * n_input + i;
+//       int32_t sum = input_1[index] + input_2[index];
+//       const int32_t sum_clamped =
+//         std::min(static_cast<int32_t>(std::numeric_limits<int16_t>::max()),
+//                  std::max(static_cast<int32_t>(std::numeric_limits<int16_t>::min()), sum));
+//       output[index] = static_cast<int16_t>(sum_clamped);
+//     }
+//   }
+// }
+
+// void tanh(int32_t cell_state_scale_power, const onert_micro::RuntimeShape &input_data_shape,
+//           int16_t *input_data, const onert_micro::RuntimeShape &output_data_shape,
+//           int16_t *output_data)
+// {
+//   int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3;
+//   int32_t input_multiplier = 0;
+//   if (tanh_input_left_shift < 0) /* handling negative shift value */
+//   {
+//     tanh_input_left_shift = -tanh_input_left_shift;
+//     input_multiplier = 3;
+//   }
+//   const int flat_size = input_data_shape.flatSize();
+//   onert_micro_pal::Tanh(input_multiplier, tanh_input_left_shift, flat_size, input_data,
+//                              output_data);
+// }
+
+// void sigmoid(const onert_micro::RuntimeShape &data_shape, int16_t *data)
+// {
+//   onert_micro_pal::Logistic(0, 0, data_shape.flatSize(), data, data);
+// }
+
+// void clipping(const int v_size, const onert_micro::lstm::CellStateInfo *cell_state_info,
+//               int16_t *vector)
+// {
+//   for (int i = 0; i < v_size; i++)
+//   {
+//     vector[i] = std::max(std::min(cell_state_info->quantized_cell_clip, vector[i]),
+//                          static_cast<int16_t>(-cell_state_info->quantized_cell_clip));
+//   }
+// }
+// #endif // DIS_QUANT
+
+#ifndef DIS_FLOAT
+// Input and output have the same shape in LSTM
+void mul(const onert_micro::core::OMRuntimeShape &shape,
+         const BinaryArithmeticBroadcastParams *params, const float *input1_data,
+         const float *input2_data, float *output_data)
+{
+  const int flat_size = shape.flatSize();
+  onert_micro::execute::pal::Mul(*params, flat_size, input1_data, input2_data, output_data);
+}
+
+void addElementWise(const float *input_1, const float *input_2, int n_batch, int n_input,
+                    float *output)
+{
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    for (int i = 0; i < n_input; ++i)
+    {
+      const int index = batch * n_input + i;
+      output[index] = input_1[index] + input_2[index];
+    }
+  }
+}
+
+void tanh(int32_t, const onert_micro::core::OMRuntimeShape &input_data_shape, float *input_data,
+          const onert_micro::core::OMRuntimeShape &output_data_shape, float *output_data)
+{
+  onert_micro::execute::pal::Tanh(input_data_shape, input_data, output_data_shape, output_data);
+}
+
+void sigmoid(const onert_micro::core::OMRuntimeShape &data_shape, float *data)
+{
+  const int flat_size = data_shape.flatSize();
+  onert_micro::execute::pal::Logistic(flat_size, data, data);
+}
+
+void clipping(const int v_size, const onert_micro::lstm::CellStateInfo *cell_state_info,
+              float *vector)
+{
+  for (int i = 0; i < v_size; i++)
+  {
+    vector[i] =
+      std::max(std::min(cell_state_info->cell_clip, vector[i]), -cell_state_info->cell_clip);
+  }
+}
+#endif // DIS_FLOAT
+
+// Size information about the LSTM kernel, which is deduced from tensors stored
+// in the flat buffer file.
+struct LstmSizeInfo
+{
+  bool time_major;
+  int32_t batch_size;
+  int32_t time_steps;
+  int32_t input_dimension;
+  int32_t state_dimension;
+};
+
+class LstmStepManager
+{
+public:
+  LstmStepManager() = delete;
+  // Does not take any ownership, and all pointers must refer to valid objects
+  // that outlive the one constructed.
+  explicit LstmStepManager(const LstmSizeInfo &size_info) : size_info_(size_info) {}
+
+  void updateTime()
+  {
+    current_time_ += 1;
+    // default as one batch per inference
+    int input_step = size_info_.input_dimension;
+    int output_step = size_info_.state_dimension;
+    // time major: batch inference
+    if (size_info_.time_major)
+    {
+      input_step = input_step * size_info_.batch_size;
+      output_step = output_step * size_info_.batch_size;
+    }
+
+    input_offset_ += input_step;
+    output_offset_ += output_step;
+  }
+
+  void updateBatch()
+  {
+    current_batch_ += 1;
+    // batch inference for time major: no action needed
+    if (size_info_.time_major)
+    {
+      return;
+    }
+    // otherwise: singe batch inference, go to the next batch
+    hidden_state_offset_ += size_info_.state_dimension;
+    cell_state_offset_ += size_info_.state_dimension;
+  }
+
+  void resetTime() { current_time_ = 0; }
+
+  onert_micro::core::OMRuntimeShape inputShape() const
+  {
+    int batch_size = 1;
+    if (size_info_.time_major)
+    {
+      batch_size = size_info_.batch_size;
+    }
+    const int dims[2] = {batch_size, size_info_.input_dimension};
+    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
+    return onert_micro::core::OMRuntimeShape(2, dims_data);
+  }
+
+  onert_micro::core::OMRuntimeShape stateShape() const
+  {
+    int batch_size = 1;
+    if (size_info_.time_major)
+    {
+      batch_size = size_info_.batch_size;
+    }
+    const int dims[2] = {batch_size, size_info_.state_dimension};
+    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
+    return onert_micro::core::OMRuntimeShape(2, dims_data);
+  }
+
+  int inputOffset() const { return input_offset_; }
+
+  int outputOffset() const { return output_offset_; }
+
+  int hiddenStateOffset() const { return hidden_state_offset_; }
+
+  int cellStateOffset() const { return cell_state_offset_; }
+
+private:
+  int32_t current_time_ = 0;
+  int32_t current_batch_ = 0;
+  int32_t input_offset_ = 0;
+  int32_t output_offset_ = 0;
+  int32_t hidden_state_offset_ = 0;
+  int32_t cell_state_offset_ = 0;
+
+  const LstmSizeInfo &size_info_;
+};
+
+// Calculates a single LSTM gate.
+// Implements the following formula:
+//   gate = activate(FC(input) + FC(recurrent))
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void calculateLstmGate(const LstmStepManager *step_info,
+                       const onert_micro::lstm::GateParameters *gate_params,
+                       // Input FC
+                       ActivationType *input_data, const WeightType *input_weight_data,
+                       const OMRuntimeShape &input_weight_shape, const BiasType *input_bias_data,
+                       // Recurrent FC
+                       ActivationType *recurrent_data, const WeightType *recurrent_weight_data,
+                       const OMRuntimeShape &recurrent_weight_shape,
+                       const BiasType *recurrent_bias_data,
+                       // Output
+                       CellType *gate_output,
+                       // Scratch arrays
+                       CellType *fc_output_buffer, const FusedActivation activation)
+{
+  // Input FC
+  const auto gate_output_shape = step_info->stateShape();
+  {
+    FullyConnectedParams op_params{};
+    // op_params.input_offset = gate_params->input_fc_params.input_offset;
+    // op_params.weights_offset = gate_params->input_fc_params.weights_offset;
+    // op_params.output_offset = gate_params->input_fc_params.output_offset;
+    // op_params.output_multiplier = gate_params->input_fc_params.output_multiplier;
+    // op_params.output_shift = gate_params->input_fc_params.output_shift;
+    // op_params.quantized_activation_min = gate_params->input_fc_params.quantized_activation_min;
+    // op_params.quantized_activation_max = gate_params->input_fc_params.quantized_activation_max;
+    op_params.float_activation_max = gate_params->input_fc_params.float_activation_max;
+    op_params.float_activation_min = gate_params->input_fc_params.float_activation_min;
+
+    FullyConnected(op_params, input_data + step_info->inputOffset(), input_weight_shape,
+                   input_weight_data, input_bias_data, gate_output_shape, gate_output);
+  }
+
+  // Recurrent FC
+  {
+    FullyConnectedParams op_params{};
+    // op_params.input_offset = gate_params->recurrent_fc_params.input_offset;
+    // op_params.weights_offset = gate_params->recurrent_fc_params.weights_offset;
+    // op_params.output_offset = gate_params->recurrent_fc_params.output_offset;
+    // op_params.output_multiplier = gate_params->recurrent_fc_params.output_multiplier;
+    // op_params.output_shift = gate_params->recurrent_fc_params.output_shift;
+    // op_params.quantized_activation_min =
+    // gate_params->recurrent_fc_params.quantized_activation_min; op_params.quantized_activation_max
+    // = gate_params->recurrent_fc_params.quantized_activation_max;
+    op_params.float_activation_max = gate_params->recurrent_fc_params.float_activation_max;
+    op_params.float_activation_min = gate_params->recurrent_fc_params.float_activation_min;
+
+    FullyConnected(op_params, recurrent_data + step_info->hiddenStateOffset(),
+                   recurrent_weight_shape, recurrent_weight_data, recurrent_bias_data,
+                   gate_output_shape, fc_output_buffer);
+
+    addElementWise(gate_output, fc_output_buffer, /*n_batch=*/gate_output_shape.dimsData()[0],
+                   /*n_state=*/gate_output_shape.dimsData()[1], gate_output);
+
+    switch (activation)
+    {
+      case FusedActivation::kTfLiteActSigmoid:
+        sigmoid(gate_output_shape, gate_output);
+        break;
+      case FusedActivation::kTfLiteActTanh:
+      {
+        // Set the scale power to -12 to avoid shift
+        tanh(/*cell_state_scale_power=*/-12, gate_output_shape, gate_output, gate_output_shape,
+             gate_output);
+      }
+      break;
+      default:
+        // Only Sigmoid or Tanh is used.
+        assert(false && "Only Sigmoid or Tanh is used");
+    }
+  }
+}
+
+// Update the hidden state of the LSTM kernel using the following formula:
+// updated_hidden_state = Tanh(updated_cell_state) * output_gate_output, * means
+// element wise multiplication
+template <typename CellType, typename ActivationType>
+void updateLstmHidden(const LstmStepManager *step_info, CellType *cell_state_data_base,
+                      ActivationType *hidden_state_data, const CellType *output_gate_output,
+                      const BinaryArithmeticBroadcastParams *mul_params,
+                      int32_t cell_state_scale_power, CellType *buffer)
+{
+  auto cell_state_shape = step_info->stateShape();
+  CellType *cell_state_data = cell_state_data_base + step_info->cellStateOffset();
+  // Tanh(cell_state)
+  tanh(cell_state_scale_power, cell_state_shape, cell_state_data, cell_state_shape, buffer);
+  // Update the hidden state
+  mul(cell_state_shape, mul_params, buffer, output_gate_output,
+      hidden_state_data + step_info->hiddenStateOffset());
+}
+
+// Update the cell state using the output from the forget gate, input gate, and
+// cell gate Formula: updated_cell_state = forget_gate_output*cell_state +
+// input_gate_output * cell_gate_output, where * denotes element wise
+// multiplication
+template <typename CellType>
+void updateLstmCell(const LstmStepManager *step_info, CellType *cell_state_data,
+                    // Gate outputs
+                    CellType *forget_gate_output, const CellType *input_gate_output,
+                    const CellType *cell_gate_output,
+                    // Mul parameters
+                    const BinaryArithmeticBroadcastParams &forget_cell_mul_params,
+                    const BinaryArithmeticBroadcastParams &input_mul_params,
+                    const onert_micro::lstm::CellStateInfo *cell_state_info, CellType *buffer)
+{
+  auto cell_state_shape = step_info->stateShape();
+  // Forget Gate x Cell State
+  mul(cell_state_shape, &forget_cell_mul_params, forget_gate_output,
+      cell_state_data + step_info->cellStateOffset(),
+      cell_state_data + step_info->cellStateOffset());
+  // Input Gate x Cell Gate
+  mul(cell_state_shape, &input_mul_params, input_gate_output, cell_gate_output, buffer);
+
+  // Update the cell state
+  addElementWise(cell_state_data + step_info->cellStateOffset(), buffer,
+                 /*n_batch=*/cell_state_shape.dimsData()[0],
+                 /*n_state=*/cell_state_shape.dimsData()[1],
+                 cell_state_data + step_info->cellStateOffset());
+
+  if (cell_state_info->cell_clip > 0)
+  {
+    clipping(cell_state_shape.flatSize(), cell_state_info,
+             cell_state_data + step_info->cellStateOffset());
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void lstmStep(onert_micro::lstm::LSTMStruct *lstm_struct,
+              onert_micro::lstm::LSTMParameters *lstm_params, LstmStepManager *step_info,
+              onert_micro::lstm::CellStateInfo *cell_state_info, ActivationType *output_state_data,
+              CellType *cell_state_data, CellType *scratch0, CellType *scratch1, CellType *scratch2,
+              CellType *scratch3, const OMExecuteArgs &execute_args)
+{
+  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
+  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
+  uint16_t op_index = execute_args.kernel_index;
+  execute::OMRuntimeKernel runtime_kernel;
+  runtime_kernel.readKernel(op_index, runtime_context);
+  runtime_kernel.getDataFromStorage(op_index, runtime_storage, runtime_context);
+
+  const WeightType *input_to_forget_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[2]);
+  const OMRuntimeShape input_to_forget_weights_shape(runtime_kernel.inputs[2]);
+  const BiasType *forget_gate_bias_data =
+    core::utils::castInputData<BiasType>(runtime_kernel.inputs_data[13]);
+  const WeightType *recurrent_to_forget_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[6]);
+  const OMRuntimeShape recurrent_to_forget_weights_shape(runtime_kernel.inputs[6]);
+
+  /*Step1: Calculate gate outputs to prepare cell state update*/
+  CellType *gate_internal_buffer = scratch3;
+  CellType *forget_gate_output = scratch0;
+
+  ActivationType *input_data =
+    core::utils::castOutputData<ActivationType>(runtime_kernel.inputs_data[0]);
+
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->forget_gate_parameters,
+    // Input FC
+    input_data, input_to_forget_weights_data, input_to_forget_weights_shape, forget_gate_bias_data,
+    // Recurrent FC
+    output_state_data, recurrent_to_forget_weights_data, recurrent_to_forget_weights_shape, nullptr,
+    // Output
+    forget_gate_output, gate_internal_buffer, FusedActivation::kTfLiteActSigmoid);
+
+  // Input Gate calculation;
+  const WeightType *input_to_input_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[1]);
+  const OMRuntimeShape input_to_input_weights_shape(runtime_kernel.inputs[1]);
+  const BiasType *input_gate_bias_data =
+    core::utils::castInputData<BiasType>(runtime_kernel.inputs_data[12]);
+  const WeightType *recurrent_to_input_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[5]);
+  const OMRuntimeShape recurrent_to_input_weights_shape(runtime_kernel.inputs[5]);
+
+  CellType *input_gate_output = scratch1;
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->input_gate_parameters,
+    // Input FC
+    input_data, input_to_input_weights_data, input_to_input_weights_shape, input_gate_bias_data,
+    // Recurrent FC
+    output_state_data, recurrent_to_input_weights_data, recurrent_to_input_weights_shape,
+    /*recurrent_bias*/ nullptr,
+    // Output
+    input_gate_output,
+    // Scratch arrays
+    gate_internal_buffer, FusedActivation::kTfLiteActSigmoid);
+
+  // Cell Gate calculation
+  const WeightType *input_to_cell_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[3]);
+  const OMRuntimeShape input_to_cell_weights_shape(runtime_kernel.inputs[3]);
+  const BiasType *cell_gate_bias_data =
+    core::utils::castInputData<BiasType>(runtime_kernel.inputs_data[14]);
+  const WeightType *recurrent_to_cell_weights_data =
+    core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[7]);
+  const OMRuntimeShape recurrent_to_cell_weights_shape(runtime_kernel.inputs[7]);
+
+  CellType *cell_gate_output = scratch2;
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->cell_gate_parameters,
+    // Input FC
+    input_data, input_to_cell_weights_data, input_to_cell_weights_shape, cell_gate_bias_data,
+    // Recurrent FC
+    output_state_data, recurrent_to_cell_weights_data, recurrent_to_cell_weights_shape,
+    /*recurrent_bias*/ nullptr,
+    // Output
+    cell_gate_output,
+    // Scratch arrays
+    gate_internal_buffer, FusedActivation::kTfLiteActTanh);
+
+  /*Step2: update the cell state */
+  {
+    // const InterGateParameters& inter_gate_params = op_data.inter_gate_parameters;
+    CellType *updated_input_buffer = scratch1; // reuse buffer
+
+    updateLstmCell<CellType>(
+      step_info, cell_state_data, forget_gate_output, input_gate_output, cell_gate_output,
+      lstm_params->inter_gate_parameters.forget_cell_mul_params,
+      lstm_params->inter_gate_parameters.input_mul_params, cell_state_info, updated_input_buffer);
+  }
+
+  {
+    /*Step3: update the hidden state */
+    const WeightType *input_to_output_weights_data =
+      core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[4]);
+    const OMRuntimeShape input_to_output_weights_shape(runtime_kernel.inputs[4]);
+    const BiasType *output_gate_bias_data =
+      core::utils::castInputData<BiasType>(runtime_kernel.inputs_data[15]);
+    const WeightType *recurrent_to_output_weights_data =
+      core::utils::castInputData<WeightType>(runtime_kernel.inputs_data[8]);
+    const OMRuntimeShape recurrent_to_output_weights_shape(runtime_kernel.inputs[8]);
+
+    CellType *output_gate_output = scratch1; // reuse buffer
+    calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+      step_info, &lstm_params->output_gate_parameters,
+      // Input FC
+      input_data, input_to_output_weights_data, input_to_output_weights_shape,
+      output_gate_bias_data,
+      // Recurrent FC
+      output_state_data, recurrent_to_output_weights_data, recurrent_to_output_weights_shape,
+      nullptr,
+      // Output
+      output_gate_output,
+      // Scratch arrays
+      gate_internal_buffer, FusedActivation::kTfLiteActSigmoid);
+    CellType *tanh_activated_cell_buffer = scratch0; // reuse buffer
+    updateLstmHidden<CellType, ActivationType>(
+      step_info, cell_state_data, output_state_data, output_gate_output,
+      &lstm_params->inter_gate_parameters.output_mul_params,
+      cell_state_info->cell_state_scale_power, tanh_activated_cell_buffer);
+
+    ActivationType *output_ptr =
+      core::utils::castOutputData<ActivationType>(runtime_kernel.outputs_data[0]);
+    std::memcpy(output_ptr + step_info->outputOffset(),
+                output_state_data + step_info->hiddenStateOffset(),
+                step_info->stateShape().flatSize() * sizeof(ActivationType));
+  }
+}
+
+} // namespace lstm_internal
+
+// Evaluate the LSTM kernel with (potential) multi-steps and multi-batch input
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void evalLSTM(onert_micro::lstm::LSTMStruct *lstm_struct,
+              onert_micro::lstm::LSTMParameters *lstm_params,
+              onert_micro::lstm::CellStateInfo *cell_state_info, ActivationType *output_state_data,
+              CellType *cell_state_data, CellType *scratch0, CellType *scratch1, CellType *scratch2,
+              CellType *scratch3, const OMExecuteArgs &execute_args)
+{
+  lstm_internal::LstmSizeInfo size_info;
+
+  size_info.time_major = lstm_struct->options->time_major();
+  size_info.batch_size =
+    size_info.time_major ? dim(lstm_struct->input(), 1) : dim(lstm_struct->input(), 0);
+  size_info.time_steps =
+    size_info.time_major ? dim(lstm_struct->input(), 0) : dim(lstm_struct->input(), 1);
+  size_info.input_dimension = dim(lstm_struct->input(), 2);
+  size_info.state_dimension = dim(lstm_struct->output_state(), 1);
+
+  lstm_internal::LstmStepManager step_info(size_info);
+
+  // time is the first dimention, enable batch computation
+  if (size_info.time_major)
+  {
+    for (int t = 0; t < size_info.time_steps; t++)
+    {
+      lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+        lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
+        scratch0, scratch1, scratch2, scratch3, execute_args);
+      // prepare for the next time step
+      step_info.updateTime();
+    }
+  }
+  else
+  {
+    // batch first, unable to size the input data. single batch inference
+    for (int b = 0; b < size_info.batch_size; b++)
+    {
+      for (int t = 0; t < size_info.time_steps; t++)
+      {
+        lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+          lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
+          scratch0, scratch1, scratch2, scratch3, execute_args);
+        // prepare for the next time step
+        step_info.updateTime();
+      }
+      // prepare for the next batch
+      step_info.updateBatch();
+      step_info.resetTime();
+    }
+  }
+}
+
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_PAL_COMMON_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
diff --git a/onert-micro/onert-micro/include/pal/mcu/KernelsToBuild.lst b/onert-micro/onert-micro/include/pal/mcu/KernelsToBuild.lst
index ecbe79f5289..3036858b92a 100644
--- a/onert-micro/onert-micro/include/pal/mcu/KernelsToBuild.lst
+++ b/onert-micro/onert-micro/include/pal/mcu/KernelsToBuild.lst
@@ -78,7 +78,7 @@ REGISTER_KERNEL(SOFTMAX, Softmax)
 #/*REGISTER_KERNEL(SELECT_V2, SelectV2)*/
 REGISTER_KERNEL(SVDF, SVDF)
 REGISTER_KERNEL(WHILE, While)
-#/*REGISTER_KERNEL(UNIDIRECTIONAL_SEQUENCE_LSTM, UnidirectionalSequenceLSTM)*/
+REGISTER_KERNEL(UNIDIRECTIONAL_SEQUENCE_LSTM, UnidirectionalSequenceLSTM)
 #/*REGISTER_KERNEL(RESIZE_BILINEAR, ResizeBilinear)*/
 #/*REGISTER_KERNEL(RESIZE_NEAREST_NEIGHBOR, ResizeNearestNeighbor)*/
 REGISTER_KERNEL(RSQRT, Rsqrt)
diff --git a/onert-micro/onert-micro/include/pal/mcu/PALUnidirectionalSequenceLSTM.h b/onert-micro/onert-micro/include/pal/mcu/PALUnidirectionalSequenceLSTM.h
new file mode 100644
index 00000000000..474df3f87f2
--- /dev/null
+++ b/onert-micro/onert-micro/include/pal/mcu/PALUnidirectionalSequenceLSTM.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_PAL_MCU_UNIDIRECTIONAL_SEQUENCE_LSTM_H
+#define ONERT_MICRO_PAL_MCU_UNIDIRECTIONAL_SEQUENCE_LSTM_H
+
+#include "PALUnidirectionalSequenceLSTMCommon.h"
+
+#endif
diff --git a/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
new file mode 100644
index 00000000000..f36090b16e8
--- /dev/null
+++ b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OMStatus.h"
+
+#include "core/OMUtils.h"
+#include "core/OMKernelData.h"
+#include "core/OMDataType.h"
+
+#include "execute/OMKernelExecutionBuilder.h"
+#include "execute/OMUtils.h"
+#include "execute/OMRuntimeKernel.h"
+
+#include "execute/kernels/UnidirectionalSequenceLSTM.h"
+#include "PALUnidirectionalSequenceLSTM.h"
+
+using namespace onert_micro;
+using namespace onert_micro::core;
+using namespace onert_micro::core::utils;
+using namespace onert_micro::execute;
+
+namespace onert_micro
+{
+namespace
+{
+
+bool checkedLog2(const float x, int *log2_result)
+{
+  // Using TfLiteRound instead of std::round and std::log instead of
+  // std::log2 to work around these functions being missing in a toolchain
+  // used in some TensorFlow tests as of May 2018.
+  const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
+  const float x_log2_rounded = std::round(x_log2);
+  const float x_log2_fracpart = x_log2 - x_log2_rounded;
+
+  *log2_result = static_cast<int>(x_log2_rounded);
+  return std::abs(x_log2_fracpart) < 1e-3f;
+}
+
+// Create parameters for element wise multiplication that happens in a) cell
+// state update ; b) hidden state update
+// Note that all the output of gates are symmetrically quantized so only scales
+// are required for input. However, during the hidden state update phase, the
+// output is the updated hidden state, which is asymmetrically quantized. Thus
+// output may require zero point
+// onert_micro::core::BinaryArithmeticBroadcastParams
+// createInterGateParams(const float input1_scale, const float input2_scale, const float
+// output_scale,
+//                       const OMDataType output_type, const int output_zp)
+// {
+//   onert_micro::core::BinaryArithmeticBroadcastParams op_params;
+// if (output_type == OMDataType::S16)
+// {
+//   op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
+//   op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();
+// }
+// else if (output_type == OMDataType::S8)
+// {
+//   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+//   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+// }
+
+// op_params.input1_offset = 0; // symmetric
+// op_params.input2_offset = 0; // symmetric
+// op_params.output_offset = output_zp;
+
+// const double input_product_scale =
+//   static_cast<double>(input1_scale) * static_cast<double>(input2_scale);
+// double effective_scale = input_product_scale / static_cast<double>(output_scale);
+// auto output_shift = static_cast<int>(op_params.output_shift);
+// onert_micro::execute::quantizeMultiplier(effective_scale, &op_params.output_multiplier,
+//                                          &output_shift);
+// op_params.output_shift = output_shift;
+//   return op_params;
+// }
+
+// void createGateParams(const circle::Tensor *input, const circle::Tensor *input_weight,
+//                       const circle::Tensor *input_bias, const circle::Tensor *hidden_state,
+//                       const circle::Tensor *hidden_state_weight,
+//                       const float nonlinear_activation_input_scale, const OMDataType cell_type,
+//                       lstm::GateParameters *gate_params)
+// {
+//   // Input CalculateOpDataFullyConnected
+//   {
+//     FullyConnectedParams input_gate_params;
+//     double real_multiplier = 0.0;
+//     int output_shift;
+//     int32_t output_activation_min;
+//     int32_t output_activation_max;
+//     int32_t output_multiplier;
+//     float input_scale = input->quantization()->scale()->data()[0];
+//     float input_weight_scale = input_weight->quantization()->scale()->data()[0];
+//     real_multiplier = getQuantizedConvolutionMultipler(
+//       input_scale, input_weight_scale, nonlinear_activation_input_scale);
+//     quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+//     calculateActivationRangeQuantized(FusedActFunc::NONE, 0, nonlinear_activation_input_scale,
+//                                       cell_type, &output_activation_min, &output_activation_max);
+
+//     input_gate_params.output_shift = output_shift;
+//     input_gate_params.output_multiplier = output_multiplier;
+//     input_gate_params.quantized_activation_max = output_activation_max;
+//     input_gate_params.quantized_activation_min = output_activation_min;
+//     input_gate_params.input_offset = (-1)*(input->quantization()->zero_point()->data()[0]);
+//     input_gate_params.weights_offset =
+//     (-1)*(input_weight->quantization()->zero_point()->data()[0]); input_gate_params.output_offset
+//     = 0;
+
+//     gate_params->input_fc_params = input_gate_params;
+//   }
+
+//   // Recurrent CalculateOpDataFullyConnected
+//   {
+//     FullyConnectedParams recurrent_gate_params;
+//     double real_multiplier = 0.0;
+//     int output_shift;
+//     int32_t output_activation_min;
+//     int32_t output_activation_max;
+//     int32_t output_multiplier;
+//     float hidden_state_scale = hidden_state->quantization()->scale()->data()[0];
+//     float hidden_state_weight_scale = hidden_state_weight->quantization()->scale()->data()[0];
+//     float hidden_state_zeropoint = hidden_state->quantization()->zero_point()->data()[0];
+//     float hidden_state_weight_zeropoint =
+//     hidden_state_weight->quantization()->zero_point()->data()[0]; real_multiplier =
+//     getQuantizedConvolutionMultipler(hidden_state_scale,
+//                                                        hidden_state_weight_scale,
+//                                                        nonlinear_activation_input_scale);
+//     quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+//     calculateActivationRangeQuantized(FusedActFunc::NONE, 0, nonlinear_activation_input_scale,
+//                                       cell_type, &output_activation_min, &output_activation_max);
+
+//     recurrent_gate_params.output_shift = output_shift;
+//     recurrent_gate_params.output_multiplier = output_multiplier;
+//     recurrent_gate_params.quantized_activation_max = output_activation_max;
+//     recurrent_gate_params.quantized_activation_min = output_activation_min;
+//     recurrent_gate_params.input_offset = -(hidden_state_zeropoint);
+//     recurrent_gate_params.weights_offset = -(hidden_state_weight_zeropoint);
+//     recurrent_gate_params.output_offset = 0;
+
+//     gate_params->recurrent_fc_params = recurrent_gate_params;
+//   }
+// }
+
+// void prepareGateParamsInteger(lstm::LSTMStruct *lstm_struct,
+//                               lstm::LSTMParameters *quant_lstm_params)
+// {
+//   float nonlinear_input_scale = 0.00024414062; // 2^-12 Q3.12 -> Q0.15
+
+//   createGateParams(lstm_struct->input(), lstm_struct->input_to_forget_weights(),
+//                    lstm_struct->forget_gate_bias(), lstm_struct->output_state(),
+//                    lstm_struct->recurrent_to_forget_weights(), nonlinear_input_scale,
+//                    OMDataType::S16, &quant_lstm_params->forget_gate_parameters);
+
+//   createGateParams(lstm_struct->input(), lstm_struct->input_to_input_weights(),
+//                    lstm_struct->input_gate_bias(), lstm_struct->output_state(),
+//                    lstm_struct->recurrent_to_input_weights(), nonlinear_input_scale,
+//                    OMDataType::S16, &quant_lstm_params->input_gate_parameters);
+
+//   // lstm::GateParameters cell_gate_parameters;
+//   createGateParams(lstm_struct->input(), lstm_struct->input_to_cell_weights(),
+//                    lstm_struct->cell_gate_bias(), lstm_struct->output_state(),
+//                    lstm_struct->recurrent_to_cell_weights(), nonlinear_input_scale,
+//                    OMDataType::S16, &quant_lstm_params->cell_gate_parameters);
+
+//   // lstm::GateParameters output_gate_parameters;
+//   createGateParams(lstm_struct->input(), lstm_struct->input_to_output_weights(),
+//                    lstm_struct->output_gate_bias(), lstm_struct->output_state(),
+//                    lstm_struct->recurrent_to_output_weights(), nonlinear_input_scale,
+//                    OMDataType::S16, &quant_lstm_params->output_gate_parameters);
+
+//   // Inter gate multiplication parameters
+//   float nonlinear_output_scale = 0.00003051757; // 2^-15 Q3.12 -> Q0.15
+//   float cell_state_scale =
+//     Tensor::scale(lstm_struct->cell_state()); // lstm_tensors.CellStateTensor()->params.scale;
+//   // forget gate output (nonlinear output) x cell state -> cell state
+//   quant_lstm_params->inter_gate_parameters.forget_cell_mul_params = createInterGateParams(
+//     nonlinear_output_scale, cell_state_scale, cell_state_scale, OMDataType::S16, 0);
+
+//   // input gate output x cell gate output -> cell state
+//   quant_lstm_params->inter_gate_parameters.input_mul_params = createInterGateParams(
+//     nonlinear_output_scale, nonlinear_output_scale, cell_state_scale, OMDataType::S16, 0);
+
+//   // tanh output x output gate output -> hidden state (potentially asymmetric)
+//   quant_lstm_params->inter_gate_parameters.output_mul_params = createInterGateParams(
+//     nonlinear_output_scale, nonlinear_output_scale, Tensor::scale(lstm_struct->output_state()),
+//     Tensor::element_type(lstm_struct->output_state()),
+//     Tensor::zero_point(lstm_struct->output_state()));
+// }
+
+// Create the additional information about the cell state, which include:
+// cell_state_scale_power: used in integer nonlinear function (e.g., tanh)
+// quantized_cell_clip: quantized cell clip range
+lstm::CellStateInfo createLstmCellStateInfo(const float cell_state_scale, const float cell_clip)
+{
+  lstm::CellStateInfo cell_state_info;
+  // cell_state_scale_power: 2^-cell_state_scale_power = cell state scale
+  int buffer;
+  checkedLog2(cell_state_scale, &buffer);
+  cell_state_info.cell_state_scale_power = buffer;
+  // Cell state specifics
+  cell_state_info.cell_clip = cell_clip;
+  cell_state_info.quantized_cell_clip = static_cast<int16_t>(std::min(
+    std::max(static_cast<double>(cell_clip) / static_cast<double>(cell_state_scale), -32768.0),
+    32767.0));
+  return cell_state_info;
+}
+
+#ifndef DIS_FLOAT
+FullyConnectedParams createFcParamsFloat()
+{
+  FullyConnectedParams op_params;
+  calculateActivationRange(circle::ActivationFunctionType::ActivationFunctionType_NONE,
+                           &op_params.float_activation_min, &op_params.float_activation_max);
+  op_params.quantized_activation_max = op_params.float_activation_max;
+  op_params.quantized_activation_min = op_params.float_activation_min;
+  return op_params;
+}
+
+lstm::GateParameters createGateParamsFloat()
+{
+  lstm::GateParameters gate_params;
+
+  gate_params.input_fc_params = createFcParamsFloat();
+  gate_params.recurrent_fc_params = createFcParamsFloat();
+
+  return gate_params;
+}
+
+lstm::CellStateInfo createLstmCellStateInfoFloat(const float cell_clip)
+{
+  lstm::CellStateInfo cell_state_info;
+  cell_state_info.cell_clip = cell_clip;
+  cell_state_info.cell_state_scale_power = 0; // no quantization
+  cell_state_info.quantized_cell_clip = 0;    // no quantization
+  return cell_state_info;
+}
+
+void prepareGateParamsFloat(lstm::LSTMParameters *float_lstm_params)
+{
+  // Gate Parameters
+  float_lstm_params->forget_gate_parameters = createGateParamsFloat();
+  float_lstm_params->input_gate_parameters = createGateParamsFloat();
+  float_lstm_params->cell_gate_parameters = createGateParamsFloat();
+  float_lstm_params->output_gate_parameters = createGateParamsFloat();
+
+  // Inter gate multiplication parameters
+  BinaryArithmeticBroadcastParams op_params;
+  calculateActivationRange(circle::ActivationFunctionType::ActivationFunctionType_NONE,
+                           &op_params.float_activation_min, &op_params.float_activation_max);
+  // op_params.quantized_activation_max = op_params.float_activation_max;
+  // op_params.quantized_activation_min = op_params.float_activation_min;
+  float_lstm_params->inter_gate_parameters.forget_cell_mul_params = op_params;
+  float_lstm_params->inter_gate_parameters.input_mul_params = op_params;
+  float_lstm_params->inter_gate_parameters.output_mul_params = op_params;
+}
+
+void evalFloat(const OMExecuteArgs &execute_args)
+{
+  lstm::LSTMStruct lstm_struct(execute_args);
+
+  lstm::CellStateInfo cell_state_info =
+    createLstmCellStateInfoFloat(lstm_struct.options->cell_clip());
+
+  lstm::LSTMParameters lstm_params;
+  prepareGateParamsFloat(&lstm_params);
+
+  const bool time_major = lstm_struct.options->time_major();
+  const auto batch_size = time_major ? dim(lstm_struct.input(), 1) : dim(lstm_struct.input(), 0);
+  const auto state_dimension = dim(lstm_struct.output_state(), 1);
+  const auto cell_state_type_size =
+    getOMDataTypeSize(onertMicroDatatype(lstm_struct.cell_state()->type()));
+
+  auto scratch_0_data =
+    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+  auto scratch_1_data =
+    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+  auto scratch_2_data =
+    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+  auto scratch_3_data =
+    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+
+  // Create and fill with 0 output state tensor
+  auto output_state_data = std::make_unique<float[]>(num_elements(lstm_struct.output_state()));
+  std::fill_n(output_state_data.get(), num_elements(lstm_struct.output_state()), 0);
+
+  // Create and fill with 0 cell state tensor
+  auto cell_state_data = std::make_unique<float[]>(num_elements(lstm_struct.cell_state()));
+  std::fill_n(cell_state_data.get(), num_elements(lstm_struct.cell_state()), 0);
+
+  evalLSTM<float, float, float, float>(
+    &lstm_struct, &lstm_params, &cell_state_info, output_state_data.get(), cell_state_data.get(),
+    core::utils::castOutputData<float>(scratch_0_data.get()),
+    core::utils::castOutputData<float>(scratch_1_data.get()),
+    core::utils::castOutputData<float>(scratch_2_data.get()),
+    core::utils::castOutputData<float>(scratch_3_data.get()), execute_args);
+}
+#endif // DIS_FLOAT
+
+void validateWeightTensorSize(const circle::Tensor *weight_tensor, int dim1_size, int dim2_size)
+{
+  assert(num_dims(weight_tensor) == 2);
+  assert(dim(weight_tensor, 0) == dim1_size);
+  assert(dim(weight_tensor, 1) == dim2_size);
+}
+
+void validateTensorsSize(lstm::LSTMStruct *lstm_struct, const bool time_major)
+{
+  const auto batch_size = time_major ? dim(lstm_struct->input(), 1) : dim(lstm_struct->input(), 0);
+
+  const auto input_dimension = dim(lstm_struct->input(), 2);
+  const auto state_dimension = dim(lstm_struct->output_state(), 1);
+
+  // Input FC weights
+  for (int32_t i = 1; i < 5; i++)
+  {
+    validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, input_dimension);
+  }
+
+  // Recurrent FC weights
+  for (int32_t i = 5; i < 9; i++)
+  {
+    validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, state_dimension);
+  }
+
+  // Biases
+  for (int32_t i = 12; i < 16; i++)
+  {
+    assert(num_dims(lstm_struct->get_internal_tensor(i)) == 1);
+    assert(dim(lstm_struct->get_internal_tensor(i), 0) == state_dimension);
+  }
+
+  // Check the shape of input state tensors.
+  // These tensor may be 1D or 2D. It's fine as long as the total size is
+  // correct.
+  assert(num_elements(lstm_struct->output_state()) == batch_size * state_dimension);
+  assert(num_elements(lstm_struct->cell_state()) == batch_size * state_dimension);
+
+  // Check the shape of output tensor against that of input tensor
+  assert(num_dims(lstm_struct->output()) == 3);
+  assert(dim(lstm_struct->input(), 0) == dim(lstm_struct->output(), 0));
+  assert(dim(lstm_struct->input(), 1) == dim(lstm_struct->output(), 1));
+  assert(dim(lstm_struct->output(), 2) == state_dimension);
+}
+
+} // namespace
+
+namespace execute
+{
+
+OMStatus execute_kernel_CircleUnidirectionalSequenceLSTM(const OMExecuteArgs &execute_args)
+{
+  core::OMRuntimeContext &runtime_context = execute_args.runtime_context;
+  core::OMRuntimeStorage &runtime_storage = execute_args.runtime_storage;
+  uint16_t op_index = execute_args.kernel_index;
+
+  const circle::Tensor *input;
+
+  execute::OMRuntimeKernel runtime_kernel;
+  runtime_kernel.readKernel(op_index, runtime_context);
+  input = runtime_kernel.inputs[0];
+
+  switch (input->type())
+  {
+#ifndef DIS_FLOAT
+    case circle::TensorType_FLOAT32:
+      evalFloat(execute_args);
+      break;
+#endif // DIS_FLOAT
+    default:
+      assert(false && "Unsupported type.");
+  }
+  return OMStatus::Ok;
+}
+
+} // namespace execute
+} // namespace onert_micro
diff --git a/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp b/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/onert-micro/onert-micro/src/import/kernels/UnidirectionalSequenceLSTM.cpp b/onert-micro/onert-micro/src/import/kernels/UnidirectionalSequenceLSTM.cpp
new file mode 100644
index 00000000000..7b69729c0de
--- /dev/null
+++ b/onert-micro/onert-micro/src/import/kernels/UnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OMStatus.h"
+
+#include "import/OMKernelConfigureBuilder.h"
+
+#include "core/OMUtils.h"
+#include "core/OMKernelData.h"
+
+#include "execute/OMRuntimeKernel.h"
+
+using namespace onert_micro;
+using namespace onert_micro::core;
+
+namespace
+{
+
+constexpr uint32_t inputTensorIdx = 0;
+constexpr uint32_t outputTensorIdx = 0;
+
+} // namespace
+
+namespace onert_micro
+{
+namespace import
+{
+
+OMStatus configure_kernel_CircleUnidirectionalSequenceLSTM(const OMConfigureArgs &config_args)
+{
+  OMStatus status = OMStatus::Ok;
+  return status;
+}
+
+} // namespace import
+} // namespace onert_micro

From e9b491eeb29bf4b51e299ab87e031240e8b82eff Mon Sep 17 00:00:00 2001
From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Tue, 7 Jan 2025 13:46:33 +0900
Subject: [PATCH 2/4] lstm f32 test

---
 .../FloatUnidirectionalLSTMKernel.h           | 164 ++++++++++++++++++
 .../TestDataUnidirectionalLSTMBase.h          |  60 +++++++
 .../tests/UnidirectionalSequenceLSTM.test.cpp |  67 +++++++
 3 files changed, 291 insertions(+)
 create mode 100644 onert-micro/onert-micro/include/test_models/unidirectional_lstm/FloatUnidirectionalLSTMKernel.h
 create mode 100644 onert-micro/onert-micro/include/test_models/unidirectional_lstm/TestDataUnidirectionalLSTMBase.h

diff --git a/onert-micro/onert-micro/include/test_models/unidirectional_lstm/FloatUnidirectionalLSTMKernel.h b/onert-micro/onert-micro/include/test_models/unidirectional_lstm/FloatUnidirectionalLSTMKernel.h
new file mode 100644
index 00000000000..88a9a17b2f5
--- /dev/null
+++ b/onert-micro/onert-micro/include/test_models/unidirectional_lstm/FloatUnidirectionalLSTMKernel.h
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_TEST_MODELS_FLOAT_UNIDIRECTIONAL_LSTM_KERNEL_H
+#define ONERT_MICRO_TEST_MODELS_FLOAT_UNIDIRECTIONAL_LSTM_KERNEL_H
+
+#include "TestDataUnidirectionalLSTMBase.h"
+
+namespace onert_micro
+{
+namespace test_model
+{
+namespace unidir_lstm_float
+{
+/*
+ * UnidirectionalLSTM Kernel:
+ *
+ *      Input(1, 4, 4)
+ *            |
+ *      UnidirectionalLSTM
+ *            |
+ *      Output(1, 4, 2)
+ */
+const unsigned char test_kernel_model_circle[] = {
+  0x18, 0x00, 0x00, 0x00, 0x43, 0x49, 0x52, 0x30, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x0c, 0x00, 0x08, 0x00, 0x10, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0xb4, 0x01, 0x00, 0x00, 0xd0, 0x05, 0x00, 0x00, 0xec, 0x05, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0xa0, 0x01, 0x00, 0x00, 0x94, 0x01, 0x00, 0x00, 0x8c, 0x01, 0x00, 0x00, 0x84, 0x01, 0x00, 0x00,
+  0x64, 0x01, 0x00, 0x00, 0x3c, 0x01, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x00,
+  0xe4, 0x00, 0x00, 0x00, 0xcc, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x3c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0xfe, 0xff, 0xff,
+  0xd2, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x84, 0x9b, 0x3c, 0xbe,
+  0x48, 0xfc, 0x22, 0xbf, 0x35, 0x43, 0xba, 0x3e, 0x18, 0x5c, 0xdb, 0x3e, 0x4b, 0x05, 0xdd, 0xbe,
+  0xf5, 0x0f, 0x1e, 0xbf, 0x1f, 0x2e, 0x09, 0x3f, 0x9e, 0xb5, 0x2f, 0x3f, 0xfe, 0xfe, 0xff, 0xff,
+  0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0xca, 0xfe, 0x05, 0x3f, 0xea, 0x91, 0x06, 0xbe,
+  0x77, 0xf4, 0xa7, 0xbe, 0x3f, 0x02, 0x23, 0xbf, 0xd1, 0xdc, 0x94, 0xbd, 0xc2, 0xdd, 0xb1, 0xbe,
+  0x45, 0x13, 0xc8, 0x3e, 0x7f, 0x6b, 0xef, 0x3e, 0x2a, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0xec, 0xde, 0xe2, 0xbe, 0xf6, 0x46, 0x01, 0xbf, 0xed, 0x4d, 0x97, 0xbd,
+  0xf2, 0xed, 0x09, 0xbf, 0x88, 0x4c, 0xe1, 0x3e, 0x60, 0x74, 0x89, 0x3e, 0x29, 0x79, 0x75, 0x3c,
+  0x9b, 0x8f, 0xdb, 0xbe, 0x56, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0xe7, 0xc8, 0x6a, 0x3e, 0x16, 0x06, 0x8b, 0xbd, 0x49, 0xf5, 0xe5, 0x3e, 0x01, 0xfb, 0xf0, 0x3e,
+  0x7c, 0x48, 0x10, 0xbf, 0x12, 0xd8, 0x94, 0xbe, 0x9a, 0xec, 0xaf, 0x3e, 0x4c, 0x1a, 0xdb, 0xbe,
+  0x82, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f,
+  0x00, 0x00, 0x80, 0x3f, 0x96, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x8d, 0xd4, 0xdb, 0xbd, 0xfe, 0x63, 0xd1, 0x3e, 0x9f, 0x60, 0x1a, 0x3d,
+  0xa1, 0x48, 0x0b, 0xbf, 0xc6, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x3a, 0xb2, 0xca, 0x3e, 0x58, 0x1a, 0x04, 0xbf, 0xe6, 0x76, 0x9f, 0x3e, 0x61, 0xa7, 0xd8, 0x3e,
+  0xe2, 0xff, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xf1, 0x48, 0x5c, 0xbe,
+  0xcd, 0x54, 0xad, 0x3c, 0x9f, 0x8e, 0xbf, 0x3e, 0x69, 0xac, 0xfd, 0x3d, 0x00, 0x00, 0x06, 0x00,
+  0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x93, 0x70, 0x21, 0xbf, 0x76, 0x27, 0x8e, 0x3c, 0x97, 0xe3, 0xc5, 0x3e, 0xe5, 0x7d, 0x8c, 0x3e,
+  0xf4, 0xff, 0xff, 0xff, 0xf8, 0xff, 0xff, 0xff, 0xfc, 0xff, 0xff, 0xff, 0x04, 0x00, 0x04, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+  0x18, 0x00, 0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x14, 0x00, 0x00, 0x00,
+  0x10, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x08, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47,
+  0x14, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+  0x0b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x41, 0x00, 0x00, 0x00, 0x04,
+  0x01, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+  0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0xe4, 0x02, 0x00, 0x00, 0x98, 0x02, 0x00, 0x00, 0x54, 0x02, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00,
+  0xec, 0x01, 0x00, 0x00, 0xb8, 0x01, 0x00, 0x00, 0x88, 0x01, 0x00, 0x00, 0x58, 0x01, 0x00, 0x00,
+  0x24, 0x01, 0x00, 0x00, 0xf0, 0x00, 0x00, 0x00, 0xbc, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x60, 0xfd, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x53, 0x74, 0x61, 0x74,
+  0x65, 0x66, 0x75, 0x6c, 0x50, 0x61, 0x72, 0x74, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x65, 0x64, 0x43,
+  0x61, 0x6c, 0x6c, 0x3a, 0x30, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xec, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+  0x0c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x2f, 0x6c, 0x73, 0x74, 0x6d, 0x2f,
+  0x7a, 0x65, 0x72, 0x6f, 0x73, 0x31, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0xdc, 0xfd, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x39, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x0c, 0xfe, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x38, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x3c, 0xfe, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x37, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x36, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x9c, 0xfe, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+  0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x35, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xc8, 0xfe, 0xff, 0xff, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61,
+  0x6e, 0x74, 0x34, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0xf4, 0xfe, 0xff, 0xff,
+  0x0c, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x33, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0xff, 0xff, 0xff,
+  0x0c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x32, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0xff, 0xff, 0xff,
+  0x0c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x31, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0xff, 0xff, 0xff,
+  0x0c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x61, 0x72, 0x69, 0x74, 0x68, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x14, 0x00,
+  0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x01, 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x00, 0x00, 0x73, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x74, 0x69, 0x61, 0x6c, 0x2f, 0x6c,
+  0x73, 0x74, 0x6d, 0x2f, 0x7a, 0x65, 0x72, 0x6f, 0x73, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
+  0x08, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x24, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x73, 0x65, 0x72, 0x76, 0x69, 0x6e, 0x67, 0x5f,
+  0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x5f, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x5f, 0x31, 0x3a,
+  0x30, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x2c, 0x11, 0x00, 0x00, 0x00, 0x4f, 0x4e, 0x45, 0x2d, 0x74, 0x66, 0x6c, 0x69,
+  0x74, 0x65, 0x32, 0x63, 0x69, 0x72, 0x63, 0x6c, 0x65, 0x00, 0x00, 0x00};
+
+const std::vector<float> input_data = {
+  -3.509163, -18.256927, 6.4799614, 10.296598, 30.371328, 18.692572, 10.12867,   -26.44944,
+  25.324795, 3.8303719,  20.93112,  22.603086, -4.308655, 2.3276749, -5.9565907, 25.611776};
+
+const std::vector<float> reference_output_data = {0.7613201,      -0.7570043,   0.0480366,
+                                                  -4.3364323e-11, -0.7613433,   1.3437739e-08,
+                                                  -0.7613537,     -7.000451e-08};
+
+} // namespace unidir_lstm_float
+
+class TestDataFloatUnidirectionalLSTM : public TestDataUnidirectionalLSTMBase<float>
+{
+public:
+  TestDataFloatUnidirectionalLSTM()
+  {
+    _input_data = unidir_lstm_float::input_data;
+    _reference_output_data = unidir_lstm_float::reference_output_data;
+    _test_kernel_model_circle = unidir_lstm_float::test_kernel_model_circle;
+  }
+
+  ~TestDataFloatUnidirectionalLSTM() override = default;
+};
+
+} // namespace test_model
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_TEST_MODELS_FLOAT_UNIDIRECTIONAL_LSTM_KERNEL_H
diff --git a/onert-micro/onert-micro/include/test_models/unidirectional_lstm/TestDataUnidirectionalLSTMBase.h b/onert-micro/onert-micro/include/test_models/unidirectional_lstm/TestDataUnidirectionalLSTMBase.h
new file mode 100644
index 00000000000..921eda3d7d8
--- /dev/null
+++ b/onert-micro/onert-micro/include/test_models/unidirectional_lstm/TestDataUnidirectionalLSTMBase.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_TEST_MODELS_UNIDIRECTIONAL_LSTM_KERNEL_BASE_H
+#define ONERT_MICRO_TEST_MODELS_UNIDIRECTIONAL_LSTM_KERNEL_BASE_H
+
+#include "test_models/TestDataBase.h"
+
+namespace onert_micro
+{
+namespace test_model
+{
+
+template <typename T> class TestDataUnidirectionalLSTMBase : public TestDataBase<T>
+{
+public:
+  TestDataUnidirectionalLSTMBase() = default;
+
+  const unsigned char *get_model_ptr() override final { return _test_kernel_model_circle; }
+
+  const std::vector<T> &get_input_data_by_index(int i) override final
+  {
+    switch (i)
+    {
+      case 0:
+        return _input_data;
+      default:
+        assert(false && "Wrong input index");
+    }
+  }
+
+  const std::vector<T> &get_output_data_by_index(int i) override final
+  {
+    assert(i == 0);
+    return _reference_output_data;
+  }
+
+protected:
+  std::vector<T> _input_data;
+  std::vector<T> _reference_output_data;
+  const unsigned char *_test_kernel_model_circle;
+};
+
+} // namespace test_model
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_TEST_MODELS_UNIDIRECTIONAL_LSTM_KERNEL_BASE_H
diff --git a/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp b/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp
index e69de29bb2d..bae0fbf670e 100644
--- a/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/tests/UnidirectionalSequenceLSTM.test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "execute/OMTestUtils.h"
+#include "test_models/unidirectional_lstm/FloatUnidirectionalLSTMKernel.h"
+
+namespace onert_micro
+{
+namespace execute
+{
+namespace testing
+{
+
+using namespace testing;
+
+template <typename T>
+std::vector<T> checkLSTMKernel(test_model::TestDataUnidirectionalLSTMBase<T> *test_data_base)
+{
+  onert_micro::OMInterpreter interpreter;
+  onert_micro::OMConfig config;
+
+  interpreter.importModel(reinterpret_cast<const char *>(test_data_base->get_model_ptr()), config);
+
+  interpreter.reset();
+  interpreter.allocateInputs();
+
+  T *input_data = reinterpret_cast<T *>(interpreter.getInputDataAt(0));
+
+  std::copy(test_data_base->get_input_data_by_index(0).begin(),
+            test_data_base->get_input_data_by_index(0).end(), input_data);
+  interpreter.run(config);
+
+  T *output_data = reinterpret_cast<T *>(interpreter.getOutputDataAt(0));
+  const size_t num_elements = interpreter.getOutputSizeAt(0);
+  std::vector<T> output_data_vector(output_data, output_data + num_elements);
+  return output_data_vector;
+}
+
+class UnidirectionalLSTMTest : public ::testing::Test
+{
+  // Do nothing
+};
+
+TEST_F(UnidirectionalLSTMTest, Float_P)
+{
+  onert_micro::test_model::TestDataFloatUnidirectionalLSTM test_data_kernel;
+  std::vector<float> output_data_vector = checkLSTMKernel<float>(&test_data_kernel);
+  EXPECT_THAT(output_data_vector,
+              FloatArrayNear(test_data_kernel.get_output_data_by_index(0), 0.0000001f));
+}
+
+} // namespace testing
+} // namespace execute
+} // namespace onert_micro

From 842903257c428bde7c443f2fd4308d19bc2aa0a3 Mon Sep 17 00:00:00 2001
From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Tue, 7 Jan 2025 15:58:12 +0900
Subject: [PATCH 3/4] fix svace overflow

---
 .../src/execute/kernels/UnidirectionalSequenceLSTM.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
index f36090b16e8..88a76e647ae 100644
--- a/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
@@ -283,14 +283,16 @@ void evalFloat(const OMExecuteArgs &execute_args)
   const auto cell_state_type_size =
     getOMDataTypeSize(onertMicroDatatype(lstm_struct.cell_state()->type()));
 
+  size_t scratch_buf_size = (long)batch_size * (long)state_dimension * (long)cell_state_type_size;
+
   auto scratch_0_data =
-    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+    std::make_unique<uint8_t[]>(scratch_buf_size);
   auto scratch_1_data =
-    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+    std::make_unique<uint8_t[]>(scratch_buf_size);
   auto scratch_2_data =
-    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+    std::make_unique<uint8_t[]>(scratch_buf_size);
   auto scratch_3_data =
-    std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
+    std::make_unique<uint8_t[]>(scratch_buf_size);
 
   // Create and fill with 0 output state tensor
   auto output_state_data = std::make_unique<float[]>(num_elements(lstm_struct.output_state()));

From 2e8294ca6518a7dc76b9260d6180b46c4c2152a0 Mon Sep 17 00:00:00 2001
From: Chunseok Lee <chunseok.lee@samsung.com>
Date: Mon, 3 Feb 2025 14:45:19 +0900
Subject: [PATCH 4/4] fix format

---
 .../execute/kernels/UnidirectionalSequenceLSTM.cpp   | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
index 88a76e647ae..548269aa51c 100644
--- a/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
+++ b/onert-micro/onert-micro/src/execute/kernels/UnidirectionalSequenceLSTM.cpp
@@ -285,14 +285,10 @@ void evalFloat(const OMExecuteArgs &execute_args)
 
   size_t scratch_buf_size = (long)batch_size * (long)state_dimension * (long)cell_state_type_size;
 
-  auto scratch_0_data =
-    std::make_unique<uint8_t[]>(scratch_buf_size);
-  auto scratch_1_data =
-    std::make_unique<uint8_t[]>(scratch_buf_size);
-  auto scratch_2_data =
-    std::make_unique<uint8_t[]>(scratch_buf_size);
-  auto scratch_3_data =
-    std::make_unique<uint8_t[]>(scratch_buf_size);
+  auto scratch_0_data = std::make_unique<uint8_t[]>(scratch_buf_size);
+  auto scratch_1_data = std::make_unique<uint8_t[]>(scratch_buf_size);
+  auto scratch_2_data = std::make_unique<uint8_t[]>(scratch_buf_size);
+  auto scratch_3_data = std::make_unique<uint8_t[]>(scratch_buf_size);
 
   // Create and fill with 0 output state tensor
   auto output_state_data = std::make_unique<float[]>(num_elements(lstm_struct.output_state()));