commit 157639e9e3420c85915183897007303aa756028e
Author: Samuel Dobron <xdobro23@stud.fit.vutbr.cz>
Date:   Sat Jul 13 13:15:28 2024 +0000

    init commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..429782b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,17 @@
+# ide related
+.vs/*
+.idea/*
+# build
+.build/*
+x64/*
+cmake-build-debug/*
+cmake-build-debug-visual-studio/*
+# ignore CMakeLists.txt but add example file
+CMakeLists.txt
+# dlls:
+*.dll
+# vs-like objects
+*.sln
+*vcxproj*
+packages.config
+packages/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..fa6e2fa
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,64 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
+project(YOLOv8CPP)
+
+SET(CMAKE_CXX_STANDARD 17)
+SET(CMAKE_CXX_STANDARD_REQUIRED ON)
+# SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib)  # opencv lib root
+# SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin)  #opencv bin root
+
+# SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll)  # change filenames
+# SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll)  # change filenames
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+SET (ONNXRUNTIME_DIR /home/xdobro23/PP1/onnxruntime/onnxruntime-linux-x64-gpu-1.18.1)  # onnxruntime root
+
+FIND_PACKAGE(OpenCV REQUIRED)
+set(CMAKE_BUILD_TYPE Debug)
+# --- Configure your project files ---
+include_directories(include)  # Include your header files directory
+
+# Recursively collect all source files under 'src' directory
+file(GLOB_RECURSE CURR_SOURCES src/*.cpp)
+
+# Create the executable
+add_executable(YOLOv8CPP ${CURR_SOURCES})
+
+target_compile_features(YOLOv8CPP PRIVATE cxx_std_17)
+
+TARGET_INCLUDE_DIRECTORIES(YOLOv8CPP PRIVATE "${ONNXRUNTIME_DIR}/include")
+
+
+TARGET_LINK_LIBRARIES(YOLOv8CPP ${OpenCV_LIBS})
+
+if (WIN32)
+    TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/onnxruntime.lib")
+
+    # some changes to the original version:
+    #   copy onnxruntime dll
+    add_custom_command(TARGET YOLOv8CPP POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${ONNXRUNTIME_DIR}/lib/onnxruntime.dll"
+            "$<TARGET_FILE_DIR:YOLOv8CPP>"
+            )
+    #   copy opencv
+#[[    add_custom_command(TARGET YOLOv8CPP POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_differentdpkg -L libcv2
+            "${OpenCV_DIR}/${OpenCV_DLL_FILENAME}"
+            "$<TARGET_FILE_DIR:YOLOv8CPP>"
+            )]]
+    add_custom_command(TARGET YOLOv8CPP POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${OpenCV_BIN_DIR}/${OpenCV_DEBUG_DLL_FILENAME}"
+            "$<TARGET_FILE_DIR:YOLOv8CPP>"
+            )
+    # add release
+    add_custom_command(TARGET YOLOv8CPP POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${OpenCV_BIN_DIR}/${OpenCV_RELEASE_DLL_FILENAME}"
+            "$<TARGET_FILE_DIR:YOLOv8CPP>"
+            )
+
+endif(WIN32)
+
+if (UNIX)
+    TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")
+endif(UNIX)
diff --git a/include/constants.h b/include/constants.h
new file mode 100644
index 0000000..5f4f673
--- /dev/null
+++ b/include/constants.h
@@ -0,0 +1,34 @@
+#pragma once
+#include <string>
+
+namespace MetadataConstants {
+    inline const std::string IMGSZ = "imgsz";
+    inline const std::string STRIDE = "stride";
+    inline const std::string NC = "nc";
+    inline const std::string CH = "ch";
+    inline const std::string DATE = "date";
+    inline const std::string VERSION = "version";
+    inline const std::string TASK = "task";
+    inline const std::string BATCH = "batch";
+    inline const std::string NAMES = "names";
+}
+
+namespace OnnxProviders {
+    inline const std::string CPU = "cpu";
+    inline const std::string CUDA = "cuda";
+}
+
+namespace OnnxInitializers
+{
+    inline const int UNINITIALIZED_STRIDE = -1;
+    inline const int UNINITIALIZED_NC = -1;
+}
+
+
+namespace YoloTasks
+{
+    inline const std::string SEGMENT = "segment";
+    inline const std::string DETECT = "detect";
+    inline const std::string POSE = "pose";
+    inline const std::string CLASSIFY = "classify";
+}
diff --git a/include/nn/autobackend.h b/include/nn/autobackend.h
new file mode 100644
index 0000000..f7f0cf5
--- /dev/null
+++ b/include/nn/autobackend.h
@@ -0,0 +1,90 @@
+#pragma once
+#include <filesystem>
+#include <vector>
+#include <unordered_map>
+#include <opencv2/core/mat.hpp>
+
+#include "onnx_model_base.h"
+#include "constants.h"
+
+/**
+ * @brief Represents the results of YOLO prediction.
+ *
+ * This structure stores information about a detected object, including its class index,
+ * confidence score, bounding box, semantic segmentation mask, and keypoints (if available).
+ */
+struct YoloResults {
+    int class_idx{};                  ///< The class index of the detected object.
+    float conf{};                     ///< The confidence score of the detection.
+    cv::Rect_<float> bbox;            ///< The bounding box of the detected object.
+    cv::Mat mask;                     ///< The semantic segmentation mask (if available).
+    std::vector<float> keypoints{};   ///< Keypoints representing the object's pose (if available).
+};
+
+struct ImageInfo {
+    cv::Size raw_size;  // add additional attrs if you need
+};
+
+
+class AutoBackendOnnx : public OnnxModelBase {
+public:
+    // constructors
+    AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
+        const std::vector<int>& imgsz, const int& stride,
+        const int& nc, std::unordered_map<int, std::string> names);
+
+    AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider);
+
+    // getters
+    virtual const std::vector<int>& getImgsz();
+    virtual const int& getStride();
+    virtual const int& getCh();
+    virtual const int& getNc();
+    virtual const std::unordered_map<int, std::string>& getNames();
+    virtual const std::vector<int64_t>& getInputTensorShape();
+    virtual const int& getWidth();
+    virtual const int& getHeight();
+    virtual const cv::Size& getCvSize();
+    virtual const std::string& getTask();
+    /**
+     * @brief Runs object detection on an input image.
+     *
+     * This method performs object detection on the input image and returns the detected objects as YoloResults.
+     *
+     * @param image The input image to run object detection on.
+     * @param conf The confidence threshold for object detection.
+     * @param iou The intersection-over-union (IoU) threshold for non-maximum suppression.
+     * @param mask_threshold The threshold for the semantic segmentation mask.
+     * @param conversionCode An optional conversion code for image format conversion (e.g., cv::COLOR_BGR2RGB).
+     *                      Default value is -1, indicating no conversion.
+     *                      TODO: use some constant from some namespace rather than hardcoded values here
+     *
+     * @return A vector of YoloResults representing the detected objects.
+     */
+    virtual std::vector<YoloResults> predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
+    virtual std::vector<YoloResults> predict_once(const std::filesystem::path& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
+    virtual std::vector<YoloResults> predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
+
+    virtual void fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
+    virtual void postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo para, std::vector<YoloResults>& output,
+        int& class_names_num, float& conf_threshold, float& iou_threshold,
+        int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold = 0.50f);
+
+    virtual void postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
+        int& class_names_num, float& conf_threshold, float& iou_threshold);
+    virtual void postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
+                                  int& class_names_num, float& conf_threshold, float& iou_threshold);
+    static void _get_mask2(const cv::Mat& mask_info, const cv::Mat& mask_data, const ImageInfo& image_info, cv::Rect bound, cv::Mat& mask_out,
+        float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, bool round_downsampled = false);
+
+protected:
+    std::vector<int> imgsz_;
+    int stride_ = OnnxInitializers::UNINITIALIZED_STRIDE;
+    int nc_ = OnnxInitializers::UNINITIALIZED_NC; //
+    int ch_ = 3;
+    std::unordered_map<int, std::string> names_;
+    std::vector<int64_t> inputTensorShape_;
+    cv::Size cvSize_;
+    std::string task_;
+    //cv::MatSize cvMatSize_;
+};
diff --git a/include/nn/onnx_model_base.h b/include/nn/onnx_model_base.h
new file mode 100644
index 0000000..dcd8e20
--- /dev/null
+++ b/include/nn/onnx_model_base.h
@@ -0,0 +1,39 @@
+#pragma once
+#include <onnxruntime_cxx_api.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+/*
+ * This interface must provide only required arguments to load any onnx model regarding specific info -
+ *  - i.e. modelPath will always be required, provider like "cpu" or "cuda" the same, since these are parameters you need
+ *  to set up `sessionOptions` or `session` objects properly, but image size is not needed for pure onnx graph to be loaded so do NOT include it here
+ */
+class OnnxModelBase {
+public:
+    OnnxModelBase(const char* modelPath, const char* logid, const char* provider);
+    //OnnxModelBase();  // no default constructor should be there
+    //virtual ~OnnxModelBase();
+    virtual const std::vector<std::string>& getInputNames(); // = 0
+    virtual const std::vector<std::string>& getOutputNames();
+    virtual const std::vector<const char*> getOutputNamesCStr();
+    virtual const std::vector<const char*> getInputNamesCStr();
+    virtual const Ort::ModelMetadata& getModelMetadata();
+    virtual const std::unordered_map<std::string, std::string>& getMetadata();
+    virtual const char* getModelPath();
+    virtual const Ort::Session& getSession();
+    //virtual std::vector<Ort::Value> forward(std::vector<Ort::Value> inputTensors);
+    virtual std::vector<Ort::Value> forward(std::vector<Ort::Value>& inputTensors);
+    Ort::Session session{ nullptr };
+
+protected:
+    const char* modelPath_;
+    Ort::Env env{ nullptr };
+
+    std::vector<std::string> inputNodeNames;
+    std::vector<std::string> outputNodeNames;
+    Ort::ModelMetadata model_metadata{ nullptr };
+    std::unordered_map<std::string, std::string> metadata;
+    std::vector<const char*> outputNamesCStr;
+    std::vector<const char*> inputNamesCStr;
+};
diff --git a/include/utils/augment.h b/include/utils/augment.h
new file mode 100644
index 0000000..c5e21d8
--- /dev/null
+++ b/include/utils/augment.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <opencv2/core/types.hpp>
+
+void letterbox(const cv::Mat& image,
+    cv::Mat& outImage,
+    const cv::Size& newShape = cv::Size(640, 640),
+    cv::Scalar_<double> color = cv::Scalar(), bool auto_ = true,
+    bool scaleFill = false,
+    bool scaleUp = true,
+    int stride = 32
+);
+
+
+cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float,
+    cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)));
+
+void scale_image2(
+        cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape,
+        const std::pair<float, cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f))
+                );
diff --git a/include/utils/common.h b/include/utils/common.h
new file mode 100644
index 0000000..52d08a8
--- /dev/null
+++ b/include/utils/common.h
@@ -0,0 +1,26 @@
+﻿#ifndef COMMON_UTILS_H
+#define COMMON_UTILS_H
+
+#include <chrono>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+
+class Timer {
+public:
+    Timer(double& accumulator, bool isEnabled = true);
+    void Stop();
+
+private:
+    double& accumulator;
+    bool isEnabled;
+    std::chrono::time_point<std::chrono::high_resolution_clock> start;
+};
+
+std::wstring get_win_path(const std::string& path);
+std::vector<std::string> parseVectorString(const std::string& input);
+std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input);
+std::unordered_map<int, std::string> parseNames(const std::string& input);
+int64_t vector_product(const std::vector<int64_t>& vec);
+#endif // COMMON_H COMMON_UTILS_H
\ No newline at end of file
diff --git a/include/utils/ops.h b/include/utils/ops.h
new file mode 100644
index 0000000..df4c035
--- /dev/null
+++ b/include/utils/ops.h
@@ -0,0 +1,44 @@
+#pragma once
+#include <opencv2/core/types.hpp>
+
+//cv::Rect scaleCoords(const cv::Size& imageShape, const cv::Rect& coords, const cv::Size& imageOriginalShape);
+/**
+ * Scales a bounding box from the shape of the input image to the shape of an original image.
+ *
+ * @param img1_shape The shape (height, width) of the input image for the model.
+ * @param box The bounding box to be scaled, specified as cv::Rect_<float>.
+ * @param img0_shape The shape (height, width) of the original target image.
+ * @param ratio_pad An optional parameter that specifies scaling and padding factors as a pair of values.
+ *	The first value (ratio) is used for scaling, and the second value (pad) is used for padding.
+ *	If not provided, default values will be used.
+ * @param padding An optional boolean parameter that specifies whether padding should be applied.
+ *	If set to true, padding will be applied to the bounding box.
+ *
+ * @return A scaled bounding box specified as cv::Rect_<float>.
+ *
+ * This function rescales a bounding box from the shape of the input image (img1_shape) to the shape of an original image (img0_shape).
+ */
+cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape, std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true);
+void clip_boxes(cv::Rect& box, const cv::Size& shape);
+void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape);
+void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape);
+void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape);
+
+//void clip_coords(cv::Mat& coords, const cv::Size& shape);
+//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape);
+void clip_coords(std::vector<float>& coords, const cv::Size& shape);
+std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape);
+
+cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box);
+
+
+struct NMSResult{
+    std::vector<cv::Rect> bboxes;
+    std::vector<float> confidences;
+    std::vector<int> classes;
+    std::vector<std::vector<float>> rest;
+};
+
+//std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
+std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
+non_max_suppression(const cv::Mat& output0, int class_names_num, int total_features_num, double conf_threshold, float iou_threshold);
\ No newline at end of file
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 0000000..f2efa3e
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,359 @@
+#include <random>
+
+#include <filesystem>
+#include "nn/onnx_model_base.h"
+#include "nn/autobackend.h"
+#include <opencv2/opencv.hpp>
+#include <vector>
+
+#include "utils/augment.h"
+#include "constants.h"
+#include "utils/common.h"
+
+
+namespace fs = std::filesystem;
+
+
+// Define the skeleton and color mappings
+std::vector<std::vector<int>> skeleton = {{16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, {6, 12}, {7, 13}, {6, 7},
+                                          {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7}};
+
+std::vector<cv::Scalar> posePalette = {
+        cv::Scalar(255, 128, 0), cv::Scalar(255, 153, 51), cv::Scalar(255, 178, 102), cv::Scalar(230, 230, 0), cv::Scalar(255, 153, 255),
+        cv::Scalar(153, 204, 255), cv::Scalar(255, 102, 255), cv::Scalar(255, 51, 255), cv::Scalar(102, 178, 255), cv::Scalar(51, 153, 255),
+        cv::Scalar(255, 153, 153), cv::Scalar(255, 102, 102), cv::Scalar(255, 51, 51), cv::Scalar(153, 255, 153), cv::Scalar(102, 255, 102),
+        cv::Scalar(51, 255, 51), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 0, 0), cv::Scalar(255, 255, 255)
+};
+
+std::vector<int> limbColorIndices = {9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16};
+std::vector<int> kptColorIndices = {16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9};
+
+
+
+cv::Scalar generateRandomColor(int numChannels) {
+    if (numChannels < 1 || numChannels > 3) {
+        throw std::invalid_argument("Invalid number of channels. Must be between 1 and 3.");
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> dis(0, 255);
+
+    cv::Scalar color;
+    for (int i = 0; i < numChannels; i++) {
+        color[i] = dis(gen); // for each channel separately generate value
+    }
+
+    return color;
+}
+
+std::vector<cv::Scalar> generateRandomColors(int class_names_num, int numChannels) {
+    std::vector<cv::Scalar> colors;
+    for (int i = 0; i < class_names_num; i++) {
+        cv::Scalar color = generateRandomColor(numChannels);
+        colors.push_back(color);
+    }
+    return colors;
+}
+
+void plot_masks(cv::Mat img, std::vector<YoloResults>& result, std::vector<cv::Scalar> color,
+    std::unordered_map<int, std::string>& names)
+{
+    cv::Mat mask = img.clone();
+    for (int i = 0; i < result.size(); i++)
+    {
+        float left, top;
+        left = result[i].bbox.x;
+        top = result[i].bbox.y;
+        int color_num = i;
+        int& class_idx = result[i].class_idx;
+        rectangle(img, result[i].bbox, color[result[i].class_idx], 2);
+
+        // try to get string value corresponding to given class_idx
+        std::string class_name;
+        auto it = names.find(class_idx);
+        if (it != names.end()) {
+            class_name = it->second;
+        }
+        else {
+            std::cerr << "Warning: class_idx not found in names for class_idx = " << class_idx << std::endl;
+            // then convert it to string anyway
+            class_name = std::to_string(class_idx);
+        }
+
+        if (result[i].mask.rows && result[i].mask.cols > 0)
+        {
+            mask(result[i].bbox).setTo(color[result[i].class_idx], result[i].mask);
+        }
+        std::stringstream labelStream;
+        labelStream << class_name << " " << std::fixed << std::setprecision(2) << result[i].conf;
+        std::string label = labelStream.str();
+
+    	cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
+        cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
+        cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
+        rectangle(img, rect_to_fill, color[result[i].class_idx], -1);
+
+        putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
+    }
+    addWeighted(img, 0.6, mask, 0.4, 0, img); //add mask to src
+    resize(img, img, img.size());
+    imshow("img", img);
+    cv::waitKey();
+}
+
+
+//void plot_keypoints(cv::Mat& image, const std::vector<std::vector<float>>& keypoints, const cv::Size& shape) {
+void plot_keypoints(cv::Mat& image, const std::vector<YoloResults>& results, const cv::Size& shape) {
+
+    int radius = 5;
+    bool drawLines = true;
+
+    if (results.empty()) {
+        return;
+    }
+
+    std::vector<cv::Scalar> limbColorPalette;
+    std::vector<cv::Scalar> kptColorPalette;
+
+    for (int index : limbColorIndices) {
+        limbColorPalette.push_back(posePalette[index]);
+    }
+
+    for (int index : kptColorIndices) {
+        kptColorPalette.push_back(posePalette[index]);
+    }
+
+    for (const auto& res: results) {
+        auto keypoint = res.keypoints;
+        bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
+        drawLines &= isPose;
+
+        // draw points
+        for (int i = 0; i < 17; i++) {
+            int idx = i * 3;
+            int x_coord = static_cast<int>(keypoint[idx]);
+            int y_coord = static_cast<int>(keypoint[idx + 1]);
+
+            if (x_coord % shape.width != 0 && y_coord % shape.height != 0) {
+                if (keypoint.size() == 3) {
+                    float conf = keypoint[2];
+                    if (conf < 0.5) {
+                        continue;
+                    }
+                }
+                cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
+                                                                               255);  // Default to red if not in pose mode
+                cv::circle(image, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
+            }
+        }
+        // draw lines
+        if (drawLines) {
+            for (int i = 0; i < skeleton.size(); i++) {
+                const std::vector<int> &sk = skeleton[i];
+                int idx1 = sk[0] - 1;
+                int idx2 = sk[1] - 1;
+
+                int idx1_x_pos = idx1 * 3;
+                int idx2_x_pos = idx2 * 3;
+
+                int x1 = static_cast<int>(keypoint[idx1_x_pos]);
+                int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
+                int x2 = static_cast<int>(keypoint[idx2_x_pos]);
+                int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
+
+                float conf1 = keypoint[idx1_x_pos + 2];
+                float conf2 = keypoint[idx2_x_pos + 2];
+
+                // Check confidence thresholds
+                if (conf1 < 0.5 || conf2 < 0.5) {
+                    continue;
+                }
+
+                // Check if positions are within bounds
+                if (x1 % shape.width == 0 || y1 % shape.height == 0 || x1 < 0 || y1 < 0 ||
+                    x2 % shape.width == 0 || y2 % shape.height == 0 || x2 < 0 || y2 < 0) {
+                    continue;
+                }
+
+                // Draw a line between keypoints
+                cv::Scalar color_limb = limbColorPalette[i];
+                cv::line(image, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
+            }
+        }
+    }
+}
+
+void plot_results(cv::Mat img, std::vector<YoloResults>& results,
+                  std::vector<cv::Scalar> color, std::unordered_map<int, std::string>& names,
+                  const cv::Size& shape
+                  ) {
+
+    cv::Mat mask = img.clone();
+
+    int radius = 5;
+    bool drawLines = true;
+
+    auto raw_image_shape = img.size();
+    std::vector<cv::Scalar> limbColorPalette;
+    std::vector<cv::Scalar> kptColorPalette;
+
+    for (int index : limbColorIndices) {
+        limbColorPalette.push_back(posePalette[index]);
+    }
+
+    for (int index : kptColorIndices) {
+        kptColorPalette.push_back(posePalette[index]);
+    }
+
+    for (const auto& res : results) {
+        float left = res.bbox.x;
+        float top = res.bbox.y;
+        int color_num = res.class_idx;
+
+        // Draw bounding box
+        rectangle(img, res.bbox, color[res.class_idx], 2);
+
+        // Try to get the class name corresponding to the given class_idx
+        std::string class_name;
+        auto it = names.find(res.class_idx);
+        if (it != names.end()) {
+            class_name = it->second;
+        }
+        else {
+            std::cerr << "Warning: class_idx not found in names for class_idx = " << res.class_idx << std::endl;
+            // Then convert it to a string anyway
+            class_name = std::to_string(res.class_idx);
+        }
+
+        // Draw mask if available
+        if (res.mask.rows && res.mask.cols > 0) {
+            mask(res.bbox).setTo(color[res.class_idx], res.mask);
+        }
+
+        // Create label
+        std::stringstream labelStream;
+        labelStream << class_name << " " << std::fixed << std::setprecision(2) << res.conf;
+        std::string label = labelStream.str();
+
+        cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
+        cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
+        cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
+        rectangle(img, rect_to_fill, color[res.class_idx], -1);
+        putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
+
+        // Check if keypoints are available
+        if (!res.keypoints.empty()) {
+            auto keypoint = res.keypoints;
+            bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
+            drawLines &= isPose;
+
+            // draw points
+            for (int i = 0; i < 17; i++) {
+                int idx = i * 3;
+                int x_coord = static_cast<int>(keypoint[idx]);
+                int y_coord = static_cast<int>(keypoint[idx + 1]);
+
+                if (x_coord % raw_image_shape.width != 0 && y_coord % raw_image_shape.height != 0) {
+                    if (keypoint.size() == 3) {
+                        float conf = keypoint[2];
+                        if (conf < 0.5) {
+                            continue;
+                        }
+                    }
+                    cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
+                                                                                  255);  // Default to red if not in pose mode
+                    cv::circle(img, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
+                }
+            }
+            // draw lines
+            if (drawLines) {
+                for (int i = 0; i < skeleton.size(); i++) {
+                    const std::vector<int> &sk = skeleton[i];
+                    int idx1 = sk[0] - 1;
+                    int idx2 = sk[1] - 1;
+
+                    int idx1_x_pos = idx1 * 3;
+                    int idx2_x_pos = idx2 * 3;
+
+                    int x1 = static_cast<int>(keypoint[idx1_x_pos]);
+                    int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
+                    int x2 = static_cast<int>(keypoint[idx2_x_pos]);
+                    int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
+
+                    float conf1 = keypoint[idx1_x_pos + 2];
+                    float conf2 = keypoint[idx2_x_pos + 2];
+
+                    // Check confidence thresholds
+                    if (conf1 < 0.5 || conf2 < 0.5) {
+                        continue;
+                    }
+
+                    // Check if positions are within bounds
+                    if (x1 % raw_image_shape.width == 0 || y1 % raw_image_shape.height == 0 || x1 < 0 || y1 < 0 ||
+                        x2 % raw_image_shape.width == 0 || y2 % raw_image_shape.height == 0 || x2 < 0 || y2 < 0) {
+                        continue;
+                    }
+
+                    // Draw a line between keypoints
+                    cv::Scalar color_limb = limbColorPalette[i];
+                    cv::line(img, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
+                }
+            }
+        }
+    }
+
+    // Combine the image and mask
+    addWeighted(img, 0.6, mask, 0.4, 0, img);
+//    resize(img, img, img.size());
+//    resize(img, img, shape);
+//    // Show the image
+//    imshow("img", img);
+//    cv::waitKey();
+}
+
+
+
+int main()
+{
+    std::string img_path = "/home/xdobro23/path_to_16bit_3_channel_image.png";
+    const std::string& modelPath = "/home/xdobro23/PP1/ultralytics/runs/detect/train35/weights/best.onnx"; // pose
+
+    fs::path imageFilePath(img_path);
+    fs::path newFilePath = imageFilePath.stem();
+    newFilePath += "-kpt-cpp";
+    newFilePath += imageFilePath.extension();
+    assert(newFilePath != imageFilePath);
+    std::cout << "newFilePath: " << newFilePath << std::endl;
+
+    const std::string& onnx_provider = OnnxProviders::CPU; // "cpu";
+    const std::string& onnx_logid = "yolov8_inference2";
+    float mask_threshold = 0.5f;  // in python it's 0.5 and you can see that at ultralytics/utils/ops.process_mask line 705 (ultralytics.__version__ == .160)
+    float conf_threshold = 0.30f;
+    float iou_threshold = 0.45f;  //  0.70f;
+	int conversion_code = -1;
+    cv::Mat img = cv::imread(img_path, cv::IMREAD_UNCHANGED);
+    if (img.empty()) {
+        std::cerr << "Error: Unable to load image" << std::endl;
+        return 1;
+    }
+    AutoBackendOnnx model(modelPath.c_str(), onnx_logid.c_str(), onnx_provider.c_str());
+    std::cout << "Model loaded" << std::endl;
+    std::vector<YoloResults> objs = model.predict_once(img, conf_threshold, iou_threshold, mask_threshold);
+    std::cout << "Got results" << std::endl;
+    std::vector<cv::Scalar> colors = generateRandomColors(model.getNc(), model.getCh());
+    std::unordered_map<int, std::string> names = model.getNames();
+
+    std::vector<std::vector<float>> keypointsVector;
+    for (const YoloResults& result : objs) {
+        keypointsVector.push_back(result.keypoints);
+    }
+
+    cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
+    cv::Size show_shape = img.size();  // cv::Size(1280, 720); // img.size()
+    plot_results(img, objs, colors, names, show_shape);
+//    plot_masks(img, objs, colors, names);
+    cv::imshow("img", img);
+    cv::waitKey();
+    return -1;
+}
diff --git a/src/nn/autobackend.cpp b/src/nn/autobackend.cpp
new file mode 100644
index 0000000..da48ae3
--- /dev/null
+++ b/src/nn/autobackend.cpp
@@ -0,0 +1,520 @@
+#pragma once
+
+#include "nn/autobackend.h"
+
+#include <iostream>
+#include <ostream>
+#include <filesystem>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/mat.hpp>
+
+#include "utils/augment.h"
+#include "constants.h"
+#include "utils/common.h"
+#include "utils/ops.h"
+
+
+namespace fs = std::filesystem;
+
+
+AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
+    const std::vector<int>& imgsz, const int& stride,
+    const int& nc, const std::unordered_map<int, std::string> names)
+    : OnnxModelBase(modelPath, logid, provider), imgsz_(imgsz), stride_(stride), nc_(nc), names_(names),
+    inputTensorShape_()
+{
+}
+
+AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider)
+    : OnnxModelBase(modelPath, logid, provider) {
+    // init metadata etc
+    OnnxModelBase(modelPath, logid, provider);
+    // then try to get additional info from metadata like imgsz, stride etc;
+    //  ideally you should get all of them but you'll raise error if smth is not in metadata (or not under the appropriate keys)
+    const std::unordered_map<std::string, std::string>& base_metadata = OnnxModelBase::getMetadata();
+
+    // post init imgsz
+    auto imgsz_iterator = base_metadata.find(MetadataConstants::IMGSZ);
+    if (imgsz_iterator != base_metadata.end()) {
+        // parse it and convert to int iterable
+        std::vector<int> imgsz = convertStringVectorToInts(parseVectorString(imgsz_iterator->second));
+        // set it here:
+        if (imgsz_.empty()) {
+            imgsz_ = imgsz;
+        }
+    }
+    else {
+        std::cerr << "Warning: Cannot get imgsz value from metadata" << std::endl;
+    }
+
+    // post init stride
+    auto stride_item = base_metadata.find(MetadataConstants::STRIDE);
+    if (stride_item != base_metadata.end()) {
+        // parse it and convert to int iterable
+        int stide_int = std::stoi(stride_item->second);
+        // set it here:
+        if (stride_ == OnnxInitializers::UNINITIALIZED_STRIDE) {
+            stride_ = stide_int;
+        }
+    }
+    else {
+        std::cerr << "Warning: Cannot get stride value from metadata" << std::endl;
+    }
+
+    // post init names
+    auto names_item = base_metadata.find(MetadataConstants::NAMES);
+    if (names_item != base_metadata.end()) {
+        // parse it and convert to int iterable
+        std::unordered_map<int, std::string> names = parseNames(names_item->second);
+        std::cout << "***Names from metadata***" << std::endl;
+        for (const auto& pair : names) {
+            std::cout << "Key: " << pair.first << ", Value: " << pair.second << std::endl;
+        }
+        // set it here:
+        if (names_.empty()) {
+            names_ = names;
+        }
+    }
+    else {
+        std::cerr << "Warning: Cannot get names value from metadata" << std::endl;
+    }
+
+    // post init number of classes - you can do that only and only if names_ is not empty and nc was not initialized previously
+    if (nc_ == OnnxInitializers::UNINITIALIZED_NC && !names_.empty()) {
+        nc_ = names_.size();
+    }
+    else {
+        std::cerr << "Warning: Cannot get nc value from metadata (probably names wasn't set)" << std::endl;
+    }
+
+    if (!imgsz_.empty() && inputTensorShape_.empty())
+    {
+        inputTensorShape_ = { 1, ch_, getHeight(), getWidth() };
+    }
+
+    if (!imgsz_.empty())
+    {
+        // Initialize cvSize_ using getHeight() and getWidth()
+        //cvSize_ = cv::MatSize()
+        cvSize_ = cv::Size(getWidth(), getHeight());
+        //cvMatSize_ = cv::MatSize(cvSize_.width, cvSize_.height);
+    }
+
+    // task init:
+    auto task_item = base_metadata.find(MetadataConstants::TASK);
+    if (task_item != base_metadata.end()) {
+        // parse it and convert to int iterable
+        std::string task = std::string(task_item->second);
+        // set it here:
+        if (task_.empty())
+        {
+            task_ = task;
+        }
+    }
+    else {
+        std::cerr << "Warning: Cannot get task value from metadata" << std::endl;
+    }
+
+    // TODO: raise assert if imgsz_ and task_ were not initialized (since you don't know in that case which postprocessing to use)
+
+}
+
+
+
+const std::vector<int>& AutoBackendOnnx::getImgsz() {
+    return imgsz_;
+}
+
+const int& AutoBackendOnnx::getHeight()
+{
+    return imgsz_[0];
+}
+
+const int& AutoBackendOnnx::getWidth()
+{
+    return imgsz_[1];
+}
+
+const int& AutoBackendOnnx::getStride() {
+    return stride_;
+}
+
+const int& AutoBackendOnnx::getCh() {
+    return ch_;
+}
+
+const int& AutoBackendOnnx::getNc() {
+    return nc_;
+}
+
+const std::unordered_map<int, std::string>& AutoBackendOnnx::getNames() {
+    return names_;
+}
+
+
+const cv::Size& AutoBackendOnnx::getCvSize()
+{
+    return cvSize_;
+}
+
+const std::vector<int64_t>& AutoBackendOnnx::getInputTensorShape()
+{
+    return inputTensorShape_;
+}
+
+const std::string& AutoBackendOnnx::getTask()
+{
+    return task_;
+}
+
+std::vector<YoloResults> AutoBackendOnnx::predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold,
+    int conversionCode, bool verbose) {
+    // Convert the string imagePath to an object of type std::filesystem::path
+    fs::path imageFilePath(imagePath);
+    // Call the predict_once method, converting the image to a cv::Mat
+    return predict_once(imageFilePath, conf, iou, mask_threshold, conversionCode);
+}
+
+std::vector<YoloResults> AutoBackendOnnx::predict_once(const fs::path& imagePath, float& conf, float& iou, float& mask_threshold,
+    int conversionCode, bool verbose) {
+    // Check if the specified path exists
+    if (!fs::exists(imagePath)) {
+        std::cerr << "Error: File does not exist: " << imagePath << std::endl;
+        // Return an empty vector or throw an exception, depending on your logic
+        return {};
+    }
+
+    // Load the image into a cv::Mat
+    cv::Mat image = cv::imread(imagePath.string(), cv::IMREAD_UNCHANGED);
+
+    // Check if loading the image was successful
+    if (image.empty()) {
+        std::cerr << "Error: Failed to load image: " << imagePath << std::endl;
+        // Return an empty vector or throw an exception, depending on your logic
+        return {};
+    }
+
+    // now do some preprocessing based on channels info:
+    int required_image_channels = this->getCh();
+    /*assert(required_image_channels == image.channels() && "");*/
+    // Assert that the number of channels in the input image matches the required number of channels for the model
+    if (required_image_channels != image.channels()) {
+        const std::string& errorMessage = "Error: Number of image channels does not match the required channels.\n"
+            "Number of channels in the image: " + std::to_string(image.channels());
+        throw std::runtime_error(errorMessage);
+    }
+
+    // Call overloaded one
+    return predict_once(image, conf, iou, mask_threshold, conversionCode);
+}
+
+
+std::vector<YoloResults> AutoBackendOnnx::predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode, bool verbose) {
+    double preprocess_time = 0.0;
+    double inference_time = 0.0;
+    double postprocess_time = 0.0;
+    Timer preprocess_timer = Timer(preprocess_time, verbose);
+    // 1. preprocess
+    float* blob = nullptr;
+    //double* blob = nullptr;
+    std::vector<Ort::Value> inputTensors;
+    if (conversionCode >= 0) {
+        cv::cvtColor(image, image, conversionCode);
+    }
+    std::vector<int64_t> inputTensorShape;
+    // TODO: for classify task preprocessed image will be different (!):
+    cv::Mat preprocessed_img;
+    cv::Size new_shape = cv::Size(getWidth(), getHeight());
+    const bool& scaleFill = false;  // false
+    const bool& auto_ = false; // true
+    letterbox(image, preprocessed_img, new_shape, cv::Scalar(), auto_, scaleFill, true, getStride());
+    fill_blob(preprocessed_img, blob, inputTensorShape);
+    int64_t inputTensorSize = vector_product(inputTensorShape);
+    std::vector<float> inputTensorValues(blob, blob + inputTensorSize);
+
+    Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
+        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
+
+    inputTensors.push_back(Ort::Value::CreateTensor<float>(
+        memoryInfo, inputTensorValues.data(), inputTensorSize,
+        inputTensorShape.data(), inputTensorShape.size()
+    ));
+    preprocess_timer.Stop();
+    Timer inference_timer = Timer(inference_time, verbose);
+    // 2. inference
+    std::vector<Ort::Value> outputTensors = forward(inputTensors);
+    inference_timer.Stop();
+    Timer postprocess_timer = Timer(postprocess_time, verbose);
+    // create container for the results
+    std::vector<YoloResults> results;
+    // 3. postprocess based on task:
+    std::unordered_map<int, std::string> names = this->getNames();
+    // 4. cleanup blob since it was created using the "new" keyword during the `fill_blob` func call
+    delete[] blob;
+
+    int class_names_num = names.size();
+    if (task_ == YoloTasks::SEGMENT) {
+
+        // get outputs info
+        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+        std::vector<int64_t> outputTensor1Shape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
+        // get outputs
+        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
+
+        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
+        auto mask_shape = outputTensor1Shape;
+        std::vector<int> mask_sz = { 1,(int)mask_shape[1],(int)mask_shape[2],(int)mask_shape[3] };
+        cv::Mat output1 = cv::Mat(mask_sz, CV_32F, outputTensors[1].GetTensorMutableData<float>());
+
+        int iw = this->getWidth();
+        int ih = this->getHeight();
+        int mask_features_num = outputTensor1Shape[1];
+        int mh = outputTensor1Shape[2];
+        int mw = outputTensor1Shape[3];
+        ImageInfo img_info = { image.size() };
+        postprocess_masks(output0, output1, img_info, results, class_names_num, conf, iou,
+            iw, ih, mw, mh, mask_features_num, mask_threshold);
+    }
+    else if (task_ == YoloTasks::DETECT) {
+        ImageInfo img_info = { image.size() };
+        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
+        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
+        postprocess_detects(output0, img_info, results, class_names_num, conf, iou);
+    }
+    else if (task_ == YoloTasks::POSE) {
+        ImageInfo image_info = { image.size() };
+        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
+        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
+        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
+        postprocess_kpts(output0, image_info, results, class_names_num, conf, iou);
+    }
+    else {
+        throw std::runtime_error("NotImplementedError: task: " + task_);
+    }
+
+    postprocess_timer.Stop();
+    if (verbose) {
+        std::cout << std::fixed << std::setprecision(1);
+        std::cout << "image: " << preprocessed_img.rows << "x" << preprocessed_img.cols << " " << results.size() << " objs, ";
+        std::cout << (preprocess_time + inference_time + postprocess_time) * 1000.0 << "ms" << std::endl;
+        std::cout << "Speed: " << (preprocess_time * 1000.0) << "ms preprocess, ";
+        std::cout << (inference_time * 1000.0) << "ms inference, ";
+        std::cout << (postprocess_time * 1000.0) << "ms postprocess per image ";
+        std::cout << "at shape (1, " << image.channels() << ", " << preprocessed_img.rows << ", " << preprocessed_img.cols << ")" << std::endl;
+    }
+
+    return results;
+}
+
+
+void AutoBackendOnnx::postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo image_info, std::vector<YoloResults>& output,
+    int& class_names_num, float& conf_threshold, float& iou_threshold,
+    int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold /* = 0.5f */)
+{
+    output.clear();
+    std::vector<int> class_ids;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> boxes;
+    std::vector<std::vector<float>> masks;
+    // 4 - your default number of rect parameters {x, y, w, h}
+    int data_width = class_names_num + 4 + masks_features_num;
+    int rows = output0.rows;
+    float* pdata = (float*)output0.data;
+    for (int r = 0; r < rows; ++r)
+    {
+        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
+        cv::Point class_id;
+        double max_conf;
+        minMaxLoc(scores, 0, &max_conf, 0, &class_id);
+        if (max_conf > conf_threshold)
+        {
+            masks.push_back(std::vector<float>(pdata + 4 + class_names_num, pdata + data_width));
+            class_ids.push_back(class_id.x);
+            confidences.push_back(max_conf);
+
+            float out_w = pdata[2];
+            float out_h = pdata[3];
+            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
+            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
+            cv::Rect_ <float> bbox = cv::Rect(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
+            cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
+            boxes.push_back(scaled_bbox);
+        }
+        pdata += data_width; // next pred
+    }
+
+    // 
+    //float masks_threshold = 0.50;
+    //int top_k = 500;
+    //const float& nmsde_eta = 1.0f;
+    std::vector<int> nms_result;
+    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
+
+    // select all of the protos tensor
+    cv::Size downsampled_size = cv::Size(mw, mh);
+    std::vector<cv::Range> roi_rangs = { cv::Range(0, 1), cv::Range::all(),
+                                         cv::Range(0, downsampled_size.height), cv::Range(0, downsampled_size.width) };
+    cv::Mat temp_mask = output1(roi_rangs).clone();
+    cv::Mat proto = temp_mask.reshape(0, { masks_features_num, downsampled_size.width * downsampled_size.height });
+
+    for (int i = 0; i < nms_result.size(); ++i)
+    {
+        int idx = nms_result[i];
+        boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
+        YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
+        _get_mask2(cv::Mat(masks[idx]).t(), proto, image_info, boxes[idx], result.mask, mask_threshold,
+            iw, ih, mw, mh, masks_features_num);
+        output.push_back(result);
+    }
+}
+
+
+void AutoBackendOnnx::postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
+    int& class_names_num, float& conf_threshold, float& iou_threshold)
+{
+    output.clear();
+    std::vector<int> class_ids;
+    std::vector<float> confidences;
+    std::vector<cv::Rect> boxes;
+    std::vector<std::vector<float>> masks;
+    // 4 - your default number of rect parameters {x, y, w, h}
+    int data_width = class_names_num + 4;
+    int rows = output0.rows;
+    float* pdata = (float*)output0.data;
+
+    for (int r = 0; r < rows; ++r)
+    {
+        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
+        cv::Point class_id;
+        double max_conf;
+        minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
+
+        if (max_conf > conf_threshold)
+        {
+            masks.emplace_back(pdata + 4 + class_names_num, pdata + data_width);
+            class_ids.push_back(class_id.x);
+            confidences.push_back((float) max_conf);
+
+            float out_w = pdata[2];
+            float out_h = pdata[3];
+            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
+            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
+
+            cv::Rect_ <float> bbox = cv::Rect_ <float> (out_left, out_top, (out_w + 0.5), (out_h + 0.5));
+            cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
+
+            boxes.push_back(scaled_bbox);
+        }
+        pdata += data_width; // next pred
+    }
+
+    std::vector<int> nms_result;
+    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
+    for (int idx : nms_result)
+    {
+        boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
+        YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
+        output.push_back(result);
+    }
+}
+
+void AutoBackendOnnx::postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
+                                          int& class_names_num, float& conf_threshold, float& iou_threshold)
+{
+    std::vector<cv::Rect> boxes;
+    std::vector<float> confidences;
+    std::vector<int> class_ids;
+    std::vector<std::vector<float>> rest;
+    std::tie(boxes, confidences, class_ids, rest) = non_max_suppression(output0, class_names_num, output0.cols, conf_threshold, iou_threshold);
+    cv::Size img1_shape = getCvSize();
+    auto bound_bbox = cv::Rect_ <float> (0, 0, image_info.raw_size.width, image_info.raw_size.height);
+    for (int i = 0; i < boxes.size(); i++) {
+        //             pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round()
+        //            pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
+        //            pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape)
+        //            path = self.batch[0]
+        //            img_path = path[i] if isinstance(path, list) else path
+        //            results.append(
+        //                Results(orig_img=orig_img,
+        //                        path=img_path,
+        //                        names=self.model.names,
+        //                        boxes=pred[:, :6],
+        //                        keypoints=pred_kpts))
+        cv::Rect_<float> bbox = boxes[i];
+        auto scaled_bbox = scale_boxes(img1_shape, bbox, image_info.raw_size);
+        scaled_bbox = scaled_bbox & bound_bbox;
+//        cv::Mat kpt = cv::Mat(rest[i]).t();
+//        scale_coords(img1_shape, kpt, image_info.raw_size);
+        // TODO: overload scale_coords so that will accept cv::Mat of shape [17, 3]
+        //      so that it will be more similar to what we have in python
+        std::vector<float> kpt = scale_coords(img1_shape, rest[i], image_info.raw_size);
+        YoloResults tmp_res = { class_ids[i], confidences[i], scaled_bbox, {}, kpt};
+        output.push_back(tmp_res);
+    }
+}
+
+void AutoBackendOnnx::_get_mask2(const cv::Mat& masks_features,
+    const cv::Mat& proto,
+    const ImageInfo& image_info, const cv::Rect bound, cv::Mat& mask_out,
+    float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num,
+    bool round_downsampled)
+
+{
+    cv::Size img0_shape = image_info.raw_size;
+    cv::Size img1_shape = cv::Size(iw, ih);
+    cv::Size downsampled_size = cv::Size(mw, mh);
+
+    cv::Rect_<float> bound_float(
+        static_cast<float>(bound.x),
+        static_cast<float>(bound.y),
+        static_cast<float>(bound.width),
+        static_cast<float>(bound.height)
+    );
+
+    cv::Rect_<float> downsampled_bbox = scale_boxes(img0_shape, bound_float, downsampled_size);
+    cv::Size bound_size = cv::Size(mw, mh);
+    clip_boxes(downsampled_bbox, bound_size);
+
+    cv::Mat matmul_res = (masks_features * proto).t();
+    matmul_res = matmul_res.reshape(1, { downsampled_size.height, downsampled_size.width });
+    // apply sigmoid to the mask:
+    cv::Mat sigmoid_mask;
+    exp(-matmul_res, sigmoid_mask);
+    sigmoid_mask = 1.0 / (1.0 + sigmoid_mask);
+    cv::Mat resized_mask;
+    cv::Rect_<float> input_bbox = scale_boxes(img0_shape, bound_float, img1_shape);
+    cv::resize(sigmoid_mask, resized_mask, img1_shape, 0, 0, cv::INTER_LANCZOS4);
+    cv::Mat pre_out_mask = resized_mask(input_bbox);
+    cv::Mat scaled_mask;
+    scale_image2(scaled_mask, resized_mask, img0_shape);
+    cv::resize(scaled_mask, mask_out, img0_shape);
+    mask_out = mask_out(bound) > mask_thresh;
+}
+
+
+void AutoBackendOnnx::fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape) {
+
+	cv::Mat floatImage;
+    if (inputTensorShape.empty())
+    {
+        inputTensorShape = getInputTensorShape();
+    }
+    int inputChannelsNum = inputTensorShape[1];
+    int rtype = CV_32FC3;
+    image.convertTo(floatImage, rtype, 1.0f / 65635.0);
+    blob = new float[floatImage.cols * floatImage.rows * floatImage.channels()];
+    cv::Size floatImageSize{ floatImage.cols, floatImage.rows };
+
+    // hwc -> chw
+    std::vector<cv::Mat> chw(floatImage.channels());
+    for (int i = 0; i < floatImage.channels(); ++i)
+    {
+        chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + i * floatImageSize.width * floatImageSize.height);
+    }
+    cv::split(floatImage, chw);
+}
+
diff --git a/src/nn/onnx_model_base.cpp b/src/nn/onnx_model_base.cpp
new file mode 100644
index 0000000..e6c579b
--- /dev/null
+++ b/src/nn/onnx_model_base.cpp
@@ -0,0 +1,181 @@
+#include "nn/onnx_model_base.h"
+
+#include <iostream>
+#include <onnxruntime_cxx_api.h>
+#include <onnxruntime_c_api.h>
+
+#include "constants.h"
+#include "utils/common.h"
+
+
+/**
+ * @brief Base class for any onnx model regarding the target.
+ *
+ * Wraps OrtApi.
+ *
+ * The caller provides a model path, logid, and provider.
+ *
+ * See the output logs for more information on warnings/errors that occur while processing the model.
+ *
+ * @param[in] modelPath Path to the model file.
+ * @param[in] logid Log identifier.
+ * @param[in] provider Provider (e.g., "CPU" or "CUDA"). (NOTE: for now only CPU is supported)
+ */
+
+OnnxModelBase::OnnxModelBase(const char* modelPath, const char* logid, const char* provider)
+//: modelPath_(modelPath), env(std::move(env)), session(std::move(session))
+    : modelPath_(modelPath)
+{
+
+    // TODO: too bad passing `ORT_LOGGING_LEVEL_WARNING` by default - for some cases
+    //       info level would make sense too
+    env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, logid);
+    Ort::SessionOptions sessionOptions = Ort::SessionOptions();
+
+    std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
+    auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider");
+    OrtCUDAProviderOptions cudaOption;
+
+    if (provider == OnnxProviders::CUDA.c_str()) {  // strcmp(provider, OnnxProviders::CUDA.c_str()) == true strcmp(provider, "cuda") // (providerStr == "cuda")
+        if (cudaAvailable == availableProviders.end()) {
+            std::cout << "CUDA is not supported by your ONNXRuntime build. Fallback to CPU." << std::endl;
+            //std::cout << "Inference device: CPU" << std::endl;
+        }
+        else {
+            //std::cout << "Inference device: GPU" << std::endl;
+            sessionOptions.AppendExecutionProvider_CUDA(cudaOption);
+        }
+    }
+
+    else if (provider == OnnxProviders::CPU.c_str()) {  // strcmp(provider, OnnxProviders::CPU.c_str()) == true) (providerStr == "cpu") {
+        // "cpu" by default
+    }
+    else
+    {
+        throw std::runtime_error("NotImplemented provider=" + std::string(provider));
+    }
+
+    std::cout << "Inference device: " << std::string(provider) << std::endl;
+    #ifdef _WIN32
+        auto modelPathW = get_win_path(modelPath);  // For Windows (wstring)
+        session = Ort::Session(env, modelPathW.c_str(), sessionOptions);
+    #else
+        session = Ort::Session(env, modelPath, sessionOptions);  // For Linux (string)
+    #endif
+    //session = Ort::Session(env)
+    // https://github.com/microsoft/onnxruntime/issues/14157
+    //std::vector<const char*> inputNodeNames; //
+    // ----------------
+    // init input names
+    inputNodeNames;
+    std::vector<Ort::AllocatedStringPtr> inputNodeNameAllocatedStrings; // <-- newly added
+    Ort::AllocatorWithDefaultOptions allocator;
+    auto inputNodesNum = session.GetInputCount();
+    for (int i = 0; i < inputNodesNum; i++) {
+        auto input_name = session.GetInputNameAllocated(i, allocator);
+        inputNodeNameAllocatedStrings.push_back(std::move(input_name));
+        inputNodeNames.push_back(inputNodeNameAllocatedStrings.back().get());
+    }
+    // -----------------
+    // init output names
+    outputNodeNames;
+    auto outputNodesNum = session.GetOutputCount();
+    std::vector<Ort::AllocatedStringPtr> outputNodeNameAllocatedStrings; // <-- newly added
+    Ort::AllocatorWithDefaultOptions output_names_allocator;
+    for (int i = 0; i < outputNodesNum; i++)
+    {
+        auto output_name = session.GetOutputNameAllocated(i, output_names_allocator);
+        outputNodeNameAllocatedStrings.push_back(std::move(output_name));
+        outputNodeNames.push_back(outputNodeNameAllocatedStrings.back().get());
+    }
+    // -------------------------
+    // initialize model metadata
+    model_metadata = session.GetModelMetadata();
+    Ort::AllocatorWithDefaultOptions metadata_allocator;
+
+    std::vector<Ort::AllocatedStringPtr> metadataAllocatedKeys = model_metadata.GetCustomMetadataMapKeysAllocated(metadata_allocator);
+    std::vector<std::string> metadata_keys;
+    metadata_keys.reserve(metadataAllocatedKeys.size());
+
+    for (const Ort::AllocatedStringPtr& allocatedString : metadataAllocatedKeys) {
+        metadata_keys.emplace_back(allocatedString.get());
+    }
+
+    // -------------------------
+    // initialize metadata as the dict
+    // even though we know exactly what metadata we intend to use
+    // base onnx class should not have any ultralytics yolo-specific attributes like stride, task etc, so keep it clean as much as possible
+    for (const std::string& key : metadata_keys) {
+        Ort::AllocatedStringPtr metadata_value = model_metadata.LookupCustomMetadataMapAllocated(key.c_str(), metadata_allocator);
+        if (metadata_value != nullptr) {
+            auto raw_metadata_value = metadata_value.get();
+            metadata[key] = std::string(raw_metadata_value);
+        }
+    }
+
+    // initialize cstr
+    for (const std::string& name : outputNodeNames) {
+        outputNamesCStr.push_back(name.c_str());
+    }
+
+    for (const std::string& name : inputNodeNames)
+    {
+        inputNamesCStr.push_back(name.c_str());
+    }
+
+}
+
+const std::vector<std::string>& OnnxModelBase::getInputNames() {
+    return inputNodeNames;
+}
+
+const std::vector<std::string>& OnnxModelBase::getOutputNames() {
+    return outputNodeNames;
+}
+
+const Ort::ModelMetadata& OnnxModelBase::getModelMetadata()
+{
+    return model_metadata;
+}
+
+const std::unordered_map<std::string, std::string>& OnnxModelBase::getMetadata()
+{
+    return metadata;
+}
+
+
+const Ort::Session& OnnxModelBase::getSession()
+{
+    return session;
+}
+
+const char* OnnxModelBase::getModelPath()
+{
+    return modelPath_;
+}
+
+const std::vector<const char*> OnnxModelBase::getOutputNamesCStr()
+{
+    return outputNamesCStr;
+}
+
+const std::vector<const char*> OnnxModelBase::getInputNamesCStr()
+{
+    return inputNamesCStr;
+}
+
+std::vector<Ort::Value> OnnxModelBase::forward(std::vector<Ort::Value>& inputTensors)
+{
+    // todo: make runOptions parameter here
+
+    return session.Run(Ort::RunOptions{ nullptr },
+        inputNamesCStr.data(),
+        inputTensors.data(),
+        inputNamesCStr.size(),
+        outputNamesCStr.data(),
+        outputNamesCStr.size());
+}
+
+//OnnxModelBase::~OnnxModelBase() {
+//    // empty body
+//}
diff --git a/src/utils/augment.cpp b/src/utils/augment.cpp
new file mode 100644
index 0000000..7a3e064
--- /dev/null
+++ b/src/utils/augment.cpp
@@ -0,0 +1,145 @@
+#include <opencv2/core.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/core/mat.hpp>
+#include <opencv2/core/types.hpp>
+
+
+/**
+ * \brief padding value when letterbox changes image size ratio
+ */
+const int& DEFAULT_LETTERBOX_PAD_VALUE = 114;
+
+
+void letterbox(const cv::Mat& image,
+    cv::Mat& outImage,
+    const cv::Size& newShape,
+    cv::Scalar_<double> color,
+    bool auto_,
+    bool scaleFill,
+    bool scaleUp, int stride
+) {
+    cv::Size shape = image.size();
+    float r = std::min(static_cast<float>(newShape.height) / static_cast<float>(shape.height),
+        static_cast<float>(newShape.width) / static_cast<float>(shape.width));
+    if (!scaleUp)
+        r = std::min(r, 1.0f);
+
+    float ratio[2]{ r, r };
+    int newUnpad[2]{ static_cast<int>(std::round(static_cast<float>(shape.width) * r)),
+                     static_cast<int>(std::round(static_cast<float>(shape.height) * r)) };
+
+    auto dw = static_cast<float>(newShape.width - newUnpad[0]);
+    auto dh = static_cast<float>(newShape.height - newUnpad[1]);
+
+    if (auto_)
+    {
+        dw = static_cast<float>((static_cast<int>(dw) % stride));
+        dh = static_cast<float>((static_cast<int>(dh) % stride));
+    }
+    else if (scaleFill)
+    {
+        dw = 0.0f;
+        dh = 0.0f;
+        newUnpad[0] = newShape.width;
+        newUnpad[1] = newShape.height;
+        ratio[0] = static_cast<float>(newShape.width) / static_cast<float>(shape.width);
+        ratio[1] = static_cast<float>(newShape.height) / static_cast<float>(shape.height);
+    }
+
+    dw /= 2.0f;
+    dh /= 2.0f;
+
+    //cv::Mat outImage;
+    if (shape.width != newUnpad[0] || shape.height != newUnpad[1])
+    {
+        cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1]));
+    }
+    else
+    {
+        outImage = image.clone();
+    }
+
+    int top = static_cast<int>(std::round(dh - 0.1f));
+    int bottom = static_cast<int>(std::round(dh + 0.1f));
+    int left = static_cast<int>(std::round(dw - 0.1f));
+    int right = static_cast<int>(std::round(dw + 0.1f));
+
+
+    if (color == cv::Scalar()) {
+        color = cv::Scalar(DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE);
+    }
+
+    cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
+
+}
+
+cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
+    cv::Size im1_shape = resized_mask.size();
+
+    // Check if resizing is needed
+    if (im1_shape == im0_shape) {
+        return resized_mask.clone();
+    }
+
+    float gain, pad_x, pad_y;
+
+    if (ratio_pad.first < 0.0f) {
+        gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
+            static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
+        pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
+        pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
+    }
+    else {
+        gain = ratio_pad.first;
+        pad_x = ratio_pad.second.x;
+        pad_y = ratio_pad.second.y;
+    }
+
+    int top = static_cast<int>(pad_y);
+    int left = static_cast<int>(pad_x);
+    int bottom = static_cast<int>(im1_shape.height - pad_y);
+    int right = static_cast<int>(im1_shape.width - pad_x);
+
+    // Clip and resize the mask
+    cv::Rect clipped_rect(left, top, right - left, bottom - top);
+    cv::Mat clipped_mask = resized_mask(clipped_rect);
+    cv::Mat scaled_mask;
+    cv::resize(clipped_mask, scaled_mask, im0_shape);
+
+    return scaled_mask;
+}
+
+
+void scale_image2(cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
+    cv::Size im1_shape = resized_mask.size();
+
+    // Check if resizing is needed
+    if (im1_shape == im0_shape) {
+        scaled_mask = resized_mask.clone();
+        return;
+    }
+
+    float gain, pad_x, pad_y;
+
+    if (ratio_pad.first < 0.0f) {
+        gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
+                        static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
+        pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
+        pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
+    }
+    else {
+        gain = ratio_pad.first;
+        pad_x = ratio_pad.second.x;
+        pad_y = ratio_pad.second.y;
+    }
+
+    int top = static_cast<int>(pad_y);
+    int left = static_cast<int>(pad_x);
+    int bottom = static_cast<int>(im1_shape.height - pad_y);
+    int right = static_cast<int>(im1_shape.width - pad_x);
+
+    // Clip and resize the mask
+    cv::Rect clipped_rect(left, top, right - left, bottom - top);
+    cv::Mat clipped_mask = resized_mask(clipped_rect);
+    cv::resize(clipped_mask, scaled_mask, im0_shape);
+}
diff --git a/src/utils/common.cpp b/src/utils/common.cpp
new file mode 100644
index 0000000..d25388e
--- /dev/null
+++ b/src/utils/common.cpp
@@ -0,0 +1,147 @@
+﻿#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#include <string>
+#include <sstream>
+#include <codecvt>
+#include "utils/common.h"
+
+#include <chrono>
+//#include <boost/algorithm/string.hpp>
+
+#include <iostream>
+#include <stdio.h>
+
+#include <string>
+
+#include <regex>
+#include <vector>
+
+
+
+Timer::Timer(double& accumulator, bool isEnabled)
+    : accumulator(accumulator), isEnabled(isEnabled) {
+    if (isEnabled) {
+        start = std::chrono::high_resolution_clock::now();
+    }
+}
+
+// Stop the timer and update the accumulator
+void Timer::Stop() {
+    if (isEnabled) {
+        auto end = std::chrono::high_resolution_clock::now();
+        double duration = std::chrono::duration<double>(end - start).count();
+        accumulator += duration;
+    }
+}
+
+// С++ 14 version
+//#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+std::wstring get_win_path(const std::string& modelPath) {
+#ifdef _WIN32
+    return std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(modelPath);
+#else
+    // return modelPath;
+    return std::wstring(modelPath.begin(), modelPath.end());
+#endif
+}
+
+
+std::vector<std::string> parseVectorString(const std::string& input) {
+    /* Main purpose of this function is to parse `imgsz` key value of model metadata
+     *  and from [height, width] get height, width values in the vector of strings
+     * Args:
+     *  input:
+     *      expected to be something like [544, 960] or [3,544, 960]
+     * output:
+     *  iterable of strings, representing integers
+     */
+    std::regex number_pattern(R"(\d+)");
+
+    std::vector<std::string> result;
+    std::sregex_iterator it(input.begin(), input.end(), number_pattern);
+    std::sregex_iterator end;
+
+    while (it != end) {
+        result.push_back(it->str());
+        ++it;
+    }
+
+    return result;
+}
+
+std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input) {
+    std::vector<int> result;
+
+    for (const std::string& str : input) {
+        try {
+            int value = std::stoi(str);
+            result.push_back(value);
+        }
+        catch (const std::invalid_argument& e) {
+            // raise explicit exception
+            throw std::invalid_argument("Bad argument (cannot cast): value=" + str);
+        }
+        catch (const std::out_of_range& e) {
+            // check bounds
+            throw std::out_of_range("Value out of range: " + str);
+        }
+    }
+
+    return result;
+}
+
+
+/*
+std::unordered_map<int, std::string> parseNames(const std::string& input) {
+    std::unordered_map<int, std::string> result;
+
+    std::string cleanedInput = input;
+    boost::erase_all(cleanedInput, "{");
+    boost::erase_all(cleanedInput, "}");
+
+    std::vector<std::string> elements;
+    boost::split(elements, cleanedInput, boost::is_any_of(","));
+
+    for (const std::string& element : elements) {
+        std::vector<std::string> keyValue;
+        boost::split(keyValue, element, boost::is_any_of(":"));
+
+        if (keyValue.size() == 2) {
+            int key = std::stoi(boost::trim_copy(keyValue[0]));
+            std::string value = boost::trim_copy(keyValue[1]);
+
+            result[key] = value;
+        }
+    }
+
+    return result;
+}
+*/
+
+std::unordered_map<int, std::string> parseNames(const std::string& input) {
+    std::unordered_map<int, std::string> result;
+
+    std::string cleanedInput = input;
+    cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '{'), cleanedInput.end());
+    cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '}'), cleanedInput.end());
+
+    std::istringstream elementStream(cleanedInput);
+    std::string element;
+    while (std::getline(elementStream, element, ',')) {
+        std::istringstream keyValueStream(element);
+        std::string keyStr, value;
+        if (std::getline(keyValueStream, keyStr, ':') && std::getline(keyValueStream, value)) {
+            int key = std::stoi(keyStr);
+            result[key] = value;
+        }
+    }
+
+    return result;
+}
+
+int64_t vector_product(const std::vector<int64_t>& vec) {
+    int64_t result = 1;
+    for (int64_t value : vec) {
+        result *= value;
+    }
+    return result;
+}
diff --git a/src/utils/ops.cpp b/src/utils/ops.cpp
new file mode 100644
index 0000000..9110565
--- /dev/null
+++ b/src/utils/ops.cpp
@@ -0,0 +1,217 @@
+#pragma once
+
+#include <opencv2/opencv.hpp>
+//#include <opencv2/imgcodecs.hpp>
+//#include <opencv2/imgproc.hpp>
+
+#include <opencv2/core/mat.hpp>
+#include <opencv2/core/types.hpp>
+#include <opencv2/core.hpp>
+#include <vector>
+
+
+
+void clip_boxes(cv::Rect& box, const cv::Size& shape) {
+    box.x = std::max(0, std::min(box.x, shape.width));
+    box.y = std::max(0, std::min(box.y, shape.height));
+    box.width = std::max(0, std::min(box.width, shape.width - box.x));
+    box.height = std::max(0, std::min(box.height, shape.height - box.y));
+}
+
+void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape) {
+    box.x = std::max(0.0f, std::min(box.x, static_cast<float>(shape.width)));
+    box.y = std::max(0.0f, std::min(box.y, static_cast<float>(shape.height)));
+    box.width = std::max(0.0f, std::min(box.width, static_cast<float>(shape.width - box.x)));
+    box.height = std::max(0.0f, std::min(box.height, static_cast<float>(shape.height - box.y)));
+}
+
+
+void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape) {
+    for (cv::Rect& box : boxes) {
+        clip_boxes(box, shape);
+    }
+}
+
+void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape) {
+    for (cv::Rect_<float>& box : boxes) {
+        clip_boxes(box, shape);
+    }
+}
+
+// source: ultralytics/utils/ops.py scale_boxes lines 99+ (ultralytics==8.0.160)
+cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape,
+    std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true) {
+
+    float gain, pad_x, pad_y;
+
+    if (ratio_pad.first < 0.0f) {
+        gain = std::min(static_cast<float>(img1_shape.height) / static_cast<float>(img0_shape.height),
+            static_cast<float>(img1_shape.width) / static_cast<float>(img0_shape.width));
+        pad_x = roundf((img1_shape.width - img0_shape.width * gain) / 2.0f - 0.1f);
+        pad_y = roundf((img1_shape.height - img0_shape.height * gain) / 2.0f - 0.1f);
+    }
+    else {
+        gain = ratio_pad.first;
+        pad_x = ratio_pad.second.x;
+        pad_y = ratio_pad.second.y;
+    }
+
+    //cv::Rect scaledCoords(box);
+    cv::Rect_<float> scaledCoords(box);
+
+    if (padding) {
+        scaledCoords.x -= pad_x;
+        scaledCoords.y -= pad_y;
+    }
+
+    scaledCoords.x /= gain;
+    scaledCoords.y /= gain;
+    scaledCoords.width /= gain;
+    scaledCoords.height /= gain;
+
+    // Clip the box to the bounds of the image
+    clip_boxes(scaledCoords, img0_shape);
+
+    return scaledCoords;
+}
+
+
+//void clip_coords(cv::Mat& coords, const cv::Size& shape) {
+//    // Clip x-coordinates to the image width
+//    cv::Mat xCoords = coords.col(0);
+//    cv::Mat yCoords = coords.col(1);
+//
+//    for (int i = 0; i < coords.rows; ++i) {
+//        xCoords.at<float>(i) = std::max(std::min(xCoords.at<float>(i), static_cast<float>(shape.width - 1)), 0.0f);
+//        yCoords.at<float>(i) = std::max(std::min(yCoords.at<float>(i), static_cast<float>(shape.height - 1)), 0.0f);
+//    }
+//}
+
+void clip_coords(std::vector<float>& coords, const cv::Size& shape) {
+    // Assuming coords are of shape [1, 17, 3]
+    for (int i = 0; i < coords.size(); i += 3) {
+        coords[i] = std::min(std::max(coords[i], 0.0f), static_cast<float>(shape.width - 1));  // x
+        coords[i + 1] = std::min(std::max(coords[i + 1], 0.0f), static_cast<float>(shape.height - 1));  // y
+    }
+}
+
+// source: ultralytics/utils/ops.py scale_coords lines 753+ (ultralytics==8.0.160)
+//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape)
+//cv::Mat scale_coords(const cv::Size& img1_shape, std::vector<float> coords, const cv::Size& img0_shape)
+std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape)
+{
+//    cv::Mat scaledCoords = coords.clone();
+    std::vector<float> scaledCoords = coords;
+
+    // Calculate gain and padding
+    double gain = std::min(static_cast<double>(img1_shape.width) / img0_shape.width, static_cast<double>(img1_shape.height) / img0_shape.height);
+    cv::Point2d pad((img1_shape.width - img0_shape.width * gain) / 2, (img1_shape.height - img0_shape.height * gain) / 2);
+
+    // Apply padding
+//    scaledCoords.col(0) = (scaledCoords.col(0) - pad.x);
+//    scaledCoords.col(1) = (scaledCoords.col(1) - pad.y);
+    // Assuming coords are of shape [1, 17, 3]
+    for (int i = 0; i < scaledCoords.size(); i += 3) {
+        scaledCoords[i] -= pad.x;  // x padding
+        scaledCoords[i + 1] -= pad.y;  // y padding
+    }
+
+    // Scale coordinates
+//    scaledCoords.col(0) /= gain;
+//    scaledCoords.col(1) /= gain;
+    // Assuming coords are of shape [1, 17, 3]
+    for (int i = 0; i < scaledCoords.size(); i += 3) {
+        scaledCoords[i] /= gain;
+        scaledCoords[i + 1] /= gain;
+    }
+
+    clip_coords(scaledCoords, img0_shape);
+    return scaledCoords;
+}
+
+
+cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box) {
+    int h = mask.rows;
+    int w = mask.cols;
+
+    int x1 = box.x;
+    int y1 = box.y;
+    int x2 = box.x + box.width;
+    int y2 = box.y + box.height;
+
+    cv::Mat cropped_mask = cv::Mat::zeros(h, w, mask.type());
+
+    for (int r = 0; r < h; ++r) {
+        for (int c = 0; c < w; ++c) {
+            if (r >= y1 && r < y2 && c >= x1 && c < x2) {
+                cropped_mask.at<float>(r, c) = mask.at<float>(r, c);
+            }
+        }
+    }
+
+    return cropped_mask;
+}
+
+//std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
+std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
+non_max_suppression(const cv::Mat& output0, int class_names_num, int data_width, double conf_threshold,
+                    float iou_threshold) {
+
+    std::vector<int> class_ids;
+    std::vector<float> confidences;
+//    std::vector<cv::Rect_<float>> boxes;
+    std::vector<cv::Rect> boxes;
+    std::vector<std::vector<float>> rest;
+
+    int rest_start_pos = class_names_num + 4;
+    int rest_features = data_width - rest_start_pos;
+//    int data_width = rest_start_pos + total_features_num;
+
+    int rows = output0.rows;
+    float* pdata = (float*) output0.data;
+
+    for (int r = 0; r < rows; ++r) {
+        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
+        cv::Point class_id;
+        double max_conf;
+        minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
+
+        if (max_conf > conf_threshold) {
+            std::vector<float> mask_data(pdata + 4 + class_names_num, pdata + data_width);
+            class_ids.push_back(class_id.x);
+            confidences.push_back((float) max_conf);
+
+            float out_w = pdata[2];
+            float out_h = pdata[3];
+            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
+            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
+            cv::Rect_<float> bbox(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
+            boxes.push_back(bbox);
+            if (rest_features > 0) {
+                std::vector<float> rest_data(pdata + rest_start_pos, pdata + data_width);
+                rest.push_back(rest_data);
+            }
+        }
+        pdata += data_width; // next prediction
+    }
+
+    //
+    //float masks_threshold = 0.50;
+    //int top_k = 500;
+    //const float& nmsde_eta = 1.0f;
+    std::vector<int> nms_result;
+    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
+//    cv::dnn::NMSBoxes(boxes, confidences, );
+    std::vector<int> nms_class_ids;
+    std::vector<float> nms_confidences;
+//    std::vector<cv::Rect_<float>> boxes;
+    std::vector<cv::Rect> nms_boxes;
+    std::vector<std::vector<float>> nms_rest;
+    for (int idx: nms_result) {
+        nms_class_ids.push_back(class_ids[idx]);
+        nms_confidences.push_back(confidences[idx]);
+        nms_boxes.push_back(boxes[idx]);
+        nms_rest.push_back(rest[idx]);
+    }
+    return std::make_tuple(nms_boxes, nms_confidences, nms_class_ids, nms_rest);
+}