init commit

11 months ago · 157639e9e3
commit 157639e9e3
14 changed files with 1903 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,17 @@
 # ide related
 .vs/*
 .idea/*
 # build
 .build/*
 x64/*
 cmake-build-debug/*
 cmake-build-debug-visual-studio/*
 # ignore CMakeLists.txt but add example file
 CMakeLists.txt
 # dlls:
 *.dll
 # vs-like objects
 *.sln
 *vcxproj*
 packages.config
 packages/*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,64 @@
 CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
 project(YOLOv8CPP)
 SET(CMAKE_CXX_STANDARD 17)
 SET(CMAKE_CXX_STANDARD_REQUIRED ON)
 # SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib)  # opencv lib root
 # SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin)  #opencv bin root
 # SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll)  # change filenames
 # SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll)  # change filenames
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
 SET (ONNXRUNTIME_DIR /home/xdobro23/PP1/onnxruntime/onnxruntime-linux-x64-gpu-1.18.1)  # onnxruntime root
 FIND_PACKAGE(OpenCV REQUIRED)
 set(CMAKE_BUILD_TYPE Debug)
 # --- Configure your project files ---
 include_directories(include)  # Include your header files directory
 # Recursively collect all source files under 'src' directory
 file(GLOB_RECURSE CURR_SOURCES src/*.cpp)
 # Create the executable
 add_executable(YOLOv8CPP ${CURR_SOURCES})
 target_compile_features(YOLOv8CPP PRIVATE cxx_std_17)
 TARGET_INCLUDE_DIRECTORIES(YOLOv8CPP PRIVATE "${ONNXRUNTIME_DIR}/include")
 TARGET_LINK_LIBRARIES(YOLOv8CPP ${OpenCV_LIBS})
 if (WIN32)
    TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/onnxruntime.lib")
    # some changes to the original version:
    #   copy onnxruntime dll
    add_custom_command(TARGET YOLOv8CPP POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_if_different
            "${ONNXRUNTIME_DIR}/lib/onnxruntime.dll"
            "$<TARGET_FILE_DIR:YOLOv8CPP>"
            )
    #   copy opencv
 #[[    add_custom_command(TARGET YOLOv8CPP POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_if_differentdpkg -L libcv2
            "${OpenCV_DIR}/${OpenCV_DLL_FILENAME}"
            "$<TARGET_FILE_DIR:YOLOv8CPP>"
            )]]
    add_custom_command(TARGET YOLOv8CPP POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_if_different
            "${OpenCV_BIN_DIR}/${OpenCV_DEBUG_DLL_FILENAME}"
            "$<TARGET_FILE_DIR:YOLOv8CPP>"
            )
    # add release
    add_custom_command(TARGET YOLOv8CPP POST_BUILD
            COMMAND ${CMAKE_COMMAND} -E copy_if_different
            "${OpenCV_BIN_DIR}/${OpenCV_RELEASE_DLL_FILENAME}"
            "$<TARGET_FILE_DIR:YOLOv8CPP>"
            )
 endif(WIN32)
 if (UNIX)
    TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")
 endif(UNIX)
--- a/include/constants.h
+++ b/include/constants.h
@ -0,0 +1,34 @@
 #pragma once
 #include <string>
 namespace MetadataConstants {
    inline const std::string IMGSZ = "imgsz";
    inline const std::string STRIDE = "stride";
    inline const std::string NC = "nc";
    inline const std::string CH = "ch";
    inline const std::string DATE = "date";
    inline const std::string VERSION = "version";
    inline const std::string TASK = "task";
    inline const std::string BATCH = "batch";
    inline const std::string NAMES = "names";
 }
 namespace OnnxProviders {
    inline const std::string CPU = "cpu";
    inline const std::string CUDA = "cuda";
 }
 namespace OnnxInitializers
 {
    inline const int UNINITIALIZED_STRIDE = -1;
    inline const int UNINITIALIZED_NC = -1;
 }
 namespace YoloTasks
 {
    inline const std::string SEGMENT = "segment";
    inline const std::string DETECT = "detect";
    inline const std::string POSE = "pose";
    inline const std::string CLASSIFY = "classify";
 }
--- a/include/nn/autobackend.h
+++ b/include/nn/autobackend.h
@ -0,0 +1,90 @@
 #pragma once
 #include <filesystem>
 #include <vector>
 #include <unordered_map>
 #include <opencv2/core/mat.hpp>
 #include "onnx_model_base.h"
 #include "constants.h"
 /**
 * @brief Represents the results of YOLO prediction.
 *
 * This structure stores information about a detected object, including its class index,
 * confidence score, bounding box, semantic segmentation mask, and keypoints (if available).
 */
 struct YoloResults {
    int class_idx{};                  ///< The class index of the detected object.
    float conf{};                     ///< The confidence score of the detection.
    cv::Rect_<float> bbox;            ///< The bounding box of the detected object.
    cv::Mat mask;                     ///< The semantic segmentation mask (if available).
    std::vector<float> keypoints{};   ///< Keypoints representing the object's pose (if available).
 };
 struct ImageInfo {
    cv::Size raw_size;  // add additional attrs if you need
 };
 class AutoBackendOnnx : public OnnxModelBase {
 public:
    // constructors
    AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
        const std::vector<int>& imgsz, const int& stride,
        const int& nc, std::unordered_map<int, std::string> names);
    AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider);
    // getters
    virtual const std::vector<int>& getImgsz();
    virtual const int& getStride();
    virtual const int& getCh();
    virtual const int& getNc();
    virtual const std::unordered_map<int, std::string>& getNames();
    virtual const std::vector<int64_t>& getInputTensorShape();
    virtual const int& getWidth();
    virtual const int& getHeight();
    virtual const cv::Size& getCvSize();
    virtual const std::string& getTask();
    /**
     * @brief Runs object detection on an input image.
     *
     * This method performs object detection on the input image and returns the detected objects as YoloResults.
     *
     * @param image The input image to run object detection on.
     * @param conf The confidence threshold for object detection.
     * @param iou The intersection-over-union (IoU) threshold for non-maximum suppression.
     * @param mask_threshold The threshold for the semantic segmentation mask.
     * @param conversionCode An optional conversion code for image format conversion (e.g., cv::COLOR_BGR2RGB).
     *                      Default value is -1, indicating no conversion.
     *                      TODO: use some constant from some namespace rather than hardcoded values here
     *
     * @return A vector of YoloResults representing the detected objects.
     */
    virtual std::vector<YoloResults> predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
    virtual std::vector<YoloResults> predict_once(const std::filesystem::path& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
    virtual std::vector<YoloResults> predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
    virtual void fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
    virtual void postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo para, std::vector<YoloResults>& output,
        int& class_names_num, float& conf_threshold, float& iou_threshold,
        int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold = 0.50f);
    virtual void postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
        int& class_names_num, float& conf_threshold, float& iou_threshold);
    virtual void postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
                                  int& class_names_num, float& conf_threshold, float& iou_threshold);
    static void _get_mask2(const cv::Mat& mask_info, const cv::Mat& mask_data, const ImageInfo& image_info, cv::Rect bound, cv::Mat& mask_out,
        float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, bool round_downsampled = false);
 protected:
    std::vector<int> imgsz_;
    int stride_ = OnnxInitializers::UNINITIALIZED_STRIDE;
    int nc_ = OnnxInitializers::UNINITIALIZED_NC; //
    int ch_ = 3;
    std::unordered_map<int, std::string> names_;
    std::vector<int64_t> inputTensorShape_;
    cv::Size cvSize_;
    std::string task_;
    //cv::MatSize cvMatSize_;
 };
--- a/include/nn/onnx_model_base.h
+++ b/include/nn/onnx_model_base.h
@ -0,0 +1,39 @@
 #pragma once
 #include <onnxruntime_cxx_api.h>
 #include <string>
 #include <unordered_map>
 #include <vector>
 /*
 * This interface must provide only required arguments to load any onnx model regarding specific info -
 *  - i.e. modelPath will always be required, provider like "cpu" or "cuda" the same, since these are parameters you need
 *  to set up `sessionOptions` or `session` objects properly, but image size is not needed for pure onnx graph to be loaded so do NOT include it here
 */
 class OnnxModelBase {
 public:
    OnnxModelBase(const char* modelPath, const char* logid, const char* provider);
    //OnnxModelBase();  // no default constructor should be there
    //virtual ~OnnxModelBase();
    virtual const std::vector<std::string>& getInputNames(); // = 0
    virtual const std::vector<std::string>& getOutputNames();
    virtual const std::vector<const char*> getOutputNamesCStr();
    virtual const std::vector<const char*> getInputNamesCStr();
    virtual const Ort::ModelMetadata& getModelMetadata();
    virtual const std::unordered_map<std::string, std::string>& getMetadata();
    virtual const char* getModelPath();
    virtual const Ort::Session& getSession();
    //virtual std::vector<Ort::Value> forward(std::vector<Ort::Value> inputTensors);
    virtual std::vector<Ort::Value> forward(std::vector<Ort::Value>& inputTensors);
    Ort::Session session{ nullptr };
 protected:
    const char* modelPath_;
    Ort::Env env{ nullptr };
    std::vector<std::string> inputNodeNames;
    std::vector<std::string> outputNodeNames;
    Ort::ModelMetadata model_metadata{ nullptr };
    std::unordered_map<std::string, std::string> metadata;
    std::vector<const char*> outputNamesCStr;
    std::vector<const char*> inputNamesCStr;
 };
--- a/include/utils/augment.h
+++ b/include/utils/augment.h
@ -0,0 +1,20 @@
 #pragma once
 #include <opencv2/core/types.hpp>
 void letterbox(const cv::Mat& image,
    cv::Mat& outImage,
    const cv::Size& newShape = cv::Size(640, 640),
    cv::Scalar_<double> color = cv::Scalar(), bool auto_ = true,
    bool scaleFill = false,
    bool scaleUp = true,
    int stride = 32
 );
 cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float,
    cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)));
 void scale_image2(
        cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape,
        const std::pair<float, cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f))
                );
--- a/include/utils/common.h
+++ b/include/utils/common.h
@ -0,0 +1,26 @@
 #ifndef COMMON_UTILS_H
 #define COMMON_UTILS_H
 #include <chrono>
 #include <string>
 #include <unordered_map>
 #include <vector>
 class Timer {
 public:
    Timer(double& accumulator, bool isEnabled = true);
    void Stop();
 private:
    double& accumulator;
    bool isEnabled;
    std::chrono::time_point<std::chrono::high_resolution_clock> start;
 };
 std::wstring get_win_path(const std::string& path);
 std::vector<std::string> parseVectorString(const std::string& input);
 std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input);
 std::unordered_map<int, std::string> parseNames(const std::string& input);
 int64_t vector_product(const std::vector<int64_t>& vec);
 #endif // COMMON_H COMMON_UTILS_H
--- a/include/utils/ops.h
+++ b/include/utils/ops.h
@ -0,0 +1,44 @@
 #pragma once
 #include <opencv2/core/types.hpp>
 //cv::Rect scaleCoords(const cv::Size& imageShape, const cv::Rect& coords, const cv::Size& imageOriginalShape);
 /**
 * Scales a bounding box from the shape of the input image to the shape of an original image.
 *
 * @param img1_shape The shape (height, width) of the input image for the model.
 * @param box The bounding box to be scaled, specified as cv::Rect_<float>.
 * @param img0_shape The shape (height, width) of the original target image.
 * @param ratio_pad An optional parameter that specifies scaling and padding factors as a pair of values.
 *	The first value (ratio) is used for scaling, and the second value (pad) is used for padding.
 *	If not provided, default values will be used.
 * @param padding An optional boolean parameter that specifies whether padding should be applied.
 *	If set to true, padding will be applied to the bounding box.
 *
 * @return A scaled bounding box specified as cv::Rect_<float>.
 *
 * This function rescales a bounding box from the shape of the input image (img1_shape) to the shape of an original image (img0_shape).
 */
 cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape, std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true);
 void clip_boxes(cv::Rect& box, const cv::Size& shape);
 void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape);
 void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape);
 void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape);
 //void clip_coords(cv::Mat& coords, const cv::Size& shape);
 //cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape);
 void clip_coords(std::vector<float>& coords, const cv::Size& shape);
 std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape);
 cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box);
 struct NMSResult{
    std::vector<cv::Rect> bboxes;
    std::vector<float> confidences;
    std::vector<int> classes;
    std::vector<std::vector<float>> rest;
 };
 //std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
 std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
 non_max_suppression(const cv::Mat& output0, int class_names_num, int total_features_num, double conf_threshold, float iou_threshold);
--- a/src/main.cpp
+++ b/src/main.cpp
@ -0,0 +1,359 @@
 #include <random>
 #include <filesystem>
 #include "nn/onnx_model_base.h"
 #include "nn/autobackend.h"
 #include <opencv2/opencv.hpp>
 #include <vector>
 #include "utils/augment.h"
 #include "constants.h"
 #include "utils/common.h"
 namespace fs = std::filesystem;
 // Define the skeleton and color mappings
 std::vector<std::vector<int>> skeleton = {{16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, {6, 12}, {7, 13}, {6, 7},
                                          {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7}};
 std::vector<cv::Scalar> posePalette = {
        cv::Scalar(255, 128, 0), cv::Scalar(255, 153, 51), cv::Scalar(255, 178, 102), cv::Scalar(230, 230, 0), cv::Scalar(255, 153, 255),
        cv::Scalar(153, 204, 255), cv::Scalar(255, 102, 255), cv::Scalar(255, 51, 255), cv::Scalar(102, 178, 255), cv::Scalar(51, 153, 255),
        cv::Scalar(255, 153, 153), cv::Scalar(255, 102, 102), cv::Scalar(255, 51, 51), cv::Scalar(153, 255, 153), cv::Scalar(102, 255, 102),
        cv::Scalar(51, 255, 51), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 0, 0), cv::Scalar(255, 255, 255)
 };
 std::vector<int> limbColorIndices = {9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16};
 std::vector<int> kptColorIndices = {16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9};
 cv::Scalar generateRandomColor(int numChannels) {
    if (numChannels < 1 || numChannels > 3) {
        throw std::invalid_argument("Invalid number of channels. Must be between 1 and 3.");
    }
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<int> dis(0, 255);
    cv::Scalar color;
    for (int i = 0; i < numChannels; i++) {
        color[i] = dis(gen); // for each channel separately generate value
    }
    return color;
 }
 std::vector<cv::Scalar> generateRandomColors(int class_names_num, int numChannels) {
    std::vector<cv::Scalar> colors;
    for (int i = 0; i < class_names_num; i++) {
        cv::Scalar color = generateRandomColor(numChannels);
        colors.push_back(color);
    }
    return colors;
 }
 void plot_masks(cv::Mat img, std::vector<YoloResults>& result, std::vector<cv::Scalar> color,
    std::unordered_map<int, std::string>& names)
 {
    cv::Mat mask = img.clone();
    for (int i = 0; i < result.size(); i++)
    {
        float left, top;
        left = result[i].bbox.x;
        top = result[i].bbox.y;
        int color_num = i;
        int& class_idx = result[i].class_idx;
        rectangle(img, result[i].bbox, color[result[i].class_idx], 2);
        // try to get string value corresponding to given class_idx
        std::string class_name;
        auto it = names.find(class_idx);
        if (it != names.end()) {
            class_name = it->second;
        }
        else {
            std::cerr << "Warning: class_idx not found in names for class_idx = " << class_idx << std::endl;
            // then convert it to string anyway
            class_name = std::to_string(class_idx);
        }
        if (result[i].mask.rows && result[i].mask.cols > 0)
        {
            mask(result[i].bbox).setTo(color[result[i].class_idx], result[i].mask);
        }
        std::stringstream labelStream;
        labelStream << class_name << " " << std::fixed << std::setprecision(2) << result[i].conf;
        std::string label = labelStream.str();
    	cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
        cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
        cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
        rectangle(img, rect_to_fill, color[result[i].class_idx], -1);
        putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
    }
    addWeighted(img, 0.6, mask, 0.4, 0, img); //add mask to src
    resize(img, img, img.size());
    imshow("img", img);
    cv::waitKey();
 }
 //void plot_keypoints(cv::Mat& image, const std::vector<std::vector<float>>& keypoints, const cv::Size& shape) {
 void plot_keypoints(cv::Mat& image, const std::vector<YoloResults>& results, const cv::Size& shape) {
    int radius = 5;
    bool drawLines = true;
    if (results.empty()) {
        return;
    }
    std::vector<cv::Scalar> limbColorPalette;
    std::vector<cv::Scalar> kptColorPalette;
    for (int index : limbColorIndices) {
        limbColorPalette.push_back(posePalette[index]);
    }
    for (int index : kptColorIndices) {
        kptColorPalette.push_back(posePalette[index]);
    }
    for (const auto& res: results) {
        auto keypoint = res.keypoints;
        bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
        drawLines &= isPose;
        // draw points
        for (int i = 0; i < 17; i++) {
            int idx = i * 3;
            int x_coord = static_cast<int>(keypoint[idx]);
            int y_coord = static_cast<int>(keypoint[idx + 1]);
            if (x_coord % shape.width != 0 && y_coord % shape.height != 0) {
                if (keypoint.size() == 3) {
                    float conf = keypoint[2];
                    if (conf < 0.5) {
                        continue;
                    }
                }
                cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
                                                                               255);  // Default to red if not in pose mode
                cv::circle(image, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
            }
        }
        // draw lines
        if (drawLines) {
            for (int i = 0; i < skeleton.size(); i++) {
                const std::vector<int> &sk = skeleton[i];
                int idx1 = sk[0] - 1;
                int idx2 = sk[1] - 1;
                int idx1_x_pos = idx1 * 3;
                int idx2_x_pos = idx2 * 3;
                int x1 = static_cast<int>(keypoint[idx1_x_pos]);
                int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
                int x2 = static_cast<int>(keypoint[idx2_x_pos]);
                int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
                float conf1 = keypoint[idx1_x_pos + 2];
                float conf2 = keypoint[idx2_x_pos + 2];
                // Check confidence thresholds
                if (conf1 < 0.5 || conf2 < 0.5) {
                    continue;
                }
                // Check if positions are within bounds
                if (x1 % shape.width == 0 || y1 % shape.height == 0 || x1 < 0 || y1 < 0 ||
                    x2 % shape.width == 0 || y2 % shape.height == 0 || x2 < 0 || y2 < 0) {
                    continue;
                }
                // Draw a line between keypoints
                cv::Scalar color_limb = limbColorPalette[i];
                cv::line(image, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
            }
        }
    }
 }
 void plot_results(cv::Mat img, std::vector<YoloResults>& results,
                  std::vector<cv::Scalar> color, std::unordered_map<int, std::string>& names,
                  const cv::Size& shape
                  ) {
    cv::Mat mask = img.clone();
    int radius = 5;
    bool drawLines = true;
    auto raw_image_shape = img.size();
    std::vector<cv::Scalar> limbColorPalette;
    std::vector<cv::Scalar> kptColorPalette;
    for (int index : limbColorIndices) {
        limbColorPalette.push_back(posePalette[index]);
    }
    for (int index : kptColorIndices) {
        kptColorPalette.push_back(posePalette[index]);
    }
    for (const auto& res : results) {
        float left = res.bbox.x;
        float top = res.bbox.y;
        int color_num = res.class_idx;
        // Draw bounding box
        rectangle(img, res.bbox, color[res.class_idx], 2);
        // Try to get the class name corresponding to the given class_idx
        std::string class_name;
        auto it = names.find(res.class_idx);
        if (it != names.end()) {
            class_name = it->second;
        }
        else {
            std::cerr << "Warning: class_idx not found in names for class_idx = " << res.class_idx << std::endl;
            // Then convert it to a string anyway
            class_name = std::to_string(res.class_idx);
        }
        // Draw mask if available
        if (res.mask.rows && res.mask.cols > 0) {
            mask(res.bbox).setTo(color[res.class_idx], res.mask);
        }
        // Create label
        std::stringstream labelStream;
        labelStream << class_name << " " << std::fixed << std::setprecision(2) << res.conf;
        std::string label = labelStream.str();
        cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
        cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
        cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
        rectangle(img, rect_to_fill, color[res.class_idx], -1);
        putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
        // Check if keypoints are available
        if (!res.keypoints.empty()) {
            auto keypoint = res.keypoints;
            bool isPose = keypoint.size() == 51;  // numKeypoints == 17 && keypoints[0].size() == 3;
            drawLines &= isPose;
            // draw points
            for (int i = 0; i < 17; i++) {
                int idx = i * 3;
                int x_coord = static_cast<int>(keypoint[idx]);
                int y_coord = static_cast<int>(keypoint[idx + 1]);
                if (x_coord % raw_image_shape.width != 0 && y_coord % raw_image_shape.height != 0) {
                    if (keypoint.size() == 3) {
                        float conf = keypoint[2];
                        if (conf < 0.5) {
                            continue;
                        }
                    }
                    cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
                                                                                  255);  // Default to red if not in pose mode
                    cv::circle(img, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
                }
            }
            // draw lines
            if (drawLines) {
                for (int i = 0; i < skeleton.size(); i++) {
                    const std::vector<int> &sk = skeleton[i];
                    int idx1 = sk[0] - 1;
                    int idx2 = sk[1] - 1;
                    int idx1_x_pos = idx1 * 3;
                    int idx2_x_pos = idx2 * 3;
                    int x1 = static_cast<int>(keypoint[idx1_x_pos]);
                    int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
                    int x2 = static_cast<int>(keypoint[idx2_x_pos]);
                    int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
                    float conf1 = keypoint[idx1_x_pos + 2];
                    float conf2 = keypoint[idx2_x_pos + 2];
                    // Check confidence thresholds
                    if (conf1 < 0.5 || conf2 < 0.5) {
                        continue;
                    }
                    // Check if positions are within bounds
                    if (x1 % raw_image_shape.width == 0 || y1 % raw_image_shape.height == 0 || x1 < 0 || y1 < 0 ||
                        x2 % raw_image_shape.width == 0 || y2 % raw_image_shape.height == 0 || x2 < 0 || y2 < 0) {
                        continue;
                    }
                    // Draw a line between keypoints
                    cv::Scalar color_limb = limbColorPalette[i];
                    cv::line(img, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
                }
            }
        }
    }
    // Combine the image and mask
    addWeighted(img, 0.6, mask, 0.4, 0, img);
 //    resize(img, img, img.size());
 //    resize(img, img, shape);
 //    // Show the image
 //    imshow("img", img);
 //    cv::waitKey();
 }
 int main()
 {
    std::string img_path = "/home/xdobro23/path_to_16bit_3_channel_image.png";
    const std::string& modelPath = "/home/xdobro23/PP1/ultralytics/runs/detect/train35/weights/best.onnx"; // pose
    fs::path imageFilePath(img_path);
    fs::path newFilePath = imageFilePath.stem();
    newFilePath += "-kpt-cpp";
    newFilePath += imageFilePath.extension();
    assert(newFilePath != imageFilePath);
    std::cout << "newFilePath: " << newFilePath << std::endl;
    const std::string& onnx_provider = OnnxProviders::CPU; // "cpu";
    const std::string& onnx_logid = "yolov8_inference2";
    float mask_threshold = 0.5f;  // in python it's 0.5 and you can see that at ultralytics/utils/ops.process_mask line 705 (ultralytics.__version__ == .160)
    float conf_threshold = 0.30f;
    float iou_threshold = 0.45f;  //  0.70f;
 	int conversion_code = -1;
    cv::Mat img = cv::imread(img_path, cv::IMREAD_UNCHANGED);
    if (img.empty()) {
        std::cerr << "Error: Unable to load image" << std::endl;
        return 1;
    }
    AutoBackendOnnx model(modelPath.c_str(), onnx_logid.c_str(), onnx_provider.c_str());
    std::cout << "Model loaded" << std::endl;
    std::vector<YoloResults> objs = model.predict_once(img, conf_threshold, iou_threshold, mask_threshold);
    std::cout << "Got results" << std::endl;
    std::vector<cv::Scalar> colors = generateRandomColors(model.getNc(), model.getCh());
    std::unordered_map<int, std::string> names = model.getNames();
    std::vector<std::vector<float>> keypointsVector;
    for (const YoloResults& result : objs) {
        keypointsVector.push_back(result.keypoints);
    }
    cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
    cv::Size show_shape = img.size();  // cv::Size(1280, 720); // img.size()
    plot_results(img, objs, colors, names, show_shape);
 //    plot_masks(img, objs, colors, names);
    cv::imshow("img", img);
    cv::waitKey();
    return -1;
 }
--- a/src/nn/autobackend.cpp
+++ b/src/nn/autobackend.cpp
@ -0,0 +1,520 @@
 #pragma once
 #include "nn/autobackend.h"
 #include <iostream>
 #include <ostream>
 #include <filesystem>
 #include <opencv2/opencv.hpp>
 #include <opencv2/imgcodecs.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/core/mat.hpp>
 #include "utils/augment.h"
 #include "constants.h"
 #include "utils/common.h"
 #include "utils/ops.h"
 namespace fs = std::filesystem;
 AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
    const std::vector<int>& imgsz, const int& stride,
    const int& nc, const std::unordered_map<int, std::string> names)
    : OnnxModelBase(modelPath, logid, provider), imgsz_(imgsz), stride_(stride), nc_(nc), names_(names),
    inputTensorShape_()
 {
 }
 AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider)
    : OnnxModelBase(modelPath, logid, provider) {
    // init metadata etc
    OnnxModelBase(modelPath, logid, provider);
    // then try to get additional info from metadata like imgsz, stride etc;
    //  ideally you should get all of them but you'll raise error if smth is not in metadata (or not under the appropriate keys)
    const std::unordered_map<std::string, std::string>& base_metadata = OnnxModelBase::getMetadata();
    // post init imgsz
    auto imgsz_iterator = base_metadata.find(MetadataConstants::IMGSZ);
    if (imgsz_iterator != base_metadata.end()) {
        // parse it and convert to int iterable
        std::vector<int> imgsz = convertStringVectorToInts(parseVectorString(imgsz_iterator->second));
        // set it here:
        if (imgsz_.empty()) {
            imgsz_ = imgsz;
        }
    }
    else {
        std::cerr << "Warning: Cannot get imgsz value from metadata" << std::endl;
    }
    // post init stride
    auto stride_item = base_metadata.find(MetadataConstants::STRIDE);
    if (stride_item != base_metadata.end()) {
        // parse it and convert to int iterable
        int stide_int = std::stoi(stride_item->second);
        // set it here:
        if (stride_ == OnnxInitializers::UNINITIALIZED_STRIDE) {
            stride_ = stide_int;
        }
    }
    else {
        std::cerr << "Warning: Cannot get stride value from metadata" << std::endl;
    }
    // post init names
    auto names_item = base_metadata.find(MetadataConstants::NAMES);
    if (names_item != base_metadata.end()) {
        // parse it and convert to int iterable
        std::unordered_map<int, std::string> names = parseNames(names_item->second);
        std::cout << "***Names from metadata***" << std::endl;
        for (const auto& pair : names) {
            std::cout << "Key: " << pair.first << ", Value: " << pair.second << std::endl;
        }
        // set it here:
        if (names_.empty()) {
            names_ = names;
        }
    }
    else {
        std::cerr << "Warning: Cannot get names value from metadata" << std::endl;
    }
    // post init number of classes - you can do that only and only if names_ is not empty and nc was not initialized previously
    if (nc_ == OnnxInitializers::UNINITIALIZED_NC && !names_.empty()) {
        nc_ = names_.size();
    }
    else {
        std::cerr << "Warning: Cannot get nc value from metadata (probably names wasn't set)" << std::endl;
    }
    if (!imgsz_.empty() && inputTensorShape_.empty())
    {
        inputTensorShape_ = { 1, ch_, getHeight(), getWidth() };
    }
    if (!imgsz_.empty())
    {
        // Initialize cvSize_ using getHeight() and getWidth()
        //cvSize_ = cv::MatSize()
        cvSize_ = cv::Size(getWidth(), getHeight());
        //cvMatSize_ = cv::MatSize(cvSize_.width, cvSize_.height);
    }
    // task init:
    auto task_item = base_metadata.find(MetadataConstants::TASK);
    if (task_item != base_metadata.end()) {
        // parse it and convert to int iterable
        std::string task = std::string(task_item->second);
        // set it here:
        if (task_.empty())
        {
            task_ = task;
        }
    }
    else {
        std::cerr << "Warning: Cannot get task value from metadata" << std::endl;
    }
    // TODO: raise assert if imgsz_ and task_ were not initialized (since you don't know in that case which postprocessing to use)
 }
 const std::vector<int>& AutoBackendOnnx::getImgsz() {
    return imgsz_;
 }
 const int& AutoBackendOnnx::getHeight()
 {
    return imgsz_[0];
 }
 const int& AutoBackendOnnx::getWidth()
 {
    return imgsz_[1];
 }
 const int& AutoBackendOnnx::getStride() {
    return stride_;
 }
 const int& AutoBackendOnnx::getCh() {
    return ch_;
 }
 const int& AutoBackendOnnx::getNc() {
    return nc_;
 }
 const std::unordered_map<int, std::string>& AutoBackendOnnx::getNames() {
    return names_;
 }
 const cv::Size& AutoBackendOnnx::getCvSize()
 {
    return cvSize_;
 }
 const std::vector<int64_t>& AutoBackendOnnx::getInputTensorShape()
 {
    return inputTensorShape_;
 }
 const std::string& AutoBackendOnnx::getTask()
 {
    return task_;
 }
 std::vector<YoloResults> AutoBackendOnnx::predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold,
    int conversionCode, bool verbose) {
    // Convert the string imagePath to an object of type std::filesystem::path
    fs::path imageFilePath(imagePath);
    // Call the predict_once method, converting the image to a cv::Mat
    return predict_once(imageFilePath, conf, iou, mask_threshold, conversionCode);
 }
 std::vector<YoloResults> AutoBackendOnnx::predict_once(const fs::path& imagePath, float& conf, float& iou, float& mask_threshold,
    int conversionCode, bool verbose) {
    // Check if the specified path exists
    if (!fs::exists(imagePath)) {
        std::cerr << "Error: File does not exist: " << imagePath << std::endl;
        // Return an empty vector or throw an exception, depending on your logic
        return {};
    }
    // Load the image into a cv::Mat
    cv::Mat image = cv::imread(imagePath.string(), cv::IMREAD_UNCHANGED);
    // Check if loading the image was successful
    if (image.empty()) {
        std::cerr << "Error: Failed to load image: " << imagePath << std::endl;
        // Return an empty vector or throw an exception, depending on your logic
        return {};
    }
    // now do some preprocessing based on channels info:
    int required_image_channels = this->getCh();
    /*assert(required_image_channels == image.channels() && "");*/
    // Assert that the number of channels in the input image matches the required number of channels for the model
    if (required_image_channels != image.channels()) {
        const std::string& errorMessage = "Error: Number of image channels does not match the required channels.\n"
            "Number of channels in the image: " + std::to_string(image.channels());
        throw std::runtime_error(errorMessage);
    }
    // Call overloaded one
    return predict_once(image, conf, iou, mask_threshold, conversionCode);
 }
 std::vector<YoloResults> AutoBackendOnnx::predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode, bool verbose) {
    double preprocess_time = 0.0;
    double inference_time = 0.0;
    double postprocess_time = 0.0;
    Timer preprocess_timer = Timer(preprocess_time, verbose);
    // 1. preprocess
    float* blob = nullptr;
    //double* blob = nullptr;
    std::vector<Ort::Value> inputTensors;
    if (conversionCode >= 0) {
        cv::cvtColor(image, image, conversionCode);
    }
    std::vector<int64_t> inputTensorShape;
    // TODO: for classify task preprocessed image will be different (!):
    cv::Mat preprocessed_img;
    cv::Size new_shape = cv::Size(getWidth(), getHeight());
    const bool& scaleFill = false;  // false
    const bool& auto_ = false; // true
    letterbox(image, preprocessed_img, new_shape, cv::Scalar(), auto_, scaleFill, true, getStride());
    fill_blob(preprocessed_img, blob, inputTensorShape);
    int64_t inputTensorSize = vector_product(inputTensorShape);
    std::vector<float> inputTensorValues(blob, blob + inputTensorSize);
    Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
        OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
    inputTensors.push_back(Ort::Value::CreateTensor<float>(
        memoryInfo, inputTensorValues.data(), inputTensorSize,
        inputTensorShape.data(), inputTensorShape.size()
    ));
    preprocess_timer.Stop();
    Timer inference_timer = Timer(inference_time, verbose);
    // 2. inference
    std::vector<Ort::Value> outputTensors = forward(inputTensors);
    inference_timer.Stop();
    Timer postprocess_timer = Timer(postprocess_time, verbose);
    // create container for the results
    std::vector<YoloResults> results;
    // 3. postprocess based on task:
    std::unordered_map<int, std::string> names = this->getNames();
    // 4. cleanup blob since it was created using the "new" keyword during the `fill_blob` func call
    delete[] blob;
    int class_names_num = names.size();
    if (task_ == YoloTasks::SEGMENT) {
        // get outputs info
        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        std::vector<int64_t> outputTensor1Shape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
        // get outputs
        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
        auto mask_shape = outputTensor1Shape;
        std::vector<int> mask_sz = { 1,(int)mask_shape[1],(int)mask_shape[2],(int)mask_shape[3] };
        cv::Mat output1 = cv::Mat(mask_sz, CV_32F, outputTensors[1].GetTensorMutableData<float>());
        int iw = this->getWidth();
        int ih = this->getHeight();
        int mask_features_num = outputTensor1Shape[1];
        int mh = outputTensor1Shape[2];
        int mw = outputTensor1Shape[3];
        ImageInfo img_info = { image.size() };
        postprocess_masks(output0, output1, img_info, results, class_names_num, conf, iou,
            iw, ih, mw, mh, mask_features_num, mask_threshold);
    }
    else if (task_ == YoloTasks::DETECT) {
        ImageInfo img_info = { image.size() };
        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
        postprocess_detects(output0, img_info, results, class_names_num, conf, iou);
    }
    else if (task_ == YoloTasks::POSE) {
        ImageInfo image_info = { image.size() };
        std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
        float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
        cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t();  // [bs, features, preds_num]=>[bs, preds_num, features]
        postprocess_kpts(output0, image_info, results, class_names_num, conf, iou);
    }
    else {
        throw std::runtime_error("NotImplementedError: task: " + task_);
    }
    postprocess_timer.Stop();
    if (verbose) {
        std::cout << std::fixed << std::setprecision(1);
        std::cout << "image: " << preprocessed_img.rows << "x" << preprocessed_img.cols << " " << results.size() << " objs, ";
        std::cout << (preprocess_time + inference_time + postprocess_time) * 1000.0 << "ms" << std::endl;
        std::cout << "Speed: " << (preprocess_time * 1000.0) << "ms preprocess, ";
        std::cout << (inference_time * 1000.0) << "ms inference, ";
        std::cout << (postprocess_time * 1000.0) << "ms postprocess per image ";
        std::cout << "at shape (1, " << image.channels() << ", " << preprocessed_img.rows << ", " << preprocessed_img.cols << ")" << std::endl;
    }
    return results;
 }
 void AutoBackendOnnx::postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo image_info, std::vector<YoloResults>& output,
    int& class_names_num, float& conf_threshold, float& iou_threshold,
    int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold /* = 0.5f */)
 {
    output.clear();
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;
    std::vector<std::vector<float>> masks;
    // 4 - your default number of rect parameters {x, y, w, h}
    int data_width = class_names_num + 4 + masks_features_num;
    int rows = output0.rows;
    float* pdata = (float*)output0.data;
    for (int r = 0; r < rows; ++r)
    {
        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
        cv::Point class_id;
        double max_conf;
        minMaxLoc(scores, 0, &max_conf, 0, &class_id);
        if (max_conf > conf_threshold)
        {
            masks.push_back(std::vector<float>(pdata + 4 + class_names_num, pdata + data_width));
            class_ids.push_back(class_id.x);
            confidences.push_back(max_conf);
            float out_w = pdata[2];
            float out_h = pdata[3];
            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
            cv::Rect_ <float> bbox = cv::Rect(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
            cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
            boxes.push_back(scaled_bbox);
        }
        pdata += data_width; // next pred
    }
    // 
    //float masks_threshold = 0.50;
    //int top_k = 500;
    //const float& nmsde_eta = 1.0f;
    std::vector<int> nms_result;
    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
    // select all of the protos tensor
    cv::Size downsampled_size = cv::Size(mw, mh);
    std::vector<cv::Range> roi_rangs = { cv::Range(0, 1), cv::Range::all(),
                                         cv::Range(0, downsampled_size.height), cv::Range(0, downsampled_size.width) };
    cv::Mat temp_mask = output1(roi_rangs).clone();
    cv::Mat proto = temp_mask.reshape(0, { masks_features_num, downsampled_size.width * downsampled_size.height });
    for (int i = 0; i < nms_result.size(); ++i)
    {
        int idx = nms_result[i];
        boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
        YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
        _get_mask2(cv::Mat(masks[idx]).t(), proto, image_info, boxes[idx], result.mask, mask_threshold,
            iw, ih, mw, mh, masks_features_num);
        output.push_back(result);
    }
 }
 void AutoBackendOnnx::postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
    int& class_names_num, float& conf_threshold, float& iou_threshold)
 {
    output.clear();
    std::vector<int> class_ids;
    std::vector<float> confidences;
    std::vector<cv::Rect> boxes;
    std::vector<std::vector<float>> masks;
    // 4 - your default number of rect parameters {x, y, w, h}
    int data_width = class_names_num + 4;
    int rows = output0.rows;
    float* pdata = (float*)output0.data;
    for (int r = 0; r < rows; ++r)
    {
        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
        cv::Point class_id;
        double max_conf;
        minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
        if (max_conf > conf_threshold)
        {
            masks.emplace_back(pdata + 4 + class_names_num, pdata + data_width);
            class_ids.push_back(class_id.x);
            confidences.push_back((float) max_conf);
            float out_w = pdata[2];
            float out_h = pdata[3];
            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
            cv::Rect_ <float> bbox = cv::Rect_ <float> (out_left, out_top, (out_w + 0.5), (out_h + 0.5));
            cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
            boxes.push_back(scaled_bbox);
        }
        pdata += data_width; // next pred
    }
    std::vector<int> nms_result;
    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
    for (int idx : nms_result)
    {
        boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
        YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
        output.push_back(result);
    }
 }
 void AutoBackendOnnx::postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
                                          int& class_names_num, float& conf_threshold, float& iou_threshold)
 {
    std::vector<cv::Rect> boxes;
    std::vector<float> confidences;
    std::vector<int> class_ids;
    std::vector<std::vector<float>> rest;
    std::tie(boxes, confidences, class_ids, rest) = non_max_suppression(output0, class_names_num, output0.cols, conf_threshold, iou_threshold);
    cv::Size img1_shape = getCvSize();
    auto bound_bbox = cv::Rect_ <float> (0, 0, image_info.raw_size.width, image_info.raw_size.height);
    for (int i = 0; i < boxes.size(); i++) {
        //             pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round()
        //            pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
        //            pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape)
        //            path = self.batch[0]
        //            img_path = path[i] if isinstance(path, list) else path
        //            results.append(
        //                Results(orig_img=orig_img,
        //                        path=img_path,
        //                        names=self.model.names,
        //                        boxes=pred[:, :6],
        //                        keypoints=pred_kpts))
        cv::Rect_<float> bbox = boxes[i];
        auto scaled_bbox = scale_boxes(img1_shape, bbox, image_info.raw_size);
        scaled_bbox = scaled_bbox & bound_bbox;
 //        cv::Mat kpt = cv::Mat(rest[i]).t();
 //        scale_coords(img1_shape, kpt, image_info.raw_size);
        // TODO: overload scale_coords so that will accept cv::Mat of shape [17, 3]
        //      so that it will be more similar to what we have in python
        std::vector<float> kpt = scale_coords(img1_shape, rest[i], image_info.raw_size);
        YoloResults tmp_res = { class_ids[i], confidences[i], scaled_bbox, {}, kpt};
        output.push_back(tmp_res);
    }
 }
 void AutoBackendOnnx::_get_mask2(const cv::Mat& masks_features,
    const cv::Mat& proto,
    const ImageInfo& image_info, const cv::Rect bound, cv::Mat& mask_out,
    float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num,
    bool round_downsampled)
 {
    cv::Size img0_shape = image_info.raw_size;
    cv::Size img1_shape = cv::Size(iw, ih);
    cv::Size downsampled_size = cv::Size(mw, mh);
    cv::Rect_<float> bound_float(
        static_cast<float>(bound.x),
        static_cast<float>(bound.y),
        static_cast<float>(bound.width),
        static_cast<float>(bound.height)
    );
    cv::Rect_<float> downsampled_bbox = scale_boxes(img0_shape, bound_float, downsampled_size);
    cv::Size bound_size = cv::Size(mw, mh);
    clip_boxes(downsampled_bbox, bound_size);
    cv::Mat matmul_res = (masks_features * proto).t();
    matmul_res = matmul_res.reshape(1, { downsampled_size.height, downsampled_size.width });
    // apply sigmoid to the mask:
    cv::Mat sigmoid_mask;
    exp(-matmul_res, sigmoid_mask);
    sigmoid_mask = 1.0 / (1.0 + sigmoid_mask);
    cv::Mat resized_mask;
    cv::Rect_<float> input_bbox = scale_boxes(img0_shape, bound_float, img1_shape);
    cv::resize(sigmoid_mask, resized_mask, img1_shape, 0, 0, cv::INTER_LANCZOS4);
    cv::Mat pre_out_mask = resized_mask(input_bbox);
    cv::Mat scaled_mask;
    scale_image2(scaled_mask, resized_mask, img0_shape);
    cv::resize(scaled_mask, mask_out, img0_shape);
    mask_out = mask_out(bound) > mask_thresh;
 }
 void AutoBackendOnnx::fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape) {
 	cv::Mat floatImage;
    if (inputTensorShape.empty())
    {
        inputTensorShape = getInputTensorShape();
    }
    int inputChannelsNum = inputTensorShape[1];
    int rtype = CV_32FC3;
    image.convertTo(floatImage, rtype, 1.0f / 65635.0);
    blob = new float[floatImage.cols * floatImage.rows * floatImage.channels()];
    cv::Size floatImageSize{ floatImage.cols, floatImage.rows };
    // hwc -> chw
    std::vector<cv::Mat> chw(floatImage.channels());
    for (int i = 0; i < floatImage.channels(); ++i)
    {
        chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + i * floatImageSize.width * floatImageSize.height);
    }
    cv::split(floatImage, chw);
 }
--- a/src/nn/onnx_model_base.cpp
+++ b/src/nn/onnx_model_base.cpp
@ -0,0 +1,181 @@
 #include "nn/onnx_model_base.h"
 #include <iostream>
 #include <onnxruntime_cxx_api.h>
 #include <onnxruntime_c_api.h>
 #include "constants.h"
 #include "utils/common.h"
 /**
 * @brief Base class for any onnx model regarding the target.
 *
 * Wraps OrtApi.
 *
 * The caller provides a model path, logid, and provider.
 *
 * See the output logs for more information on warnings/errors that occur while processing the model.
 *
 * @param[in] modelPath Path to the model file.
 * @param[in] logid Log identifier.
 * @param[in] provider Provider (e.g., "CPU" or "CUDA"). (NOTE: for now only CPU is supported)
 */
 OnnxModelBase::OnnxModelBase(const char* modelPath, const char* logid, const char* provider)
 //: modelPath_(modelPath), env(std::move(env)), session(std::move(session))
    : modelPath_(modelPath)
 {
    // TODO: too bad passing `ORT_LOGGING_LEVEL_WARNING` by default - for some cases
    //       info level would make sense too
    env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, logid);
    Ort::SessionOptions sessionOptions = Ort::SessionOptions();
    std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
    auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider");
    OrtCUDAProviderOptions cudaOption;
    if (provider == OnnxProviders::CUDA.c_str()) {  // strcmp(provider, OnnxProviders::CUDA.c_str()) == true strcmp(provider, "cuda") // (providerStr == "cuda")
        if (cudaAvailable == availableProviders.end()) {
            std::cout << "CUDA is not supported by your ONNXRuntime build. Fallback to CPU." << std::endl;
            //std::cout << "Inference device: CPU" << std::endl;
        }
        else {
            //std::cout << "Inference device: GPU" << std::endl;
            sessionOptions.AppendExecutionProvider_CUDA(cudaOption);
        }
    }
    else if (provider == OnnxProviders::CPU.c_str()) {  // strcmp(provider, OnnxProviders::CPU.c_str()) == true) (providerStr == "cpu") {
        // "cpu" by default
    }
    else
    {
        throw std::runtime_error("NotImplemented provider=" + std::string(provider));
    }
    std::cout << "Inference device: " << std::string(provider) << std::endl;
    #ifdef _WIN32
        auto modelPathW = get_win_path(modelPath);  // For Windows (wstring)
        session = Ort::Session(env, modelPathW.c_str(), sessionOptions);
    #else
        session = Ort::Session(env, modelPath, sessionOptions);  // For Linux (string)
    #endif
    //session = Ort::Session(env)
    // https://github.com/microsoft/onnxruntime/issues/14157
    //std::vector<const char*> inputNodeNames; //
    // ----------------
    // init input names
    inputNodeNames;
    std::vector<Ort::AllocatedStringPtr> inputNodeNameAllocatedStrings; // <-- newly added
    Ort::AllocatorWithDefaultOptions allocator;
    auto inputNodesNum = session.GetInputCount();
    for (int i = 0; i < inputNodesNum; i++) {
        auto input_name = session.GetInputNameAllocated(i, allocator);
        inputNodeNameAllocatedStrings.push_back(std::move(input_name));
        inputNodeNames.push_back(inputNodeNameAllocatedStrings.back().get());
    }
    // -----------------
    // init output names
    outputNodeNames;
    auto outputNodesNum = session.GetOutputCount();
    std::vector<Ort::AllocatedStringPtr> outputNodeNameAllocatedStrings; // <-- newly added
    Ort::AllocatorWithDefaultOptions output_names_allocator;
    for (int i = 0; i < outputNodesNum; i++)
    {
        auto output_name = session.GetOutputNameAllocated(i, output_names_allocator);
        outputNodeNameAllocatedStrings.push_back(std::move(output_name));
        outputNodeNames.push_back(outputNodeNameAllocatedStrings.back().get());
    }
    // -------------------------
    // initialize model metadata
    model_metadata = session.GetModelMetadata();
    Ort::AllocatorWithDefaultOptions metadata_allocator;
    std::vector<Ort::AllocatedStringPtr> metadataAllocatedKeys = model_metadata.GetCustomMetadataMapKeysAllocated(metadata_allocator);
    std::vector<std::string> metadata_keys;
    metadata_keys.reserve(metadataAllocatedKeys.size());
    for (const Ort::AllocatedStringPtr& allocatedString : metadataAllocatedKeys) {
        metadata_keys.emplace_back(allocatedString.get());
    }
    // -------------------------
    // initialize metadata as the dict
    // even though we know exactly what metadata we intend to use
    // base onnx class should not have any ultralytics yolo-specific attributes like stride, task etc, so keep it clean as much as possible
    for (const std::string& key : metadata_keys) {
        Ort::AllocatedStringPtr metadata_value = model_metadata.LookupCustomMetadataMapAllocated(key.c_str(), metadata_allocator);
        if (metadata_value != nullptr) {
            auto raw_metadata_value = metadata_value.get();
            metadata[key] = std::string(raw_metadata_value);
        }
    }
    // initialize cstr
    for (const std::string& name : outputNodeNames) {
        outputNamesCStr.push_back(name.c_str());
    }
    for (const std::string& name : inputNodeNames)
    {
        inputNamesCStr.push_back(name.c_str());
    }
 }
 const std::vector<std::string>& OnnxModelBase::getInputNames() {
    return inputNodeNames;
 }
 const std::vector<std::string>& OnnxModelBase::getOutputNames() {
    return outputNodeNames;
 }
 const Ort::ModelMetadata& OnnxModelBase::getModelMetadata()
 {
    return model_metadata;
 }
 const std::unordered_map<std::string, std::string>& OnnxModelBase::getMetadata()
 {
    return metadata;
 }
 const Ort::Session& OnnxModelBase::getSession()
 {
    return session;
 }
 const char* OnnxModelBase::getModelPath()
 {
    return modelPath_;
 }
 const std::vector<const char*> OnnxModelBase::getOutputNamesCStr()
 {
    return outputNamesCStr;
 }
 const std::vector<const char*> OnnxModelBase::getInputNamesCStr()
 {
    return inputNamesCStr;
 }
 std::vector<Ort::Value> OnnxModelBase::forward(std::vector<Ort::Value>& inputTensors)
 {
    // todo: make runOptions parameter here
    return session.Run(Ort::RunOptions{ nullptr },
        inputNamesCStr.data(),
        inputTensors.data(),
        inputNamesCStr.size(),
        outputNamesCStr.data(),
        outputNamesCStr.size());
 }
 //OnnxModelBase::~OnnxModelBase() {
 //    // empty body
 //}
--- a/src/utils/augment.cpp
+++ b/src/utils/augment.cpp
@ -0,0 +1,145 @@
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/core/mat.hpp>
 #include <opencv2/core/types.hpp>
 /**
 * \brief padding value when letterbox changes image size ratio
 */
 const int& DEFAULT_LETTERBOX_PAD_VALUE = 114;
 void letterbox(const cv::Mat& image,
    cv::Mat& outImage,
    const cv::Size& newShape,
    cv::Scalar_<double> color,
    bool auto_,
    bool scaleFill,
    bool scaleUp, int stride
 ) {
    cv::Size shape = image.size();
    float r = std::min(static_cast<float>(newShape.height) / static_cast<float>(shape.height),
        static_cast<float>(newShape.width) / static_cast<float>(shape.width));
    if (!scaleUp)
        r = std::min(r, 1.0f);
    float ratio[2]{ r, r };
    int newUnpad[2]{ static_cast<int>(std::round(static_cast<float>(shape.width) * r)),
                     static_cast<int>(std::round(static_cast<float>(shape.height) * r)) };
    auto dw = static_cast<float>(newShape.width - newUnpad[0]);
    auto dh = static_cast<float>(newShape.height - newUnpad[1]);
    if (auto_)
    {
        dw = static_cast<float>((static_cast<int>(dw) % stride));
        dh = static_cast<float>((static_cast<int>(dh) % stride));
    }
    else if (scaleFill)
    {
        dw = 0.0f;
        dh = 0.0f;
        newUnpad[0] = newShape.width;
        newUnpad[1] = newShape.height;
        ratio[0] = static_cast<float>(newShape.width) / static_cast<float>(shape.width);
        ratio[1] = static_cast<float>(newShape.height) / static_cast<float>(shape.height);
    }
    dw /= 2.0f;
    dh /= 2.0f;
    //cv::Mat outImage;
    if (shape.width != newUnpad[0] || shape.height != newUnpad[1])
    {
        cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1]));
    }
    else
    {
        outImage = image.clone();
    }
    int top = static_cast<int>(std::round(dh - 0.1f));
    int bottom = static_cast<int>(std::round(dh + 0.1f));
    int left = static_cast<int>(std::round(dw - 0.1f));
    int right = static_cast<int>(std::round(dw + 0.1f));
    if (color == cv::Scalar()) {
        color = cv::Scalar(DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE);
    }
    cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
 }
 cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
    cv::Size im1_shape = resized_mask.size();
    // Check if resizing is needed
    if (im1_shape == im0_shape) {
        return resized_mask.clone();
    }
    float gain, pad_x, pad_y;
    if (ratio_pad.first < 0.0f) {
        gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
            static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
        pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
        pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
    }
    else {
        gain = ratio_pad.first;
        pad_x = ratio_pad.second.x;
        pad_y = ratio_pad.second.y;
    }
    int top = static_cast<int>(pad_y);
    int left = static_cast<int>(pad_x);
    int bottom = static_cast<int>(im1_shape.height - pad_y);
    int right = static_cast<int>(im1_shape.width - pad_x);
    // Clip and resize the mask
    cv::Rect clipped_rect(left, top, right - left, bottom - top);
    cv::Mat clipped_mask = resized_mask(clipped_rect);
    cv::Mat scaled_mask;
    cv::resize(clipped_mask, scaled_mask, im0_shape);
    return scaled_mask;
 }
 void scale_image2(cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
    cv::Size im1_shape = resized_mask.size();
    // Check if resizing is needed
    if (im1_shape == im0_shape) {
        scaled_mask = resized_mask.clone();
        return;
    }
    float gain, pad_x, pad_y;
    if (ratio_pad.first < 0.0f) {
        gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
                        static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
        pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
        pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
    }
    else {
        gain = ratio_pad.first;
        pad_x = ratio_pad.second.x;
        pad_y = ratio_pad.second.y;
    }
    int top = static_cast<int>(pad_y);
    int left = static_cast<int>(pad_x);
    int bottom = static_cast<int>(im1_shape.height - pad_y);
    int right = static_cast<int>(im1_shape.width - pad_x);
    // Clip and resize the mask
    cv::Rect clipped_rect(left, top, right - left, bottom - top);
    cv::Mat clipped_mask = resized_mask(clipped_rect);
    cv::resize(clipped_mask, scaled_mask, im0_shape);
 }
--- a/src/utils/common.cpp
+++ b/src/utils/common.cpp
@ -0,0 +1,147 @@
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #include <string>
 #include <sstream>
 #include <codecvt>
 #include "utils/common.h"
 #include <chrono>
 //#include <boost/algorithm/string.hpp>
 #include <iostream>
 #include <stdio.h>
 #include <string>
 #include <regex>
 #include <vector>
 Timer::Timer(double& accumulator, bool isEnabled)
    : accumulator(accumulator), isEnabled(isEnabled) {
    if (isEnabled) {
        start = std::chrono::high_resolution_clock::now();
    }
 }
 // Stop the timer and update the accumulator
 void Timer::Stop() {
    if (isEnabled) {
        auto end = std::chrono::high_resolution_clock::now();
        double duration = std::chrono::duration<double>(end - start).count();
        accumulator += duration;
    }
 }
 // С++ 14 version
 //#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 std::wstring get_win_path(const std::string& modelPath) {
 #ifdef _WIN32
    return std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(modelPath);
 #else
    // return modelPath;
    return std::wstring(modelPath.begin(), modelPath.end());
 #endif
 }
 std::vector<std::string> parseVectorString(const std::string& input) {
    /* Main purpose of this function is to parse `imgsz` key value of model metadata
     *  and from [height, width] get height, width values in the vector of strings
     * Args:
     *  input:
     *      expected to be something like [544, 960] or [3,544, 960]
     * output:
     *  iterable of strings, representing integers
     */
    std::regex number_pattern(R"(\d+)");
    std::vector<std::string> result;
    std::sregex_iterator it(input.begin(), input.end(), number_pattern);
    std::sregex_iterator end;
    while (it != end) {
        result.push_back(it->str());
        ++it;
    }
    return result;
 }
 std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input) {
    std::vector<int> result;
    for (const std::string& str : input) {
        try {
            int value = std::stoi(str);
            result.push_back(value);
        }
        catch (const std::invalid_argument& e) {
            // raise explicit exception
            throw std::invalid_argument("Bad argument (cannot cast): value=" + str);
        }
        catch (const std::out_of_range& e) {
            // check bounds
            throw std::out_of_range("Value out of range: " + str);
        }
    }
    return result;
 }
 /*
 std::unordered_map<int, std::string> parseNames(const std::string& input) {
    std::unordered_map<int, std::string> result;
    std::string cleanedInput = input;
    boost::erase_all(cleanedInput, "{");
    boost::erase_all(cleanedInput, "}");
    std::vector<std::string> elements;
    boost::split(elements, cleanedInput, boost::is_any_of(","));
    for (const std::string& element : elements) {
        std::vector<std::string> keyValue;
        boost::split(keyValue, element, boost::is_any_of(":"));
        if (keyValue.size() == 2) {
            int key = std::stoi(boost::trim_copy(keyValue[0]));
            std::string value = boost::trim_copy(keyValue[1]);
            result[key] = value;
        }
    }
    return result;
 }
 */
 std::unordered_map<int, std::string> parseNames(const std::string& input) {
    std::unordered_map<int, std::string> result;
    std::string cleanedInput = input;
    cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '{'), cleanedInput.end());
    cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '}'), cleanedInput.end());
    std::istringstream elementStream(cleanedInput);
    std::string element;
    while (std::getline(elementStream, element, ',')) {
        std::istringstream keyValueStream(element);
        std::string keyStr, value;
        if (std::getline(keyValueStream, keyStr, ':') && std::getline(keyValueStream, value)) {
            int key = std::stoi(keyStr);
            result[key] = value;
        }
    }
    return result;
 }
 int64_t vector_product(const std::vector<int64_t>& vec) {
    int64_t result = 1;
    for (int64_t value : vec) {
        result *= value;
    }
    return result;
 }
--- a/src/utils/ops.cpp
+++ b/src/utils/ops.cpp
@ -0,0 +1,217 @@
 #pragma once
 #include <opencv2/opencv.hpp>
 //#include <opencv2/imgcodecs.hpp>
 //#include <opencv2/imgproc.hpp>
 #include <opencv2/core/mat.hpp>
 #include <opencv2/core/types.hpp>
 #include <opencv2/core.hpp>
 #include <vector>
 void clip_boxes(cv::Rect& box, const cv::Size& shape) {
    box.x = std::max(0, std::min(box.x, shape.width));
    box.y = std::max(0, std::min(box.y, shape.height));
    box.width = std::max(0, std::min(box.width, shape.width - box.x));
    box.height = std::max(0, std::min(box.height, shape.height - box.y));
 }
 void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape) {
    box.x = std::max(0.0f, std::min(box.x, static_cast<float>(shape.width)));
    box.y = std::max(0.0f, std::min(box.y, static_cast<float>(shape.height)));
    box.width = std::max(0.0f, std::min(box.width, static_cast<float>(shape.width - box.x)));
    box.height = std::max(0.0f, std::min(box.height, static_cast<float>(shape.height - box.y)));
 }
 void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape) {
    for (cv::Rect& box : boxes) {
        clip_boxes(box, shape);
    }
 }
 void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape) {
    for (cv::Rect_<float>& box : boxes) {
        clip_boxes(box, shape);
    }
 }
 // source: ultralytics/utils/ops.py scale_boxes lines 99+ (ultralytics==8.0.160)
 cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape,
    std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true) {
    float gain, pad_x, pad_y;
    if (ratio_pad.first < 0.0f) {
        gain = std::min(static_cast<float>(img1_shape.height) / static_cast<float>(img0_shape.height),
            static_cast<float>(img1_shape.width) / static_cast<float>(img0_shape.width));
        pad_x = roundf((img1_shape.width - img0_shape.width * gain) / 2.0f - 0.1f);
        pad_y = roundf((img1_shape.height - img0_shape.height * gain) / 2.0f - 0.1f);
    }
    else {
        gain = ratio_pad.first;
        pad_x = ratio_pad.second.x;
        pad_y = ratio_pad.second.y;
    }
    //cv::Rect scaledCoords(box);
    cv::Rect_<float> scaledCoords(box);
    if (padding) {
        scaledCoords.x -= pad_x;
        scaledCoords.y -= pad_y;
    }
    scaledCoords.x /= gain;
    scaledCoords.y /= gain;
    scaledCoords.width /= gain;
    scaledCoords.height /= gain;
    // Clip the box to the bounds of the image
    clip_boxes(scaledCoords, img0_shape);
    return scaledCoords;
 }
 //void clip_coords(cv::Mat& coords, const cv::Size& shape) {
 //    // Clip x-coordinates to the image width
 //    cv::Mat xCoords = coords.col(0);
 //    cv::Mat yCoords = coords.col(1);
 //
 //    for (int i = 0; i < coords.rows; ++i) {
 //        xCoords.at<float>(i) = std::max(std::min(xCoords.at<float>(i), static_cast<float>(shape.width - 1)), 0.0f);
 //        yCoords.at<float>(i) = std::max(std::min(yCoords.at<float>(i), static_cast<float>(shape.height - 1)), 0.0f);
 //    }
 //}
 void clip_coords(std::vector<float>& coords, const cv::Size& shape) {
    // Assuming coords are of shape [1, 17, 3]
    for (int i = 0; i < coords.size(); i += 3) {
        coords[i] = std::min(std::max(coords[i], 0.0f), static_cast<float>(shape.width - 1));  // x
        coords[i + 1] = std::min(std::max(coords[i + 1], 0.0f), static_cast<float>(shape.height - 1));  // y
    }
 }
 // source: ultralytics/utils/ops.py scale_coords lines 753+ (ultralytics==8.0.160)
 //cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape)
 //cv::Mat scale_coords(const cv::Size& img1_shape, std::vector<float> coords, const cv::Size& img0_shape)
 std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape)
 {
 //    cv::Mat scaledCoords = coords.clone();
    std::vector<float> scaledCoords = coords;
    // Calculate gain and padding
    double gain = std::min(static_cast<double>(img1_shape.width) / img0_shape.width, static_cast<double>(img1_shape.height) / img0_shape.height);
    cv::Point2d pad((img1_shape.width - img0_shape.width * gain) / 2, (img1_shape.height - img0_shape.height * gain) / 2);
    // Apply padding
 //    scaledCoords.col(0) = (scaledCoords.col(0) - pad.x);
 //    scaledCoords.col(1) = (scaledCoords.col(1) - pad.y);
    // Assuming coords are of shape [1, 17, 3]
    for (int i = 0; i < scaledCoords.size(); i += 3) {
        scaledCoords[i] -= pad.x;  // x padding
        scaledCoords[i + 1] -= pad.y;  // y padding
    }
    // Scale coordinates
 //    scaledCoords.col(0) /= gain;
 //    scaledCoords.col(1) /= gain;
    // Assuming coords are of shape [1, 17, 3]
    for (int i = 0; i < scaledCoords.size(); i += 3) {
        scaledCoords[i] /= gain;
        scaledCoords[i + 1] /= gain;
    }
    clip_coords(scaledCoords, img0_shape);
    return scaledCoords;
 }
 cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box) {
    int h = mask.rows;
    int w = mask.cols;
    int x1 = box.x;
    int y1 = box.y;
    int x2 = box.x + box.width;
    int y2 = box.y + box.height;
    cv::Mat cropped_mask = cv::Mat::zeros(h, w, mask.type());
    for (int r = 0; r < h; ++r) {
        for (int c = 0; c < w; ++c) {
            if (r >= y1 && r < y2 && c >= x1 && c < x2) {
                cropped_mask.at<float>(r, c) = mask.at<float>(r, c);
            }
        }
    }
    return cropped_mask;
 }
 //std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
 std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
 non_max_suppression(const cv::Mat& output0, int class_names_num, int data_width, double conf_threshold,
                    float iou_threshold) {
    std::vector<int> class_ids;
    std::vector<float> confidences;
 //    std::vector<cv::Rect_<float>> boxes;
    std::vector<cv::Rect> boxes;
    std::vector<std::vector<float>> rest;
    int rest_start_pos = class_names_num + 4;
    int rest_features = data_width - rest_start_pos;
 //    int data_width = rest_start_pos + total_features_num;
    int rows = output0.rows;
    float* pdata = (float*) output0.data;
    for (int r = 0; r < rows; ++r) {
        cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
        cv::Point class_id;
        double max_conf;
        minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
        if (max_conf > conf_threshold) {
            std::vector<float> mask_data(pdata + 4 + class_names_num, pdata + data_width);
            class_ids.push_back(class_id.x);
            confidences.push_back((float) max_conf);
            float out_w = pdata[2];
            float out_h = pdata[3];
            float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
            float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
            cv::Rect_<float> bbox(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
            boxes.push_back(bbox);
            if (rest_features > 0) {
                std::vector<float> rest_data(pdata + rest_start_pos, pdata + data_width);
                rest.push_back(rest_data);
            }
        }
        pdata += data_width; // next prediction
    }
    //
    //float masks_threshold = 0.50;
    //int top_k = 500;
    //const float& nmsde_eta = 1.0f;
    std::vector<int> nms_result;
    cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
 //    cv::dnn::NMSBoxes(boxes, confidences, );
    std::vector<int> nms_class_ids;
    std::vector<float> nms_confidences;
 //    std::vector<cv::Rect_<float>> boxes;
    std::vector<cv::Rect> nms_boxes;
    std::vector<std::vector<float>> nms_rest;
    for (int idx: nms_result) {
        nms_class_ids.push_back(class_ids[idx]);
        nms_confidences.push_back(confidences[idx]);
        nms_boxes.push_back(boxes[idx]);
        nms_rest.push_back(rest[idx]);
    }
    return std::make_tuple(nms_boxes, nms_confidences, nms_class_ids, nms_rest);
 }