commit 157639e9e3420c85915183897007303aa756028e Author: Samuel Dobron Date: Sat Jul 13 13:15:28 2024 +0000 init commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..429782b --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +# ide related +.vs/* +.idea/* +# build +.build/* +x64/* +cmake-build-debug/* +cmake-build-debug-visual-studio/* +# ignore CMakeLists.txt but add example file +CMakeLists.txt +# dlls: +*.dll +# vs-like objects +*.sln +*vcxproj* +packages.config +packages/* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..fa6e2fa --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,64 @@ +CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0) +project(YOLOv8CPP) + +SET(CMAKE_CXX_STANDARD 17) +SET(CMAKE_CXX_STANDARD_REQUIRED ON) +# SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib) # opencv lib root +# SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin) #opencv bin root + +# SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll) # change filenames +# SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll) # change filenames +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") +SET (ONNXRUNTIME_DIR /home/xdobro23/PP1/onnxruntime/onnxruntime-linux-x64-gpu-1.18.1) # onnxruntime root + +FIND_PACKAGE(OpenCV REQUIRED) +set(CMAKE_BUILD_TYPE Debug) +# --- Configure your project files --- +include_directories(include) # Include your header files directory + +# Recursively collect all source files under 'src' directory +file(GLOB_RECURSE CURR_SOURCES src/*.cpp) + +# Create the executable +add_executable(YOLOv8CPP ${CURR_SOURCES}) + +target_compile_features(YOLOv8CPP PRIVATE cxx_std_17) + +TARGET_INCLUDE_DIRECTORIES(YOLOv8CPP PRIVATE "${ONNXRUNTIME_DIR}/include") + + +TARGET_LINK_LIBRARIES(YOLOv8CPP ${OpenCV_LIBS}) + +if (WIN32) + TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/onnxruntime.lib") + + # some changes to the original version: + # copy onnxruntime dll + add_custom_command(TARGET YOLOv8CPP POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${ONNXRUNTIME_DIR}/lib/onnxruntime.dll" + "$" + ) + # copy opencv +#[[ add_custom_command(TARGET YOLOv8CPP POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_differentdpkg -L libcv2 + "${OpenCV_DIR}/${OpenCV_DLL_FILENAME}" + "$" + )]] + add_custom_command(TARGET YOLOv8CPP POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${OpenCV_BIN_DIR}/${OpenCV_DEBUG_DLL_FILENAME}" + "$" + ) + # add release + add_custom_command(TARGET YOLOv8CPP POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${OpenCV_BIN_DIR}/${OpenCV_RELEASE_DLL_FILENAME}" + "$" + ) + +endif(WIN32) + +if (UNIX) + TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so") +endif(UNIX) diff --git a/include/constants.h b/include/constants.h new file mode 100644 index 0000000..5f4f673 --- /dev/null +++ b/include/constants.h @@ -0,0 +1,34 @@ +#pragma once +#include + +namespace MetadataConstants { + inline const std::string IMGSZ = "imgsz"; + inline const std::string STRIDE = "stride"; + inline const std::string NC = "nc"; + inline const std::string CH = "ch"; + inline const std::string DATE = "date"; + inline const std::string VERSION = "version"; + inline const std::string TASK = "task"; + inline const std::string BATCH = "batch"; + inline const std::string NAMES = "names"; +} + +namespace OnnxProviders { + inline const std::string CPU = "cpu"; + inline const std::string CUDA = "cuda"; +} + +namespace OnnxInitializers +{ + inline const int UNINITIALIZED_STRIDE = -1; + inline const int UNINITIALIZED_NC = -1; +} + + +namespace YoloTasks +{ + inline const std::string SEGMENT = "segment"; + inline const std::string DETECT = "detect"; + inline const std::string POSE = "pose"; + inline const std::string CLASSIFY = "classify"; +} diff --git a/include/nn/autobackend.h b/include/nn/autobackend.h new file mode 100644 index 0000000..f7f0cf5 --- /dev/null +++ b/include/nn/autobackend.h @@ -0,0 +1,90 @@ +#pragma once +#include +#include +#include +#include + +#include "onnx_model_base.h" +#include "constants.h" + +/** + * @brief Represents the results of YOLO prediction. + * + * This structure stores information about a detected object, including its class index, + * confidence score, bounding box, semantic segmentation mask, and keypoints (if available). + */ +struct YoloResults { + int class_idx{}; ///< The class index of the detected object. + float conf{}; ///< The confidence score of the detection. + cv::Rect_ bbox; ///< The bounding box of the detected object. + cv::Mat mask; ///< The semantic segmentation mask (if available). + std::vector keypoints{}; ///< Keypoints representing the object's pose (if available). +}; + +struct ImageInfo { + cv::Size raw_size; // add additional attrs if you need +}; + + +class AutoBackendOnnx : public OnnxModelBase { +public: + // constructors + AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider, + const std::vector& imgsz, const int& stride, + const int& nc, std::unordered_map names); + + AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider); + + // getters + virtual const std::vector& getImgsz(); + virtual const int& getStride(); + virtual const int& getCh(); + virtual const int& getNc(); + virtual const std::unordered_map& getNames(); + virtual const std::vector& getInputTensorShape(); + virtual const int& getWidth(); + virtual const int& getHeight(); + virtual const cv::Size& getCvSize(); + virtual const std::string& getTask(); + /** + * @brief Runs object detection on an input image. + * + * This method performs object detection on the input image and returns the detected objects as YoloResults. + * + * @param image The input image to run object detection on. + * @param conf The confidence threshold for object detection. + * @param iou The intersection-over-union (IoU) threshold for non-maximum suppression. + * @param mask_threshold The threshold for the semantic segmentation mask. + * @param conversionCode An optional conversion code for image format conversion (e.g., cv::COLOR_BGR2RGB). + * Default value is -1, indicating no conversion. + * TODO: use some constant from some namespace rather than hardcoded values here + * + * @return A vector of YoloResults representing the detected objects. + */ + virtual std::vector predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true); + virtual std::vector predict_once(const std::filesystem::path& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true); + virtual std::vector predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true); + + virtual void fill_blob(cv::Mat& image, float*& blob, std::vector& inputTensorShape); + virtual void postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo para, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold, + int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold = 0.50f); + + virtual void postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold); + virtual void postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold); + static void _get_mask2(const cv::Mat& mask_info, const cv::Mat& mask_data, const ImageInfo& image_info, cv::Rect bound, cv::Mat& mask_out, + float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, bool round_downsampled = false); + +protected: + std::vector imgsz_; + int stride_ = OnnxInitializers::UNINITIALIZED_STRIDE; + int nc_ = OnnxInitializers::UNINITIALIZED_NC; // + int ch_ = 3; + std::unordered_map names_; + std::vector inputTensorShape_; + cv::Size cvSize_; + std::string task_; + //cv::MatSize cvMatSize_; +}; diff --git a/include/nn/onnx_model_base.h b/include/nn/onnx_model_base.h new file mode 100644 index 0000000..dcd8e20 --- /dev/null +++ b/include/nn/onnx_model_base.h @@ -0,0 +1,39 @@ +#pragma once +#include +#include +#include +#include + +/* + * This interface must provide only required arguments to load any onnx model regarding specific info - + * - i.e. modelPath will always be required, provider like "cpu" or "cuda" the same, since these are parameters you need + * to set up `sessionOptions` or `session` objects properly, but image size is not needed for pure onnx graph to be loaded so do NOT include it here + */ +class OnnxModelBase { +public: + OnnxModelBase(const char* modelPath, const char* logid, const char* provider); + //OnnxModelBase(); // no default constructor should be there + //virtual ~OnnxModelBase(); + virtual const std::vector& getInputNames(); // = 0 + virtual const std::vector& getOutputNames(); + virtual const std::vector getOutputNamesCStr(); + virtual const std::vector getInputNamesCStr(); + virtual const Ort::ModelMetadata& getModelMetadata(); + virtual const std::unordered_map& getMetadata(); + virtual const char* getModelPath(); + virtual const Ort::Session& getSession(); + //virtual std::vector forward(std::vector inputTensors); + virtual std::vector forward(std::vector& inputTensors); + Ort::Session session{ nullptr }; + +protected: + const char* modelPath_; + Ort::Env env{ nullptr }; + + std::vector inputNodeNames; + std::vector outputNodeNames; + Ort::ModelMetadata model_metadata{ nullptr }; + std::unordered_map metadata; + std::vector outputNamesCStr; + std::vector inputNamesCStr; +}; diff --git a/include/utils/augment.h b/include/utils/augment.h new file mode 100644 index 0000000..c5e21d8 --- /dev/null +++ b/include/utils/augment.h @@ -0,0 +1,20 @@ +#pragma once +#include + +void letterbox(const cv::Mat& image, + cv::Mat& outImage, + const cv::Size& newShape = cv::Size(640, 640), + cv::Scalar_ color = cv::Scalar(), bool auto_ = true, + bool scaleFill = false, + bool scaleUp = true, + int stride = 32 +); + + +cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f))); + +void scale_image2( + cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, + const std::pair& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)) + ); diff --git a/include/utils/common.h b/include/utils/common.h new file mode 100644 index 0000000..52d08a8 --- /dev/null +++ b/include/utils/common.h @@ -0,0 +1,26 @@ +#ifndef COMMON_UTILS_H +#define COMMON_UTILS_H + +#include +#include +#include +#include + + +class Timer { +public: + Timer(double& accumulator, bool isEnabled = true); + void Stop(); + +private: + double& accumulator; + bool isEnabled; + std::chrono::time_point start; +}; + +std::wstring get_win_path(const std::string& path); +std::vector parseVectorString(const std::string& input); +std::vector convertStringVectorToInts(const std::vector& input); +std::unordered_map parseNames(const std::string& input); +int64_t vector_product(const std::vector& vec); +#endif // COMMON_H COMMON_UTILS_H \ No newline at end of file diff --git a/include/utils/ops.h b/include/utils/ops.h new file mode 100644 index 0000000..df4c035 --- /dev/null +++ b/include/utils/ops.h @@ -0,0 +1,44 @@ +#pragma once +#include + +//cv::Rect scaleCoords(const cv::Size& imageShape, const cv::Rect& coords, const cv::Size& imageOriginalShape); +/** + * Scales a bounding box from the shape of the input image to the shape of an original image. + * + * @param img1_shape The shape (height, width) of the input image for the model. + * @param box The bounding box to be scaled, specified as cv::Rect_. + * @param img0_shape The shape (height, width) of the original target image. + * @param ratio_pad An optional parameter that specifies scaling and padding factors as a pair of values. + * The first value (ratio) is used for scaling, and the second value (pad) is used for padding. + * If not provided, default values will be used. + * @param padding An optional boolean parameter that specifies whether padding should be applied. + * If set to true, padding will be applied to the bounding box. + * + * @return A scaled bounding box specified as cv::Rect_. + * + * This function rescales a bounding box from the shape of the input image (img1_shape) to the shape of an original image (img0_shape). + */ +cv::Rect_ scale_boxes(const cv::Size& img1_shape, cv::Rect_& box, const cv::Size& img0_shape, std::pair ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true); +void clip_boxes(cv::Rect& box, const cv::Size& shape); +void clip_boxes(cv::Rect_& box, const cv::Size& shape); +void clip_boxes(std::vector& boxes, const cv::Size& shape); +void clip_boxes(std::vector>& boxes, const cv::Size& shape); + +//void clip_coords(cv::Mat& coords, const cv::Size& shape); +//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape); +void clip_coords(std::vector& coords, const cv::Size& shape); +std::vector scale_coords(const cv::Size& img1_shape, std::vector& coords, const cv::Size& img0_shape); + +cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box); + + +struct NMSResult{ + std::vector bboxes; + std::vector confidences; + std::vector classes; + std::vector> rest; +}; + +//std::tuple>, std::vector, std::vector, std::vector>> +std::tuple, std::vector, std::vector, std::vector>> +non_max_suppression(const cv::Mat& output0, int class_names_num, int total_features_num, double conf_threshold, float iou_threshold); \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..f2efa3e --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,359 @@ +#include + +#include +#include "nn/onnx_model_base.h" +#include "nn/autobackend.h" +#include +#include + +#include "utils/augment.h" +#include "constants.h" +#include "utils/common.h" + + +namespace fs = std::filesystem; + + +// Define the skeleton and color mappings +std::vector> skeleton = {{16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, {6, 12}, {7, 13}, {6, 7}, + {6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7}}; + +std::vector posePalette = { + cv::Scalar(255, 128, 0), cv::Scalar(255, 153, 51), cv::Scalar(255, 178, 102), cv::Scalar(230, 230, 0), cv::Scalar(255, 153, 255), + cv::Scalar(153, 204, 255), cv::Scalar(255, 102, 255), cv::Scalar(255, 51, 255), cv::Scalar(102, 178, 255), cv::Scalar(51, 153, 255), + cv::Scalar(255, 153, 153), cv::Scalar(255, 102, 102), cv::Scalar(255, 51, 51), cv::Scalar(153, 255, 153), cv::Scalar(102, 255, 102), + cv::Scalar(51, 255, 51), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 0, 0), cv::Scalar(255, 255, 255) +}; + +std::vector limbColorIndices = {9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16}; +std::vector kptColorIndices = {16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9}; + + + +cv::Scalar generateRandomColor(int numChannels) { + if (numChannels < 1 || numChannels > 3) { + throw std::invalid_argument("Invalid number of channels. Must be between 1 and 3."); + } + + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dis(0, 255); + + cv::Scalar color; + for (int i = 0; i < numChannels; i++) { + color[i] = dis(gen); // for each channel separately generate value + } + + return color; +} + +std::vector generateRandomColors(int class_names_num, int numChannels) { + std::vector colors; + for (int i = 0; i < class_names_num; i++) { + cv::Scalar color = generateRandomColor(numChannels); + colors.push_back(color); + } + return colors; +} + +void plot_masks(cv::Mat img, std::vector& result, std::vector color, + std::unordered_map& names) +{ + cv::Mat mask = img.clone(); + for (int i = 0; i < result.size(); i++) + { + float left, top; + left = result[i].bbox.x; + top = result[i].bbox.y; + int color_num = i; + int& class_idx = result[i].class_idx; + rectangle(img, result[i].bbox, color[result[i].class_idx], 2); + + // try to get string value corresponding to given class_idx + std::string class_name; + auto it = names.find(class_idx); + if (it != names.end()) { + class_name = it->second; + } + else { + std::cerr << "Warning: class_idx not found in names for class_idx = " << class_idx << std::endl; + // then convert it to string anyway + class_name = std::to_string(class_idx); + } + + if (result[i].mask.rows && result[i].mask.cols > 0) + { + mask(result[i].bbox).setTo(color[result[i].class_idx], result[i].mask); + } + std::stringstream labelStream; + labelStream << class_name << " " << std::fixed << std::setprecision(2) << result[i].conf; + std::string label = labelStream.str(); + + cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr); + cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5); + cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0); + rectangle(img, rect_to_fill, color[result[i].class_idx], -1); + + putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2); + } + addWeighted(img, 0.6, mask, 0.4, 0, img); //add mask to src + resize(img, img, img.size()); + imshow("img", img); + cv::waitKey(); +} + + +//void plot_keypoints(cv::Mat& image, const std::vector>& keypoints, const cv::Size& shape) { +void plot_keypoints(cv::Mat& image, const std::vector& results, const cv::Size& shape) { + + int radius = 5; + bool drawLines = true; + + if (results.empty()) { + return; + } + + std::vector limbColorPalette; + std::vector kptColorPalette; + + for (int index : limbColorIndices) { + limbColorPalette.push_back(posePalette[index]); + } + + for (int index : kptColorIndices) { + kptColorPalette.push_back(posePalette[index]); + } + + for (const auto& res: results) { + auto keypoint = res.keypoints; + bool isPose = keypoint.size() == 51; // numKeypoints == 17 && keypoints[0].size() == 3; + drawLines &= isPose; + + // draw points + for (int i = 0; i < 17; i++) { + int idx = i * 3; + int x_coord = static_cast(keypoint[idx]); + int y_coord = static_cast(keypoint[idx + 1]); + + if (x_coord % shape.width != 0 && y_coord % shape.height != 0) { + if (keypoint.size() == 3) { + float conf = keypoint[2]; + if (conf < 0.5) { + continue; + } + } + cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0, + 255); // Default to red if not in pose mode + cv::circle(image, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA); + } + } + // draw lines + if (drawLines) { + for (int i = 0; i < skeleton.size(); i++) { + const std::vector &sk = skeleton[i]; + int idx1 = sk[0] - 1; + int idx2 = sk[1] - 1; + + int idx1_x_pos = idx1 * 3; + int idx2_x_pos = idx2 * 3; + + int x1 = static_cast(keypoint[idx1_x_pos]); + int y1 = static_cast(keypoint[idx1_x_pos + 1]); + int x2 = static_cast(keypoint[idx2_x_pos]); + int y2 = static_cast(keypoint[idx2_x_pos + 1]); + + float conf1 = keypoint[idx1_x_pos + 2]; + float conf2 = keypoint[idx2_x_pos + 2]; + + // Check confidence thresholds + if (conf1 < 0.5 || conf2 < 0.5) { + continue; + } + + // Check if positions are within bounds + if (x1 % shape.width == 0 || y1 % shape.height == 0 || x1 < 0 || y1 < 0 || + x2 % shape.width == 0 || y2 % shape.height == 0 || x2 < 0 || y2 < 0) { + continue; + } + + // Draw a line between keypoints + cv::Scalar color_limb = limbColorPalette[i]; + cv::line(image, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA); + } + } + } +} + +void plot_results(cv::Mat img, std::vector& results, + std::vector color, std::unordered_map& names, + const cv::Size& shape + ) { + + cv::Mat mask = img.clone(); + + int radius = 5; + bool drawLines = true; + + auto raw_image_shape = img.size(); + std::vector limbColorPalette; + std::vector kptColorPalette; + + for (int index : limbColorIndices) { + limbColorPalette.push_back(posePalette[index]); + } + + for (int index : kptColorIndices) { + kptColorPalette.push_back(posePalette[index]); + } + + for (const auto& res : results) { + float left = res.bbox.x; + float top = res.bbox.y; + int color_num = res.class_idx; + + // Draw bounding box + rectangle(img, res.bbox, color[res.class_idx], 2); + + // Try to get the class name corresponding to the given class_idx + std::string class_name; + auto it = names.find(res.class_idx); + if (it != names.end()) { + class_name = it->second; + } + else { + std::cerr << "Warning: class_idx not found in names for class_idx = " << res.class_idx << std::endl; + // Then convert it to a string anyway + class_name = std::to_string(res.class_idx); + } + + // Draw mask if available + if (res.mask.rows && res.mask.cols > 0) { + mask(res.bbox).setTo(color[res.class_idx], res.mask); + } + + // Create label + std::stringstream labelStream; + labelStream << class_name << " " << std::fixed << std::setprecision(2) << res.conf; + std::string label = labelStream.str(); + + cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr); + cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5); + cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0); + rectangle(img, rect_to_fill, color[res.class_idx], -1); + putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2); + + // Check if keypoints are available + if (!res.keypoints.empty()) { + auto keypoint = res.keypoints; + bool isPose = keypoint.size() == 51; // numKeypoints == 17 && keypoints[0].size() == 3; + drawLines &= isPose; + + // draw points + for (int i = 0; i < 17; i++) { + int idx = i * 3; + int x_coord = static_cast(keypoint[idx]); + int y_coord = static_cast(keypoint[idx + 1]); + + if (x_coord % raw_image_shape.width != 0 && y_coord % raw_image_shape.height != 0) { + if (keypoint.size() == 3) { + float conf = keypoint[2]; + if (conf < 0.5) { + continue; + } + } + cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0, + 255); // Default to red if not in pose mode + cv::circle(img, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA); + } + } + // draw lines + if (drawLines) { + for (int i = 0; i < skeleton.size(); i++) { + const std::vector &sk = skeleton[i]; + int idx1 = sk[0] - 1; + int idx2 = sk[1] - 1; + + int idx1_x_pos = idx1 * 3; + int idx2_x_pos = idx2 * 3; + + int x1 = static_cast(keypoint[idx1_x_pos]); + int y1 = static_cast(keypoint[idx1_x_pos + 1]); + int x2 = static_cast(keypoint[idx2_x_pos]); + int y2 = static_cast(keypoint[idx2_x_pos + 1]); + + float conf1 = keypoint[idx1_x_pos + 2]; + float conf2 = keypoint[idx2_x_pos + 2]; + + // Check confidence thresholds + if (conf1 < 0.5 || conf2 < 0.5) { + continue; + } + + // Check if positions are within bounds + if (x1 % raw_image_shape.width == 0 || y1 % raw_image_shape.height == 0 || x1 < 0 || y1 < 0 || + x2 % raw_image_shape.width == 0 || y2 % raw_image_shape.height == 0 || x2 < 0 || y2 < 0) { + continue; + } + + // Draw a line between keypoints + cv::Scalar color_limb = limbColorPalette[i]; + cv::line(img, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA); + } + } + } + } + + // Combine the image and mask + addWeighted(img, 0.6, mask, 0.4, 0, img); +// resize(img, img, img.size()); +// resize(img, img, shape); +// // Show the image +// imshow("img", img); +// cv::waitKey(); +} + + + +int main() +{ + std::string img_path = "/home/xdobro23/path_to_16bit_3_channel_image.png"; + const std::string& modelPath = "/home/xdobro23/PP1/ultralytics/runs/detect/train35/weights/best.onnx"; // pose + + fs::path imageFilePath(img_path); + fs::path newFilePath = imageFilePath.stem(); + newFilePath += "-kpt-cpp"; + newFilePath += imageFilePath.extension(); + assert(newFilePath != imageFilePath); + std::cout << "newFilePath: " << newFilePath << std::endl; + + const std::string& onnx_provider = OnnxProviders::CPU; // "cpu"; + const std::string& onnx_logid = "yolov8_inference2"; + float mask_threshold = 0.5f; // in python it's 0.5 and you can see that at ultralytics/utils/ops.process_mask line 705 (ultralytics.__version__ == .160) + float conf_threshold = 0.30f; + float iou_threshold = 0.45f; // 0.70f; + int conversion_code = -1; + cv::Mat img = cv::imread(img_path, cv::IMREAD_UNCHANGED); + if (img.empty()) { + std::cerr << "Error: Unable to load image" << std::endl; + return 1; + } + AutoBackendOnnx model(modelPath.c_str(), onnx_logid.c_str(), onnx_provider.c_str()); + std::cout << "Model loaded" << std::endl; + std::vector objs = model.predict_once(img, conf_threshold, iou_threshold, mask_threshold); + std::cout << "Got results" << std::endl; + std::vector colors = generateRandomColors(model.getNc(), model.getCh()); + std::unordered_map names = model.getNames(); + + std::vector> keypointsVector; + for (const YoloResults& result : objs) { + keypointsVector.push_back(result.keypoints); + } + + cv::cvtColor(img, img, cv::COLOR_RGB2BGR); + cv::Size show_shape = img.size(); // cv::Size(1280, 720); // img.size() + plot_results(img, objs, colors, names, show_shape); +// plot_masks(img, objs, colors, names); + cv::imshow("img", img); + cv::waitKey(); + return -1; +} diff --git a/src/nn/autobackend.cpp b/src/nn/autobackend.cpp new file mode 100644 index 0000000..da48ae3 --- /dev/null +++ b/src/nn/autobackend.cpp @@ -0,0 +1,520 @@ +#pragma once + +#include "nn/autobackend.h" + +#include +#include +#include + +#include +#include +#include +#include + +#include "utils/augment.h" +#include "constants.h" +#include "utils/common.h" +#include "utils/ops.h" + + +namespace fs = std::filesystem; + + +AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider, + const std::vector& imgsz, const int& stride, + const int& nc, const std::unordered_map names) + : OnnxModelBase(modelPath, logid, provider), imgsz_(imgsz), stride_(stride), nc_(nc), names_(names), + inputTensorShape_() +{ +} + +AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider) + : OnnxModelBase(modelPath, logid, provider) { + // init metadata etc + OnnxModelBase(modelPath, logid, provider); + // then try to get additional info from metadata like imgsz, stride etc; + // ideally you should get all of them but you'll raise error if smth is not in metadata (or not under the appropriate keys) + const std::unordered_map& base_metadata = OnnxModelBase::getMetadata(); + + // post init imgsz + auto imgsz_iterator = base_metadata.find(MetadataConstants::IMGSZ); + if (imgsz_iterator != base_metadata.end()) { + // parse it and convert to int iterable + std::vector imgsz = convertStringVectorToInts(parseVectorString(imgsz_iterator->second)); + // set it here: + if (imgsz_.empty()) { + imgsz_ = imgsz; + } + } + else { + std::cerr << "Warning: Cannot get imgsz value from metadata" << std::endl; + } + + // post init stride + auto stride_item = base_metadata.find(MetadataConstants::STRIDE); + if (stride_item != base_metadata.end()) { + // parse it and convert to int iterable + int stide_int = std::stoi(stride_item->second); + // set it here: + if (stride_ == OnnxInitializers::UNINITIALIZED_STRIDE) { + stride_ = stide_int; + } + } + else { + std::cerr << "Warning: Cannot get stride value from metadata" << std::endl; + } + + // post init names + auto names_item = base_metadata.find(MetadataConstants::NAMES); + if (names_item != base_metadata.end()) { + // parse it and convert to int iterable + std::unordered_map names = parseNames(names_item->second); + std::cout << "***Names from metadata***" << std::endl; + for (const auto& pair : names) { + std::cout << "Key: " << pair.first << ", Value: " << pair.second << std::endl; + } + // set it here: + if (names_.empty()) { + names_ = names; + } + } + else { + std::cerr << "Warning: Cannot get names value from metadata" << std::endl; + } + + // post init number of classes - you can do that only and only if names_ is not empty and nc was not initialized previously + if (nc_ == OnnxInitializers::UNINITIALIZED_NC && !names_.empty()) { + nc_ = names_.size(); + } + else { + std::cerr << "Warning: Cannot get nc value from metadata (probably names wasn't set)" << std::endl; + } + + if (!imgsz_.empty() && inputTensorShape_.empty()) + { + inputTensorShape_ = { 1, ch_, getHeight(), getWidth() }; + } + + if (!imgsz_.empty()) + { + // Initialize cvSize_ using getHeight() and getWidth() + //cvSize_ = cv::MatSize() + cvSize_ = cv::Size(getWidth(), getHeight()); + //cvMatSize_ = cv::MatSize(cvSize_.width, cvSize_.height); + } + + // task init: + auto task_item = base_metadata.find(MetadataConstants::TASK); + if (task_item != base_metadata.end()) { + // parse it and convert to int iterable + std::string task = std::string(task_item->second); + // set it here: + if (task_.empty()) + { + task_ = task; + } + } + else { + std::cerr << "Warning: Cannot get task value from metadata" << std::endl; + } + + // TODO: raise assert if imgsz_ and task_ were not initialized (since you don't know in that case which postprocessing to use) + +} + + + +const std::vector& AutoBackendOnnx::getImgsz() { + return imgsz_; +} + +const int& AutoBackendOnnx::getHeight() +{ + return imgsz_[0]; +} + +const int& AutoBackendOnnx::getWidth() +{ + return imgsz_[1]; +} + +const int& AutoBackendOnnx::getStride() { + return stride_; +} + +const int& AutoBackendOnnx::getCh() { + return ch_; +} + +const int& AutoBackendOnnx::getNc() { + return nc_; +} + +const std::unordered_map& AutoBackendOnnx::getNames() { + return names_; +} + + +const cv::Size& AutoBackendOnnx::getCvSize() +{ + return cvSize_; +} + +const std::vector& AutoBackendOnnx::getInputTensorShape() +{ + return inputTensorShape_; +} + +const std::string& AutoBackendOnnx::getTask() +{ + return task_; +} + +std::vector AutoBackendOnnx::predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, + int conversionCode, bool verbose) { + // Convert the string imagePath to an object of type std::filesystem::path + fs::path imageFilePath(imagePath); + // Call the predict_once method, converting the image to a cv::Mat + return predict_once(imageFilePath, conf, iou, mask_threshold, conversionCode); +} + +std::vector AutoBackendOnnx::predict_once(const fs::path& imagePath, float& conf, float& iou, float& mask_threshold, + int conversionCode, bool verbose) { + // Check if the specified path exists + if (!fs::exists(imagePath)) { + std::cerr << "Error: File does not exist: " << imagePath << std::endl; + // Return an empty vector or throw an exception, depending on your logic + return {}; + } + + // Load the image into a cv::Mat + cv::Mat image = cv::imread(imagePath.string(), cv::IMREAD_UNCHANGED); + + // Check if loading the image was successful + if (image.empty()) { + std::cerr << "Error: Failed to load image: " << imagePath << std::endl; + // Return an empty vector or throw an exception, depending on your logic + return {}; + } + + // now do some preprocessing based on channels info: + int required_image_channels = this->getCh(); + /*assert(required_image_channels == image.channels() && "");*/ + // Assert that the number of channels in the input image matches the required number of channels for the model + if (required_image_channels != image.channels()) { + const std::string& errorMessage = "Error: Number of image channels does not match the required channels.\n" + "Number of channels in the image: " + std::to_string(image.channels()); + throw std::runtime_error(errorMessage); + } + + // Call overloaded one + return predict_once(image, conf, iou, mask_threshold, conversionCode); +} + + +std::vector AutoBackendOnnx::predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode, bool verbose) { + double preprocess_time = 0.0; + double inference_time = 0.0; + double postprocess_time = 0.0; + Timer preprocess_timer = Timer(preprocess_time, verbose); + // 1. preprocess + float* blob = nullptr; + //double* blob = nullptr; + std::vector inputTensors; + if (conversionCode >= 0) { + cv::cvtColor(image, image, conversionCode); + } + std::vector inputTensorShape; + // TODO: for classify task preprocessed image will be different (!): + cv::Mat preprocessed_img; + cv::Size new_shape = cv::Size(getWidth(), getHeight()); + const bool& scaleFill = false; // false + const bool& auto_ = false; // true + letterbox(image, preprocessed_img, new_shape, cv::Scalar(), auto_, scaleFill, true, getStride()); + fill_blob(preprocessed_img, blob, inputTensorShape); + int64_t inputTensorSize = vector_product(inputTensorShape); + std::vector inputTensorValues(blob, blob + inputTensorSize); + + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu( + OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); + + inputTensors.push_back(Ort::Value::CreateTensor( + memoryInfo, inputTensorValues.data(), inputTensorSize, + inputTensorShape.data(), inputTensorShape.size() + )); + preprocess_timer.Stop(); + Timer inference_timer = Timer(inference_time, verbose); + // 2. inference + std::vector outputTensors = forward(inputTensors); + inference_timer.Stop(); + Timer postprocess_timer = Timer(postprocess_time, verbose); + // create container for the results + std::vector results; + // 3. postprocess based on task: + std::unordered_map names = this->getNames(); + // 4. cleanup blob since it was created using the "new" keyword during the `fill_blob` func call + delete[] blob; + + int class_names_num = names.size(); + if (task_ == YoloTasks::SEGMENT) { + + // get outputs info + std::vector outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + std::vector outputTensor1Shape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape(); + // get outputs + float* all_data0 = outputTensors[0].GetTensorMutableData(); + + cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features] + auto mask_shape = outputTensor1Shape; + std::vector mask_sz = { 1,(int)mask_shape[1],(int)mask_shape[2],(int)mask_shape[3] }; + cv::Mat output1 = cv::Mat(mask_sz, CV_32F, outputTensors[1].GetTensorMutableData()); + + int iw = this->getWidth(); + int ih = this->getHeight(); + int mask_features_num = outputTensor1Shape[1]; + int mh = outputTensor1Shape[2]; + int mw = outputTensor1Shape[3]; + ImageInfo img_info = { image.size() }; + postprocess_masks(output0, output1, img_info, results, class_names_num, conf, iou, + iw, ih, mw, mh, mask_features_num, mask_threshold); + } + else if (task_ == YoloTasks::DETECT) { + ImageInfo img_info = { image.size() }; + std::vector outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + float* all_data0 = outputTensors[0].GetTensorMutableData(); + cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features] + postprocess_detects(output0, img_info, results, class_names_num, conf, iou); + } + else if (task_ == YoloTasks::POSE) { + ImageInfo image_info = { image.size() }; + std::vector outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); + float* all_data0 = outputTensors[0].GetTensorMutableData(); + cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features] + postprocess_kpts(output0, image_info, results, class_names_num, conf, iou); + } + else { + throw std::runtime_error("NotImplementedError: task: " + task_); + } + + postprocess_timer.Stop(); + if (verbose) { + std::cout << std::fixed << std::setprecision(1); + std::cout << "image: " << preprocessed_img.rows << "x" << preprocessed_img.cols << " " << results.size() << " objs, "; + std::cout << (preprocess_time + inference_time + postprocess_time) * 1000.0 << "ms" << std::endl; + std::cout << "Speed: " << (preprocess_time * 1000.0) << "ms preprocess, "; + std::cout << (inference_time * 1000.0) << "ms inference, "; + std::cout << (postprocess_time * 1000.0) << "ms postprocess per image "; + std::cout << "at shape (1, " << image.channels() << ", " << preprocessed_img.rows << ", " << preprocessed_img.cols << ")" << std::endl; + } + + return results; +} + + +void AutoBackendOnnx::postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo image_info, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold, + int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold /* = 0.5f */) +{ + output.clear(); + std::vector class_ids; + std::vector confidences; + std::vector boxes; + std::vector> masks; + // 4 - your default number of rect parameters {x, y, w, h} + int data_width = class_names_num + 4 + masks_features_num; + int rows = output0.rows; + float* pdata = (float*)output0.data; + for (int r = 0; r < rows; ++r) + { + cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4); + cv::Point class_id; + double max_conf; + minMaxLoc(scores, 0, &max_conf, 0, &class_id); + if (max_conf > conf_threshold) + { + masks.push_back(std::vector(pdata + 4 + class_names_num, pdata + data_width)); + class_ids.push_back(class_id.x); + confidences.push_back(max_conf); + + float out_w = pdata[2]; + float out_h = pdata[3]; + float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0); + float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0); + cv::Rect_ bbox = cv::Rect(out_left, out_top, (out_w + 0.5), (out_h + 0.5)); + cv::Rect_ scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size); + boxes.push_back(scaled_bbox); + } + pdata += data_width; // next pred + } + + // + //float masks_threshold = 0.50; + //int top_k = 500; + //const float& nmsde_eta = 1.0f; + std::vector nms_result; + cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k); + + // select all of the protos tensor + cv::Size downsampled_size = cv::Size(mw, mh); + std::vector roi_rangs = { cv::Range(0, 1), cv::Range::all(), + cv::Range(0, downsampled_size.height), cv::Range(0, downsampled_size.width) }; + cv::Mat temp_mask = output1(roi_rangs).clone(); + cv::Mat proto = temp_mask.reshape(0, { masks_features_num, downsampled_size.width * downsampled_size.height }); + + for (int i = 0; i < nms_result.size(); ++i) + { + int idx = nms_result[i]; + boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height); + YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] }; + _get_mask2(cv::Mat(masks[idx]).t(), proto, image_info, boxes[idx], result.mask, mask_threshold, + iw, ih, mw, mh, masks_features_num); + output.push_back(result); + } +} + + +void AutoBackendOnnx::postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold) +{ + output.clear(); + std::vector class_ids; + std::vector confidences; + std::vector boxes; + std::vector> masks; + // 4 - your default number of rect parameters {x, y, w, h} + int data_width = class_names_num + 4; + int rows = output0.rows; + float* pdata = (float*)output0.data; + + for (int r = 0; r < rows; ++r) + { + cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4); + cv::Point class_id; + double max_conf; + minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id); + + if (max_conf > conf_threshold) + { + masks.emplace_back(pdata + 4 + class_names_num, pdata + data_width); + class_ids.push_back(class_id.x); + confidences.push_back((float) max_conf); + + float out_w = pdata[2]; + float out_h = pdata[3]; + float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0); + float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0); + + cv::Rect_ bbox = cv::Rect_ (out_left, out_top, (out_w + 0.5), (out_h + 0.5)); + cv::Rect_ scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size); + + boxes.push_back(scaled_bbox); + } + pdata += data_width; // next pred + } + + std::vector nms_result; + cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k); + for (int idx : nms_result) + { + boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height); + YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] }; + output.push_back(result); + } +} + +void AutoBackendOnnx::postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector& output, + int& class_names_num, float& conf_threshold, float& iou_threshold) +{ + std::vector boxes; + std::vector confidences; + std::vector class_ids; + std::vector> rest; + std::tie(boxes, confidences, class_ids, rest) = non_max_suppression(output0, class_names_num, output0.cols, conf_threshold, iou_threshold); + cv::Size img1_shape = getCvSize(); + auto bound_bbox = cv::Rect_ (0, 0, image_info.raw_size.width, image_info.raw_size.height); + for (int i = 0; i < boxes.size(); i++) { + // pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round() + // pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:] + // pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape) + // path = self.batch[0] + // img_path = path[i] if isinstance(path, list) else path + // results.append( + // Results(orig_img=orig_img, + // path=img_path, + // names=self.model.names, + // boxes=pred[:, :6], + // keypoints=pred_kpts)) + cv::Rect_ bbox = boxes[i]; + auto scaled_bbox = scale_boxes(img1_shape, bbox, image_info.raw_size); + scaled_bbox = scaled_bbox & bound_bbox; +// cv::Mat kpt = cv::Mat(rest[i]).t(); +// scale_coords(img1_shape, kpt, image_info.raw_size); + // TODO: overload scale_coords so that will accept cv::Mat of shape [17, 3] + // so that it will be more similar to what we have in python + std::vector kpt = scale_coords(img1_shape, rest[i], image_info.raw_size); + YoloResults tmp_res = { class_ids[i], confidences[i], scaled_bbox, {}, kpt}; + output.push_back(tmp_res); + } +} + +void AutoBackendOnnx::_get_mask2(const cv::Mat& masks_features, + const cv::Mat& proto, + const ImageInfo& image_info, const cv::Rect bound, cv::Mat& mask_out, + float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, + bool round_downsampled) + +{ + cv::Size img0_shape = image_info.raw_size; + cv::Size img1_shape = cv::Size(iw, ih); + cv::Size downsampled_size = cv::Size(mw, mh); + + cv::Rect_ bound_float( + static_cast(bound.x), + static_cast(bound.y), + static_cast(bound.width), + static_cast(bound.height) + ); + + cv::Rect_ downsampled_bbox = scale_boxes(img0_shape, bound_float, downsampled_size); + cv::Size bound_size = cv::Size(mw, mh); + clip_boxes(downsampled_bbox, bound_size); + + cv::Mat matmul_res = (masks_features * proto).t(); + matmul_res = matmul_res.reshape(1, { downsampled_size.height, downsampled_size.width }); + // apply sigmoid to the mask: + cv::Mat sigmoid_mask; + exp(-matmul_res, sigmoid_mask); + sigmoid_mask = 1.0 / (1.0 + sigmoid_mask); + cv::Mat resized_mask; + cv::Rect_ input_bbox = scale_boxes(img0_shape, bound_float, img1_shape); + cv::resize(sigmoid_mask, resized_mask, img1_shape, 0, 0, cv::INTER_LANCZOS4); + cv::Mat pre_out_mask = resized_mask(input_bbox); + cv::Mat scaled_mask; + scale_image2(scaled_mask, resized_mask, img0_shape); + cv::resize(scaled_mask, mask_out, img0_shape); + mask_out = mask_out(bound) > mask_thresh; +} + + +void AutoBackendOnnx::fill_blob(cv::Mat& image, float*& blob, std::vector& inputTensorShape) { + + cv::Mat floatImage; + if (inputTensorShape.empty()) + { + inputTensorShape = getInputTensorShape(); + } + int inputChannelsNum = inputTensorShape[1]; + int rtype = CV_32FC3; + image.convertTo(floatImage, rtype, 1.0f / 65635.0); + blob = new float[floatImage.cols * floatImage.rows * floatImage.channels()]; + cv::Size floatImageSize{ floatImage.cols, floatImage.rows }; + + // hwc -> chw + std::vector chw(floatImage.channels()); + for (int i = 0; i < floatImage.channels(); ++i) + { + chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + i * floatImageSize.width * floatImageSize.height); + } + cv::split(floatImage, chw); +} + diff --git a/src/nn/onnx_model_base.cpp b/src/nn/onnx_model_base.cpp new file mode 100644 index 0000000..e6c579b --- /dev/null +++ b/src/nn/onnx_model_base.cpp @@ -0,0 +1,181 @@ +#include "nn/onnx_model_base.h" + +#include +#include +#include + +#include "constants.h" +#include "utils/common.h" + + +/** + * @brief Base class for any onnx model regarding the target. + * + * Wraps OrtApi. + * + * The caller provides a model path, logid, and provider. + * + * See the output logs for more information on warnings/errors that occur while processing the model. + * + * @param[in] modelPath Path to the model file. + * @param[in] logid Log identifier. + * @param[in] provider Provider (e.g., "CPU" or "CUDA"). (NOTE: for now only CPU is supported) + */ + +OnnxModelBase::OnnxModelBase(const char* modelPath, const char* logid, const char* provider) +//: modelPath_(modelPath), env(std::move(env)), session(std::move(session)) + : modelPath_(modelPath) +{ + + // TODO: too bad passing `ORT_LOGGING_LEVEL_WARNING` by default - for some cases + // info level would make sense too + env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, logid); + Ort::SessionOptions sessionOptions = Ort::SessionOptions(); + + std::vector availableProviders = Ort::GetAvailableProviders(); + auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider"); + OrtCUDAProviderOptions cudaOption; + + if (provider == OnnxProviders::CUDA.c_str()) { // strcmp(provider, OnnxProviders::CUDA.c_str()) == true strcmp(provider, "cuda") // (providerStr == "cuda") + if (cudaAvailable == availableProviders.end()) { + std::cout << "CUDA is not supported by your ONNXRuntime build. Fallback to CPU." << std::endl; + //std::cout << "Inference device: CPU" << std::endl; + } + else { + //std::cout << "Inference device: GPU" << std::endl; + sessionOptions.AppendExecutionProvider_CUDA(cudaOption); + } + } + + else if (provider == OnnxProviders::CPU.c_str()) { // strcmp(provider, OnnxProviders::CPU.c_str()) == true) (providerStr == "cpu") { + // "cpu" by default + } + else + { + throw std::runtime_error("NotImplemented provider=" + std::string(provider)); + } + + std::cout << "Inference device: " << std::string(provider) << std::endl; + #ifdef _WIN32 + auto modelPathW = get_win_path(modelPath); // For Windows (wstring) + session = Ort::Session(env, modelPathW.c_str(), sessionOptions); + #else + session = Ort::Session(env, modelPath, sessionOptions); // For Linux (string) + #endif + //session = Ort::Session(env) + // https://github.com/microsoft/onnxruntime/issues/14157 + //std::vector inputNodeNames; // + // ---------------- + // init input names + inputNodeNames; + std::vector inputNodeNameAllocatedStrings; // <-- newly added + Ort::AllocatorWithDefaultOptions allocator; + auto inputNodesNum = session.GetInputCount(); + for (int i = 0; i < inputNodesNum; i++) { + auto input_name = session.GetInputNameAllocated(i, allocator); + inputNodeNameAllocatedStrings.push_back(std::move(input_name)); + inputNodeNames.push_back(inputNodeNameAllocatedStrings.back().get()); + } + // ----------------- + // init output names + outputNodeNames; + auto outputNodesNum = session.GetOutputCount(); + std::vector outputNodeNameAllocatedStrings; // <-- newly added + Ort::AllocatorWithDefaultOptions output_names_allocator; + for (int i = 0; i < outputNodesNum; i++) + { + auto output_name = session.GetOutputNameAllocated(i, output_names_allocator); + outputNodeNameAllocatedStrings.push_back(std::move(output_name)); + outputNodeNames.push_back(outputNodeNameAllocatedStrings.back().get()); + } + // ------------------------- + // initialize model metadata + model_metadata = session.GetModelMetadata(); + Ort::AllocatorWithDefaultOptions metadata_allocator; + + std::vector metadataAllocatedKeys = model_metadata.GetCustomMetadataMapKeysAllocated(metadata_allocator); + std::vector metadata_keys; + metadata_keys.reserve(metadataAllocatedKeys.size()); + + for (const Ort::AllocatedStringPtr& allocatedString : metadataAllocatedKeys) { + metadata_keys.emplace_back(allocatedString.get()); + } + + // ------------------------- + // initialize metadata as the dict + // even though we know exactly what metadata we intend to use + // base onnx class should not have any ultralytics yolo-specific attributes like stride, task etc, so keep it clean as much as possible + for (const std::string& key : metadata_keys) { + Ort::AllocatedStringPtr metadata_value = model_metadata.LookupCustomMetadataMapAllocated(key.c_str(), metadata_allocator); + if (metadata_value != nullptr) { + auto raw_metadata_value = metadata_value.get(); + metadata[key] = std::string(raw_metadata_value); + } + } + + // initialize cstr + for (const std::string& name : outputNodeNames) { + outputNamesCStr.push_back(name.c_str()); + } + + for (const std::string& name : inputNodeNames) + { + inputNamesCStr.push_back(name.c_str()); + } + +} + +const std::vector& OnnxModelBase::getInputNames() { + return inputNodeNames; +} + +const std::vector& OnnxModelBase::getOutputNames() { + return outputNodeNames; +} + +const Ort::ModelMetadata& OnnxModelBase::getModelMetadata() +{ + return model_metadata; +} + +const std::unordered_map& OnnxModelBase::getMetadata() +{ + return metadata; +} + + +const Ort::Session& OnnxModelBase::getSession() +{ + return session; +} + +const char* OnnxModelBase::getModelPath() +{ + return modelPath_; +} + +const std::vector OnnxModelBase::getOutputNamesCStr() +{ + return outputNamesCStr; +} + +const std::vector OnnxModelBase::getInputNamesCStr() +{ + return inputNamesCStr; +} + +std::vector OnnxModelBase::forward(std::vector& inputTensors) +{ + // todo: make runOptions parameter here + + return session.Run(Ort::RunOptions{ nullptr }, + inputNamesCStr.data(), + inputTensors.data(), + inputNamesCStr.size(), + outputNamesCStr.data(), + outputNamesCStr.size()); +} + +//OnnxModelBase::~OnnxModelBase() { +// // empty body +//} diff --git a/src/utils/augment.cpp b/src/utils/augment.cpp new file mode 100644 index 0000000..7a3e064 --- /dev/null +++ b/src/utils/augment.cpp @@ -0,0 +1,145 @@ +#include +#include +#include +#include + + +/** + * \brief padding value when letterbox changes image size ratio + */ +const int& DEFAULT_LETTERBOX_PAD_VALUE = 114; + + +void letterbox(const cv::Mat& image, + cv::Mat& outImage, + const cv::Size& newShape, + cv::Scalar_ color, + bool auto_, + bool scaleFill, + bool scaleUp, int stride +) { + cv::Size shape = image.size(); + float r = std::min(static_cast(newShape.height) / static_cast(shape.height), + static_cast(newShape.width) / static_cast(shape.width)); + if (!scaleUp) + r = std::min(r, 1.0f); + + float ratio[2]{ r, r }; + int newUnpad[2]{ static_cast(std::round(static_cast(shape.width) * r)), + static_cast(std::round(static_cast(shape.height) * r)) }; + + auto dw = static_cast(newShape.width - newUnpad[0]); + auto dh = static_cast(newShape.height - newUnpad[1]); + + if (auto_) + { + dw = static_cast((static_cast(dw) % stride)); + dh = static_cast((static_cast(dh) % stride)); + } + else if (scaleFill) + { + dw = 0.0f; + dh = 0.0f; + newUnpad[0] = newShape.width; + newUnpad[1] = newShape.height; + ratio[0] = static_cast(newShape.width) / static_cast(shape.width); + ratio[1] = static_cast(newShape.height) / static_cast(shape.height); + } + + dw /= 2.0f; + dh /= 2.0f; + + //cv::Mat outImage; + if (shape.width != newUnpad[0] || shape.height != newUnpad[1]) + { + cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1])); + } + else + { + outImage = image.clone(); + } + + int top = static_cast(std::round(dh - 0.1f)); + int bottom = static_cast(std::round(dh + 0.1f)); + int left = static_cast(std::round(dw - 0.1f)); + int right = static_cast(std::round(dw + 0.1f)); + + + if (color == cv::Scalar()) { + color = cv::Scalar(DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE); + } + + cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color); + +} + +cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair& ratio_pad) { + cv::Size im1_shape = resized_mask.size(); + + // Check if resizing is needed + if (im1_shape == im0_shape) { + return resized_mask.clone(); + } + + float gain, pad_x, pad_y; + + if (ratio_pad.first < 0.0f) { + gain = std::min(static_cast(im1_shape.height) / static_cast(im0_shape.height), + static_cast(im1_shape.width) / static_cast(im0_shape.width)); + pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f; + pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f; + } + else { + gain = ratio_pad.first; + pad_x = ratio_pad.second.x; + pad_y = ratio_pad.second.y; + } + + int top = static_cast(pad_y); + int left = static_cast(pad_x); + int bottom = static_cast(im1_shape.height - pad_y); + int right = static_cast(im1_shape.width - pad_x); + + // Clip and resize the mask + cv::Rect clipped_rect(left, top, right - left, bottom - top); + cv::Mat clipped_mask = resized_mask(clipped_rect); + cv::Mat scaled_mask; + cv::resize(clipped_mask, scaled_mask, im0_shape); + + return scaled_mask; +} + + +void scale_image2(cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair& ratio_pad) { + cv::Size im1_shape = resized_mask.size(); + + // Check if resizing is needed + if (im1_shape == im0_shape) { + scaled_mask = resized_mask.clone(); + return; + } + + float gain, pad_x, pad_y; + + if (ratio_pad.first < 0.0f) { + gain = std::min(static_cast(im1_shape.height) / static_cast(im0_shape.height), + static_cast(im1_shape.width) / static_cast(im0_shape.width)); + pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f; + pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f; + } + else { + gain = ratio_pad.first; + pad_x = ratio_pad.second.x; + pad_y = ratio_pad.second.y; + } + + int top = static_cast(pad_y); + int left = static_cast(pad_x); + int bottom = static_cast(im1_shape.height - pad_y); + int right = static_cast(im1_shape.width - pad_x); + + // Clip and resize the mask + cv::Rect clipped_rect(left, top, right - left, bottom - top); + cv::Mat clipped_mask = resized_mask(clipped_rect); + cv::resize(clipped_mask, scaled_mask, im0_shape); +} diff --git a/src/utils/common.cpp b/src/utils/common.cpp new file mode 100644 index 0000000..d25388e --- /dev/null +++ b/src/utils/common.cpp @@ -0,0 +1,147 @@ +#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +#include +#include +#include +#include "utils/common.h" + +#include +//#include + +#include +#include + +#include + +#include +#include + + + +Timer::Timer(double& accumulator, bool isEnabled) + : accumulator(accumulator), isEnabled(isEnabled) { + if (isEnabled) { + start = std::chrono::high_resolution_clock::now(); + } +} + +// Stop the timer and update the accumulator +void Timer::Stop() { + if (isEnabled) { + auto end = std::chrono::high_resolution_clock::now(); + double duration = std::chrono::duration(end - start).count(); + accumulator += duration; + } +} + +// С++ 14 version +//#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING +std::wstring get_win_path(const std::string& modelPath) { +#ifdef _WIN32 + return std::wstring_convert>().from_bytes(modelPath); +#else + // return modelPath; + return std::wstring(modelPath.begin(), modelPath.end()); +#endif +} + + +std::vector parseVectorString(const std::string& input) { + /* Main purpose of this function is to parse `imgsz` key value of model metadata + * and from [height, width] get height, width values in the vector of strings + * Args: + * input: + * expected to be something like [544, 960] or [3,544, 960] + * output: + * iterable of strings, representing integers + */ + std::regex number_pattern(R"(\d+)"); + + std::vector result; + std::sregex_iterator it(input.begin(), input.end(), number_pattern); + std::sregex_iterator end; + + while (it != end) { + result.push_back(it->str()); + ++it; + } + + return result; +} + +std::vector convertStringVectorToInts(const std::vector& input) { + std::vector result; + + for (const std::string& str : input) { + try { + int value = std::stoi(str); + result.push_back(value); + } + catch (const std::invalid_argument& e) { + // raise explicit exception + throw std::invalid_argument("Bad argument (cannot cast): value=" + str); + } + catch (const std::out_of_range& e) { + // check bounds + throw std::out_of_range("Value out of range: " + str); + } + } + + return result; +} + + +/* +std::unordered_map parseNames(const std::string& input) { + std::unordered_map result; + + std::string cleanedInput = input; + boost::erase_all(cleanedInput, "{"); + boost::erase_all(cleanedInput, "}"); + + std::vector elements; + boost::split(elements, cleanedInput, boost::is_any_of(",")); + + for (const std::string& element : elements) { + std::vector keyValue; + boost::split(keyValue, element, boost::is_any_of(":")); + + if (keyValue.size() == 2) { + int key = std::stoi(boost::trim_copy(keyValue[0])); + std::string value = boost::trim_copy(keyValue[1]); + + result[key] = value; + } + } + + return result; +} +*/ + +std::unordered_map parseNames(const std::string& input) { + std::unordered_map result; + + std::string cleanedInput = input; + cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '{'), cleanedInput.end()); + cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '}'), cleanedInput.end()); + + std::istringstream elementStream(cleanedInput); + std::string element; + while (std::getline(elementStream, element, ',')) { + std::istringstream keyValueStream(element); + std::string keyStr, value; + if (std::getline(keyValueStream, keyStr, ':') && std::getline(keyValueStream, value)) { + int key = std::stoi(keyStr); + result[key] = value; + } + } + + return result; +} + +int64_t vector_product(const std::vector& vec) { + int64_t result = 1; + for (int64_t value : vec) { + result *= value; + } + return result; +} diff --git a/src/utils/ops.cpp b/src/utils/ops.cpp new file mode 100644 index 0000000..9110565 --- /dev/null +++ b/src/utils/ops.cpp @@ -0,0 +1,217 @@ +#pragma once + +#include +//#include +//#include + +#include +#include +#include +#include + + + +void clip_boxes(cv::Rect& box, const cv::Size& shape) { + box.x = std::max(0, std::min(box.x, shape.width)); + box.y = std::max(0, std::min(box.y, shape.height)); + box.width = std::max(0, std::min(box.width, shape.width - box.x)); + box.height = std::max(0, std::min(box.height, shape.height - box.y)); +} + +void clip_boxes(cv::Rect_& box, const cv::Size& shape) { + box.x = std::max(0.0f, std::min(box.x, static_cast(shape.width))); + box.y = std::max(0.0f, std::min(box.y, static_cast(shape.height))); + box.width = std::max(0.0f, std::min(box.width, static_cast(shape.width - box.x))); + box.height = std::max(0.0f, std::min(box.height, static_cast(shape.height - box.y))); +} + + +void clip_boxes(std::vector& boxes, const cv::Size& shape) { + for (cv::Rect& box : boxes) { + clip_boxes(box, shape); + } +} + +void clip_boxes(std::vector>& boxes, const cv::Size& shape) { + for (cv::Rect_& box : boxes) { + clip_boxes(box, shape); + } +} + +// source: ultralytics/utils/ops.py scale_boxes lines 99+ (ultralytics==8.0.160) +cv::Rect_ scale_boxes(const cv::Size& img1_shape, cv::Rect_& box, const cv::Size& img0_shape, + std::pair ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true) { + + float gain, pad_x, pad_y; + + if (ratio_pad.first < 0.0f) { + gain = std::min(static_cast(img1_shape.height) / static_cast(img0_shape.height), + static_cast(img1_shape.width) / static_cast(img0_shape.width)); + pad_x = roundf((img1_shape.width - img0_shape.width * gain) / 2.0f - 0.1f); + pad_y = roundf((img1_shape.height - img0_shape.height * gain) / 2.0f - 0.1f); + } + else { + gain = ratio_pad.first; + pad_x = ratio_pad.second.x; + pad_y = ratio_pad.second.y; + } + + //cv::Rect scaledCoords(box); + cv::Rect_ scaledCoords(box); + + if (padding) { + scaledCoords.x -= pad_x; + scaledCoords.y -= pad_y; + } + + scaledCoords.x /= gain; + scaledCoords.y /= gain; + scaledCoords.width /= gain; + scaledCoords.height /= gain; + + // Clip the box to the bounds of the image + clip_boxes(scaledCoords, img0_shape); + + return scaledCoords; +} + + +//void clip_coords(cv::Mat& coords, const cv::Size& shape) { +// // Clip x-coordinates to the image width +// cv::Mat xCoords = coords.col(0); +// cv::Mat yCoords = coords.col(1); +// +// for (int i = 0; i < coords.rows; ++i) { +// xCoords.at(i) = std::max(std::min(xCoords.at(i), static_cast(shape.width - 1)), 0.0f); +// yCoords.at(i) = std::max(std::min(yCoords.at(i), static_cast(shape.height - 1)), 0.0f); +// } +//} + +void clip_coords(std::vector& coords, const cv::Size& shape) { + // Assuming coords are of shape [1, 17, 3] + for (int i = 0; i < coords.size(); i += 3) { + coords[i] = std::min(std::max(coords[i], 0.0f), static_cast(shape.width - 1)); // x + coords[i + 1] = std::min(std::max(coords[i + 1], 0.0f), static_cast(shape.height - 1)); // y + } +} + +// source: ultralytics/utils/ops.py scale_coords lines 753+ (ultralytics==8.0.160) +//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape) +//cv::Mat scale_coords(const cv::Size& img1_shape, std::vector coords, const cv::Size& img0_shape) +std::vector scale_coords(const cv::Size& img1_shape, std::vector& coords, const cv::Size& img0_shape) +{ +// cv::Mat scaledCoords = coords.clone(); + std::vector scaledCoords = coords; + + // Calculate gain and padding + double gain = std::min(static_cast(img1_shape.width) / img0_shape.width, static_cast(img1_shape.height) / img0_shape.height); + cv::Point2d pad((img1_shape.width - img0_shape.width * gain) / 2, (img1_shape.height - img0_shape.height * gain) / 2); + + // Apply padding +// scaledCoords.col(0) = (scaledCoords.col(0) - pad.x); +// scaledCoords.col(1) = (scaledCoords.col(1) - pad.y); + // Assuming coords are of shape [1, 17, 3] + for (int i = 0; i < scaledCoords.size(); i += 3) { + scaledCoords[i] -= pad.x; // x padding + scaledCoords[i + 1] -= pad.y; // y padding + } + + // Scale coordinates +// scaledCoords.col(0) /= gain; +// scaledCoords.col(1) /= gain; + // Assuming coords are of shape [1, 17, 3] + for (int i = 0; i < scaledCoords.size(); i += 3) { + scaledCoords[i] /= gain; + scaledCoords[i + 1] /= gain; + } + + clip_coords(scaledCoords, img0_shape); + return scaledCoords; +} + + +cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box) { + int h = mask.rows; + int w = mask.cols; + + int x1 = box.x; + int y1 = box.y; + int x2 = box.x + box.width; + int y2 = box.y + box.height; + + cv::Mat cropped_mask = cv::Mat::zeros(h, w, mask.type()); + + for (int r = 0; r < h; ++r) { + for (int c = 0; c < w; ++c) { + if (r >= y1 && r < y2 && c >= x1 && c < x2) { + cropped_mask.at(r, c) = mask.at(r, c); + } + } + } + + return cropped_mask; +} + +//std::tuple>, std::vector, std::vector, std::vector>> +std::tuple, std::vector, std::vector, std::vector>> +non_max_suppression(const cv::Mat& output0, int class_names_num, int data_width, double conf_threshold, + float iou_threshold) { + + std::vector class_ids; + std::vector confidences; +// std::vector> boxes; + std::vector boxes; + std::vector> rest; + + int rest_start_pos = class_names_num + 4; + int rest_features = data_width - rest_start_pos; +// int data_width = rest_start_pos + total_features_num; + + int rows = output0.rows; + float* pdata = (float*) output0.data; + + for (int r = 0; r < rows; ++r) { + cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4); + cv::Point class_id; + double max_conf; + minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id); + + if (max_conf > conf_threshold) { + std::vector mask_data(pdata + 4 + class_names_num, pdata + data_width); + class_ids.push_back(class_id.x); + confidences.push_back((float) max_conf); + + float out_w = pdata[2]; + float out_h = pdata[3]; + float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0); + float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0); + cv::Rect_ bbox(out_left, out_top, (out_w + 0.5), (out_h + 0.5)); + boxes.push_back(bbox); + if (rest_features > 0) { + std::vector rest_data(pdata + rest_start_pos, pdata + data_width); + rest.push_back(rest_data); + } + } + pdata += data_width; // next prediction + } + + // + //float masks_threshold = 0.50; + //int top_k = 500; + //const float& nmsde_eta = 1.0f; + std::vector nms_result; + cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k); +// cv::dnn::NMSBoxes(boxes, confidences, ); + std::vector nms_class_ids; + std::vector nms_confidences; +// std::vector> boxes; + std::vector nms_boxes; + std::vector> nms_rest; + for (int idx: nms_result) { + nms_class_ids.push_back(class_ids[idx]); + nms_confidences.push_back(confidences[idx]); + nms_boxes.push_back(boxes[idx]); + nms_rest.push_back(rest[idx]); + } + return std::make_tuple(nms_boxes, nms_confidences, nms_class_ids, nms_rest); +}