init commit

master
Samuel Dobron 5 months ago
commit 157639e9e3

17
.gitignore vendored

@ -0,0 +1,17 @@
# ide related
.vs/*
.idea/*
# build
.build/*
x64/*
cmake-build-debug/*
cmake-build-debug-visual-studio/*
# ignore CMakeLists.txt but add example file
CMakeLists.txt
# dlls:
*.dll
# vs-like objects
*.sln
*vcxproj*
packages.config
packages/*

@ -0,0 +1,64 @@
CMAKE_MINIMUM_REQUIRED(VERSION 3.0.0)
project(YOLOv8CPP)
SET(CMAKE_CXX_STANDARD 17)
SET(CMAKE_CXX_STANDARD_REQUIRED ON)
# SET (OpenCV_DIR your/path/to/opencv/build/x64/vc16/lib) # opencv lib root
# SET (OpenCV_BIN_DIR your/path/to/opencv/build/x64/vc16/bin) #opencv bin root
# SET (OpenCV_DEBUG_DLL_FILENAME opencv_world480d.dll) # change filenames
# SET (OpenCV_RELEASE_DLL_FILENAME opencv_world480.dll) # change filenames
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
SET (ONNXRUNTIME_DIR /home/xdobro23/PP1/onnxruntime/onnxruntime-linux-x64-gpu-1.18.1) # onnxruntime root
FIND_PACKAGE(OpenCV REQUIRED)
set(CMAKE_BUILD_TYPE Debug)
# --- Configure your project files ---
include_directories(include) # Include your header files directory
# Recursively collect all source files under 'src' directory
file(GLOB_RECURSE CURR_SOURCES src/*.cpp)
# Create the executable
add_executable(YOLOv8CPP ${CURR_SOURCES})
target_compile_features(YOLOv8CPP PRIVATE cxx_std_17)
TARGET_INCLUDE_DIRECTORIES(YOLOv8CPP PRIVATE "${ONNXRUNTIME_DIR}/include")
TARGET_LINK_LIBRARIES(YOLOv8CPP ${OpenCV_LIBS})
if (WIN32)
TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/onnxruntime.lib")
# some changes to the original version:
# copy onnxruntime dll
add_custom_command(TARGET YOLOv8CPP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${ONNXRUNTIME_DIR}/lib/onnxruntime.dll"
"$<TARGET_FILE_DIR:YOLOv8CPP>"
)
# copy opencv
#[[ add_custom_command(TARGET YOLOv8CPP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_differentdpkg -L libcv2
"${OpenCV_DIR}/${OpenCV_DLL_FILENAME}"
"$<TARGET_FILE_DIR:YOLOv8CPP>"
)]]
add_custom_command(TARGET YOLOv8CPP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${OpenCV_BIN_DIR}/${OpenCV_DEBUG_DLL_FILENAME}"
"$<TARGET_FILE_DIR:YOLOv8CPP>"
)
# add release
add_custom_command(TARGET YOLOv8CPP POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
"${OpenCV_BIN_DIR}/${OpenCV_RELEASE_DLL_FILENAME}"
"$<TARGET_FILE_DIR:YOLOv8CPP>"
)
endif(WIN32)
if (UNIX)
TARGET_LINK_LIBRARIES(YOLOv8CPP "${ONNXRUNTIME_DIR}/lib/libonnxruntime.so")
endif(UNIX)

@ -0,0 +1,34 @@
#pragma once
#include <string>
namespace MetadataConstants {
inline const std::string IMGSZ = "imgsz";
inline const std::string STRIDE = "stride";
inline const std::string NC = "nc";
inline const std::string CH = "ch";
inline const std::string DATE = "date";
inline const std::string VERSION = "version";
inline const std::string TASK = "task";
inline const std::string BATCH = "batch";
inline const std::string NAMES = "names";
}
namespace OnnxProviders {
inline const std::string CPU = "cpu";
inline const std::string CUDA = "cuda";
}
namespace OnnxInitializers
{
inline const int UNINITIALIZED_STRIDE = -1;
inline const int UNINITIALIZED_NC = -1;
}
namespace YoloTasks
{
inline const std::string SEGMENT = "segment";
inline const std::string DETECT = "detect";
inline const std::string POSE = "pose";
inline const std::string CLASSIFY = "classify";
}

@ -0,0 +1,90 @@
#pragma once
#include <filesystem>
#include <vector>
#include <unordered_map>
#include <opencv2/core/mat.hpp>
#include "onnx_model_base.h"
#include "constants.h"
/**
* @brief Represents the results of YOLO prediction.
*
* This structure stores information about a detected object, including its class index,
* confidence score, bounding box, semantic segmentation mask, and keypoints (if available).
*/
struct YoloResults {
int class_idx{}; ///< The class index of the detected object.
float conf{}; ///< The confidence score of the detection.
cv::Rect_<float> bbox; ///< The bounding box of the detected object.
cv::Mat mask; ///< The semantic segmentation mask (if available).
std::vector<float> keypoints{}; ///< Keypoints representing the object's pose (if available).
};
struct ImageInfo {
cv::Size raw_size; // add additional attrs if you need
};
class AutoBackendOnnx : public OnnxModelBase {
public:
// constructors
AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
const std::vector<int>& imgsz, const int& stride,
const int& nc, std::unordered_map<int, std::string> names);
AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider);
// getters
virtual const std::vector<int>& getImgsz();
virtual const int& getStride();
virtual const int& getCh();
virtual const int& getNc();
virtual const std::unordered_map<int, std::string>& getNames();
virtual const std::vector<int64_t>& getInputTensorShape();
virtual const int& getWidth();
virtual const int& getHeight();
virtual const cv::Size& getCvSize();
virtual const std::string& getTask();
/**
* @brief Runs object detection on an input image.
*
* This method performs object detection on the input image and returns the detected objects as YoloResults.
*
* @param image The input image to run object detection on.
* @param conf The confidence threshold for object detection.
* @param iou The intersection-over-union (IoU) threshold for non-maximum suppression.
* @param mask_threshold The threshold for the semantic segmentation mask.
* @param conversionCode An optional conversion code for image format conversion (e.g., cv::COLOR_BGR2RGB).
* Default value is -1, indicating no conversion.
* TODO: use some constant from some namespace rather than hardcoded values here
*
* @return A vector of YoloResults representing the detected objects.
*/
virtual std::vector<YoloResults> predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
virtual std::vector<YoloResults> predict_once(const std::filesystem::path& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
virtual std::vector<YoloResults> predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold, int conversionCode = -1, bool verbose = true);
virtual void fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape);
virtual void postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo para, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold,
int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold = 0.50f);
virtual void postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold);
virtual void postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold);
static void _get_mask2(const cv::Mat& mask_info, const cv::Mat& mask_data, const ImageInfo& image_info, cv::Rect bound, cv::Mat& mask_out,
float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num, bool round_downsampled = false);
protected:
std::vector<int> imgsz_;
int stride_ = OnnxInitializers::UNINITIALIZED_STRIDE;
int nc_ = OnnxInitializers::UNINITIALIZED_NC; //
int ch_ = 3;
std::unordered_map<int, std::string> names_;
std::vector<int64_t> inputTensorShape_;
cv::Size cvSize_;
std::string task_;
//cv::MatSize cvMatSize_;
};

@ -0,0 +1,39 @@
#pragma once
#include <onnxruntime_cxx_api.h>
#include <string>
#include <unordered_map>
#include <vector>
/*
* This interface must provide only required arguments to load any onnx model regarding specific info -
* - i.e. modelPath will always be required, provider like "cpu" or "cuda" the same, since these are parameters you need
* to set up `sessionOptions` or `session` objects properly, but image size is not needed for pure onnx graph to be loaded so do NOT include it here
*/
class OnnxModelBase {
public:
OnnxModelBase(const char* modelPath, const char* logid, const char* provider);
//OnnxModelBase(); // no default constructor should be there
//virtual ~OnnxModelBase();
virtual const std::vector<std::string>& getInputNames(); // = 0
virtual const std::vector<std::string>& getOutputNames();
virtual const std::vector<const char*> getOutputNamesCStr();
virtual const std::vector<const char*> getInputNamesCStr();
virtual const Ort::ModelMetadata& getModelMetadata();
virtual const std::unordered_map<std::string, std::string>& getMetadata();
virtual const char* getModelPath();
virtual const Ort::Session& getSession();
//virtual std::vector<Ort::Value> forward(std::vector<Ort::Value> inputTensors);
virtual std::vector<Ort::Value> forward(std::vector<Ort::Value>& inputTensors);
Ort::Session session{ nullptr };
protected:
const char* modelPath_;
Ort::Env env{ nullptr };
std::vector<std::string> inputNodeNames;
std::vector<std::string> outputNodeNames;
Ort::ModelMetadata model_metadata{ nullptr };
std::unordered_map<std::string, std::string> metadata;
std::vector<const char*> outputNamesCStr;
std::vector<const char*> inputNamesCStr;
};

@ -0,0 +1,20 @@
#pragma once
#include <opencv2/core/types.hpp>
void letterbox(const cv::Mat& image,
cv::Mat& outImage,
const cv::Size& newShape = cv::Size(640, 640),
cv::Scalar_<double> color = cv::Scalar(), bool auto_ = true,
bool scaleFill = false,
bool scaleUp = true,
int stride = 32
);
cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float,
cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)));
void scale_image2(
cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape,
const std::pair<float, cv::Point2f>& ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f))
);

@ -0,0 +1,26 @@
#ifndef COMMON_UTILS_H
#define COMMON_UTILS_H
#include <chrono>
#include <string>
#include <unordered_map>
#include <vector>
class Timer {
public:
Timer(double& accumulator, bool isEnabled = true);
void Stop();
private:
double& accumulator;
bool isEnabled;
std::chrono::time_point<std::chrono::high_resolution_clock> start;
};
std::wstring get_win_path(const std::string& path);
std::vector<std::string> parseVectorString(const std::string& input);
std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input);
std::unordered_map<int, std::string> parseNames(const std::string& input);
int64_t vector_product(const std::vector<int64_t>& vec);
#endif // COMMON_H COMMON_UTILS_H

@ -0,0 +1,44 @@
#pragma once
#include <opencv2/core/types.hpp>
//cv::Rect scaleCoords(const cv::Size& imageShape, const cv::Rect& coords, const cv::Size& imageOriginalShape);
/**
* Scales a bounding box from the shape of the input image to the shape of an original image.
*
* @param img1_shape The shape (height, width) of the input image for the model.
* @param box The bounding box to be scaled, specified as cv::Rect_<float>.
* @param img0_shape The shape (height, width) of the original target image.
* @param ratio_pad An optional parameter that specifies scaling and padding factors as a pair of values.
* The first value (ratio) is used for scaling, and the second value (pad) is used for padding.
* If not provided, default values will be used.
* @param padding An optional boolean parameter that specifies whether padding should be applied.
* If set to true, padding will be applied to the bounding box.
*
* @return A scaled bounding box specified as cv::Rect_<float>.
*
* This function rescales a bounding box from the shape of the input image (img1_shape) to the shape of an original image (img0_shape).
*/
cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape, std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true);
void clip_boxes(cv::Rect& box, const cv::Size& shape);
void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape);
void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape);
void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape);
//void clip_coords(cv::Mat& coords, const cv::Size& shape);
//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape);
void clip_coords(std::vector<float>& coords, const cv::Size& shape);
std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape);
cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box);
struct NMSResult{
std::vector<cv::Rect> bboxes;
std::vector<float> confidences;
std::vector<int> classes;
std::vector<std::vector<float>> rest;
};
//std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
non_max_suppression(const cv::Mat& output0, int class_names_num, int total_features_num, double conf_threshold, float iou_threshold);

@ -0,0 +1,359 @@
#include <random>
#include <filesystem>
#include "nn/onnx_model_base.h"
#include "nn/autobackend.h"
#include <opencv2/opencv.hpp>
#include <vector>
#include "utils/augment.h"
#include "constants.h"
#include "utils/common.h"
namespace fs = std::filesystem;
// Define the skeleton and color mappings
std::vector<std::vector<int>> skeleton = {{16, 14}, {14, 12}, {17, 15}, {15, 13}, {12, 13}, {6, 12}, {7, 13}, {6, 7},
{6, 8}, {7, 9}, {8, 10}, {9, 11}, {2, 3}, {1, 2}, {1, 3}, {2, 4}, {3, 5}, {4, 6}, {5, 7}};
std::vector<cv::Scalar> posePalette = {
cv::Scalar(255, 128, 0), cv::Scalar(255, 153, 51), cv::Scalar(255, 178, 102), cv::Scalar(230, 230, 0), cv::Scalar(255, 153, 255),
cv::Scalar(153, 204, 255), cv::Scalar(255, 102, 255), cv::Scalar(255, 51, 255), cv::Scalar(102, 178, 255), cv::Scalar(51, 153, 255),
cv::Scalar(255, 153, 153), cv::Scalar(255, 102, 102), cv::Scalar(255, 51, 51), cv::Scalar(153, 255, 153), cv::Scalar(102, 255, 102),
cv::Scalar(51, 255, 51), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 0, 0), cv::Scalar(255, 255, 255)
};
std::vector<int> limbColorIndices = {9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16};
std::vector<int> kptColorIndices = {16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9};
cv::Scalar generateRandomColor(int numChannels) {
if (numChannels < 1 || numChannels > 3) {
throw std::invalid_argument("Invalid number of channels. Must be between 1 and 3.");
}
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<int> dis(0, 255);
cv::Scalar color;
for (int i = 0; i < numChannels; i++) {
color[i] = dis(gen); // for each channel separately generate value
}
return color;
}
std::vector<cv::Scalar> generateRandomColors(int class_names_num, int numChannels) {
std::vector<cv::Scalar> colors;
for (int i = 0; i < class_names_num; i++) {
cv::Scalar color = generateRandomColor(numChannels);
colors.push_back(color);
}
return colors;
}
void plot_masks(cv::Mat img, std::vector<YoloResults>& result, std::vector<cv::Scalar> color,
std::unordered_map<int, std::string>& names)
{
cv::Mat mask = img.clone();
for (int i = 0; i < result.size(); i++)
{
float left, top;
left = result[i].bbox.x;
top = result[i].bbox.y;
int color_num = i;
int& class_idx = result[i].class_idx;
rectangle(img, result[i].bbox, color[result[i].class_idx], 2);
// try to get string value corresponding to given class_idx
std::string class_name;
auto it = names.find(class_idx);
if (it != names.end()) {
class_name = it->second;
}
else {
std::cerr << "Warning: class_idx not found in names for class_idx = " << class_idx << std::endl;
// then convert it to string anyway
class_name = std::to_string(class_idx);
}
if (result[i].mask.rows && result[i].mask.cols > 0)
{
mask(result[i].bbox).setTo(color[result[i].class_idx], result[i].mask);
}
std::stringstream labelStream;
labelStream << class_name << " " << std::fixed << std::setprecision(2) << result[i].conf;
std::string label = labelStream.str();
cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
rectangle(img, rect_to_fill, color[result[i].class_idx], -1);
putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
}
addWeighted(img, 0.6, mask, 0.4, 0, img); //add mask to src
resize(img, img, img.size());
imshow("img", img);
cv::waitKey();
}
//void plot_keypoints(cv::Mat& image, const std::vector<std::vector<float>>& keypoints, const cv::Size& shape) {
void plot_keypoints(cv::Mat& image, const std::vector<YoloResults>& results, const cv::Size& shape) {
int radius = 5;
bool drawLines = true;
if (results.empty()) {
return;
}
std::vector<cv::Scalar> limbColorPalette;
std::vector<cv::Scalar> kptColorPalette;
for (int index : limbColorIndices) {
limbColorPalette.push_back(posePalette[index]);
}
for (int index : kptColorIndices) {
kptColorPalette.push_back(posePalette[index]);
}
for (const auto& res: results) {
auto keypoint = res.keypoints;
bool isPose = keypoint.size() == 51; // numKeypoints == 17 && keypoints[0].size() == 3;
drawLines &= isPose;
// draw points
for (int i = 0; i < 17; i++) {
int idx = i * 3;
int x_coord = static_cast<int>(keypoint[idx]);
int y_coord = static_cast<int>(keypoint[idx + 1]);
if (x_coord % shape.width != 0 && y_coord % shape.height != 0) {
if (keypoint.size() == 3) {
float conf = keypoint[2];
if (conf < 0.5) {
continue;
}
}
cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
255); // Default to red if not in pose mode
cv::circle(image, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
}
}
// draw lines
if (drawLines) {
for (int i = 0; i < skeleton.size(); i++) {
const std::vector<int> &sk = skeleton[i];
int idx1 = sk[0] - 1;
int idx2 = sk[1] - 1;
int idx1_x_pos = idx1 * 3;
int idx2_x_pos = idx2 * 3;
int x1 = static_cast<int>(keypoint[idx1_x_pos]);
int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
int x2 = static_cast<int>(keypoint[idx2_x_pos]);
int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
float conf1 = keypoint[idx1_x_pos + 2];
float conf2 = keypoint[idx2_x_pos + 2];
// Check confidence thresholds
if (conf1 < 0.5 || conf2 < 0.5) {
continue;
}
// Check if positions are within bounds
if (x1 % shape.width == 0 || y1 % shape.height == 0 || x1 < 0 || y1 < 0 ||
x2 % shape.width == 0 || y2 % shape.height == 0 || x2 < 0 || y2 < 0) {
continue;
}
// Draw a line between keypoints
cv::Scalar color_limb = limbColorPalette[i];
cv::line(image, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
}
}
}
}
void plot_results(cv::Mat img, std::vector<YoloResults>& results,
std::vector<cv::Scalar> color, std::unordered_map<int, std::string>& names,
const cv::Size& shape
) {
cv::Mat mask = img.clone();
int radius = 5;
bool drawLines = true;
auto raw_image_shape = img.size();
std::vector<cv::Scalar> limbColorPalette;
std::vector<cv::Scalar> kptColorPalette;
for (int index : limbColorIndices) {
limbColorPalette.push_back(posePalette[index]);
}
for (int index : kptColorIndices) {
kptColorPalette.push_back(posePalette[index]);
}
for (const auto& res : results) {
float left = res.bbox.x;
float top = res.bbox.y;
int color_num = res.class_idx;
// Draw bounding box
rectangle(img, res.bbox, color[res.class_idx], 2);
// Try to get the class name corresponding to the given class_idx
std::string class_name;
auto it = names.find(res.class_idx);
if (it != names.end()) {
class_name = it->second;
}
else {
std::cerr << "Warning: class_idx not found in names for class_idx = " << res.class_idx << std::endl;
// Then convert it to a string anyway
class_name = std::to_string(res.class_idx);
}
// Draw mask if available
if (res.mask.rows && res.mask.cols > 0) {
mask(res.bbox).setTo(color[res.class_idx], res.mask);
}
// Create label
std::stringstream labelStream;
labelStream << class_name << " " << std::fixed << std::setprecision(2) << res.conf;
std::string label = labelStream.str();
cv::Size text_size = cv::getTextSize(label, cv::FONT_HERSHEY_SIMPLEX, 0.6, 2, nullptr);
cv::Rect rect_to_fill(left - 1, top - text_size.height - 5, text_size.width + 2, text_size.height + 5);
cv::Scalar text_color = cv::Scalar(255.0, 255.0, 255.0);
rectangle(img, rect_to_fill, color[res.class_idx], -1);
putText(img, label, cv::Point(left - 1.5, top - 2.5), cv::FONT_HERSHEY_SIMPLEX, 0.6, text_color, 2);
// Check if keypoints are available
if (!res.keypoints.empty()) {
auto keypoint = res.keypoints;
bool isPose = keypoint.size() == 51; // numKeypoints == 17 && keypoints[0].size() == 3;
drawLines &= isPose;
// draw points
for (int i = 0; i < 17; i++) {
int idx = i * 3;
int x_coord = static_cast<int>(keypoint[idx]);
int y_coord = static_cast<int>(keypoint[idx + 1]);
if (x_coord % raw_image_shape.width != 0 && y_coord % raw_image_shape.height != 0) {
if (keypoint.size() == 3) {
float conf = keypoint[2];
if (conf < 0.5) {
continue;
}
}
cv::Scalar color_k = isPose ? kptColorPalette[i] : cv::Scalar(0, 0,
255); // Default to red if not in pose mode
cv::circle(img, cv::Point(x_coord, y_coord), radius, color_k, -1, cv::LINE_AA);
}
}
// draw lines
if (drawLines) {
for (int i = 0; i < skeleton.size(); i++) {
const std::vector<int> &sk = skeleton[i];
int idx1 = sk[0] - 1;
int idx2 = sk[1] - 1;
int idx1_x_pos = idx1 * 3;
int idx2_x_pos = idx2 * 3;
int x1 = static_cast<int>(keypoint[idx1_x_pos]);
int y1 = static_cast<int>(keypoint[idx1_x_pos + 1]);
int x2 = static_cast<int>(keypoint[idx2_x_pos]);
int y2 = static_cast<int>(keypoint[idx2_x_pos + 1]);
float conf1 = keypoint[idx1_x_pos + 2];
float conf2 = keypoint[idx2_x_pos + 2];
// Check confidence thresholds
if (conf1 < 0.5 || conf2 < 0.5) {
continue;
}
// Check if positions are within bounds
if (x1 % raw_image_shape.width == 0 || y1 % raw_image_shape.height == 0 || x1 < 0 || y1 < 0 ||
x2 % raw_image_shape.width == 0 || y2 % raw_image_shape.height == 0 || x2 < 0 || y2 < 0) {
continue;
}
// Draw a line between keypoints
cv::Scalar color_limb = limbColorPalette[i];
cv::line(img, cv::Point(x1, y1), cv::Point(x2, y2), color_limb, 2, cv::LINE_AA);
}
}
}
}
// Combine the image and mask
addWeighted(img, 0.6, mask, 0.4, 0, img);
// resize(img, img, img.size());
// resize(img, img, shape);
// // Show the image
// imshow("img", img);
// cv::waitKey();
}
int main()
{
std::string img_path = "/home/xdobro23/path_to_16bit_3_channel_image.png";
const std::string& modelPath = "/home/xdobro23/PP1/ultralytics/runs/detect/train35/weights/best.onnx"; // pose
fs::path imageFilePath(img_path);
fs::path newFilePath = imageFilePath.stem();
newFilePath += "-kpt-cpp";
newFilePath += imageFilePath.extension();
assert(newFilePath != imageFilePath);
std::cout << "newFilePath: " << newFilePath << std::endl;
const std::string& onnx_provider = OnnxProviders::CPU; // "cpu";
const std::string& onnx_logid = "yolov8_inference2";
float mask_threshold = 0.5f; // in python it's 0.5 and you can see that at ultralytics/utils/ops.process_mask line 705 (ultralytics.__version__ == .160)
float conf_threshold = 0.30f;
float iou_threshold = 0.45f; // 0.70f;
int conversion_code = -1;
cv::Mat img = cv::imread(img_path, cv::IMREAD_UNCHANGED);
if (img.empty()) {
std::cerr << "Error: Unable to load image" << std::endl;
return 1;
}
AutoBackendOnnx model(modelPath.c_str(), onnx_logid.c_str(), onnx_provider.c_str());
std::cout << "Model loaded" << std::endl;
std::vector<YoloResults> objs = model.predict_once(img, conf_threshold, iou_threshold, mask_threshold);
std::cout << "Got results" << std::endl;
std::vector<cv::Scalar> colors = generateRandomColors(model.getNc(), model.getCh());
std::unordered_map<int, std::string> names = model.getNames();
std::vector<std::vector<float>> keypointsVector;
for (const YoloResults& result : objs) {
keypointsVector.push_back(result.keypoints);
}
cv::cvtColor(img, img, cv::COLOR_RGB2BGR);
cv::Size show_shape = img.size(); // cv::Size(1280, 720); // img.size()
plot_results(img, objs, colors, names, show_shape);
// plot_masks(img, objs, colors, names);
cv::imshow("img", img);
cv::waitKey();
return -1;
}

@ -0,0 +1,520 @@
#pragma once
#include "nn/autobackend.h"
#include <iostream>
#include <ostream>
#include <filesystem>
#include <opencv2/opencv.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core/mat.hpp>
#include "utils/augment.h"
#include "constants.h"
#include "utils/common.h"
#include "utils/ops.h"
namespace fs = std::filesystem;
AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider,
const std::vector<int>& imgsz, const int& stride,
const int& nc, const std::unordered_map<int, std::string> names)
: OnnxModelBase(modelPath, logid, provider), imgsz_(imgsz), stride_(stride), nc_(nc), names_(names),
inputTensorShape_()
{
}
AutoBackendOnnx::AutoBackendOnnx(const char* modelPath, const char* logid, const char* provider)
: OnnxModelBase(modelPath, logid, provider) {
// init metadata etc
OnnxModelBase(modelPath, logid, provider);
// then try to get additional info from metadata like imgsz, stride etc;
// ideally you should get all of them but you'll raise error if smth is not in metadata (or not under the appropriate keys)
const std::unordered_map<std::string, std::string>& base_metadata = OnnxModelBase::getMetadata();
// post init imgsz
auto imgsz_iterator = base_metadata.find(MetadataConstants::IMGSZ);
if (imgsz_iterator != base_metadata.end()) {
// parse it and convert to int iterable
std::vector<int> imgsz = convertStringVectorToInts(parseVectorString(imgsz_iterator->second));
// set it here:
if (imgsz_.empty()) {
imgsz_ = imgsz;
}
}
else {
std::cerr << "Warning: Cannot get imgsz value from metadata" << std::endl;
}
// post init stride
auto stride_item = base_metadata.find(MetadataConstants::STRIDE);
if (stride_item != base_metadata.end()) {
// parse it and convert to int iterable
int stide_int = std::stoi(stride_item->second);
// set it here:
if (stride_ == OnnxInitializers::UNINITIALIZED_STRIDE) {
stride_ = stide_int;
}
}
else {
std::cerr << "Warning: Cannot get stride value from metadata" << std::endl;
}
// post init names
auto names_item = base_metadata.find(MetadataConstants::NAMES);
if (names_item != base_metadata.end()) {
// parse it and convert to int iterable
std::unordered_map<int, std::string> names = parseNames(names_item->second);
std::cout << "***Names from metadata***" << std::endl;
for (const auto& pair : names) {
std::cout << "Key: " << pair.first << ", Value: " << pair.second << std::endl;
}
// set it here:
if (names_.empty()) {
names_ = names;
}
}
else {
std::cerr << "Warning: Cannot get names value from metadata" << std::endl;
}
// post init number of classes - you can do that only and only if names_ is not empty and nc was not initialized previously
if (nc_ == OnnxInitializers::UNINITIALIZED_NC && !names_.empty()) {
nc_ = names_.size();
}
else {
std::cerr << "Warning: Cannot get nc value from metadata (probably names wasn't set)" << std::endl;
}
if (!imgsz_.empty() && inputTensorShape_.empty())
{
inputTensorShape_ = { 1, ch_, getHeight(), getWidth() };
}
if (!imgsz_.empty())
{
// Initialize cvSize_ using getHeight() and getWidth()
//cvSize_ = cv::MatSize()
cvSize_ = cv::Size(getWidth(), getHeight());
//cvMatSize_ = cv::MatSize(cvSize_.width, cvSize_.height);
}
// task init:
auto task_item = base_metadata.find(MetadataConstants::TASK);
if (task_item != base_metadata.end()) {
// parse it and convert to int iterable
std::string task = std::string(task_item->second);
// set it here:
if (task_.empty())
{
task_ = task;
}
}
else {
std::cerr << "Warning: Cannot get task value from metadata" << std::endl;
}
// TODO: raise assert if imgsz_ and task_ were not initialized (since you don't know in that case which postprocessing to use)
}
const std::vector<int>& AutoBackendOnnx::getImgsz() {
return imgsz_;
}
const int& AutoBackendOnnx::getHeight()
{
return imgsz_[0];
}
const int& AutoBackendOnnx::getWidth()
{
return imgsz_[1];
}
const int& AutoBackendOnnx::getStride() {
return stride_;
}
const int& AutoBackendOnnx::getCh() {
return ch_;
}
const int& AutoBackendOnnx::getNc() {
return nc_;
}
const std::unordered_map<int, std::string>& AutoBackendOnnx::getNames() {
return names_;
}
const cv::Size& AutoBackendOnnx::getCvSize()
{
return cvSize_;
}
const std::vector<int64_t>& AutoBackendOnnx::getInputTensorShape()
{
return inputTensorShape_;
}
const std::string& AutoBackendOnnx::getTask()
{
return task_;
}
std::vector<YoloResults> AutoBackendOnnx::predict_once(const std::string& imagePath, float& conf, float& iou, float& mask_threshold,
int conversionCode, bool verbose) {
// Convert the string imagePath to an object of type std::filesystem::path
fs::path imageFilePath(imagePath);
// Call the predict_once method, converting the image to a cv::Mat
return predict_once(imageFilePath, conf, iou, mask_threshold, conversionCode);
}
std::vector<YoloResults> AutoBackendOnnx::predict_once(const fs::path& imagePath, float& conf, float& iou, float& mask_threshold,
int conversionCode, bool verbose) {
// Check if the specified path exists
if (!fs::exists(imagePath)) {
std::cerr << "Error: File does not exist: " << imagePath << std::endl;
// Return an empty vector or throw an exception, depending on your logic
return {};
}
// Load the image into a cv::Mat
cv::Mat image = cv::imread(imagePath.string(), cv::IMREAD_UNCHANGED);
// Check if loading the image was successful
if (image.empty()) {
std::cerr << "Error: Failed to load image: " << imagePath << std::endl;
// Return an empty vector or throw an exception, depending on your logic
return {};
}
// now do some preprocessing based on channels info:
int required_image_channels = this->getCh();
/*assert(required_image_channels == image.channels() && "");*/
// Assert that the number of channels in the input image matches the required number of channels for the model
if (required_image_channels != image.channels()) {
const std::string& errorMessage = "Error: Number of image channels does not match the required channels.\n"
"Number of channels in the image: " + std::to_string(image.channels());
throw std::runtime_error(errorMessage);
}
// Call overloaded one
return predict_once(image, conf, iou, mask_threshold, conversionCode);
}
std::vector<YoloResults> AutoBackendOnnx::predict_once(cv::Mat& image, float& conf, float& iou, float& mask_threshold, int conversionCode, bool verbose) {
double preprocess_time = 0.0;
double inference_time = 0.0;
double postprocess_time = 0.0;
Timer preprocess_timer = Timer(preprocess_time, verbose);
// 1. preprocess
float* blob = nullptr;
//double* blob = nullptr;
std::vector<Ort::Value> inputTensors;
if (conversionCode >= 0) {
cv::cvtColor(image, image, conversionCode);
}
std::vector<int64_t> inputTensorShape;
// TODO: for classify task preprocessed image will be different (!):
cv::Mat preprocessed_img;
cv::Size new_shape = cv::Size(getWidth(), getHeight());
const bool& scaleFill = false; // false
const bool& auto_ = false; // true
letterbox(image, preprocessed_img, new_shape, cv::Scalar(), auto_, scaleFill, true, getStride());
fill_blob(preprocessed_img, blob, inputTensorShape);
int64_t inputTensorSize = vector_product(inputTensorShape);
std::vector<float> inputTensorValues(blob, blob + inputTensorSize);
Ort::MemoryInfo memoryInfo = Ort::MemoryInfo::CreateCpu(
OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
inputTensors.push_back(Ort::Value::CreateTensor<float>(
memoryInfo, inputTensorValues.data(), inputTensorSize,
inputTensorShape.data(), inputTensorShape.size()
));
preprocess_timer.Stop();
Timer inference_timer = Timer(inference_time, verbose);
// 2. inference
std::vector<Ort::Value> outputTensors = forward(inputTensors);
inference_timer.Stop();
Timer postprocess_timer = Timer(postprocess_time, verbose);
// create container for the results
std::vector<YoloResults> results;
// 3. postprocess based on task:
std::unordered_map<int, std::string> names = this->getNames();
// 4. cleanup blob since it was created using the "new" keyword during the `fill_blob` func call
delete[] blob;
int class_names_num = names.size();
if (task_ == YoloTasks::SEGMENT) {
// get outputs info
std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
std::vector<int64_t> outputTensor1Shape = outputTensors[1].GetTensorTypeAndShapeInfo().GetShape();
// get outputs
float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features]
auto mask_shape = outputTensor1Shape;
std::vector<int> mask_sz = { 1,(int)mask_shape[1],(int)mask_shape[2],(int)mask_shape[3] };
cv::Mat output1 = cv::Mat(mask_sz, CV_32F, outputTensors[1].GetTensorMutableData<float>());
int iw = this->getWidth();
int ih = this->getHeight();
int mask_features_num = outputTensor1Shape[1];
int mh = outputTensor1Shape[2];
int mw = outputTensor1Shape[3];
ImageInfo img_info = { image.size() };
postprocess_masks(output0, output1, img_info, results, class_names_num, conf, iou,
iw, ih, mw, mh, mask_features_num, mask_threshold);
}
else if (task_ == YoloTasks::DETECT) {
ImageInfo img_info = { image.size() };
std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features]
postprocess_detects(output0, img_info, results, class_names_num, conf, iou);
}
else if (task_ == YoloTasks::POSE) {
ImageInfo image_info = { image.size() };
std::vector<int64_t> outputTensor0Shape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
float* all_data0 = outputTensors[0].GetTensorMutableData<float>();
cv::Mat output0 = cv::Mat(cv::Size((int)outputTensor0Shape[2], (int)outputTensor0Shape[1]), CV_32F, all_data0).t(); // [bs, features, preds_num]=>[bs, preds_num, features]
postprocess_kpts(output0, image_info, results, class_names_num, conf, iou);
}
else {
throw std::runtime_error("NotImplementedError: task: " + task_);
}
postprocess_timer.Stop();
if (verbose) {
std::cout << std::fixed << std::setprecision(1);
std::cout << "image: " << preprocessed_img.rows << "x" << preprocessed_img.cols << " " << results.size() << " objs, ";
std::cout << (preprocess_time + inference_time + postprocess_time) * 1000.0 << "ms" << std::endl;
std::cout << "Speed: " << (preprocess_time * 1000.0) << "ms preprocess, ";
std::cout << (inference_time * 1000.0) << "ms inference, ";
std::cout << (postprocess_time * 1000.0) << "ms postprocess per image ";
std::cout << "at shape (1, " << image.channels() << ", " << preprocessed_img.rows << ", " << preprocessed_img.cols << ")" << std::endl;
}
return results;
}
void AutoBackendOnnx::postprocess_masks(cv::Mat& output0, cv::Mat& output1, ImageInfo image_info, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold,
int& iw, int& ih, int& mw, int& mh, int& masks_features_num, float mask_threshold /* = 0.5f */)
{
output.clear();
std::vector<int> class_ids;
std::vector<float> confidences;
std::vector<cv::Rect> boxes;
std::vector<std::vector<float>> masks;
// 4 - your default number of rect parameters {x, y, w, h}
int data_width = class_names_num + 4 + masks_features_num;
int rows = output0.rows;
float* pdata = (float*)output0.data;
for (int r = 0; r < rows; ++r)
{
cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
cv::Point class_id;
double max_conf;
minMaxLoc(scores, 0, &max_conf, 0, &class_id);
if (max_conf > conf_threshold)
{
masks.push_back(std::vector<float>(pdata + 4 + class_names_num, pdata + data_width));
class_ids.push_back(class_id.x);
confidences.push_back(max_conf);
float out_w = pdata[2];
float out_h = pdata[3];
float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
cv::Rect_ <float> bbox = cv::Rect(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
boxes.push_back(scaled_bbox);
}
pdata += data_width; // next pred
}
//
//float masks_threshold = 0.50;
//int top_k = 500;
//const float& nmsde_eta = 1.0f;
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
// select all of the protos tensor
cv::Size downsampled_size = cv::Size(mw, mh);
std::vector<cv::Range> roi_rangs = { cv::Range(0, 1), cv::Range::all(),
cv::Range(0, downsampled_size.height), cv::Range(0, downsampled_size.width) };
cv::Mat temp_mask = output1(roi_rangs).clone();
cv::Mat proto = temp_mask.reshape(0, { masks_features_num, downsampled_size.width * downsampled_size.height });
for (int i = 0; i < nms_result.size(); ++i)
{
int idx = nms_result[i];
boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
_get_mask2(cv::Mat(masks[idx]).t(), proto, image_info, boxes[idx], result.mask, mask_threshold,
iw, ih, mw, mh, masks_features_num);
output.push_back(result);
}
}
void AutoBackendOnnx::postprocess_detects(cv::Mat& output0, ImageInfo image_info, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold)
{
output.clear();
std::vector<int> class_ids;
std::vector<float> confidences;
std::vector<cv::Rect> boxes;
std::vector<std::vector<float>> masks;
// 4 - your default number of rect parameters {x, y, w, h}
int data_width = class_names_num + 4;
int rows = output0.rows;
float* pdata = (float*)output0.data;
for (int r = 0; r < rows; ++r)
{
cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
cv::Point class_id;
double max_conf;
minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
if (max_conf > conf_threshold)
{
masks.emplace_back(pdata + 4 + class_names_num, pdata + data_width);
class_ids.push_back(class_id.x);
confidences.push_back((float) max_conf);
float out_w = pdata[2];
float out_h = pdata[3];
float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
cv::Rect_ <float> bbox = cv::Rect_ <float> (out_left, out_top, (out_w + 0.5), (out_h + 0.5));
cv::Rect_<float> scaled_bbox = scale_boxes(getCvSize(), bbox, image_info.raw_size);
boxes.push_back(scaled_bbox);
}
pdata += data_width; // next pred
}
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
for (int idx : nms_result)
{
boxes[idx] = boxes[idx] & cv::Rect(0, 0, image_info.raw_size.width, image_info.raw_size.height);
YoloResults result = { class_ids[idx] ,confidences[idx] ,boxes[idx] };
output.push_back(result);
}
}
void AutoBackendOnnx::postprocess_kpts(cv::Mat& output0, ImageInfo& image_info, std::vector<YoloResults>& output,
int& class_names_num, float& conf_threshold, float& iou_threshold)
{
std::vector<cv::Rect> boxes;
std::vector<float> confidences;
std::vector<int> class_ids;
std::vector<std::vector<float>> rest;
std::tie(boxes, confidences, class_ids, rest) = non_max_suppression(output0, class_names_num, output0.cols, conf_threshold, iou_threshold);
cv::Size img1_shape = getCvSize();
auto bound_bbox = cv::Rect_ <float> (0, 0, image_info.raw_size.width, image_info.raw_size.height);
for (int i = 0; i < boxes.size(); i++) {
// pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], shape).round()
// pred_kpts = pred[:, 6:].view(len(pred), *self.model.kpt_shape) if len(pred) else pred[:, 6:]
// pred_kpts = ops.scale_coords(img.shape[2:], pred_kpts, shape)
// path = self.batch[0]
// img_path = path[i] if isinstance(path, list) else path
// results.append(
// Results(orig_img=orig_img,
// path=img_path,
// names=self.model.names,
// boxes=pred[:, :6],
// keypoints=pred_kpts))
cv::Rect_<float> bbox = boxes[i];
auto scaled_bbox = scale_boxes(img1_shape, bbox, image_info.raw_size);
scaled_bbox = scaled_bbox & bound_bbox;
// cv::Mat kpt = cv::Mat(rest[i]).t();
// scale_coords(img1_shape, kpt, image_info.raw_size);
// TODO: overload scale_coords so that will accept cv::Mat of shape [17, 3]
// so that it will be more similar to what we have in python
std::vector<float> kpt = scale_coords(img1_shape, rest[i], image_info.raw_size);
YoloResults tmp_res = { class_ids[i], confidences[i], scaled_bbox, {}, kpt};
output.push_back(tmp_res);
}
}
void AutoBackendOnnx::_get_mask2(const cv::Mat& masks_features,
const cv::Mat& proto,
const ImageInfo& image_info, const cv::Rect bound, cv::Mat& mask_out,
float& mask_thresh, int& iw, int& ih, int& mw, int& mh, int& masks_features_num,
bool round_downsampled)
{
cv::Size img0_shape = image_info.raw_size;
cv::Size img1_shape = cv::Size(iw, ih);
cv::Size downsampled_size = cv::Size(mw, mh);
cv::Rect_<float> bound_float(
static_cast<float>(bound.x),
static_cast<float>(bound.y),
static_cast<float>(bound.width),
static_cast<float>(bound.height)
);
cv::Rect_<float> downsampled_bbox = scale_boxes(img0_shape, bound_float, downsampled_size);
cv::Size bound_size = cv::Size(mw, mh);
clip_boxes(downsampled_bbox, bound_size);
cv::Mat matmul_res = (masks_features * proto).t();
matmul_res = matmul_res.reshape(1, { downsampled_size.height, downsampled_size.width });
// apply sigmoid to the mask:
cv::Mat sigmoid_mask;
exp(-matmul_res, sigmoid_mask);
sigmoid_mask = 1.0 / (1.0 + sigmoid_mask);
cv::Mat resized_mask;
cv::Rect_<float> input_bbox = scale_boxes(img0_shape, bound_float, img1_shape);
cv::resize(sigmoid_mask, resized_mask, img1_shape, 0, 0, cv::INTER_LANCZOS4);
cv::Mat pre_out_mask = resized_mask(input_bbox);
cv::Mat scaled_mask;
scale_image2(scaled_mask, resized_mask, img0_shape);
cv::resize(scaled_mask, mask_out, img0_shape);
mask_out = mask_out(bound) > mask_thresh;
}
void AutoBackendOnnx::fill_blob(cv::Mat& image, float*& blob, std::vector<int64_t>& inputTensorShape) {
cv::Mat floatImage;
if (inputTensorShape.empty())
{
inputTensorShape = getInputTensorShape();
}
int inputChannelsNum = inputTensorShape[1];
int rtype = CV_32FC3;
image.convertTo(floatImage, rtype, 1.0f / 65635.0);
blob = new float[floatImage.cols * floatImage.rows * floatImage.channels()];
cv::Size floatImageSize{ floatImage.cols, floatImage.rows };
// hwc -> chw
std::vector<cv::Mat> chw(floatImage.channels());
for (int i = 0; i < floatImage.channels(); ++i)
{
chw[i] = cv::Mat(floatImageSize, CV_32FC1, blob + i * floatImageSize.width * floatImageSize.height);
}
cv::split(floatImage, chw);
}

@ -0,0 +1,181 @@
#include "nn/onnx_model_base.h"
#include <iostream>
#include <onnxruntime_cxx_api.h>
#include <onnxruntime_c_api.h>
#include "constants.h"
#include "utils/common.h"
/**
* @brief Base class for any onnx model regarding the target.
*
* Wraps OrtApi.
*
* The caller provides a model path, logid, and provider.
*
* See the output logs for more information on warnings/errors that occur while processing the model.
*
* @param[in] modelPath Path to the model file.
* @param[in] logid Log identifier.
* @param[in] provider Provider (e.g., "CPU" or "CUDA"). (NOTE: for now only CPU is supported)
*/
OnnxModelBase::OnnxModelBase(const char* modelPath, const char* logid, const char* provider)
//: modelPath_(modelPath), env(std::move(env)), session(std::move(session))
: modelPath_(modelPath)
{
// TODO: too bad passing `ORT_LOGGING_LEVEL_WARNING` by default - for some cases
// info level would make sense too
env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, logid);
Ort::SessionOptions sessionOptions = Ort::SessionOptions();
std::vector<std::string> availableProviders = Ort::GetAvailableProviders();
auto cudaAvailable = std::find(availableProviders.begin(), availableProviders.end(), "CUDAExecutionProvider");
OrtCUDAProviderOptions cudaOption;
if (provider == OnnxProviders::CUDA.c_str()) { // strcmp(provider, OnnxProviders::CUDA.c_str()) == true strcmp(provider, "cuda") // (providerStr == "cuda")
if (cudaAvailable == availableProviders.end()) {
std::cout << "CUDA is not supported by your ONNXRuntime build. Fallback to CPU." << std::endl;
//std::cout << "Inference device: CPU" << std::endl;
}
else {
//std::cout << "Inference device: GPU" << std::endl;
sessionOptions.AppendExecutionProvider_CUDA(cudaOption);
}
}
else if (provider == OnnxProviders::CPU.c_str()) { // strcmp(provider, OnnxProviders::CPU.c_str()) == true) (providerStr == "cpu") {
// "cpu" by default
}
else
{
throw std::runtime_error("NotImplemented provider=" + std::string(provider));
}
std::cout << "Inference device: " << std::string(provider) << std::endl;
#ifdef _WIN32
auto modelPathW = get_win_path(modelPath); // For Windows (wstring)
session = Ort::Session(env, modelPathW.c_str(), sessionOptions);
#else
session = Ort::Session(env, modelPath, sessionOptions); // For Linux (string)
#endif
//session = Ort::Session(env)
// https://github.com/microsoft/onnxruntime/issues/14157
//std::vector<const char*> inputNodeNames; //
// ----------------
// init input names
inputNodeNames;
std::vector<Ort::AllocatedStringPtr> inputNodeNameAllocatedStrings; // <-- newly added
Ort::AllocatorWithDefaultOptions allocator;
auto inputNodesNum = session.GetInputCount();
for (int i = 0; i < inputNodesNum; i++) {
auto input_name = session.GetInputNameAllocated(i, allocator);
inputNodeNameAllocatedStrings.push_back(std::move(input_name));
inputNodeNames.push_back(inputNodeNameAllocatedStrings.back().get());
}
// -----------------
// init output names
outputNodeNames;
auto outputNodesNum = session.GetOutputCount();
std::vector<Ort::AllocatedStringPtr> outputNodeNameAllocatedStrings; // <-- newly added
Ort::AllocatorWithDefaultOptions output_names_allocator;
for (int i = 0; i < outputNodesNum; i++)
{
auto output_name = session.GetOutputNameAllocated(i, output_names_allocator);
outputNodeNameAllocatedStrings.push_back(std::move(output_name));
outputNodeNames.push_back(outputNodeNameAllocatedStrings.back().get());
}
// -------------------------
// initialize model metadata
model_metadata = session.GetModelMetadata();
Ort::AllocatorWithDefaultOptions metadata_allocator;
std::vector<Ort::AllocatedStringPtr> metadataAllocatedKeys = model_metadata.GetCustomMetadataMapKeysAllocated(metadata_allocator);
std::vector<std::string> metadata_keys;
metadata_keys.reserve(metadataAllocatedKeys.size());
for (const Ort::AllocatedStringPtr& allocatedString : metadataAllocatedKeys) {
metadata_keys.emplace_back(allocatedString.get());
}
// -------------------------
// initialize metadata as the dict
// even though we know exactly what metadata we intend to use
// base onnx class should not have any ultralytics yolo-specific attributes like stride, task etc, so keep it clean as much as possible
for (const std::string& key : metadata_keys) {
Ort::AllocatedStringPtr metadata_value = model_metadata.LookupCustomMetadataMapAllocated(key.c_str(), metadata_allocator);
if (metadata_value != nullptr) {
auto raw_metadata_value = metadata_value.get();
metadata[key] = std::string(raw_metadata_value);
}
}
// initialize cstr
for (const std::string& name : outputNodeNames) {
outputNamesCStr.push_back(name.c_str());
}
for (const std::string& name : inputNodeNames)
{
inputNamesCStr.push_back(name.c_str());
}
}
const std::vector<std::string>& OnnxModelBase::getInputNames() {
return inputNodeNames;
}
const std::vector<std::string>& OnnxModelBase::getOutputNames() {
return outputNodeNames;
}
const Ort::ModelMetadata& OnnxModelBase::getModelMetadata()
{
return model_metadata;
}
const std::unordered_map<std::string, std::string>& OnnxModelBase::getMetadata()
{
return metadata;
}
const Ort::Session& OnnxModelBase::getSession()
{
return session;
}
const char* OnnxModelBase::getModelPath()
{
return modelPath_;
}
const std::vector<const char*> OnnxModelBase::getOutputNamesCStr()
{
return outputNamesCStr;
}
const std::vector<const char*> OnnxModelBase::getInputNamesCStr()
{
return inputNamesCStr;
}
std::vector<Ort::Value> OnnxModelBase::forward(std::vector<Ort::Value>& inputTensors)
{
// todo: make runOptions parameter here
return session.Run(Ort::RunOptions{ nullptr },
inputNamesCStr.data(),
inputTensors.data(),
inputNamesCStr.size(),
outputNamesCStr.data(),
outputNamesCStr.size());
}
//OnnxModelBase::~OnnxModelBase() {
// // empty body
//}

@ -0,0 +1,145 @@
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/types.hpp>
/**
* \brief padding value when letterbox changes image size ratio
*/
const int& DEFAULT_LETTERBOX_PAD_VALUE = 114;
void letterbox(const cv::Mat& image,
cv::Mat& outImage,
const cv::Size& newShape,
cv::Scalar_<double> color,
bool auto_,
bool scaleFill,
bool scaleUp, int stride
) {
cv::Size shape = image.size();
float r = std::min(static_cast<float>(newShape.height) / static_cast<float>(shape.height),
static_cast<float>(newShape.width) / static_cast<float>(shape.width));
if (!scaleUp)
r = std::min(r, 1.0f);
float ratio[2]{ r, r };
int newUnpad[2]{ static_cast<int>(std::round(static_cast<float>(shape.width) * r)),
static_cast<int>(std::round(static_cast<float>(shape.height) * r)) };
auto dw = static_cast<float>(newShape.width - newUnpad[0]);
auto dh = static_cast<float>(newShape.height - newUnpad[1]);
if (auto_)
{
dw = static_cast<float>((static_cast<int>(dw) % stride));
dh = static_cast<float>((static_cast<int>(dh) % stride));
}
else if (scaleFill)
{
dw = 0.0f;
dh = 0.0f;
newUnpad[0] = newShape.width;
newUnpad[1] = newShape.height;
ratio[0] = static_cast<float>(newShape.width) / static_cast<float>(shape.width);
ratio[1] = static_cast<float>(newShape.height) / static_cast<float>(shape.height);
}
dw /= 2.0f;
dh /= 2.0f;
//cv::Mat outImage;
if (shape.width != newUnpad[0] || shape.height != newUnpad[1])
{
cv::resize(image, outImage, cv::Size(newUnpad[0], newUnpad[1]));
}
else
{
outImage = image.clone();
}
int top = static_cast<int>(std::round(dh - 0.1f));
int bottom = static_cast<int>(std::round(dh + 0.1f));
int left = static_cast<int>(std::round(dw - 0.1f));
int right = static_cast<int>(std::round(dw + 0.1f));
if (color == cv::Scalar()) {
color = cv::Scalar(DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE, DEFAULT_LETTERBOX_PAD_VALUE);
}
cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}
cv::Mat scale_image(const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
cv::Size im1_shape = resized_mask.size();
// Check if resizing is needed
if (im1_shape == im0_shape) {
return resized_mask.clone();
}
float gain, pad_x, pad_y;
if (ratio_pad.first < 0.0f) {
gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
}
else {
gain = ratio_pad.first;
pad_x = ratio_pad.second.x;
pad_y = ratio_pad.second.y;
}
int top = static_cast<int>(pad_y);
int left = static_cast<int>(pad_x);
int bottom = static_cast<int>(im1_shape.height - pad_y);
int right = static_cast<int>(im1_shape.width - pad_x);
// Clip and resize the mask
cv::Rect clipped_rect(left, top, right - left, bottom - top);
cv::Mat clipped_mask = resized_mask(clipped_rect);
cv::Mat scaled_mask;
cv::resize(clipped_mask, scaled_mask, im0_shape);
return scaled_mask;
}
void scale_image2(cv::Mat& scaled_mask, const cv::Mat& resized_mask, const cv::Size& im0_shape, const std::pair<float, cv::Point2f>& ratio_pad) {
cv::Size im1_shape = resized_mask.size();
// Check if resizing is needed
if (im1_shape == im0_shape) {
scaled_mask = resized_mask.clone();
return;
}
float gain, pad_x, pad_y;
if (ratio_pad.first < 0.0f) {
gain = std::min(static_cast<float>(im1_shape.height) / static_cast<float>(im0_shape.height),
static_cast<float>(im1_shape.width) / static_cast<float>(im0_shape.width));
pad_x = (im1_shape.width - im0_shape.width * gain) / 2.0f;
pad_y = (im1_shape.height - im0_shape.height * gain) / 2.0f;
}
else {
gain = ratio_pad.first;
pad_x = ratio_pad.second.x;
pad_y = ratio_pad.second.y;
}
int top = static_cast<int>(pad_y);
int left = static_cast<int>(pad_x);
int bottom = static_cast<int>(im1_shape.height - pad_y);
int right = static_cast<int>(im1_shape.width - pad_x);
// Clip and resize the mask
cv::Rect clipped_rect(left, top, right - left, bottom - top);
cv::Mat clipped_mask = resized_mask(clipped_rect);
cv::resize(clipped_mask, scaled_mask, im0_shape);
}

@ -0,0 +1,147 @@
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#include <string>
#include <sstream>
#include <codecvt>
#include "utils/common.h"
#include <chrono>
//#include <boost/algorithm/string.hpp>
#include <iostream>
#include <stdio.h>
#include <string>
#include <regex>
#include <vector>
Timer::Timer(double& accumulator, bool isEnabled)
: accumulator(accumulator), isEnabled(isEnabled) {
if (isEnabled) {
start = std::chrono::high_resolution_clock::now();
}
}
// Stop the timer and update the accumulator
void Timer::Stop() {
if (isEnabled) {
auto end = std::chrono::high_resolution_clock::now();
double duration = std::chrono::duration<double>(end - start).count();
accumulator += duration;
}
}
// С++ 14 version
//#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
std::wstring get_win_path(const std::string& modelPath) {
#ifdef _WIN32
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(modelPath);
#else
// return modelPath;
return std::wstring(modelPath.begin(), modelPath.end());
#endif
}
std::vector<std::string> parseVectorString(const std::string& input) {
/* Main purpose of this function is to parse `imgsz` key value of model metadata
* and from [height, width] get height, width values in the vector of strings
* Args:
* input:
* expected to be something like [544, 960] or [3,544, 960]
* output:
* iterable of strings, representing integers
*/
std::regex number_pattern(R"(\d+)");
std::vector<std::string> result;
std::sregex_iterator it(input.begin(), input.end(), number_pattern);
std::sregex_iterator end;
while (it != end) {
result.push_back(it->str());
++it;
}
return result;
}
std::vector<int> convertStringVectorToInts(const std::vector<std::string>& input) {
std::vector<int> result;
for (const std::string& str : input) {
try {
int value = std::stoi(str);
result.push_back(value);
}
catch (const std::invalid_argument& e) {
// raise explicit exception
throw std::invalid_argument("Bad argument (cannot cast): value=" + str);
}
catch (const std::out_of_range& e) {
// check bounds
throw std::out_of_range("Value out of range: " + str);
}
}
return result;
}
/*
std::unordered_map<int, std::string> parseNames(const std::string& input) {
std::unordered_map<int, std::string> result;
std::string cleanedInput = input;
boost::erase_all(cleanedInput, "{");
boost::erase_all(cleanedInput, "}");
std::vector<std::string> elements;
boost::split(elements, cleanedInput, boost::is_any_of(","));
for (const std::string& element : elements) {
std::vector<std::string> keyValue;
boost::split(keyValue, element, boost::is_any_of(":"));
if (keyValue.size() == 2) {
int key = std::stoi(boost::trim_copy(keyValue[0]));
std::string value = boost::trim_copy(keyValue[1]);
result[key] = value;
}
}
return result;
}
*/
std::unordered_map<int, std::string> parseNames(const std::string& input) {
std::unordered_map<int, std::string> result;
std::string cleanedInput = input;
cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '{'), cleanedInput.end());
cleanedInput.erase(std::remove(cleanedInput.begin(), cleanedInput.end(), '}'), cleanedInput.end());
std::istringstream elementStream(cleanedInput);
std::string element;
while (std::getline(elementStream, element, ',')) {
std::istringstream keyValueStream(element);
std::string keyStr, value;
if (std::getline(keyValueStream, keyStr, ':') && std::getline(keyValueStream, value)) {
int key = std::stoi(keyStr);
result[key] = value;
}
}
return result;
}
int64_t vector_product(const std::vector<int64_t>& vec) {
int64_t result = 1;
for (int64_t value : vec) {
result *= value;
}
return result;
}

@ -0,0 +1,217 @@
#pragma once
#include <opencv2/opencv.hpp>
//#include <opencv2/imgcodecs.hpp>
//#include <opencv2/imgproc.hpp>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/types.hpp>
#include <opencv2/core.hpp>
#include <vector>
void clip_boxes(cv::Rect& box, const cv::Size& shape) {
box.x = std::max(0, std::min(box.x, shape.width));
box.y = std::max(0, std::min(box.y, shape.height));
box.width = std::max(0, std::min(box.width, shape.width - box.x));
box.height = std::max(0, std::min(box.height, shape.height - box.y));
}
void clip_boxes(cv::Rect_<float>& box, const cv::Size& shape) {
box.x = std::max(0.0f, std::min(box.x, static_cast<float>(shape.width)));
box.y = std::max(0.0f, std::min(box.y, static_cast<float>(shape.height)));
box.width = std::max(0.0f, std::min(box.width, static_cast<float>(shape.width - box.x)));
box.height = std::max(0.0f, std::min(box.height, static_cast<float>(shape.height - box.y)));
}
void clip_boxes(std::vector<cv::Rect>& boxes, const cv::Size& shape) {
for (cv::Rect& box : boxes) {
clip_boxes(box, shape);
}
}
void clip_boxes(std::vector<cv::Rect_<float>>& boxes, const cv::Size& shape) {
for (cv::Rect_<float>& box : boxes) {
clip_boxes(box, shape);
}
}
// source: ultralytics/utils/ops.py scale_boxes lines 99+ (ultralytics==8.0.160)
cv::Rect_<float> scale_boxes(const cv::Size& img1_shape, cv::Rect_<float>& box, const cv::Size& img0_shape,
std::pair<float, cv::Point2f> ratio_pad = std::make_pair(-1.0f, cv::Point2f(-1.0f, -1.0f)), bool padding = true) {
float gain, pad_x, pad_y;
if (ratio_pad.first < 0.0f) {
gain = std::min(static_cast<float>(img1_shape.height) / static_cast<float>(img0_shape.height),
static_cast<float>(img1_shape.width) / static_cast<float>(img0_shape.width));
pad_x = roundf((img1_shape.width - img0_shape.width * gain) / 2.0f - 0.1f);
pad_y = roundf((img1_shape.height - img0_shape.height * gain) / 2.0f - 0.1f);
}
else {
gain = ratio_pad.first;
pad_x = ratio_pad.second.x;
pad_y = ratio_pad.second.y;
}
//cv::Rect scaledCoords(box);
cv::Rect_<float> scaledCoords(box);
if (padding) {
scaledCoords.x -= pad_x;
scaledCoords.y -= pad_y;
}
scaledCoords.x /= gain;
scaledCoords.y /= gain;
scaledCoords.width /= gain;
scaledCoords.height /= gain;
// Clip the box to the bounds of the image
clip_boxes(scaledCoords, img0_shape);
return scaledCoords;
}
//void clip_coords(cv::Mat& coords, const cv::Size& shape) {
// // Clip x-coordinates to the image width
// cv::Mat xCoords = coords.col(0);
// cv::Mat yCoords = coords.col(1);
//
// for (int i = 0; i < coords.rows; ++i) {
// xCoords.at<float>(i) = std::max(std::min(xCoords.at<float>(i), static_cast<float>(shape.width - 1)), 0.0f);
// yCoords.at<float>(i) = std::max(std::min(yCoords.at<float>(i), static_cast<float>(shape.height - 1)), 0.0f);
// }
//}
void clip_coords(std::vector<float>& coords, const cv::Size& shape) {
// Assuming coords are of shape [1, 17, 3]
for (int i = 0; i < coords.size(); i += 3) {
coords[i] = std::min(std::max(coords[i], 0.0f), static_cast<float>(shape.width - 1)); // x
coords[i + 1] = std::min(std::max(coords[i + 1], 0.0f), static_cast<float>(shape.height - 1)); // y
}
}
// source: ultralytics/utils/ops.py scale_coords lines 753+ (ultralytics==8.0.160)
//cv::Mat scale_coords(const cv::Size& img1_shape, cv::Mat& coords, const cv::Size& img0_shape)
//cv::Mat scale_coords(const cv::Size& img1_shape, std::vector<float> coords, const cv::Size& img0_shape)
std::vector<float> scale_coords(const cv::Size& img1_shape, std::vector<float>& coords, const cv::Size& img0_shape)
{
// cv::Mat scaledCoords = coords.clone();
std::vector<float> scaledCoords = coords;
// Calculate gain and padding
double gain = std::min(static_cast<double>(img1_shape.width) / img0_shape.width, static_cast<double>(img1_shape.height) / img0_shape.height);
cv::Point2d pad((img1_shape.width - img0_shape.width * gain) / 2, (img1_shape.height - img0_shape.height * gain) / 2);
// Apply padding
// scaledCoords.col(0) = (scaledCoords.col(0) - pad.x);
// scaledCoords.col(1) = (scaledCoords.col(1) - pad.y);
// Assuming coords are of shape [1, 17, 3]
for (int i = 0; i < scaledCoords.size(); i += 3) {
scaledCoords[i] -= pad.x; // x padding
scaledCoords[i + 1] -= pad.y; // y padding
}
// Scale coordinates
// scaledCoords.col(0) /= gain;
// scaledCoords.col(1) /= gain;
// Assuming coords are of shape [1, 17, 3]
for (int i = 0; i < scaledCoords.size(); i += 3) {
scaledCoords[i] /= gain;
scaledCoords[i + 1] /= gain;
}
clip_coords(scaledCoords, img0_shape);
return scaledCoords;
}
cv::Mat crop_mask(const cv::Mat& mask, const cv::Rect& box) {
int h = mask.rows;
int w = mask.cols;
int x1 = box.x;
int y1 = box.y;
int x2 = box.x + box.width;
int y2 = box.y + box.height;
cv::Mat cropped_mask = cv::Mat::zeros(h, w, mask.type());
for (int r = 0; r < h; ++r) {
for (int c = 0; c < w; ++c) {
if (r >= y1 && r < y2 && c >= x1 && c < x2) {
cropped_mask.at<float>(r, c) = mask.at<float>(r, c);
}
}
}
return cropped_mask;
}
//std::tuple<std::vector<cv::Rect_<float>>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
std::tuple<std::vector<cv::Rect>, std::vector<float>, std::vector<int>, std::vector<std::vector<float>>>
non_max_suppression(const cv::Mat& output0, int class_names_num, int data_width, double conf_threshold,
float iou_threshold) {
std::vector<int> class_ids;
std::vector<float> confidences;
// std::vector<cv::Rect_<float>> boxes;
std::vector<cv::Rect> boxes;
std::vector<std::vector<float>> rest;
int rest_start_pos = class_names_num + 4;
int rest_features = data_width - rest_start_pos;
// int data_width = rest_start_pos + total_features_num;
int rows = output0.rows;
float* pdata = (float*) output0.data;
for (int r = 0; r < rows; ++r) {
cv::Mat scores(1, class_names_num, CV_32FC1, pdata + 4);
cv::Point class_id;
double max_conf;
minMaxLoc(scores, nullptr, &max_conf, nullptr, &class_id);
if (max_conf > conf_threshold) {
std::vector<float> mask_data(pdata + 4 + class_names_num, pdata + data_width);
class_ids.push_back(class_id.x);
confidences.push_back((float) max_conf);
float out_w = pdata[2];
float out_h = pdata[3];
float out_left = MAX((pdata[0] - 0.5 * out_w + 0.5), 0);
float out_top = MAX((pdata[1] - 0.5 * out_h + 0.5), 0);
cv::Rect_<float> bbox(out_left, out_top, (out_w + 0.5), (out_h + 0.5));
boxes.push_back(bbox);
if (rest_features > 0) {
std::vector<float> rest_data(pdata + rest_start_pos, pdata + data_width);
rest.push_back(rest_data);
}
}
pdata += data_width; // next prediction
}
//
//float masks_threshold = 0.50;
//int top_k = 500;
//const float& nmsde_eta = 1.0f;
std::vector<int> nms_result;
cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, iou_threshold, nms_result); // , nms_eta, top_k);
// cv::dnn::NMSBoxes(boxes, confidences, );
std::vector<int> nms_class_ids;
std::vector<float> nms_confidences;
// std::vector<cv::Rect_<float>> boxes;
std::vector<cv::Rect> nms_boxes;
std::vector<std::vector<float>> nms_rest;
for (int idx: nms_result) {
nms_class_ids.push_back(class_ids[idx]);
nms_confidences.push_back(confidences[idx]);
nms_boxes.push_back(boxes[idx]);
nms_rest.push_back(rest[idx]);
}
return std::make_tuple(nms_boxes, nms_confidences, nms_class_ids, nms_rest);
}
Loading…
Cancel
Save