diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..21edb06 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,13 @@ +name: CI + +on: [push, pull_request] + +jobs: + tue-ci: + name: TUe CI - ${{ github.event_name }} + runs-on: ubuntu-latest + steps: + - name: TUe CI + uses: tue-robotics/tue-env/ci/main@master + with: + package: ${{ github.event.repository.name }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ddd4b43 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +build/ +images/* +onnxruntime*/ +onnxruntime/* +docker/* +CMakefile +CMakeCache.txt +CMakeFiles/* +cmake_install.cmake +Makefile +SPEED-SAM-C-TENSORRT/ +sam_inference/model/FastSAM-x.onnx +mask* +segmentation_results* diff --git a/CMakeLists.txt b/CMakeLists.txt index 8036a89..12ca294 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,59 +1,119 @@ -cmake_minimum_required(VERSION 3.5) +cmake_minimum_required(VERSION 3.14) -set(PROJECT_NAME SAMOnnxRuntimeCPPInference) -project(${PROJECT_NAME} VERSION 0.0.1 LANGUAGES CXX) +project(sam_onnx_ros) + +# -------------- CMake Policies ------------------# +#add_compile_options(-Wall -Werror=all) +#add_compile_options(-Wextra -Werror=extra) # -------------- Support C++17 for using filesystem ------------------# set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) -set(CMAKE_INCLUDE_CURRENT_DIR ON) - +#set(CMAKE_INCLUDE_CURRENT_DIR ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # -------------- OpenCV ------------------# find_package(OpenCV REQUIRED) include_directories(${OpenCV_INCLUDE_DIRS}) + # -------------- ONNXRuntime ------------------# set(ONNXRUNTIME_VERSION 1.21.0) -set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../onnxruntime-linux-x64-gpu-1.21.1") +set(ONNXRUNTIME_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../hero_sam.bak/onnxruntime-linux-x64-gpu-1.21.1") include_directories(${ONNXRUNTIME_ROOT}/include) # -------------- Cuda ------------------# add_definitions(-DUSE_CUDA=1) include_directories(/usr/local/cuda/include) +find_package(catkin REQUIRED + COMPONENTS + # roscpp + # tue_config + # tue_filesystem + # code_profiler + + #onnxruntime_ros +) + +# ------------------------------------------------------------------------------------------------ +# CATKIN EXPORT +# ------------------------------------------------------------------------------------------------ + +catkin_package( + INCLUDE_DIRS include + #LIBRARIES ${PROJECT_NAME} + LIBRARIES sam_onnx_ros_core + CATKIN_DEPENDS + DEPENDS OpenCV +) + +# ------------------------------------------------------------------------------------------------ +# BUILD +# ------------------------------------------------------------------------------------------------ + +include_directories( + include + SYSTEM + ${OpenCV_INCLUDE_DIRS} + ${catkin_INCLUDE_DIRS} +) + set(PROJECT_SOURCES src/main.cpp src/sam_inference.cpp + src/segmentation.cpp src/utils.cpp ) -add_executable(${PROJECT_NAME} ${PROJECT_SOURCES}) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) - -# Link OpenCV libraries along with ONNX Runtime -target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS} ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.so) - -# For Windows system, copy onnxruntime.dll to the same folder of the executable file -if (WIN32) - add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${ONNXRUNTIME_ROOT}/lib/onnxruntime.dll" - $) -endif () +# Build core library (no main.cpp here) +add_library(sam_onnx_ros_core + src/sam_inference.cpp + src/segmentation.cpp + src/utils.cpp +) +target_link_libraries(sam_onnx_ros_core + ${OpenCV_LIBS} + ${catkin_LIBRARIES} + ${ONNXRUNTIME_ROOT}/lib/libonnxruntime.so +) +target_include_directories(sam_onnx_ros_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) -# Download https://raw.githubusercontent.com/ultralytics/ultralytics/main/ultralytics/cfg/datasets/coco.yaml -# and put it in the same folder of the executable file -configure_file(./model/SAM_mask_decoder.onnx ${CMAKE_CURRENT_BINARY_DIR}/SAM_mask_decoder.onnx COPYONLY) +# Main executable links the core lib +add_executable(${PROJECT_NAME} src/main.cpp) +target_link_libraries(${PROJECT_NAME} sam_onnx_ros_core) -# Copy yolov8n.onnx file to the same folder of the executable file -configure_file(./model/SAM_encoder.onnx ${CMAKE_CURRENT_BINARY_DIR}/SAM_encoder.onnx COPYONLY) +# Copy sam_.onnx file to the same folder of the executable file +configure_file(~/Documents/repos/hero_sam.bak/sam_inference/model/SAM_mask_decoder.onnx ${CMAKE_CURRENT_BINARY_DIR}/SAM_mask_decoder.onnx COPYONLY) +configure_file(~/Documents/repos/hero_sam.bak/sam_inference/model/SAM_encoder.onnx ${CMAKE_CURRENT_BINARY_DIR}/SAM_encoder.onnx COPYONLY) # Create folder name images in the same folder of the executable file add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_BINARY_DIR}/images ) +# # Enable testing +if (CATKIN_ENABLE_TESTING) +# find_package(catkin_lint_cmake REQUIRED) +# catkin_add_catkin_lint_test("-W2 --ignore HEADER_OUTSIDE_PACKAGE_INCLUDE_PATH") + + # Utils unit tests (no models needed) + catkin_add_gtest(utils_tests test/test_utils.cpp) + if(TARGET utils_tests) + target_link_libraries(utils_tests sam_onnx_ros_core GTest::gtest_main ${catkin_LIBRARIES}) + target_include_directories(utils_tests PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + endif() + + # SAM integration-ish tests (may need models) + catkin_add_gtest(sam_tests test/sam_test.cpp) + if(TARGET sam_tests) + target_link_libraries(sam_tests sam_onnx_ros_core GTest::gtest_main ${catkin_LIBRARIES}) + target_include_directories(sam_tests PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) + endif() +endif() + + + #If you want to debug -set(CMAKE_BUILD_TYPE Debug) -set(CMAKE_CXX_FLAGS_DEBUG "-g") +# set(CMAKE_BUILD_TYPE Debug) +# set(CMAKE_CXX_FLAGS_DEBUG "-g") diff --git a/LICENSE b/LICENSE index 7121e4a..c640c63 100644 --- a/LICENSE +++ b/LICENSE @@ -1,22 +1,25 @@ -Custom License Agreement +BSD 2-Clause License -1. License Grant You are hereby granted a non-exclusive, non-transferable license to use, reproduce, and distribute the code (hereinafter referred to as "the Software") under the following conditions: +Copyright (c) 2021, Eindhoven University of Technology - CST Robotics Group +All rights reserved. -2. Conditions of Use +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: -Non-Commercial Use: You may use the Software for personal, educational, or non-commercial purposes without any additional permissions. -Commercial Use: Any commercial use of the Software, including but not limited to selling, licensing, or using it in a commercial product, requires prior written permission from the original developer. -3. Contact Requirement +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. -If you wish to use the Software for commercial purposes, you must contact the original developer at [https://www.linkedin.com/in/hamdi-boukamcha/] to obtain a commercial license. -The terms of any commercial license will be mutually agreed upon and may involve a licensing fee. -4. Attribution +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. -Regardless of whether you are using the Software for commercial or non-commercial purposes, you must provide appropriate credit to the original developer in any distributions or products that use the Software. -5. Disclaimer of Warranty - -The Software is provided "as is," without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, and non-infringement. In no event shall the original developer be liable for any claim, damages, or other liability, whether in an action of contract, tort, or otherwise, arising from, out of, or in connection with the Software or the use or other dealings in the Software. -6. Governing Law - -This License Agreement shall be governed by and construed in accordance with the laws of France. -By using the Software, you agree to abide by the terms outlined in this License Agreement. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 0efca14..4692d73 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ A high-performance C++ implementation for SAM (segment anything model) using Ten Dynamic Shape Support: Efficient handling of variable input sizes using optimization profiles. CUDA Optimization: Leverage CUDA for preprocessing and efficient memory handling. -## 📢 Performance - ### Infernce Time +## 📢 Performance + ### Infernce Time | Component | SpeedSAM | |----------------------------|-----------| @@ -51,7 +51,7 @@ A high-performance C++ implementation for SAM (segment anything model) using Ten │ ├── main.cpp # Main entry point │ └── speedSam.cpp # Implementation of the SpeedSam class └── CMakeLists.txt # CMake configuration - + # 🚀 Installation ## Prerequisites git clone https://github.com/hamdiboukamcha/SPEED-SAM-C-TENSORRT.git @@ -94,8 +94,3 @@ If you use this code in your research, please cite the repository as follows: publisher = {GitHub}, howpublished = {\url{https://github.com/hamdiboukamcha/SPEED-SAM-C-TENSORRT//}}, } - - - - - diff --git a/inc/dl_types.h b/inc/dl_types.h deleted file mode 100644 index 54bd60f..0000000 --- a/inc/dl_types.h +++ /dev/null @@ -1,68 +0,0 @@ -#pragma once -namespace SEG -{ -enum MODEL_TYPE -{ - //FLOAT32 MODEL - SAM_SEGMENT_ENCODER = 1, - SAM_SEGMENT_DECODER = 2, - //YOLO_CLS = 3, - - //FLOAT16 MODEL - //YOLO_DETECT_V8_HALF = 4, - //YOLO_POSE_V8_HALF = 5, - //YOLO_CLS_HALF = 6 -}; - - -typedef struct _DL_INIT_PARAM -{ - // Yolo & Common Part - std::string modelPath; - MODEL_TYPE modelType = SAM_SEGMENT_ENCODER; - std::vector imgSize = { 640, 640 }; - float rectConfidenceThreshold = 0.6; - float iouThreshold = 0.5; - int keyPointsNum = 2; //Note:kpt number for pose - bool cudaEnable = false; - int logSeverityLevel = 3; - int intraOpNumThreads = 1; - //std::vector boxes; // For SAM encoder model, this will be filled with detected boxes - - friend std::ostream& operator<<(std::ostream& os, _DL_INIT_PARAM& param) - { - os << "modelPath: " << param.modelPath << "\n"; - os << "modelType: " << param.modelType << "\n"; - os << "imgSize: "; - for (const auto& size : param.imgSize) - os << size << " "; - os << "\n"; - os << "rectConfidenceThreshold: " << param.rectConfidenceThreshold << "\n"; - os << "iouThreshold: " << param.iouThreshold << "\n"; - os << "keyPointsNum: " << param.keyPointsNum << "\n"; - os << "cudaEnable: " << (param.cudaEnable ? "true" : "false") << "\n"; - os << "logSeverityLevel: " << param.logSeverityLevel << "\n"; - os << "intraOpNumThreads: " << param.intraOpNumThreads; - return os; - } - -} DL_INIT_PARAM; - - -typedef struct _DL_RESULT -{ - - //Yolo Part - int classId; - float confidence; - std::vector boxes; // For SAM encoder model, this will be filled with detected boxes - std::vector keyPoints; - - // Sam Part - std::vector embeddings; - // Masks for SAM decoder model output - std::vector masks; // Each cv::Mat represents a mask - - -} DL_RESULT; -} // namespace SEG \ No newline at end of file diff --git a/inc/sam_inference.h b/inc/sam_inference.h deleted file mode 100644 index 8a15c38..0000000 --- a/inc/sam_inference.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#define RET_OK nullptr - -#ifdef _WIN32 -#include -#include -#include -#endif - -#include -#include -#include -#include -#include "onnxruntime_cxx_api.h" -#include "utils.h" -#ifdef USE_CUDA -#include -#endif - - - - - -class SAM -{ -public: - SAM(); - - ~SAM(); - -public: - - const char* CreateSession(SEG::DL_INIT_PARAM& iParams); - - const char* RunSession(const cv::Mat& iImg, std::vector& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result); - - char* WarmUpSession(SEG::MODEL_TYPE modelType); - - template - char* TensorProcess(clock_t& starttime_1, const cv::Mat& iImg, N& blob, std::vector& inputNodeDims, - SEG::MODEL_TYPE modelType, std::vector& oResult, Utils& utilities, SEG::DL_RESULT& result); - - - - std::vector classes{}; - -private: - Ort::Env env; - std::unique_ptr session; - bool cudaEnable; - Ort::RunOptions options; - std::vector inputNodeNames; - std::vector outputNodeNames; - - SEG::MODEL_TYPE modelType; - std::vector imgSize; - float rectConfidenceThreshold; - float iouThreshold; - -}; \ No newline at end of file diff --git a/inc/utils.h b/inc/utils.h deleted file mode 100644 index 0e7a8d7..0000000 --- a/inc/utils.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#define RET_OK nullptr - -#ifdef _WIN32 -#include -#include -#include -#endif - -#include -#include -#include -#include -#include "onnxruntime_cxx_api.h" -#include "dl_types.h" -#ifdef USE_CUDA -#include -#endif - -class Utils -{ - public: - Utils(); - ~Utils(); - - void overlay(std::vector& output_tensors, const cv::Mat& iImg, std::vector iImgSize, SEG::DL_RESULT& result); - char* PreProcess(const cv::Mat& iImg, std::vector iImgSize, cv::Mat& oImg); - void ScaleBboxPoints(const cv::Mat& iImg, std::vector iImgSize, std::vector& pointCoords, std::vector& PointsCoordsScaled); - - std::vector PrepareInputTensor(Ort::Value& decoderInputTensor, std::vector& pointCoordsScaled, std::vector pointCoordsDims, - std::vector& pointLabels, std::vector pointLabelsDims, std::vector& maskInput, - std::vector maskInputDims, std::vector& hasMaskInput, std::vector hasMaskInputDims); - - // Definition: Flattened image to blob (and normalizaed) for deep learning inference. Also reorganize from HWC to CHW. - // Note: Code in header file since it is used outside of this utils src code. - template - char* BlobFromImage(const cv::Mat& iImg, T& iBlob) { - int channels = iImg.channels(); - int imgHeight = iImg.rows; - int imgWidth = iImg.cols; - - for (int c = 0; c < channels; c++) - { - for (int h = 0; h < imgHeight; h++) - { - for (int w = 0; w < imgWidth; w++) - { - iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = typename std::remove_pointer::type( - (iImg.at(h, w)[c]) / 255.0f); - } - } - } - return RET_OK; - } - private: - float resizeScales; - float resizeScalesBbox; //letterbox scale - - - }; diff --git a/include/dl_types.h b/include/dl_types.h new file mode 100644 index 0000000..033df56 --- /dev/null +++ b/include/dl_types.h @@ -0,0 +1,71 @@ +#ifndef DL_TYPES_H +#define DL_TYPES_H + +#include +#include +#include +#include + +namespace SEG +{ + enum MODEL_TYPE + { + // FLOAT32 MODEL + SAM_SEGMENT_ENCODER = 1, + SAM_SEGMENT_DECODER = 2, + // YOLO_CLS = 3, + + // FLOAT16 MODEL + // YOLO_DETECT_V8_HALF = 4, + // YOLO_POSE_V8_HALF = 5, + // YOLO_CLS_HALF = 6 + }; + + typedef struct _DL_INIT_PARAM + { + // Yolo & Common Part + std::string modelPath; + MODEL_TYPE modelType = SAM_SEGMENT_ENCODER; + std::vector imgSize = {640, 640}; + float rectConfidenceThreshold = 0.6; + float iouThreshold = 0.5; + int keyPointsNum = 2; // Note:kpt number for pose + bool cudaEnable = false; + int logSeverityLevel = 3; + int intraOpNumThreads = 1; + // std::vector boxes; // For SAM encoder model, this will be filled with detected boxes + + // Overloaded output operator for _DL_INIT_PARAM to print its contents + friend std::ostream &operator<<(std::ostream &os, const _DL_INIT_PARAM ¶m) + { + os << "modelPath: " << param.modelPath << "\n"; + os << "modelType: " << param.modelType << "\n"; + os << "imgSize: "; + for (const auto &size : param.imgSize) + os << size << " "; + os << "\n"; + os << "rectConfidenceThreshold: " << param.rectConfidenceThreshold << "\n"; + os << "iouThreshold: " << param.iouThreshold << "\n"; + os << "keyPointsNum: " << param.keyPointsNum << "\n"; + os << "cudaEnable: " << (param.cudaEnable ? "true" : "false") << "\n"; + os << "logSeverityLevel: " << param.logSeverityLevel << "\n"; + os << "intraOpNumThreads: " << param.intraOpNumThreads; + return os; + } + + } DL_INIT_PARAM; + + typedef struct _DL_RESULT + { + + std::vector boxes; // For SAM encoder model, this will be filled with detected boxes + std::vector keyPoints; + + // Sam Part + std::vector embeddings; + // Masks for SAM decoder model output + std::vector masks; // Each cv::Mat represents a mask + + } DL_RESULT; +} // namespace SEG +#endif // DL_TYPES_H diff --git a/include/sam_inference.h b/include/sam_inference.h new file mode 100644 index 0000000..6b4713e --- /dev/null +++ b/include/sam_inference.h @@ -0,0 +1,49 @@ +#ifndef SAMINFERENCE_H +#define SAMINFERENCE_H + + +#define RET_OK nullptr +#include +#include +#include +#include +#include "utils.h" +#ifdef USE_CUDA +#include +#endif + +class SAM +{ +public: + SAM(); + + ~SAM(); + +public: + const char *CreateSession(SEG::DL_INIT_PARAM &iParams); + + const char *RunSession(const cv::Mat &iImg, std::vector &oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT &result); + + char *WarmUpSession(SEG::MODEL_TYPE modelType); + + template + const char *TensorProcess(clock_t &starttime_1, const cv::Mat &iImg, N &blob, std::vector &inputNodeDims, + SEG::MODEL_TYPE modelType, std::vector &oResult, Utils &utilities, SEG::DL_RESULT &result); + + std::vector classes{}; + +private: + Ort::Env _env; + std::unique_ptr _session; + bool _cudaEnable; + Ort::RunOptions _options; + std::vector _inputNodeNames; + std::vector _outputNodeNames; + + SEG::MODEL_TYPE _modelType; + std::vector _imgSize; + float _rectConfidenceThreshold; + float _iouThreshold; +}; + +#endif // SAMINFERENCE_H diff --git a/include/segmentation.h b/include/segmentation.h new file mode 100644 index 0000000..83102e0 --- /dev/null +++ b/include/segmentation.h @@ -0,0 +1,12 @@ +#ifndef SEGMENTATION_H +#define SEGMENTATION_H + +#include + +#include "sam_inference.h" +std::tuple>, SEG::_DL_INIT_PARAM, SEG::_DL_INIT_PARAM, SEG::DL_RESULT, std::vector> Initializer(); +void SegmentAnything(std::vector>& samSegmentors, const SEG::_DL_INIT_PARAM& params_encoder, const SEG::_DL_INIT_PARAM& params_decoder, const cv::Mat& img, +std::vector &resSam, + SEG::DL_RESULT &res); + +#endif // SEGMENTATION_H diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..a471512 --- /dev/null +++ b/include/utils.h @@ -0,0 +1,58 @@ +#ifndef UTILS_H +#define UTILS_H + +#define RET_OK nullptr + +#include +#include +#include +#include "onnxruntime_cxx_api.h" +#include "dl_types.h" +#ifdef USE_CUDA +#include +#endif + +class Utils +{ +public: + Utils(); + ~Utils(); + + char *PreProcess(const cv::Mat &iImg, std::vector iImgSize, cv::Mat &oImg); + void ScaleBboxPoints(const cv::Mat &iImg, std::vector iImgSize, std::vector &pointCoords, std::vector &PointsCoordsScaled); + + std::vector PrepareInputTensor(Ort::Value &decoderInputTensor, std::vector &pointCoordsScaled, std::vector pointCoordsDims, + std::vector &pointLabels, std::vector pointLabelsDims, std::vector &maskInput, + std::vector maskInputDims, std::vector &hasMaskInput, std::vector hasMaskInputDims); + + void PostProcess(std::vector &output_tensors, const cv::Mat &iImg, std::vector iImgSize, SEG::DL_RESULT &result); + + // Definition: Flattened image to blob (and normalizaed) for deep learning inference. Also reorganize from HWC to CHW. + // Note: Code in header file since it is used outside of this utils src code. + template + char *BlobFromImage(const cv::Mat &iImg, T &iBlob) + { + int channels = iImg.channels(); + int imgHeight = iImg.rows; + int imgWidth = iImg.cols; + + for (int c = 0; c < channels; c++) + { + for (int h = 0; h < imgHeight; h++) + { + for (int w = 0; w < imgWidth; w++) + { + iBlob[c * imgWidth * imgHeight + h * imgWidth + w] = typename std::remove_pointer::type( + (iImg.at(h, w)[c]) / 255.0f); + } + } + } + return RET_OK; + } + +private: + float _resizeScales; + float _resizeScalesBbox; // letterbox scale +}; + +#endif // UTILS_H diff --git a/package.xml b/package.xml new file mode 100644 index 0000000..7d17b8d --- /dev/null +++ b/package.xml @@ -0,0 +1,31 @@ + + + + sam_onnx_ros + 0.0.0 + Segment Anything Model (SAM) segmentation + + Iason Theodorou + Iason Theodorou + + BSD + + catkin + + libopencv-dev + onnxruntime_ros + + libopencv-dev + onnxruntime_ros + + catkin_lint_cmake + + doxygen + + + + + + diff --git a/src/main.cpp b/src/main.cpp index 7481567..cd0f9dd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,67 +1,30 @@ +#include "segmentation.h" #include -#include -#include "sam_inference.h" +#include #include -#include -#include +#include - - -void SegmentAnything() { - - SAM* samSegmentor = new SAM; - SEG::DL_INIT_PARAM params; - SEG::DL_INIT_PARAM params1; - - params.rectConfidenceThreshold = 0.1; - params.iouThreshold = 0.5; - params.modelPath = "SAM_encoder.onnx"; - params.imgSize = { 1024, 1024 }; - - params1 = params; - params1.modelType = SEG::SAM_SEGMENT_DECODER; - params1.modelPath = "SAM_mask_decoder.onnx"; - - - #ifdef USE_CUDA - params.cudaEnable = true; - #else - params.cudaEnable = false; - #endif - - - - //Running inference - std::filesystem::path current_path = std::filesystem::current_path(); - std::filesystem::path imgs_path = current_path / "../../pipeline/build/images"; +int main() +{ + // Running inference + std::vector> samSegmentors; + SEG::DL_INIT_PARAM params_encoder; + SEG::DL_INIT_PARAM params_decoder; std::vector resSam; - for (auto& i : std::filesystem::directory_iterator(imgs_path)) + SEG::DL_RESULT res; + std::tie(samSegmentors, params_encoder, params_decoder, res, resSam) = Initializer(); + std::filesystem::path current_path = std::filesystem::current_path(); + std::filesystem::path imgs_path = "/home/amigo/Documents/repos/hero_sam.bak/sam_inference/build/images"; // current_path / <- you could use + for (auto &i : std::filesystem::directory_iterator(imgs_path)) { if (i.path().extension() == ".jpg" || i.path().extension() == ".png" || i.path().extension() == ".jpeg") { std::string img_path = i.path().string(); cv::Mat img = cv::imread(img_path); - SEG::DL_RESULT res; - samSegmentor->CreateSession(params); - SEG::MODEL_TYPE modelTypeRef = params.modelType; - samSegmentor->RunSession(img, resSam, modelTypeRef, res); - + SegmentAnything(samSegmentors, params_encoder, params_decoder, img, resSam, res); - - - samSegmentor->CreateSession(params1); - modelTypeRef = params1.modelType; - samSegmentor->RunSession(img, resSam, modelTypeRef, res); - std::cout << "Press any key to exit" << std::endl; - cv::imshow("Result of Detection", img); - cv::waitKey(0); - cv::destroyAllWindows(); } } + return 0; } - -int main() -{ - SegmentAnything(); -} \ No newline at end of file diff --git a/src/sam_inference.cpp b/src/sam_inference.cpp index 3820338..3ae5677 100644 --- a/src/sam_inference.cpp +++ b/src/sam_inference.cpp @@ -1,508 +1,427 @@ #include "sam_inference.h" #include "utils.h" #include -#include #define benchmark //#define ROI -// #define min(a,b) (((a) < (b)) ? (a) : (b)) - -SAM::SAM() { - -} +SAM::SAM() {} SAM::~SAM() { - // Clean up input/output node names - for (auto& name : inputNodeNames) { - delete[] name; - } - for (auto& name : outputNodeNames) { - delete[] name; - } + // Clean up input/output node names + for (auto &name : _inputNodeNames) { + delete[] name; + } + for (auto &name : _outputNodeNames) { + delete[] name; + } } #ifdef USE_CUDA -namespace Ort -{ - template<> - struct TypeToTensorType { static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; }; -} +namespace Ort { +template <> struct TypeToTensorType { + static constexpr ONNXTensorElementDataType type = + ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; +}; +} // namespace Ort #endif +const char *SAM::CreateSession(SEG::DL_INIT_PARAM &iParams) { + const char *Ret = RET_OK; + if (_session) { + _session.reset(); // Release previous _session -const char* SAM::CreateSession(SEG::DL_INIT_PARAM& iParams) { - const char* Ret = RET_OK; - if (session) { - session.reset(); // Release previous session - - // Clear node names - for (auto& name : inputNodeNames) { - delete[] name; - } - inputNodeNames.clear(); + // Clear node names + for (auto &name : _inputNodeNames) { + delete[] name; + } + _inputNodeNames.clear(); - for (auto& name : outputNodeNames) { - delete[] name; - } - outputNodeNames.clear(); + for (auto &name : _outputNodeNames) { + delete[] name; } - std::regex pattern("[\u4e00-\u9fa5]"); - bool result = std::regex_search(iParams.modelPath, pattern); - if (result) - { - Ret = "[SAM]:Your model path is error.Change your model path without chinese characters."; - std::cout << Ret << std::endl; - return Ret; + _outputNodeNames.clear(); + } + std::regex pattern("[\u4e00-\u9fa5]"); + bool result = std::regex_search(iParams.modelPath, pattern); + if (result) { + Ret = "[SAM]:Your model path is error.Change your model path without " + "chinese characters."; + std::cout << Ret << std::endl; + return Ret; + } + try { + _rectConfidenceThreshold = iParams.rectConfidenceThreshold; + _iouThreshold = iParams.iouThreshold; + _imgSize = iParams.imgSize; + _modelType = iParams.modelType; + _cudaEnable = iParams.cudaEnable; + _env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "Sam"); + Ort::SessionOptions _sessionOption; + if (iParams.cudaEnable) { + OrtCUDAProviderOptions cudaOption; + cudaOption.device_id = 0; + _sessionOption.AppendExecutionProvider_CUDA(cudaOption); } - try - { - rectConfidenceThreshold = iParams.rectConfidenceThreshold; - iouThreshold = iParams.iouThreshold; - imgSize = iParams.imgSize; - modelType = iParams.modelType; - cudaEnable = iParams.cudaEnable; - env = Ort::Env(ORT_LOGGING_LEVEL_WARNING, "Sam"); - Ort::SessionOptions sessionOption; - if (iParams.cudaEnable) - { - OrtCUDAProviderOptions cudaOption; - cudaOption.device_id = 0; - sessionOption.AppendExecutionProvider_CUDA(cudaOption); - } - - //OrtTensorRTProviderOptions trtOptions{}; - //trtOptions.device_id = 0; - //trtOptions.trt_fp16_enable = true; - //sessionOption.AppendExecutionProvider_TensorRT(trtOptions); - - sessionOption.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_ALL); - sessionOption.SetIntraOpNumThreads(iParams.intraOpNumThreads); - sessionOption.SetLogSeverityLevel(iParams.logSeverityLevel); - -#ifdef _WIN32 - int ModelPathSize = MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast(iParams.modelPath.length()), nullptr, 0); - wchar_t* wide_cstr = new wchar_t[ModelPathSize + 1]; - MultiByteToWideChar(CP_UTF8, 0, iParams.modelPath.c_str(), static_cast(iParams.modelPath.length()), wide_cstr, ModelPathSize); - wide_cstr[ModelPathSize] = L'\0'; - const wchar_t* modelPath = wide_cstr; -#else - const char* modelPath = iParams.modelPath.c_str(); -#endif // _WIN32 - - //session = new Ort::Session(env, modelPath, sessionOption); - session = std::make_unique(env, modelPath, sessionOption); - Ort::AllocatorWithDefaultOptions allocator; - size_t inputNodesNum = session->GetInputCount(); - for (size_t i = 0; i < inputNodesNum; i++) - { - Ort::AllocatedStringPtr input_node_name = session->GetInputNameAllocated(i, allocator); - char* temp_buf = new char[50]; - strcpy(temp_buf, input_node_name.get()); - inputNodeNames.push_back(temp_buf); - } - size_t OutputNodesNum = session->GetOutputCount(); - for (size_t i = 0; i < OutputNodesNum; i++) - { - Ort::AllocatedStringPtr output_node_name = session->GetOutputNameAllocated(i, allocator); - char* temp_buf = new char[10]; - strcpy(temp_buf, output_node_name.get()); - outputNodeNames.push_back(temp_buf); - } - options = Ort::RunOptions{ nullptr }; - - //std::vector input_shape; - //std::vector output_shape; - //size_t input_tensor_size = 0; - //size_t output_tensor_size = 0; - //Get input and output tensor size - - //auto input_tensor_size = session->GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetElementCount(); - //auto output_tensor_size = session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetElementCount(); - auto input_shape = session->GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); - auto output_shape = session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); - auto output_type = session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetElementType(); - - WarmUpSession(modelType); - return RET_OK; + + _sessionOption.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + _sessionOption.SetIntraOpNumThreads(iParams.intraOpNumThreads); + _sessionOption.SetLogSeverityLevel(iParams.logSeverityLevel); + + const char *modelPath = iParams.modelPath.c_str(); + + _session = std::make_unique(_env, modelPath, _sessionOption); + Ort::AllocatorWithDefaultOptions allocator; + size_t inputNodesNum = _session->GetInputCount(); + for (size_t i = 0; i < inputNodesNum; i++) { + Ort::AllocatedStringPtr input_node_name = + _session->GetInputNameAllocated(i, allocator); + char *temp_buf = new char[50]; + strcpy(temp_buf, input_node_name.get()); + _inputNodeNames.push_back(temp_buf); } - catch (const std::exception& e) - { - const char* str1 = "[SAM]:"; - const char* str2 = e.what(); - std::string result = std::string(str1) + std::string(str2); - char* merged = new char[result.length() + 1]; - std::strcpy(merged, result.c_str()); - std::cout << merged << std::endl; - delete[] merged; - return "[SAM]:Create session failed."; + size_t OutputNodesNum = _session->GetOutputCount(); + for (size_t i = 0; i < OutputNodesNum; i++) { + Ort::AllocatedStringPtr output_node_name = + _session->GetOutputNameAllocated(i, allocator); + char *temp_buf = new char[50]; + strcpy(temp_buf, output_node_name.get()); + _outputNodeNames.push_back(temp_buf); } + _options = Ort::RunOptions{nullptr}; + auto input_shape = + _session->GetInputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + + auto output_shape = + _session->GetOutputTypeInfo(0).GetTensorTypeAndShapeInfo().GetShape(); + auto output_type = _session->GetOutputTypeInfo(0) + .GetTensorTypeAndShapeInfo() + .GetElementType(); + + WarmUpSession(_modelType); + return RET_OK; + } catch (const std::exception &e) { + const char *str1 = "[SAM]:"; + const char *str2 = e.what(); + std::string str_result = std::string(str1) + std::string(str2); + char *merged = new char[str_result.length() + 1]; + std::strcpy(merged, str_result.c_str()); + std::cout << merged << std::endl; + delete[] merged; + return "[SAM]:Create _session failed."; + } } -const char* SAM::RunSession(const cv::Mat& iImg, std::vector& oResult, SEG::MODEL_TYPE modelType, SEG::DL_RESULT& result) { - #ifdef benchmark - clock_t starttime_1 = clock(); - #endif // benchmark - Utils utilities; - const char* Ret = RET_OK; - cv::Mat processedImg; - utilities.PreProcess(iImg, imgSize, processedImg); - if (modelType < 4) - { - float* blob = new float[processedImg.total() * 3]; - utilities.BlobFromImage(processedImg, blob); - std::vector inputNodeDims; - if (modelType == SEG::SAM_SEGMENT_ENCODER) - { - inputNodeDims = { 1, 3, imgSize.at(0), imgSize.at(1) }; - } - else if (modelType == SEG::SAM_SEGMENT_DECODER) - { - // For SAM decoder model, the input size is different - // Assuming the input size is 236x64x64 for the decoder - // You can adjust this based on your actual model requirements - // For example, if the input size is 1x3x236x64, you can set it as follows: - // inputNodeDims = { 1, 3, 236, 64 }; - // But here we are using 1x236x64x64 as per your original code - - inputNodeDims = { 1, 256, 64, 64 }; - } - TensorProcess(starttime_1, iImg, blob, inputNodeDims, modelType, oResult, utilities, result); - } - else - { - #ifdef USE_CUDA - half* blob = new half[processedImg.total() * 3]; - utilities.BlobFromImage(processedImg, blob); - std::vector inputNodeDims = { 1,3,imgSize.at(0),imgSize.at(1) }; - TensorProcess(starttime_1, iImg, blob, inputNodeDims, modelType, oResult, utilities, result); - #endif - } - - return Ret; - } +const char *SAM::RunSession(const cv::Mat &iImg, + std::vector &oResult, + SEG::MODEL_TYPE _modelType, SEG::DL_RESULT &result) { +#ifdef benchmark + clock_t starttime_1 = clock(); +#endif // benchmark + Utils utilities; + const char *Ret = RET_OK; + cv::Mat processedImg; + utilities.PreProcess(iImg, _imgSize, processedImg); + float *blob = new float[processedImg.total() * 3]; + utilities.BlobFromImage(processedImg, blob); + std::vector inputNodeDims; + if (_modelType == SEG::SAM_SEGMENT_ENCODER) { + // NCHW: H = imgSize[1], W = imgSize[0] + inputNodeDims = {1, 3, _imgSize.at(1), _imgSize.at(0)}; + } else if (_modelType == SEG::SAM_SEGMENT_DECODER) { + inputNodeDims = {1, 256, 64, 64}; + } + TensorProcess(starttime_1, iImg, blob, inputNodeDims, _modelType, oResult, + utilities, result); + + return Ret; +} + +template +const char *SAM::TensorProcess(clock_t &starttime_1, const cv::Mat &iImg, + N &blob, std::vector &inputNodeDims, + SEG::MODEL_TYPE _modelType, + std::vector &oResult, + Utils &utilities, SEG::DL_RESULT &result) { - template - char* SAM::TensorProcess(clock_t& starttime_1, const cv::Mat& iImg, N& blob, std::vector& inputNodeDims, - SEG::MODEL_TYPE modelType, std::vector& oResult, Utils& utilities, SEG::DL_RESULT& result) { - - switch (modelType) - { - case SEG::SAM_SEGMENT_ENCODER: - // case OTHER_SAM_MODEL: - { - - Ort::Value inputTensor = Ort::Value::CreateTensor::type>( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), - inputNodeDims.data(), inputNodeDims.size()); - #ifdef benchmark - clock_t starttime_2 = clock(); - #endif // benchmark - auto outputTensor = session->Run(options, inputNodeNames.data(), &inputTensor, 1, outputNodeNames.data(), - outputNodeNames.size()); - #ifdef benchmark - clock_t starttime_3 = clock(); - #endif // benchmark - - Ort::TypeInfo typeInfo = outputTensor.front().GetTypeInfo(); - auto tensor_info = typeInfo.GetTensorTypeAndShapeInfo(); - std::vector outputNodeDims = tensor_info.GetShape(); - auto output = outputTensor.front().GetTensorMutableData::type>(); - //std::vector outputNodeDims = outputTensor.front().GetTensorTypeAndShapeInfo().GetShape(); - delete[] blob; - - int embeddingSize = outputNodeDims[1] * outputNodeDims[2] * outputNodeDims[3]; // Flattened size - result.embeddings.assign(output, output + embeddingSize); // Save embeddings - - - #ifdef benchmark - clock_t starttime_4 = clock(); - double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000; - double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000; - double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000; - if (cudaEnable) - { - std::cout << "[SAM(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl; - } - else - { - std::cout << "[SAM(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl; - } - #endif // benchmark - - break; - } - case SEG::SAM_SEGMENT_DECODER: - //case : - { - // Use embeddings from the last result - std::vector embeddings = result.embeddings; - // Create tensor for decoder - std::vector decoderInputDims = { 1, 256, 64, 64 }; // Adjust based on your decoder's requirements - - - // Create point coordinates and labels - #ifdef ROI - - // Create a window for user interaction - namedWindow("Select and View Result", cv::WINDOW_AUTOSIZE); - - // Let the user select the bounding box - cv::Rect bbox = selectROI("Select and View Result", iImg, false, false); - - // Check if a valid bounding box was selected - if (bbox.width == 0 || bbox.height == 0) - { - std::cerr << "No valid bounding box selected." << std::endl; - return "[SAM]: NO valid Box."; - } - - //cv::Rect bbox1(138, 29, 170, 301); - - std::vector boundingBoxes; - boundingBoxes.push_back(bbox); - #endif // ROI - //boundingBoxes.push_back(bbox1); - // Declare timing variables BEFORE the loop - #ifdef benchmark - clock_t starttime_2 = 0; - clock_t starttime_3 = 0; - #endif // benchmark - - #ifdef ROI - for (const auto &bbox : boundingBoxes) - #else - for (const auto &bbox : result.boxes) - #endif // ROI - { - Ort::Value decoderInputTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - embeddings.data(), // Use the embeddings from the encoder - embeddings.size(), // Total number of elements - decoderInputDims.data(), - decoderInputDims.size() - ); - // Use center of bounding box as foreground point - float centerX = bbox.x + bbox.width/2; - float centerY = bbox.y + bbox.height/2; - - // Convert bounding box to points - std::vector pointCoords = { - (float)bbox.x, (float)bbox.y, // Top-left - (float)(bbox.x + bbox.width), (float)(bbox.y + bbox.height) // Bottom-right - }; - - - std::vector pointCoordsScaled; - - std::vector pointCoordsDims = {1, 2, 2}; // 2 points, each with (x, y) - - // Labels for the points - std::vector pointLabels = {2.0f, 3.0f}; // Box prompt labels - std::vector pointLabelsDims = {1, 2}; - - // Create dummy mask_input and has_mask_input - std::vector maskInput(256 * 256, 0.0f); // Fill with zeros - std::vector maskInputDims = {1, 1, 256, 256}; - - - std::vector hasMaskInput = {0.0f}; // No mask provided - std::vector hasMaskInputDims = {1}; - - utilities.ScaleBboxPoints(iImg, imgSize, pointCoords, pointCoordsScaled); - - - - - std::vector inputTensors = utilities.PrepareInputTensor( - decoderInputTensor, - pointCoordsScaled, - pointCoordsDims, - pointLabels, - pointLabelsDims, - maskInput, - maskInputDims, - hasMaskInput, - hasMaskInputDims - ); - - #ifdef benchmark - starttime_2 = clock(); - #endif // benchmark - auto output_tensors = session->Run( - options, - inputNodeNames.data(), - inputTensors.data(), - inputTensors.size(), - outputNodeNames.data(), - outputNodeNames.size()); - - #ifdef benchmark - starttime_3 = clock(); - #endif // benchmark - - - utilities.overlay(output_tensors, iImg, imgSize, result); - //std::cout << "Press any key to exit" << std::endl; - //cv::imshow("Result of INTERMEDIATE Detection", iImg); - //cv::waitKey(0); - //cv::destroyAllWindows(); - } - // Add the result to oResult - oResult.push_back(result); - - delete[] blob; - - #ifdef benchmark - clock_t starttime_4 = clock(); - double pre_process_time = (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000; - double process_time = (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000; - double post_process_time = (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000; - if (cudaEnable) - { - std::cout << "[SAM(CUDA)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl; - } - else - { - std::cout << "[SAM(CPU)]: " << pre_process_time << "ms pre-process, " << process_time << "ms inference, " << post_process_time << "ms post-process." << std::endl; - } - #endif // benchmark - break; - } - - default: - std::cout << "[SAM]: " << "Not support model type." << std::endl; - } - return RET_OK; + switch (_modelType) { + case SEG::SAM_SEGMENT_ENCODER: + // case OTHER_SAM_MODEL: + { + Ort::Value inputTensor = + Ort::Value::CreateTensor::type>( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + blob, 3 * _imgSize.at(0) * _imgSize.at(1), inputNodeDims.data(), + inputNodeDims.size()); +#ifdef benchmark + clock_t starttime_2 = clock(); +#endif // benchmark + auto outputTensor = + _session->Run(_options, _inputNodeNames.data(), &inputTensor, 1, + _outputNodeNames.data(), _outputNodeNames.size()); +#ifdef benchmark + clock_t starttime_3 = clock(); +#endif // benchmark + + Ort::TypeInfo typeInfo = outputTensor.front().GetTypeInfo(); + auto tensor_info = typeInfo.GetTensorTypeAndShapeInfo(); + std::vector outputNodeDims = tensor_info.GetShape(); + auto output = + outputTensor.front() + .GetTensorMutableData::type>(); + delete[] blob; + + int embeddingSize = outputNodeDims[1] * outputNodeDims[2] * + outputNodeDims[3]; // Flattened size + result.embeddings.assign(output, + output + embeddingSize); // Save embeddings + +#ifdef benchmark + clock_t starttime_4 = clock(); + double pre_process_time = + (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000; + double process_time = + (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000; + double post_process_time = + (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000; + if (_cudaEnable) { + std::cout << "[SAM(CUDA)]: " << pre_process_time << "ms pre-process, " + << process_time << "ms inference, " << post_process_time + << "ms post-process." << std::endl; + } else { + std::cout << "[SAM(CPU)]: " << pre_process_time << "ms pre-process, " + << process_time << "ms inference, " << post_process_time + << "ms post-process." << std::endl; + } +#endif // benchmark + + break; } + case SEG::SAM_SEGMENT_DECODER: { + // Use embeddings from the last result + std::vector embeddings = result.embeddings; + // Create tensor for decoder + std::vector decoderInputDims = { + 1, 256, 64, 64}; // Adjust based on your decoder's requirements + + // Create point coordinates for testing purposes +#ifdef ROI + + // Create a window for user interaction + namedWindow("Select and View Result", cv::WINDOW_AUTOSIZE); + + // Let the user select the bounding box + cv::Rect bbox = selectROI("Select and View Result", iImg, false, false); + + // Check if a valid bounding box was selected + if (bbox.width == 0 || bbox.height == 0) { + std::cerr << "No valid bounding box selected." << std::endl; + return "[SAM]: NO valid Box."; + } + + std::vector boundingBoxes; + boundingBoxes.push_back(bbox); +#endif // ROI -char* SAM::WarmUpSession(SEG::MODEL_TYPE modelType) { - clock_t starttime_1 = clock(); - Utils utilities; - cv::Mat iImg = cv::Mat(cv::Size(imgSize.at(0), imgSize.at(1)), CV_8UC3); - cv::Mat processedImg; - utilities.PreProcess(iImg, imgSize, processedImg); - if (modelType < 4) +#ifdef benchmark + clock_t starttime_2 = 0; + clock_t starttime_3 = 0; +#endif // benchmark + +#ifdef ROI + for (const auto &box : boundingBoxes) +#else + for (const auto &box : result.boxes) +#endif // ROI { - float* blob = new float[iImg.total() * 3]; - utilities.BlobFromImage(processedImg, blob); - std::vector SAM_input_node_dims = { 1, 3, imgSize.at(0), imgSize.at(1) }; - switch (modelType) - { - case SEG::SAM_SEGMENT_ENCODER: { - Ort::Value input_tensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), - SAM_input_node_dims.data(), SAM_input_node_dims.size()); - auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(), - outputNodeNames.size()); - delete[] blob; - clock_t starttime_4 = clock(); - double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000; - if (cudaEnable) - { - std::cout << "[SAM(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl; - } - break; - } - - case SEG::SAM_SEGMENT_DECODER: { - std::vector inputNodeDims = { 1, 256, 64, 64 }; // BUG: That was 236 instead of 256 - // Use embeddings from the last result - std::vector dummyEmbeddings(256 * 64 * 64, 1.0f); // Fill with zeros or any dummy values - std::vector decoderInputDims = { 1, 256, 64, 64 }; // Adjust based on your decoder's requirements - - - // Create dummy point coordinates and labels - std::vector boundingBoxes = { - cv::Rect(0, 0, 100, 100), // Example bounding box with (x, y, width, height) - //cv::Rect(0, 0, 473, 359) // Another example bounding box - }; - for (const auto& bbox : boundingBoxes) { - Ort::Value decoderInputTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - dummyEmbeddings.data(), // Use the embeddings from the encoder - dummyEmbeddings.size(), // Total number of elements - decoderInputDims.data(), - decoderInputDims.size() - ); - // Convert bounding box to points - // Use center of bounding box as foreground point - float centerX = bbox.x + bbox.width/2; - float centerY = bbox.y + bbox.height/2; - - std::vector pointCoords = { - centerX, centerY // Center point (foreground) - }; - - std::vector pointCoordsDims = { 1, 1, 2 }; // 2 points, each with (x, y) - - std::vector pointCoordsScaled; - - utilities.ScaleBboxPoints(iImg, imgSize, pointCoords, pointCoordsScaled); - - // Labels for the points - std::vector pointLabels = {1.0f}; // All points are foreground - std::vector pointLabelsDims = { 1, 1}; - // Create dummy mask_input and has_mask_input - std::vector maskInput(256 * 256, 0.0f); // Fill with zeros - std::vector maskInputDims = { 1, 1, 256, 256 }; - std::vector hasMaskInput = { 0.0f }; // No mask provided - std::vector hasMaskInputDims = { 1 }; - - std::vector inputTensors = utilities.PrepareInputTensor( - decoderInputTensor, - pointCoordsScaled, - pointCoordsDims, - pointLabels, - pointLabelsDims, - maskInput, - maskInputDims, - hasMaskInput, - hasMaskInputDims - ); - - auto output_tensors = session->Run( - options, - inputNodeNames.data(), - inputTensors.data(), - inputTensors.size(), - outputNodeNames.data(), - outputNodeNames.size() - ); } - - outputNodeNames.size(); - delete[] blob; - clock_t starttime_4 = clock(); - double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000; - if (cudaEnable) - { - std::cout << "[SAM(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl; - } - - break; - } + Ort::Value decoderInputTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + embeddings.data(), // Use the embeddings from the encoder + embeddings.size(), // Total number of elements + decoderInputDims.data(), decoderInputDims.size()); + // Use center of bounding box as foreground point + float centerX = box.x + box.width / 2.0; + float centerY = box.y + box.height / 2.0; + + // Convert bounding box to points + std::vector pointCoords = { + (float)box.x, (float)box.y, // Top-left + (float)(box.x + box.width), + (float)(box.y + box.height) // Bottom-right + }; + + std::vector pointCoordsScaled; + + std::vector pointCoordsDims = {1, 2, 2}; // 2 points, each with (x, y) + + // Labels for the points + std::vector pointLabels = {2.0f, 3.0f}; // Box prompt labels + std::vector pointLabelsDims = {1, 2}; + + // Create dummy mask_input and has_mask_input + std::vector maskInput(256 * 256, 0.0f); // Fill with zeros + std::vector maskInputDims = {1, 1, 256, 256}; + + std::vector hasMaskInput = {0.0f}; // No mask provided + std::vector hasMaskInputDims = {1}; + + utilities.ScaleBboxPoints(iImg, _imgSize, pointCoords, pointCoordsScaled); + + std::vector inputTensors = utilities.PrepareInputTensor( + decoderInputTensor, pointCoordsScaled, pointCoordsDims, pointLabels, + pointLabelsDims, maskInput, maskInputDims, hasMaskInput, + hasMaskInputDims); + +#ifdef benchmark + starttime_2 = clock(); +#endif // benchmark + auto output_tensors = _session->Run( + _options, _inputNodeNames.data(), inputTensors.data(), + inputTensors.size(), _outputNodeNames.data(), _outputNodeNames.size()); + +#ifdef benchmark + starttime_3 = clock(); +#endif // benchmark + + utilities.PostProcess(output_tensors, iImg, _imgSize, result); } + // Add the result to oResult + oResult.push_back(result); + + delete[] blob; + +#ifdef benchmark + clock_t starttime_4 = clock(); + double pre_process_time = + (double)(starttime_2 - starttime_1) / CLOCKS_PER_SEC * 1000; + double process_time = + (double)(starttime_3 - starttime_2) / CLOCKS_PER_SEC * 1000; + double post_process_time = + (double)(starttime_4 - starttime_3) / CLOCKS_PER_SEC * 1000; + if (_cudaEnable) { + std::cout << "[SAM(CUDA)]: " << pre_process_time << "ms pre-process, " + << process_time << "ms inference, " << post_process_time + << "ms post-process." << std::endl; + } else { + std::cout << "[SAM(CPU)]: " << pre_process_time << "ms pre-process, " + << process_time << "ms inference, " << post_process_time + << "ms post-process." << std::endl; + } +#endif // benchmark + break; + } + + default: + std::cout << "[SAM]: " << "Not support model type." << std::endl; + } + return RET_OK; +} +char *SAM::WarmUpSession(SEG::MODEL_TYPE _modelType) { + clock_t starttime_1 = clock(); + Utils utilities; + cv::Mat iImg = cv::Mat(cv::Size(_imgSize.at(0), _imgSize.at(1)), CV_8UC3); + cv::Mat processedImg; + utilities.PreProcess(iImg, _imgSize, processedImg); + + float *blob = new float[iImg.total() * 3]; + utilities.BlobFromImage(processedImg, blob); + + // NCHW: H = imgSize[1], W = imgSize[0] + std::vector SAM_input_node_dims = {1, 3, _imgSize.at(1), _imgSize.at(0)}; + switch (_modelType) { + case SEG::SAM_SEGMENT_ENCODER: { + Ort::Value input_tensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, + 3 * _imgSize.at(0) * _imgSize.at(1), SAM_input_node_dims.data(), + SAM_input_node_dims.size()); + auto output_tensors = + _session->Run(_options, _inputNodeNames.data(), &input_tensor, 1, + _outputNodeNames.data(), _outputNodeNames.size()); + delete[] blob; + clock_t starttime_4 = clock(); + double post_process_time = + (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000; + if (_cudaEnable) { + std::cout << "[SAM(CUDA)]: " << "Cuda warm-up cost " << post_process_time + << " ms. " << std::endl; } - else - { -#ifdef USE_CUDA - half* blob = new half[iImg.total() * 3]; - utilities.BlobFromImage(processedImg, blob); - std::vector SAM_input_node_dims = { 1,3,imgSize.at(0),imgSize.at(1) }; - Ort::Value input_tensor = Ort::Value::CreateTensor(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), blob, 3 * imgSize.at(0) * imgSize.at(1), SAM_input_node_dims.data(), SAM_input_node_dims.size()); - auto output_tensors = session->Run(options, inputNodeNames.data(), &input_tensor, 1, outputNodeNames.data(), outputNodeNames.size()); - delete[] blob; - clock_t starttime_4 = clock(); - double post_process_time = (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000; - if (cudaEnable) - { - std::cout << "[SAM(CUDA)]: " << "Cuda warm-up cost " << post_process_time << " ms. " << std::endl; - } -#endif + break; + } + + case SEG::SAM_SEGMENT_DECODER: { + std::vector inputNodeDims = { + 1, 256, 64, 64}; // BUG: That was 236 instead of 256 + // Use embeddings from the last result + std::vector dummyEmbeddings( + 256 * 64 * 64, 1.0f); // Fill with zeros or any dummy values + std::vector decoderInputDims = { + 1, 256, 64, 64}; // Adjust based on your decoder's requirements + + // Create dummy point coordinates and labels + std::vector boundingBoxes = { + cv::Rect(0, 0, 100, + 100), // Example bounding box with (x, y, width, height) + // cv::Rect(0, 0, 473, 359) // Another example bounding box + }; + for (const auto &bbox : boundingBoxes) { + Ort::Value decoderInputTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + dummyEmbeddings.data(), // Use the embeddings from the encoder + dummyEmbeddings.size(), // Total number of elements + decoderInputDims.data(), decoderInputDims.size()); + // Convert bounding box to points + // Use center of bounding box as foreground point + float centerX = bbox.x + bbox.width / 2.0; + float centerY = bbox.y + bbox.height / 2.0; + + std::vector pointCoords = { + centerX, centerY // Center point (foreground) + }; + + std::vector pointCoordsDims = {1, 1, + 2}; // 2 points, each with (x, y) + + std::vector pointCoordsScaled; + + utilities.ScaleBboxPoints(iImg, _imgSize, pointCoords, pointCoordsScaled); + + // Labels for the points + std::vector pointLabels = {1.0f}; // All points are foreground + std::vector pointLabelsDims = {1, 1}; + // Create dummy mask_input and has_mask_input + std::vector maskInput(256 * 256, 0.0f); // Fill with zeros + std::vector maskInputDims = {1, 1, 256, 256}; + std::vector hasMaskInput = {0.0f}; // No mask provided + std::vector hasMaskInputDims = {1}; + + std::vector inputTensors = utilities.PrepareInputTensor( + decoderInputTensor, pointCoordsScaled, pointCoordsDims, pointLabels, + pointLabelsDims, maskInput, maskInputDims, hasMaskInput, + hasMaskInputDims); + + auto output_tensors = _session->Run( + _options, _inputNodeNames.data(), inputTensors.data(), + inputTensors.size(), _outputNodeNames.data(), _outputNodeNames.size()); } - return RET_OK; + + _outputNodeNames.size(); + delete[] blob; + clock_t starttime_4 = clock(); + double post_process_time = + (double)(starttime_4 - starttime_1) / CLOCKS_PER_SEC * 1000; + if (_cudaEnable) { + std::cout << "[SAM(CUDA)]: " << "Cuda warm-up cost " << post_process_time + << " ms. " << std::endl; + } + + break; + } + } + + return RET_OK; } diff --git a/src/segmentation.cpp b/src/segmentation.cpp new file mode 100644 index 0000000..41176de --- /dev/null +++ b/src/segmentation.cpp @@ -0,0 +1,54 @@ +#include "segmentation.h" + +std::tuple>, SEG::DL_INIT_PARAM, + SEG::DL_INIT_PARAM, SEG::DL_RESULT, std::vector> +Initializer() { + std::vector> samSegmentors; + samSegmentors.push_back(std::make_unique()); + samSegmentors.push_back(std::make_unique()); + + std::unique_ptr samSegmentorEncoder = std::make_unique(); + std::unique_ptr samSegmentorDecoder = std::make_unique(); + SEG::DL_INIT_PARAM params_encoder; + SEG::DL_INIT_PARAM params_decoder; + SEG::DL_RESULT res; + std::vector resSam; + params_encoder.rectConfidenceThreshold = 0.1; + params_encoder.iouThreshold = 0.5; + params_encoder.modelPath = "/home/amigo/Documents/repos/sam_onnx_ros/build/SAM_encoder.onnx"; + params_encoder.imgSize = {1024, 1024}; + + params_decoder = params_encoder; + params_decoder.modelType = SEG::SAM_SEGMENT_DECODER; + params_decoder.modelPath = "/home/amigo/Documents/repos/sam_onnx_ros/build/SAM_mask_decoder.onnx"; + +#ifdef USE_CUDA + params_encoder.cudaEnable = true; + params_decoder.cudaEnable = true; + +#else + params_encoder.cudaEnable = false; +#endif + + samSegmentorEncoder->CreateSession(params_encoder); + samSegmentorDecoder->CreateSession(params_decoder); + samSegmentors[0] = std::move(samSegmentorEncoder); + samSegmentors[1] = std::move(samSegmentorDecoder); + return {std::move(samSegmentors), params_encoder, params_decoder, res, resSam}; +} + +void SegmentAnything(std::vector> &samSegmentors, + const SEG::DL_INIT_PARAM ¶ms_encoder, + const SEG::DL_INIT_PARAM ¶ms_decoder, const cv::Mat &img, std::vector &resSam, + SEG::DL_RESULT &res) { + + + + SEG::MODEL_TYPE modelTypeRef = params_encoder.modelType; + samSegmentors[0]->RunSession(img, resSam, modelTypeRef, res); + + modelTypeRef = params_decoder.modelType; + samSegmentors[1]->RunSession(img, resSam, modelTypeRef, res); + + // return std::move(res.masks); +} diff --git a/src/utils.cpp b/src/utils.cpp index ce75a0b..643dba4 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,16 +1,18 @@ #include "utils.h" #include // for guided filter +//#define LOGGING // Constructor -Utils::Utils(){ - +Utils::Utils() +{ } // Destructor -Utils::~Utils(){ +Utils::~Utils() +{ } -char* Utils::PreProcess(const cv::Mat& iImg, std::vector iImgSize, cv::Mat& oImg) +char *Utils::PreProcess(const cv::Mat &iImg, std::vector iImgSize, cv::Mat &oImg) { if (iImg.channels() == 3) { @@ -22,41 +24,47 @@ char* Utils::PreProcess(const cv::Mat& iImg, std::vector iImgSize, cv::Mat& cv::cvtColor(iImg, oImg, cv::COLOR_GRAY2RGB); } - - if (iImg.cols >= iImg.rows) - { - resizeScales = iImg.cols / (float)iImgSize.at(0); - cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / resizeScales))); - } - else - { - resizeScales = iImg.rows / (float)iImgSize.at(0); - cv::resize(oImg, oImg, cv::Size(int(iImg.cols / resizeScales), iImgSize.at(1))); - } - cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3); - oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows))); - oImg = tempImg; + if (iImg.cols >= iImg.rows) + { + _resizeScales = iImg.cols / (float)iImgSize.at(0); + cv::resize(oImg, oImg, cv::Size(iImgSize.at(0), int(iImg.rows / _resizeScales))); + } + else + { + _resizeScales = iImg.rows / (float)iImgSize.at(1); + cv::resize(oImg, oImg, cv::Size(int(iImg.cols / _resizeScales), iImgSize.at(1))); + } + //cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(0), iImgSize.at(1), CV_8UC3); + cv::Mat tempImg = cv::Mat::zeros(iImgSize.at(1), iImgSize.at(0), CV_8UC3); + oImg.copyTo(tempImg(cv::Rect(0, 0, oImg.cols, oImg.rows))); + oImg = tempImg; return RET_OK; } -void Utils::ScaleBboxPoints(const cv::Mat& iImg, std::vector imgSize, std::vector& pointCoords, std::vector& pointCoordsScaled){ +void Utils::ScaleBboxPoints(const cv::Mat &iImg, std::vector imgSize, std::vector &pointCoords, std::vector &pointCoordsScaled) +{ pointCoordsScaled.clear(); // Calculate same scale as preprocessing float scale; - if (iImg.cols >= iImg.rows) { + if (iImg.cols >= iImg.rows) + { scale = imgSize[0] / (float)iImg.cols; - resizeScalesBbox = iImg.cols / (float)imgSize[0]; - } else { + _resizeScalesBbox = iImg.cols / (float)imgSize[0]; + } + else + { scale = imgSize[1] / (float)iImg.rows; - resizeScalesBbox = iImg.rows / (float)imgSize[1]; + _resizeScalesBbox = iImg.rows / (float)imgSize[1]; } - // TOP-LEFT placement (matching PreProcess) - for (size_t i = 0; i < pointCoords.size(); i += 2) { - if (i + 1 < pointCoords.size()) { + // Top-Left placement (matching PreProcess) + for (size_t i = 0; i < pointCoords.size(); i += 2) + { + if (i + 1 < pointCoords.size()) + { float x = pointCoords[i]; float y = pointCoords[i + 1]; @@ -70,200 +78,145 @@ void Utils::ScaleBboxPoints(const cv::Mat& iImg, std::vector imgSize, std:: } } -std::vector Utils::PrepareInputTensor(Ort::Value& decoderInputTensor, std::vector& pointCoordsScaled, std::vector pointCoordsDims, std::vector& pointLabels, - std::vector pointLabelsDims, std::vector& maskInput, std::vector maskInputDims, std::vector& hasMaskInput, std::vector hasMaskInputDims){ - -Ort::Value pointCoordsTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - pointCoordsScaled.data(), - pointCoordsScaled.size(), - pointCoordsDims.data(), - pointCoordsDims.size() -); - - - -Ort::Value pointLabelsTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - pointLabels.data(), - pointLabels.size(), - pointLabelsDims.data(), - pointLabelsDims.size() -); +std::vector Utils::PrepareInputTensor(Ort::Value &decoderInputTensor, std::vector &pointCoordsScaled, std::vector pointCoordsDims, std::vector &pointLabels, + std::vector pointLabelsDims, std::vector &maskInput, std::vector maskInputDims, std::vector &hasMaskInput, std::vector hasMaskInputDims) +{ + Ort::Value pointCoordsTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + pointCoordsScaled.data(), + pointCoordsScaled.size(), + pointCoordsDims.data(), + pointCoordsDims.size()); + + Ort::Value pointLabelsTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + pointLabels.data(), + pointLabels.size(), + pointLabelsDims.data(), + pointLabelsDims.size()); + + Ort::Value maskInputTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + maskInput.data(), + maskInput.size(), + maskInputDims.data(), + maskInputDims.size()); + + Ort::Value hasMaskInputTensor = Ort::Value::CreateTensor( + Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), + hasMaskInput.data(), + hasMaskInput.size(), + hasMaskInputDims.data(), + hasMaskInputDims.size()); + + // Pass all inputs to the decoder + std::vector inputTensors; + inputTensors.push_back(std::move(decoderInputTensor)); + inputTensors.push_back(std::move(pointCoordsTensor)); + inputTensors.push_back(std::move(pointLabelsTensor)); + inputTensors.push_back(std::move(maskInputTensor)); + inputTensors.push_back(std::move(hasMaskInputTensor)); + + return inputTensors; +} +void Utils::PostProcess(std::vector &output_tensors, const cv::Mat &iImg, std::vector imgSize, SEG::DL_RESULT &result) +{ + if (output_tensors.size() < 2) + { + std::cerr << "[SAM]: Decoder returned insufficient outputs." << std::endl; + return; + } + // Assume [scores, masks]; consider shape-based detection later + auto scoresTensor = std::move(output_tensors[0]); + auto masksTensor = std::move(output_tensors[1]); -Ort::Value maskInputTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - maskInput.data(), - maskInput.size(), - maskInputDims.data(), - maskInputDims.size() -); + auto masksInfo = masksTensor.GetTensorTypeAndShapeInfo(); + auto masksShape = masksInfo.GetShape(); + if (masksShape.size() == 4) + { + auto masksData = masksTensor.GetTensorMutableData(); + auto scoresData = scoresTensor.GetTensorMutableData(); + const size_t numMasks = static_cast(masksShape[1]); + const size_t height = static_cast(masksShape[2]); + const size_t width = static_cast(masksShape[3]); -Ort::Value hasMaskInputTensor = Ort::Value::CreateTensor( - Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU), - hasMaskInput.data(), - hasMaskInput.size(), - hasMaskInputDims.data(), - hasMaskInputDims.size() -); + // Pick best mask by score + float bestScore = -1.0f; + size_t bestMaskIndex = 0; + for (size_t i = 0; i < numMasks; ++i) + { + const float s = scoresData ? scoresData[i] : 0.0f; + if (s > bestScore) { bestScore = s; bestMaskIndex = i; } + } -// Pass all inputs to the decoder -std::vector inputTensors; -inputTensors.push_back(std::move(decoderInputTensor)); -inputTensors.push_back(std::move(pointCoordsTensor)); -inputTensors.push_back(std::move(pointLabelsTensor)); -inputTensors.push_back(std::move(maskInputTensor)); -inputTensors.push_back(std::move(hasMaskInputTensor)); + // Compute preprocessed region (top-left anchored) + float scale; + int processedWidth, processedHeight; + if (iImg.cols >= iImg.rows) + { + scale = static_cast(imgSize[0]) / static_cast(iImg.cols); + processedWidth = imgSize[0]; + processedHeight = static_cast(iImg.rows * scale); + } + else + { + scale = static_cast(imgSize[1]) / static_cast(iImg.rows); + processedWidth = static_cast(iImg.cols * scale); + processedHeight = imgSize[1]; + } -return inputTensors; -} -void Utils::overlay(std::vector& output_tensors, const cv::Mat& iImg, std::vector imgSize, SEG::DL_RESULT& result){ - // Process decoder output (masks) - if (output_tensors.size() > 0) + auto clampDim = [](int v, int lo, int hi) { return std::max(lo, std::min(v, hi)); }; + + // Wrap selected mask plane as float prob map + const size_t planeOffset = bestMaskIndex * height * width; + cv::Mat prob32f(static_cast(height), static_cast(width), CV_32F, + const_cast(masksData + planeOffset)); + + // Crop in mask space using proportional dimensions (no hardcoded 256) + const int cropW = clampDim(static_cast(std::round(static_cast(width) * processedWidth / static_cast(imgSize[0]))), 1, static_cast(width)); + const int cropH = clampDim(static_cast(std::round(static_cast(height) * processedHeight / static_cast(imgSize[1]))), 1, static_cast(height)); + cv::Mat probCropped = prob32f(cv::Rect(0, 0, cropW, cropH)); + + // Resize probabilities to original image (linear) + cv::Mat probResized; + cv::resize(probCropped, probResized, cv::Size(iImg.cols, iImg.rows), 0, 0, cv::INTER_LINEAR); + + // Threshold once to binary mask + cv::Mat finalMask; + cv::compare(probResized, 0.5f, finalMask, cv::CMP_GT); // CV_8U 0/255 + + // Morphological cleanup (light, then ensure binary) + int kernelSize = std::max(5, std::min(iImg.cols, iImg.rows) / 100); + cv::Mat kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(kernelSize, kernelSize)); + cv::morphologyEx(finalMask, finalMask, cv::MORPH_CLOSE, kernel); + cv::morphologyEx(finalMask, finalMask, cv::MORPH_OPEN, kernel); + cv::threshold(finalMask, finalMask, 127, 255, cv::THRESH_BINARY); + + // Save mask + result.masks.push_back(finalMask); + + // Overlay for display on a copy (iImg is const) + #ifdef LOGGING + cv::Mat overlay = iImg.clone(); + std::vector> contours; + cv::findContours(finalMask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); + + cv::Mat colorMask = cv::Mat::zeros(overlay.size(), CV_8UC3); + colorMask.setTo(cv::Scalar(0, 200, 0), finalMask); + cv::addWeighted(overlay, 0.7, colorMask, 0.3, 0, overlay); + cv::drawContours(overlay, contours, -1, cv::Scalar(0, 255, 255), 2); + + cv::imshow("SAM Segmentation", overlay); + cv::waitKey(0); + cv::destroyAllWindows(); + #endif // LOGGING + } + else { - // Get the masks from the output tensor - auto scoresTensor = std::move(output_tensors[0]); // IoU scores - auto masksTensor = std::move(output_tensors[1]); // First output should be the masks PROBABLY WRONG - auto masksInfo = masksTensor.GetTensorTypeAndShapeInfo(); - auto masksShape = masksInfo.GetShape(); - - - if (masksShape.size() == 4) - { - auto masksData = masksTensor.GetTensorMutableData(); - auto scoresData = scoresTensor.GetTensorMutableData(); - - size_t batchSize = masksShape[0]; // Usually 1 - size_t numMasks = masksShape[1]; // Number of masks (typically 1) - size_t height = masksShape[2]; // Height of mask - size_t width = masksShape[3]; // Width of mask - - - // Find the best mask (highest IoU score) - float bestScore = -1; - size_t bestMaskIndex = 0; - - for (size_t i = 0; i < numMasks; ++i) - { - - float score = scoresData[i]; - - if (score > bestScore) { - bestScore = score; - bestMaskIndex = i; - } - } - std::cout << "Best mask index: " << bestMaskIndex << ", Score: " << bestScore << std::endl; - // Create OpenCV Mat for the mask - cv::Mat mask = cv::Mat::zeros(height, width, CV_8UC1); - - // Convert float mask to binary mask - for (size_t h = 0; h < height; ++h) - { - for (size_t w = 0; w < width; ++w) - { - size_t idx = (bestMaskIndex * height * width) + (h * width) + w; - float value = masksData[idx]; - mask.at(h, w) = (value > 0.5f) ? 255 : 0; // Threshold at 0.5 - } - } - - // 1. Calculate the dimensions the image had during preprocessing - float scale; - int processedWidth, processedHeight; - if (iImg.cols >= iImg.rows) { - scale = (float)imgSize[0] / iImg.cols; - processedWidth = imgSize[0]; - processedHeight = int(iImg.rows * scale); - } else { - scale = (float)imgSize[1] / iImg.rows; - processedWidth = int(iImg.cols * scale); - processedHeight = imgSize[1]; - } - // 2. Resize mask to match the SAM input dimensions - //cv::Mat resizedMask; - //cv::resize(mask, resizedMask, cv::Size(256, 256)); - - // 3. Extract the portion that corresponds to the actual image (no padding) - int cropWidth = std::min(256, int(256 * processedWidth / (float)imgSize[0])); - int cropHeight = std::min(256, int(256 * processedHeight / (float)imgSize[1])); - cv::Mat croppedMask = mask(cv::Rect(0, 0, cropWidth, cropHeight)); - - // 4. Resize directly to original image dimensions in one step - cv::Mat finalMask; - - // Use INTER_NEAREST for binary masks - preserves hard edges - cv::resize(croppedMask, finalMask, cv::Size(iImg.cols, iImg.rows), 0, 0, cv::INTER_NEAREST); - - ////////////////////// GUIDED BILATERAL FILTER ///////////////////////// - // Convert the upscaled mask to CV_8UC1 if necessary - if (finalMask.type() != CV_8UC1) - { - finalMask.convertTo(finalMask, CV_8UC1); - } - - // Apply the Guided Filter - // cv::Mat filteredMask; - int radius = 2; - double eps = 0.01; - cv::ximgproc::guidedFilter(iImg, finalMask, finalMask, radius, eps); - ////////////////////// END: GUIDED BILATERAL FILTER ///////////////////////// - - ////////////////////// MORPHOLOGICAN OPERATIONS ///////////////////////// - // Morphological operations to clean up the mask - int kernelSize = std::max(5, std::min(iImg.cols, iImg.rows) / 100); // Adaptive size - cv::Mat kernel = cv::getStructuringElement(cv::MORPH_ELLIPSE, cv::Size(kernelSize, kernelSize)); - - // CLOSE operation: fills small holes in the mask - cv::morphologyEx(finalMask, finalMask, cv::MORPH_CLOSE, kernel); - - // OPEN operation: removes small noise - cv::morphologyEx(finalMask, finalMask, cv::MORPH_OPEN, kernel); - - ////////////////////// END: MORPHOLOGICAN OPERATIONS ///////////////////////// - - // Re-threshold after resizing to ensure binary mask (critical step) - - cv::threshold(finalMask, finalMask, 127, 255, cv::THRESH_BINARY); - result.masks.push_back(finalMask); - - /*// Add IoU scores if available (typically second tensor) - if (output_tensors.size() > 1) { - auto scoresTensor = std::move(output_tensors[1]); - auto scoresData = scoresTensor.GetTensorMutableData(); - if (i < scoresTensor.GetTensorTypeAndShapeInfo().GetShape()[1]) { - result.confidence = scoresData[i]; - std::cout << "Mask confidence: " << result.confidence << std::endl; - } - }*/ - - - // Find contours of the mask - std::vector> contours; - cv::findContours(finalMask, contours, cv::RETR_EXTERNAL, cv::CHAIN_APPROX_SIMPLE); - - // Create a semi-transparent overlay - cv::Mat colorMask = cv::Mat::zeros(iImg.size(), CV_8UC3); - colorMask.setTo(cv::Scalar(0, 200, 0), finalMask); // Green fill - cv::addWeighted(iImg, 0.7, colorMask, 0.3, 0, iImg); - - // Draw contours with a thick, high-contrast outline - cv::drawContours(iImg, contours, -1, cv::Scalar(0, 255, 255), 2); // Yellow outline - - - // Save or display the result - cv::imwrite("segmentation_result_" + std::to_string(bestMaskIndex) + ".jpg", iImg); - cv::imwrite("mask_" + std::to_string(bestMaskIndex) + ".jpg", finalMask); - }else - { - std::cerr << "[SAM]: Unexpected mask tensor shape." << std::endl; - } - }else - { - std::cerr << "[SAM]: No masks found in the output tensor." << std::endl; - } - } \ No newline at end of file + std::cerr << "[SAM]: Unexpected mask tensor shape." << std::endl; + } +} diff --git a/test/sam_test.cpp b/test/sam_test.cpp new file mode 100644 index 0000000..cd54d7e --- /dev/null +++ b/test/sam_test.cpp @@ -0,0 +1,94 @@ +#include +#include +#include +#include "segmentation.h" +#include "sam_inference.h" +#include "dl_types.h" + +// This file contains higher-level (integration-ish) tests. +// They cover object/session creation and a full pipeline run using synthetic images. +// These tests may require the .onnx model files to be present next to the binary or in a known dir. + +class SamInferenceTest : public ::testing::Test +{ +protected: + void SetUp() override + { + // Create simple synthetic images: + // - a white 640x640 (square) + // - a gray 800x600 (non-square) + testImage_640x640 = cv::Mat::ones(640, 640, CV_8UC3) * 255; + testImage_800x600 = cv::Mat::ones(600, 800, CV_8UC3) * 128; + + // A "random noise" image to simulate realistic content for end-to-end checks. + testImage_realistic = cv::Mat(640, 640, CV_8UC3); + cv::randu(testImage_realistic, cv::Scalar(0,0,0), cv::Scalar(255,255,255)); + + // Cache non-square size for preprocessing helpers. + NonSquareImgSize = { testImage_800x600.cols, testImage_800x600.rows }; + + // Use package helpers to build default params and SAM objects. + std::tie(samSegmentors, params_encoder, params_decoder, res, resSam) = Initializer(); + +#ifdef USE_CUDA + params_encoder.cudaEnable = true; // Enable CUDA if compiled with it +#else + params_encoder.cudaEnable = false; // Otherwise run on CPU +#endif + + } + + // Clean up the SAM objects after each test. + void TearDown() override { samSegmentors[0].reset(); samSegmentors[1].reset(); } + + // Test data and objects shared across tests. + Utils utilities; + cv::Mat testImage_640x640, testImage_800x600, testImage_realistic; + std::vector NonSquareImgSize; + std::vector> samSegmentors; + SEG::DL_INIT_PARAM params_encoder, params_decoder; + SEG::DL_RESULT res; + std::vector resSam; +}; + +// Simple smoke test: we can construct a SAM object without throwing. +TEST_F(SamInferenceTest, ObjectCreation) +{ + EXPECT_NO_THROW({ + SAM localSam; + }); +} + +// Confirms that with a present encoder model we can initialize a session. +// Skips if the model file is not available. +TEST_F(SamInferenceTest, CreateSessionWithValidModel) +{ + if (!std::filesystem::exists("SAM_encoder.onnx")) { + GTEST_SKIP() << "Model not found in build dir"; + } + + EXPECT_NE(samSegmentors[0], nullptr) << "CreateSession should succeed with valid parameters"; +} + +// Confirms that giving an invalid model path returns an error (no crash). +TEST_F(SamInferenceTest, CreateSessionWithInvalidModel) +{ + params_encoder.modelPath = "nonexistent_model.onnx"; + const char* result = samSegmentors[0]->CreateSession(params_encoder); + EXPECT_NE(result, nullptr) << "CreateSession should fail with invalid model path"; +} + +// End-to-end check: with both encoder/decoder models present, the pipeline runs +// and returns a mask vector. Skips if models are not available. +TEST_F(SamInferenceTest, FullInferencePipeline) +{ + if (!std::filesystem::exists("SAM_encoder.onnx") || + !std::filesystem::exists("SAM_mask_decoder.onnx")) { + GTEST_SKIP() << "Models not found in build dir"; + } + + SegmentAnything(samSegmentors, params_encoder, params_decoder, testImage_realistic, resSam, res); + + // We only check that a vector is returned. (You can strengthen this to EXPECT_FALSE(masks.empty()).) + EXPECT_TRUE(res.masks.size() >= 0) << "Masks should be a valid output vector"; +} diff --git a/test/test_utils.cpp b/test/test_utils.cpp new file mode 100644 index 0000000..92d20ab --- /dev/null +++ b/test/test_utils.cpp @@ -0,0 +1,191 @@ +#include +#include +#include "utils.h" + +// This file contains small, focused unit tests for Utils. +// We verify image preprocessing (channel conversion, aspect-preserving resize, padding) +// and coordinate scaling to match preprocessing. + +// Lightweight fixture: gives each test a fresh Utils instance. +class UtilsTest : public ::testing::Test { +protected: + Utils u; +}; + +// Checks that a grayscale (1-channel) image is converted to RGB (3-channel) +// and the output image is exactly the requested target size (letterboxed). +TEST_F(UtilsTest, GrayscaleToRGBKeepsSize) { + cv::Mat gray = cv::Mat::zeros(300, 500, CV_8UC1); + cv::Mat out; + std::vector target{1024, 1024}; + + // Call PreProcess and expect no error. + const char* err = u.PreProcess(gray, target, out); + ASSERT_EQ(err, nullptr); + + // After preprocessing, we must have 3 channels (RGB). + EXPECT_EQ(out.channels(), 3); + + // The letterboxed output must match the target canvas size. + EXPECT_EQ(out.size(), cv::Size(target[0], target[1])); +} + +// Verifies three things: +// 1) Aspect ratio is preserved when resizing to the target. +// 2) The resized image is placed at the top-left (0,0). +// 3) The padding area is zero (black). +TEST_F(UtilsTest, PreprocessTopLeftPaddingAndAspect) { + const cv::Scalar fill(10, 20, 30); // Input color in BGR + cv::Mat img(720, 1280, CV_8UC3, fill); + cv::Mat out; + std::vector target{1024, 1024}; + + ASSERT_EQ(u.PreProcess(img, target, out), nullptr); + ASSERT_EQ(out.size(), cv::Size(target[0], target[1])); + ASSERT_EQ(out.channels(), 3); + + // Width drives resizing here (landscape). Width becomes 1024, height scales accordingly. + int resized_w = target[0]; + int resized_h = static_cast(img.rows / (img.cols / static_cast(target[0]))); + + // PreProcess converts BGR -> RGB, so expected color is swapped. + cv::Scalar expected_rgb(fill[2], fill[1], fill[0]); + + // The top-left region (resized content) should keep the image color. + cv::Mat roi_top = out(cv::Rect(0, 0, resized_w, resized_h)); + cv::Scalar mean_top = cv::mean(roi_top); + EXPECT_NEAR(mean_top[0], expected_rgb[0], 1.0); + EXPECT_NEAR(mean_top[1], expected_rgb[1], 1.0); + EXPECT_NEAR(mean_top[2], expected_rgb[2], 1.0); + + // The area below the resized content (padding) must be zeros. + if (resized_h < target[1]) { + cv::Mat roi_pad = out(cv::Rect(0, resized_h, target[0], target[1] - resized_h)); + cv::Mat gray; cv::cvtColor(roi_pad, gray, cv::COLOR_BGR2GRAY); + EXPECT_EQ(cv::countNonZero(gray), 0); + } +} + +// Explicitly ensure imgSize is interpreted as [W, H] in PreProcess for non-square targets. +TEST_F(UtilsTest, PreprocessNonSquareWidthHeightOrder) { + // Input image: H=300, W=500 + cv::Mat img(300, 500, CV_8UC3, cv::Scalar(5, 6, 7)); + + // Target canvas (W,H) with non-square dims + std::vector target{640, 480}; + cv::Mat out; + + ASSERT_EQ(u.PreProcess(img, target, out), nullptr); + // cols = width, rows = height + EXPECT_EQ(out.cols, target[0]); + EXPECT_EQ(out.rows, target[1]); + EXPECT_EQ(out.size(), cv::Size(target[0], target[1])); +} + +// Parameterized fixture: used with TEST_P to run the same test body +// for many (input size, target size) pairs. +class UtilsPreprocessParamTest + : public ::testing::TestWithParam> { +protected: + Utils u; +}; + +// TEST_P defines a parameterized test. It runs once per parameter set. +// We assert that: +// - Output size equals the target canvas. +// - Output has 3 channels (RGB). +// - The padding area (bottom or right) is zero depending on which side letterboxes. +TEST_P(UtilsPreprocessParamTest, LetterboxWithinBoundsAndChannels3) { + const auto [inSize, target] = GetParam(); + cv::Mat img(inSize, CV_8UC3, cv::Scalar(1, 2, 3)); + cv::Mat out; + + ASSERT_EQ(u.PreProcess(img, {target.width, target.height}, out), nullptr); + EXPECT_EQ(out.size(), target); + EXPECT_EQ(out.channels(), 3); + + // Detect which side letterboxes and check that the padded region is zeros. + if (inSize.width >= inSize.height) { + int resized_h = static_cast(inSize.height / (inSize.width / static_cast(target.width))); + if (resized_h < target.height) { + cv::Mat roi_pad = out(cv::Rect(0, resized_h, target.width, target.height - resized_h)); + cv::Mat gray; cv::cvtColor(roi_pad, gray, cv::COLOR_BGR2GRAY); + EXPECT_EQ(cv::countNonZero(gray), 0); + } + } else { + int resized_w = static_cast(inSize.width / (inSize.height / static_cast(target.height))); + if (resized_w < target.width) { + cv::Mat roi_pad = out(cv::Rect(resized_w, 0, target.width - resized_w, target.height)); + cv::Mat gray; cv::cvtColor(roi_pad, gray, cv::COLOR_BGR2GRAY); + EXPECT_EQ(cv::countNonZero(gray), 0); + } + } +} + +// INSTANTIATE_TEST_SUITE_P provides the concrete parameter values. +// Each pair (input size, target size) creates a separate test instance. +INSTANTIATE_TEST_SUITE_P( + ManySizes, + UtilsPreprocessParamTest, + ::testing::Values( + std::make_tuple(cv::Size(640, 640), cv::Size(1024, 1024)), // square -> square + std::make_tuple(cv::Size(800, 600), cv::Size(800, 600)), // same size (no resize) + std::make_tuple(cv::Size(600, 800), cv::Size(800, 600)), // portrait -> landscape + std::make_tuple(cv::Size(1280, 720), cv::Size(1024, 1024)) // wide -> square + ) +); + +// Separate fixture for point scaling tests. +class UtilsScaleBboxPointsTest : public ::testing::Test { +protected: + Utils u; +}; + +// If the input size and target size are the same, scaling should do nothing. +TEST_F(UtilsScaleBboxPointsTest, IdentityWhenSameSize) { + cv::Mat img(600, 800, CV_8UC3); + std::vector target{800, 600}; + std::vector pts{100.f, 100.f, 700.f, 500.f}; + std::vector scaled; + + u.ScaleBboxPoints(img, target, pts, scaled); + ASSERT_EQ(scaled.size(), pts.size()); + EXPECT_NEAR(scaled[0], pts[0], 1e-3); + EXPECT_NEAR(scaled[1], pts[1], 1e-3); + EXPECT_NEAR(scaled[2], pts[2], 1e-3); + EXPECT_NEAR(scaled[3], pts[3], 1e-3); +} + +// When width drives the resize (landscape), both x and y are scaled by the same factor. +// We expect coordinates to be multiplied by target_width / input_width. +TEST_F(UtilsScaleBboxPointsTest, ScalesWidthDominant) { + cv::Mat img(300, 600, CV_8UC3); // h=300, w=600 (w >= h) + std::vector target{1200, 600}; // width doubles + std::vector pts{100.f, 50.f, 500.f, 250.f}; + std::vector scaled; + + u.ScaleBboxPoints(img, target, pts, scaled); + ASSERT_EQ(scaled.size(), pts.size()); + const float scale = target[0] / static_cast(img.cols); // 1200/600 = 2 + EXPECT_NEAR(scaled[0], pts[0] * scale, 1e-3); + EXPECT_NEAR(scaled[1], pts[1] * scale, 1e-3); + EXPECT_NEAR(scaled[2], pts[2] * scale, 1e-3); + EXPECT_NEAR(scaled[3], pts[3] * scale, 1e-3); +} + +// When height drives the resize (portrait), both x and y are scaled by the same factor. +// We expect coordinates to be multiplied by target_height / input_height. +TEST_F(UtilsScaleBboxPointsTest, ScalesHeightDominant) { + cv::Mat img(600, 300, CV_8UC3); // h=600, w=300 (h > w) + std::vector target{600, 1200}; // height doubles + std::vector pts{100.f, 50.f, 200.f, 500.f}; + std::vector scaled; + + u.ScaleBboxPoints(img, target, pts, scaled); + ASSERT_EQ(scaled.size(), pts.size()); + const float scale = target[1] / static_cast(img.rows); // 1200/600 = 2 + EXPECT_NEAR(scaled[0], pts[0] * scale, 1e-3); + EXPECT_NEAR(scaled[1], pts[1] * scale, 1e-3); + EXPECT_NEAR(scaled[2], pts[2] * scale, 1e-3); + EXPECT_NEAR(scaled[3], pts[3] * scale, 1e-3); +}