From 229119c376070d3c7bc0170646e3651155024d38 Mon Sep 17 00:00:00 2001
From: Glenn Jocher <glenn.jocher@ultralytics.com>
Date: Thu, 11 May 2023 01:15:20 +0200
Subject: [PATCH] `ultralytics 8.0.98` add Baidu RT-DETR models (#2527)

Co-authored-by: Kalen Michael <kalenmike@gmail.com>
Co-authored-by: Laughing <61612323+Laughing-q@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yonghye Kwon <developer.0hye@gmail.com>
Co-authored-by: Dowon <ks2515@naver.com>
---
 docs/hub/app/index.md                     |  24 +-
 docs/reference/nn/modules.md              | 173 ------
 docs/reference/nn/modules/block.md        |  84 +++
 docs/reference/nn/modules/conv.md         |  64 +++
 docs/reference/nn/modules/head.md         |  24 +
 docs/reference/nn/modules/transformer.md  |  49 ++
 docs/reference/nn/modules/utils.md        |  24 +
 docs/reference/nn/tasks.md                |   5 +
 mkdocs.yml                                |  19 +-
 tests/test_python.py                      |   2 +-
 ultralytics/__init__.py                   |   5 +-
 ultralytics/hub/__init__.py               |   8 +-
 ultralytics/models/rt-detr/rt-detr-l.yaml |  49 ++
 ultralytics/models/rt-detr/rt-detr-x.yaml |  53 ++
 ultralytics/nn/autobackend.py             |   1 +
 ultralytics/nn/modules.py                 | 616 ----------------------
 ultralytics/nn/modules/__init__.py        |  17 +
 ultralytics/nn/modules/block.py           | 305 +++++++++++
 ultralytics/nn/modules/conv.py            | 277 ++++++++++
 ultralytics/nn/modules/head.py            | 382 ++++++++++++++
 ultralytics/nn/modules/transformer.py     | 390 ++++++++++++++
 ultralytics/nn/modules/utils.py           |  78 +++
 ultralytics/nn/tasks.py                   |  43 +-
 ultralytics/vit/__init__.py               |   7 +-
 ultralytics/vit/rtdetr/__init__.py        |   7 +
 ultralytics/vit/rtdetr/model.py           | 104 ++++
 ultralytics/vit/rtdetr/predict.py         |  42 ++
 ultralytics/vit/rtdetr/val.py             | 114 ++++
 ultralytics/vit/sam/modules/decoders.py   |   2 +-
 ultralytics/yolo/engine/predictor.py      |  16 +-
 ultralytics/yolo/utils/downloads.py       |   3 +-
 ultralytics/yolo/utils/files.py           |   2 +-
 ultralytics/yolo/utils/torch_utils.py     |  17 +-
 33 files changed, 2172 insertions(+), 834 deletions(-)
 delete mode 100644 docs/reference/nn/modules.md
 create mode 100644 docs/reference/nn/modules/block.md
 create mode 100644 docs/reference/nn/modules/conv.md
 create mode 100644 docs/reference/nn/modules/head.md
 create mode 100644 docs/reference/nn/modules/transformer.md
 create mode 100644 docs/reference/nn/modules/utils.md
 create mode 100644 ultralytics/models/rt-detr/rt-detr-l.yaml
 create mode 100644 ultralytics/models/rt-detr/rt-detr-x.yaml
 delete mode 100644 ultralytics/nn/modules.py
 create mode 100644 ultralytics/nn/modules/__init__.py
 create mode 100644 ultralytics/nn/modules/block.py
 create mode 100644 ultralytics/nn/modules/conv.py
 create mode 100644 ultralytics/nn/modules/head.py
 create mode 100644 ultralytics/nn/modules/transformer.py
 create mode 100644 ultralytics/nn/modules/utils.py
 create mode 100644 ultralytics/vit/rtdetr/__init__.py
 create mode 100644 ultralytics/vit/rtdetr/model.py
 create mode 100644 ultralytics/vit/rtdetr/predict.py
 create mode 100644 ultralytics/vit/rtdetr/val.py
diff --git a/docs/hub/app/index.md b/docs/hub/app/index.md
index 8fb977e..96b7f9d 100644
--- a/docs/hub/app/index.md
+++ b/docs/hub/app/index.md
@@ -34,20 +34,18 @@ description: Experience the power of YOLOv5 and YOLOv8 models with Ultralytics H
     <img src="https://raw.githubusercontent.com/ultralytics/assets/master/app/app-store.svg" width="15%" alt="" /></a>
 </div>
 
-Welcome to the Ultralytics HUB app, which is designed to demonstrate the power and capabilities of the YOLOv5 and YOLOv8
-models. This app is available for download on
-the [Apple App Store](https://apps.apple.com/xk/app/ultralytics/id1583935240) and
-the [Google Play Store](https://play.google.com/store/apps/details?id=com.ultralytics.ultralytics_app).
+Welcome to the Ultralytics HUB App! We are excited to introduce this powerful mobile app that allows you to run YOLOv5 and YOLOv8 models directly on your [iOS](https://apps.apple.com/xk/app/ultralytics/id1583935240) and [Android](https://play.google.com/store/apps/details?id=com.ultralytics.ultralytics_app) devices. With the HUB App, you can utilize hardware acceleration features like Apple's Neural Engine (ANE) or Android GPU and Neural Network API (NNAPI) delegates to achieve impressive performance on your mobile device.
 
-**To install the app, simply scan the QR code provided above**. At the moment, the app features YOLOv5 models, with
-YOLOv8 models set to be available soon.
+## Features
 
-With the YOLOv5 model, you can easily detect and classify objects in images and videos with high accuracy and speed. The
-model has been trained on a vast dataset and can recognize a wide range of objects, including pedestrians, traffic
-signs, and cars.
+- **Run YOLOv5 and YOLOv8 models**: Experience the power of YOLO models on your mobile device for real-time object detection and image recognition tasks.
+- **Hardware Acceleration**: Benefit from Apple ANE on iOS devices or Android GPU and NNAPI delegates for optimized performance.
+- **Custom Model Training**: Train custom models with the Ultralytics HUB platform and preview them live using the HUB App.
+- **Mobile Compatibility**: The HUB App supports both iOS and Android devices, bringing the power of YOLO models to a wide range of users.
 
-Using this app, you can try out YOLOv5 on your images and videos, and observe how the model works in real-time.
-Additionally, you can learn more about YOLOv5's functionality and how it can be integrated into real-world applications.
+## App Documentation
 
-We are confident that you will enjoy using YOLOv5 and be amazed at its capabilities. Thank you for choosing Ultralytics
-for your AI solutions.
\ No newline at end of file
+- [**iOS**](./ios.md): Learn about YOLO CoreML models accelerated on Apple's Neural Engine for iPhones and iPads.
+- [**Android**](./android.md): Explore TFLite acceleration on Android mobile devices.
+
+Get started today by downloading the Ultralytics HUB App on your mobile device and unlock the potential of YOLOv5 and YOLOv8 models on-the-go. Don't forget to check out our comprehensive [HUB Docs](../) for more information on training, deploying, and using your custom models with the Ultralytics HUB platform.
\ No newline at end of file
diff --git a/docs/reference/nn/modules.md b/docs/reference/nn/modules.md
deleted file mode 100644
index 9bfbdcf..0000000
--- a/docs/reference/nn/modules.md
+++ /dev/null
@@ -1,173 +0,0 @@
----
-description: Explore Ultralytics neural network modules for convolution, attention, detection, pose, and classification in PyTorch.
----
-
-# Conv
----
-:::ultralytics.nn.modules.Conv
-<br><br>
-
-# DWConv
----
-:::ultralytics.nn.modules.DWConv
-<br><br>
-
-# DWConvTranspose2d
----
-:::ultralytics.nn.modules.DWConvTranspose2d
-<br><br>
-
-# ConvTranspose
----
-:::ultralytics.nn.modules.ConvTranspose
-<br><br>
-
-# DFL
----
-:::ultralytics.nn.modules.DFL
-<br><br>
-
-# TransformerLayer
----
-:::ultralytics.nn.modules.TransformerLayer
-<br><br>
-
-# TransformerBlock
----
-:::ultralytics.nn.modules.TransformerBlock
-<br><br>
-
-# Bottleneck
----
-:::ultralytics.nn.modules.Bottleneck
-<br><br>
-
-# BottleneckCSP
----
-:::ultralytics.nn.modules.BottleneckCSP
-<br><br>
-
-# C3
----
-:::ultralytics.nn.modules.C3
-<br><br>
-
-# C2
----
-:::ultralytics.nn.modules.C2
-<br><br>
-
-# C2f
----
-:::ultralytics.nn.modules.C2f
-<br><br>
-
-# ChannelAttention
----
-:::ultralytics.nn.modules.ChannelAttention
-<br><br>
-
-# SpatialAttention
----
-:::ultralytics.nn.modules.SpatialAttention
-<br><br>
-
-# CBAM
----
-:::ultralytics.nn.modules.CBAM
-<br><br>
-
-# C1
----
-:::ultralytics.nn.modules.C1
-<br><br>
-
-# C3x
----
-:::ultralytics.nn.modules.C3x
-<br><br>
-
-# C3TR
----
-:::ultralytics.nn.modules.C3TR
-<br><br>
-
-# C3Ghost
----
-:::ultralytics.nn.modules.C3Ghost
-<br><br>
-
-# SPP
----
-:::ultralytics.nn.modules.SPP
-<br><br>
-
-# SPPF
----
-:::ultralytics.nn.modules.SPPF
-<br><br>
-
-# Focus
----
-:::ultralytics.nn.modules.Focus
-<br><br>
-
-# GhostConv
----
-:::ultralytics.nn.modules.GhostConv
-<br><br>
-
-# GhostBottleneck
----
-:::ultralytics.nn.modules.GhostBottleneck
-<br><br>
-
-# Concat
----
-:::ultralytics.nn.modules.Concat
-<br><br>
-
-# Proto
----
-:::ultralytics.nn.modules.Proto
-<br><br>
-
-# Ensemble
----
-:::ultralytics.nn.modules.Ensemble
-<br><br>
-
-# Detect
----
-:::ultralytics.nn.modules.Detect
-<br><br>
-
-# MLPBlock
----
-:::ultralytics.nn.modules.MLPBlock
-<br><br>
-
-# LayerNorm2d
----
-:::ultralytics.nn.modules.LayerNorm2d
-<br><br>
-
-# Segment
----
-:::ultralytics.nn.modules.Segment
-<br><br>
-
-# Pose
----
-:::ultralytics.nn.modules.Pose
-<br><br>
-
-# Classify
----
-:::ultralytics.nn.modules.Classify
-<br><br>
-
-# autopad
----
-:::ultralytics.nn.modules.autopad
-<br><br>
\ No newline at end of file
diff --git a/docs/reference/nn/modules/block.md b/docs/reference/nn/modules/block.md
new file mode 100644
index 0000000..0eb5470
--- /dev/null
+++ b/docs/reference/nn/modules/block.md
@@ -0,0 +1,84 @@
+# DFL
+---
+:::ultralytics.nn.modules.block.DFL
+<br><br>
+
+# Proto
+---
+:::ultralytics.nn.modules.block.Proto
+<br><br>
+
+# HGStem
+---
+:::ultralytics.nn.modules.block.HGStem
+<br><br>
+
+# HGBlock
+---
+:::ultralytics.nn.modules.block.HGBlock
+<br><br>
+
+# SPP
+---
+:::ultralytics.nn.modules.block.SPP
+<br><br>
+
+# SPPF
+---
+:::ultralytics.nn.modules.block.SPPF
+<br><br>
+
+# C1
+---
+:::ultralytics.nn.modules.block.C1
+<br><br>
+
+# C2
+---
+:::ultralytics.nn.modules.block.C2
+<br><br>
+
+# C2f
+---
+:::ultralytics.nn.modules.block.C2f
+<br><br>
+
+# C3
+---
+:::ultralytics.nn.modules.block.C3
+<br><br>
+
+# C3x
+---
+:::ultralytics.nn.modules.block.C3x
+<br><br>
+
+# RepC3
+---
+:::ultralytics.nn.modules.block.RepC3
+<br><br>
+
+# C3TR
+---
+:::ultralytics.nn.modules.block.C3TR
+<br><br>
+
+# C3Ghost
+---
+:::ultralytics.nn.modules.block.C3Ghost
+<br><br>
+
+# GhostBottleneck
+---
+:::ultralytics.nn.modules.block.GhostBottleneck
+<br><br>
+
+# Bottleneck
+---
+:::ultralytics.nn.modules.block.Bottleneck
+<br><br>
+
+# BottleneckCSP
+---
+:::ultralytics.nn.modules.block.BottleneckCSP
+<br><br>
diff --git a/docs/reference/nn/modules/conv.md b/docs/reference/nn/modules/conv.md
new file mode 100644
index 0000000..779dcd8
--- /dev/null
+++ b/docs/reference/nn/modules/conv.md
@@ -0,0 +1,64 @@
+# Conv
+---
+:::ultralytics.nn.modules.conv.Conv
+<br><br>
+
+# LightConv
+---
+:::ultralytics.nn.modules.conv.LightConv
+<br><br>
+
+# DWConv
+---
+:::ultralytics.nn.modules.conv.DWConv
+<br><br>
+
+# DWConvTranspose2d
+---
+:::ultralytics.nn.modules.conv.DWConvTranspose2d
+<br><br>
+
+# ConvTranspose
+---
+:::ultralytics.nn.modules.conv.ConvTranspose
+<br><br>
+
+# Focus
+---
+:::ultralytics.nn.modules.conv.Focus
+<br><br>
+
+# GhostConv
+---
+:::ultralytics.nn.modules.conv.GhostConv
+<br><br>
+
+# RepConv
+---
+:::ultralytics.nn.modules.conv.RepConv
+<br><br>
+
+# ChannelAttention
+---
+:::ultralytics.nn.modules.conv.ChannelAttention
+<br><br>
+
+# SpatialAttention
+---
+:::ultralytics.nn.modules.conv.SpatialAttention
+<br><br>
+
+# CBAM
+---
+:::ultralytics.nn.modules.conv.CBAM
+<br><br>
+
+# Concat
+---
+:::ultralytics.nn.modules.conv.Concat
+<br><br>
+
+# autopad
+---
+:::ultralytics.nn.modules.conv.autopad
+<br><br>
diff --git a/docs/reference/nn/modules/head.md b/docs/reference/nn/modules/head.md
new file mode 100644
index 0000000..2515ca8
--- /dev/null
+++ b/docs/reference/nn/modules/head.md
@@ -0,0 +1,24 @@
+# Detect
+---
+:::ultralytics.nn.modules.head.Detect
+<br><br>
+
+# Segment
+---
+:::ultralytics.nn.modules.head.Segment
+<br><br>
+
+# Pose
+---
+:::ultralytics.nn.modules.head.Pose
+<br><br>
+
+# Classify
+---
+:::ultralytics.nn.modules.head.Classify
+<br><br>
+
+# RTDETRDecoder
+---
+:::ultralytics.nn.modules.head.RTDETRDecoder
+<br><br>
diff --git a/docs/reference/nn/modules/transformer.md b/docs/reference/nn/modules/transformer.md
new file mode 100644
index 0000000..c607b93
--- /dev/null
+++ b/docs/reference/nn/modules/transformer.md
@@ -0,0 +1,49 @@
+# TransformerEncoderLayer
+---
+:::ultralytics.nn.modules.transformer.TransformerEncoderLayer
+<br><br>
+
+# AIFI
+---
+:::ultralytics.nn.modules.transformer.AIFI
+<br><br>
+
+# TransformerLayer
+---
+:::ultralytics.nn.modules.transformer.TransformerLayer
+<br><br>
+
+# TransformerBlock
+---
+:::ultralytics.nn.modules.transformer.TransformerBlock
+<br><br>
+
+# MLPBlock
+---
+:::ultralytics.nn.modules.transformer.MLPBlock
+<br><br>
+
+# MLP
+---
+:::ultralytics.nn.modules.transformer.MLP
+<br><br>
+
+# LayerNorm2d
+---
+:::ultralytics.nn.modules.transformer.LayerNorm2d
+<br><br>
+
+# MSDeformAttn
+---
+:::ultralytics.nn.modules.transformer.MSDeformAttn
+<br><br>
+
+# DeformableTransformerDecoderLayer
+---
+:::ultralytics.nn.modules.transformer.DeformableTransformerDecoderLayer
+<br><br>
+
+# DeformableTransformerDecoder
+---
+:::ultralytics.nn.modules.transformer.DeformableTransformerDecoder
+<br><br>
diff --git a/docs/reference/nn/modules/utils.md b/docs/reference/nn/modules/utils.md
new file mode 100644
index 0000000..8ee2577
--- /dev/null
+++ b/docs/reference/nn/modules/utils.md
@@ -0,0 +1,24 @@
+# _get_clones
+---
+:::ultralytics.nn.modules.utils._get_clones
+<br><br>
+
+# bias_init_with_prob
+---
+:::ultralytics.nn.modules.utils.bias_init_with_prob
+<br><br>
+
+# linear_init_
+---
+:::ultralytics.nn.modules.utils.linear_init_
+<br><br>
+
+# inverse_sigmoid
+---
+:::ultralytics.nn.modules.utils.inverse_sigmoid
+<br><br>
+
+# multi_scale_deformable_attn_pytorch
+---
+:::ultralytics.nn.modules.utils.multi_scale_deformable_attn_pytorch
+<br><br>
diff --git a/docs/reference/nn/tasks.md b/docs/reference/nn/tasks.md
index 977ed65..2263779 100644
--- a/docs/reference/nn/tasks.md
+++ b/docs/reference/nn/tasks.md
@@ -27,6 +27,11 @@ description: Learn how to work with Ultralytics YOLO Detection, Segmentation & C
 :::ultralytics.nn.tasks.ClassificationModel
 <br><br>
 
+# Ensemble
+---
+:::ultralytics.nn.tasks.Ensemble
+<br><br>
+
 # torch_safe_load
 ---
 :::ultralytics.nn.tasks.torch_safe_load
diff --git a/mkdocs.yml b/mkdocs.yml
index 79b2e02..5f50397 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -95,6 +95,7 @@ extra:
 
 extra_css:
   - stylesheets/style.css
+  - https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css
 
 markdown_extensions:
   # Div text decorators
@@ -249,7 +250,12 @@ nav:
       - nn:
           - autobackend: reference/nn/autobackend.md
           - autoshape: reference/nn/autoshape.md
-          - modules: reference/nn/modules.md
+          - modules:
+              - blocks: reference/nn/modules/block.md
+              - convs: reference/nn/modules/conv.md
+              - head: reference/nn/modules/head.md
+              - transformer: reference/nn/modules/transformer.md
+              - utils: reference/nn/modules/utils.md
           - tasks: reference/nn/tasks.md
       - tracker:
           - track: reference/tracker/track.md
@@ -340,7 +346,8 @@ plugins:
   - ultralytics:
       add_desc: False
       add_image: True
-      default_image: https://raw.githubusercontent.com/ultralytics/assets/main/yolov8/banner-yolov8.png
+      add_share_buttons: True
+      default_image: https://github.com/ultralytics/ultralytics/assets/26833433/6d09221c-c52a-4234-9a5d-b862e93c6529
   - git-revision-date-localized:
       type: timeago
       enable_creation_date: true
@@ -366,7 +373,7 @@ plugins:
         reference/base_trainer.md: reference/yolo/engine/trainer.md
         reference/exporter.md: reference/yolo/engine/exporter.md
         reference/model.md: reference/yolo/engine/model.md
-        reference/nn.md: reference/nn/modules.md
+        reference/nn.md: reference/nn/modules/head.md
         reference/ops.md: reference/yolo/utils/ops.md
         reference/results.md: reference/yolo/engine/results.md
         reference/base_val.md: index.md
@@ -427,6 +434,12 @@ plugins:
         yolov5/tutorials/comet_integration_tutorial.md: yolov5/tutorials/comet_logging_integration.md
         yolov5/tutorials/yolov5_pruning_and_sparsity_tutorial.md: yolov5/tutorials/model_pruning_and_sparsity.md
         yolov5/tutorials/yolov5_jetson_nano_tutorial.md: yolov5/tutorials/running_on_jetson_nano.md
+        yolov5/tutorials/yolov5_roboflow_integration.md: yolov5/tutorials/roboflow_datasets_integration.md
+        yolov5/tutorials/hyperparameter_evolution_tutorial.md: yolov5/tutorials/hyperparameter_evolution.md
+        yolov5/tutorials/yolov5_hyperparameter_evolution_tutorial.md: yolov5/tutorials/hyperparameter_evolution.md
+        yolov5/tutorials/clearml_integration_tutorial.md: yolov5/tutorials/clearml_logging_integration.md
+        yolov5/tutorials/test_time_augmentation_tutorial.md: yolov5/tutorials/test_time_augmentation.md
+        yolov5/tutorials/yolov5_test_time_augmentation_tutorial.md: yolov5/tutorials/test_time_augmentation.md
         yolov5/environments/yolov5_amazon_web_services_quickstart_tutorial.md: yolov5/environments/aws_quickstart_tutorial.md
         yolov5/environments/yolov5_google_cloud_platform_quickstart_tutorial.md: yolov5/environments/google_cloud_quickstart_tutorial.md
         yolov5/environments/yolov5_docker_image_quickstart_tutorial.md: yolov5/environments/docker_image_quickstart_tutorial.md
diff --git a/tests/test_python.py b/tests/test_python.py
index 09fb9e6..0e8d9f1 100644
--- a/tests/test_python.py
+++ b/tests/test_python.py
@@ -173,7 +173,7 @@ def test_export_paddle(enabled=False):
 
 
 def test_all_model_yamls():
-    for m in list((ROOT / 'models').rglob('*.yaml')):
+    for m in list((ROOT / 'models').rglob('yolo*.yaml')):
         YOLO(m.name)
 
 
diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py
index c3f35a2..0e1c4da 100644
--- a/ultralytics/__init__.py
+++ b/ultralytics/__init__.py
@@ -1,10 +1,11 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
 
-__version__ = '8.0.97'
+__version__ = '8.0.98'
 
 from ultralytics.hub import start
+from ultralytics.vit.rtdetr import RTDETR
 from ultralytics.vit.sam import SAM
 from ultralytics.yolo.engine.model import YOLO
 from ultralytics.yolo.utils.checks import check_yolo as checks
 
-__all__ = '__version__', 'YOLO', 'SAM', 'checks', 'start'  # allow simpler import
+__all__ = '__version__', 'YOLO', 'SAM', 'RTDETR', 'checks', 'start'  # allow simpler import
diff --git a/ultralytics/hub/__init__.py b/ultralytics/hub/__init__.py
index 707563f..59c7692 100644
--- a/ultralytics/hub/__init__.py
+++ b/ultralytics/hub/__init__.py
@@ -71,11 +71,9 @@ def export_fmts_hub():
 def export_model(model_id='', format='torchscript'):
     """Export a model to all formats."""
     assert format in export_fmts_hub(), f"Unsupported export format '{format}', valid formats are {export_fmts_hub()}"
-    r = requests.post('https://api.ultralytics.com/export',
-                      json={
-                          'apiKey': Auth().api_key,
-                          'modelId': model_id,
-                          'format': format})
+    r = requests.post(f'https://api.ultralytics.com/v1/models/{model_id}/export',
+                      json={'format': format},
+                      headers={'x-api-key': Auth().api_key})
     assert r.status_code == 200, f'{PREFIX}{format} export failure {r.status_code} {r.reason}'
     LOGGER.info(f'{PREFIX}{format} export started ✅')
 
diff --git a/ultralytics/models/rt-detr/rt-detr-l.yaml b/ultralytics/models/rt-detr/rt-detr-l.yaml
new file mode 100644
index 0000000..37299fa
--- /dev/null
+++ b/ultralytics/models/rt-detr/rt-detr-l.yaml
@@ -0,0 +1,49 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
+  # [depth, width, max_channels]
+  l: [1.00, 1.00, 1024]
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, HGStem, [32, 48]]  # 0-P2/4
+  - [-1, 6, HGBlock, [48, 128, 3]]  # stage 1
+
+  - [-1, 1, DWConv, [128, 3, 2, 1, False]]  # 2-P3/8
+  - [-1, 6, HGBlock, [96, 512, 3]]   # stage 2
+
+  - [-1, 1, DWConv, [512, 3, 2, 1, False]]  # 4-P3/16
+  - [-1, 6, HGBlock, [192, 1024, 5, True, False]]  # cm, c2, k, light, shortcut
+  - [-1, 6, HGBlock, [192, 1024, 5, True, True]]
+  - [-1, 6, HGBlock, [192, 1024, 5, True, True]]  # stage 3
+
+  - [-1, 1, DWConv, [1024, 3, 2, 1, False]]  # 8-P4/32
+  - [-1, 6, HGBlock, [384, 2048, 5, True, False]]  # stage 4
+
+head:
+  - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]]  # 10 input_proj.2
+  - [-1, 1, AIFI, [1024, 8]]
+  - [-1, 1, Conv, [256, 1, 1]]   # 12, Y5, lateral_convs.0
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [7, 1, Conv, [256, 1, 1, None, 1, 1, False]]  # 14 input_proj.1
+  - [[-2, -1], 1, Concat, [1]]
+  - [-1, 3, RepC3, [256]]  # 16, fpn_blocks.0
+  - [-1, 1, Conv, [256, 1, 1]]   # 17, Y4, lateral_convs.1
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]]  # 19 input_proj.0
+  - [[-2, -1], 1, Concat, [1]]  # cat backbone P4
+  - [-1, 3, RepC3, [256]]    # X3 (21), fpn_blocks.1
+
+  - [-1, 1, Conv, [256, 3, 2]]   # 22, downsample_convs.0
+  - [[-1, 17], 1, Concat, [1]]  # cat Y4
+  - [-1, 3, RepC3, [256]]    # F4 (24), pan_blocks.0
+
+  - [-1, 1, Conv, [256, 3, 2]]   # 25, downsample_convs.1
+  - [[-1, 12], 1, Concat, [1]]  # cat Y5
+  - [-1, 3, RepC3, [256]]    # F5 (27), pan_blocks.1
+
+  - [[21, 24, 27], 1, RTDETRDecoder, [nc]]  # Detect(P3, P4, P5)
diff --git a/ultralytics/models/rt-detr/rt-detr-x.yaml b/ultralytics/models/rt-detr/rt-detr-x.yaml
new file mode 100644
index 0000000..e5b0b67
--- /dev/null
+++ b/ultralytics/models/rt-detr/rt-detr-x.yaml
@@ -0,0 +1,53 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+# Parameters
+nc: 80  # number of classes
+scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
+  # [depth, width, max_channels]
+  x: [1.00, 1.00, 2048]
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, HGStem, [32, 64]]  # 0-P2/4
+  - [-1, 6, HGBlock, [64, 128, 3]]  # stage 1
+
+  - [-1, 1, DWConv, [128, 3, 2, 1, False]]  # 2-P3/8
+  - [-1, 6, HGBlock, [128, 512, 3]]
+  - [-1, 6, HGBlock, [128, 512, 3, False, True]]   # 4-stage 2
+
+  - [-1, 1, DWConv, [512, 3, 2, 1, False]]  # 5-P3/16
+  - [-1, 6, HGBlock, [256, 1024, 5, True, False]]  # cm, c2, k, light, shortcut
+  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
+  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
+  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]
+  - [-1, 6, HGBlock, [256, 1024, 5, True, True]]  # 10-stage 3
+
+  - [-1, 1, DWConv, [1024, 3, 2, 1, False]]  # 11-P4/32
+  - [-1, 6, HGBlock, [512, 2048, 5, True, False]]
+  - [-1, 6, HGBlock, [512, 2048, 5, True, True]]  # 13-stage 4
+
+head:
+  - [-1, 1, Conv, [384, 1, 1, None, 1, 1, False]]  # 14 input_proj.2
+  - [-1, 1, AIFI, [2048, 8]]
+  - [-1, 1, Conv, [384, 1, 1]]   # 16, Y5, lateral_convs.0
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [10, 1, Conv, [384, 1, 1, None, 1, 1, False]]  # 18 input_proj.1
+  - [[-2, -1], 1, Concat, [1]]
+  - [-1, 3, RepC3, [384]]  # 20, fpn_blocks.0
+  - [-1, 1, Conv, [384, 1, 1]]   # 21, Y4, lateral_convs.1
+
+  - [-1, 1, nn.Upsample, [None, 2, 'nearest']]
+  - [4, 1, Conv, [384, 1, 1, None, 1, 1, False]]  # 23 input_proj.0
+  - [[-2, -1], 1, Concat, [1]]  # cat backbone P4
+  - [-1, 3, RepC3, [384]]    # X3 (25), fpn_blocks.1
+
+  - [-1, 1, Conv, [384, 3, 2]]   # 26, downsample_convs.0
+  - [[-1, 21], 1, Concat, [1]]  # cat Y4
+  - [-1, 3, RepC3, [384]]    # F4 (28), pan_blocks.0
+
+  - [-1, 1, Conv, [384, 3, 2]]   # 29, downsample_convs.1
+  - [[-1, 16], 1, Concat, [1]]  # cat Y5
+  - [-1, 3, RepC3, [384]]    # F5 (31), pan_blocks.1
+
+  - [[25, 28, 31], 1, RTDETRDecoder, [nc]]  # Detect(P3, P4, P5)
diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py
index 5e61049..7a94d1b 100644
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@@ -1,4 +1,5 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
+
 import ast
 import contextlib
 import json
diff --git a/ultralytics/nn/modules.py b/ultralytics/nn/modules.py
deleted file mode 100644
index 8ad7672..0000000
--- a/ultralytics/nn/modules.py
+++ /dev/null
@@ -1,616 +0,0 @@
-# Ultralytics YOLO 🚀, AGPL-3.0 license
-"""
-Common modules
-"""
-
-import math
-
-import torch
-import torch.nn as nn
-
-from ultralytics.yolo.utils.tal import dist2bbox, make_anchors
-
-
-def autopad(k, p=None, d=1):  # kernel, padding, dilation
-    """Pad to 'same' shape outputs."""
-    if d > 1:
-        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
-    if p is None:
-        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
-    return p
-
-
-class Conv(nn.Module):
-    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
-    default_act = nn.SiLU()  # default activation
-
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
-        """Initialize Conv layer with given arguments including activation."""
-        super().__init__()
-        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
-        self.bn = nn.BatchNorm2d(c2)
-        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
-    def forward(self, x):
-        """Apply convolution, batch normalization and activation to input tensor."""
-        return self.act(self.bn(self.conv(x)))
-
-    def forward_fuse(self, x):
-        """Perform transposed convolution of 2D data."""
-        return self.act(self.conv(x))
-
-
-class DWConv(Conv):
-    """Depth-wise convolution."""
-
-    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
-        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
-
-
-class DWConvTranspose2d(nn.ConvTranspose2d):
-    """Depth-wise transpose convolution."""
-
-    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
-        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
-
-
-class ConvTranspose(nn.Module):
-    """Convolution transpose 2d layer."""
-    default_act = nn.SiLU()  # default activation
-
-    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
-        """Initialize ConvTranspose2d layer with batch normalization and activation function."""
-        super().__init__()
-        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
-        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
-        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
-
-    def forward(self, x):
-        """Applies transposed convolutions, batch normalization and activation to input."""
-        return self.act(self.bn(self.conv_transpose(x)))
-
-    def forward_fuse(self, x):
-        """Applies activation and convolution transpose operation to input."""
-        return self.act(self.conv_transpose(x))
-
-
-class DFL(nn.Module):
-    """
-    Integral module of Distribution Focal Loss (DFL).
-    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
-    """
-
-    def __init__(self, c1=16):
-        """Initialize a convolutional layer with a given number of input channels."""
-        super().__init__()
-        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
-        x = torch.arange(c1, dtype=torch.float)
-        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
-        self.c1 = c1
-
-    def forward(self, x):
-        """Applies a transformer layer on input tensor 'x' and returns a tensor."""
-        b, c, a = x.shape  # batch, channels, anchors
-        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
-        # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
-
-
-class TransformerLayer(nn.Module):
-    """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
-
-    def __init__(self, c, num_heads):
-        """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
-        super().__init__()
-        self.q = nn.Linear(c, c, bias=False)
-        self.k = nn.Linear(c, c, bias=False)
-        self.v = nn.Linear(c, c, bias=False)
-        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
-        self.fc1 = nn.Linear(c, c, bias=False)
-        self.fc2 = nn.Linear(c, c, bias=False)
-
-    def forward(self, x):
-        """Apply a transformer block to the input x and return the output."""
-        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
-        x = self.fc2(self.fc1(x)) + x
-        return x
-
-
-class TransformerBlock(nn.Module):
-    """Vision Transformer https://arxiv.org/abs/2010.11929."""
-
-    def __init__(self, c1, c2, num_heads, num_layers):
-        """Initialize a Transformer module with position embedding and specified number of heads and layers."""
-        super().__init__()
-        self.conv = None
-        if c1 != c2:
-            self.conv = Conv(c1, c2)
-        self.linear = nn.Linear(c2, c2)  # learnable position embedding
-        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
-        self.c2 = c2
-
-    def forward(self, x):
-        """Forward propagates the input through the bottleneck module."""
-        if self.conv is not None:
-            x = self.conv(x)
-        b, _, w, h = x.shape
-        p = x.flatten(2).permute(2, 0, 1)
-        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
-
-
-class Bottleneck(nn.Module):
-    """Standard bottleneck."""
-
-    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, groups, kernels, expand
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, k[0], 1)
-        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
-        self.add = shortcut and c1 == c2
-
-    def forward(self, x):
-        """'forward()' applies the YOLOv5 FPN to input data."""
-        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
-
-
-class BottleneckCSP(nn.Module):
-    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
-        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
-        self.cv4 = Conv(2 * c_, c2, 1, 1)
-        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
-        self.act = nn.SiLU()
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
-
-    def forward(self, x):
-        """Applies a CSP bottleneck with 3 convolutions."""
-        y1 = self.cv3(self.m(self.cv1(x)))
-        y2 = self.cv2(x)
-        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
-
-
-class C3(nn.Module):
-    """CSP Bottleneck with 3 convolutions."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
-
-    def forward(self, x):
-        """Forward pass through the CSP bottleneck with 2 convolutions."""
-        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
-
-
-class C2(nn.Module):
-    """CSP Bottleneck with 2 convolutions."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        self.c = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv(2 * self.c, c2, 1)  # optional act=FReLU(c2)
-        # self.attention = ChannelAttention(2 * self.c)  # or SpatialAttention()
-        self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
-
-    def forward(self, x):
-        """Forward pass through the CSP bottleneck with 2 convolutions."""
-        a, b = self.cv1(x).chunk(2, 1)
-        return self.cv2(torch.cat((self.m(a), b), 1))
-
-
-class C2f(nn.Module):
-    """CSP Bottleneck with 2 convolutions."""
-
-    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
-        super().__init__()
-        self.c = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
-        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
-        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
-
-    def forward(self, x):
-        """Forward pass of a YOLOv5 CSPDarknet backbone layer."""
-        y = list(self.cv1(x).chunk(2, 1))
-        y.extend(m(y[-1]) for m in self.m)
-        return self.cv2(torch.cat(y, 1))
-
-    def forward_split(self, x):
-        """Applies spatial attention to module's input."""
-        y = list(self.cv1(x).split((self.c, self.c), 1))
-        y.extend(m(y[-1]) for m in self.m)
-        return self.cv2(torch.cat(y, 1))
-
-
-class ChannelAttention(nn.Module):
-    """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
-
-    def __init__(self, channels: int) -> None:
-        super().__init__()
-        self.pool = nn.AdaptiveAvgPool2d(1)
-        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
-        self.act = nn.Sigmoid()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return x * self.act(self.fc(self.pool(x)))
-
-
-class SpatialAttention(nn.Module):
-    """Spatial-attention module."""
-
-    def __init__(self, kernel_size=7):
-        """Initialize Spatial-attention module with kernel size argument."""
-        super().__init__()
-        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
-        padding = 3 if kernel_size == 7 else 1
-        self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
-        self.act = nn.Sigmoid()
-
-    def forward(self, x):
-        """Apply channel and spatial attention on input for feature recalibration."""
-        return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
-
-
-class CBAM(nn.Module):
-    """Convolutional Block Attention Module."""
-
-    def __init__(self, c1, kernel_size=7):  # ch_in, kernels
-        super().__init__()
-        self.channel_attention = ChannelAttention(c1)
-        self.spatial_attention = SpatialAttention(kernel_size)
-
-    def forward(self, x):
-        """Applies the forward pass through C1 module."""
-        return self.spatial_attention(self.channel_attention(x))
-
-
-class C1(nn.Module):
-    """CSP Bottleneck with 1 convolution."""
-
-    def __init__(self, c1, c2, n=1):  # ch_in, ch_out, number
-        super().__init__()
-        self.cv1 = Conv(c1, c2, 1, 1)
-        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
-
-    def forward(self, x):
-        """Applies cross-convolutions to input in the C3 module."""
-        y = self.cv1(x)
-        return self.m(y) + y
-
-
-class C3x(C3):
-    """C3 module with cross-convolutions."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize C3TR instance and set default parameters."""
-        super().__init__(c1, c2, n, shortcut, g, e)
-        self.c_ = int(c2 * e)
-        self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
-
-
-class C3TR(C3):
-    """C3 module with TransformerBlock()."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize C3Ghost module with GhostBottleneck()."""
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)
-        self.m = TransformerBlock(c_, c_, 4, n)
-
-
-class C3Ghost(C3):
-    """C3 module with GhostBottleneck()."""
-
-    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
-        super().__init__(c1, c2, n, shortcut, g, e)
-        c_ = int(c2 * e)  # hidden channels
-        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
-
-
-class SPP(nn.Module):
-    """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
-
-    def __init__(self, c1, c2, k=(5, 9, 13)):
-        """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
-        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
-
-    def forward(self, x):
-        """Forward pass of the SPP layer, performing spatial pyramid pooling."""
-        x = self.cv1(x)
-        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
-
-
-class SPPF(nn.Module):
-    """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
-
-    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
-        super().__init__()
-        c_ = c1 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c_ * 4, c2, 1, 1)
-        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
-
-    def forward(self, x):
-        """Forward pass through Ghost Convolution block."""
-        x = self.cv1(x)
-        y1 = self.m(x)
-        y2 = self.m(y1)
-        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
-
-
-class Focus(nn.Module):
-    """Focus wh information into c-space."""
-
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
-        super().__init__()
-        self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
-        # self.contract = Contract(gain=2)
-
-    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
-        return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
-        # return self.conv(self.contract(x))
-
-
-class GhostConv(nn.Module):
-    """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
-
-    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
-        super().__init__()
-        c_ = c2 // 2  # hidden channels
-        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
-        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
-
-    def forward(self, x):
-        """Forward propagation through a Ghost Bottleneck layer with skip connection."""
-        y = self.cv1(x)
-        return torch.cat((y, self.cv2(y)), 1)
-
-
-class GhostBottleneck(nn.Module):
-    """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
-
-    def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
-        super().__init__()
-        c_ = c2 // 2
-        self.conv = nn.Sequential(
-            GhostConv(c1, c_, 1, 1),  # pw
-            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
-            GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
-        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
-                                                                            act=False)) if s == 2 else nn.Identity()
-
-    def forward(self, x):
-        """Applies skip connection and concatenation to input tensor."""
-        return self.conv(x) + self.shortcut(x)
-
-
-class Concat(nn.Module):
-    """Concatenate a list of tensors along dimension."""
-
-    def __init__(self, dimension=1):
-        """Concatenates a list of tensors along a specified dimension."""
-        super().__init__()
-        self.d = dimension
-
-    def forward(self, x):
-        """Forward pass for the YOLOv8 mask Proto module."""
-        return torch.cat(x, self.d)
-
-
-class Proto(nn.Module):
-    """YOLOv8 mask Proto module for segmentation models."""
-
-    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
-        super().__init__()
-        self.cv1 = Conv(c1, c_, k=3)
-        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
-        self.cv2 = Conv(c_, c_, k=3)
-        self.cv3 = Conv(c_, c2)
-
-    def forward(self, x):
-        """Performs a forward pass through layers using an upsampled input image."""
-        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
-
-
-class Ensemble(nn.ModuleList):
-    """Ensemble of models."""
-
-    def __init__(self):
-        """Initialize an ensemble of models."""
-        super().__init__()
-
-    def forward(self, x, augment=False, profile=False, visualize=False):
-        """Function generates the YOLOv5 network's final layer."""
-        y = [module(x, augment, profile, visualize)[0] for module in self]
-        # y = torch.stack(y).max(0)[0]  # max ensemble
-        # y = torch.stack(y).mean(0)  # mean ensemble
-        y = torch.cat(y, 2)  # nms ensemble, y shape(B, HW, C)
-        return y, None  # inference, train output
-
-
-# Model heads below ----------------------------------------------------------------------------------------------------
-
-
-class Detect(nn.Module):
-    """YOLOv8 Detect head for detection models."""
-    dynamic = False  # force grid reconstruction
-    export = False  # export mode
-    shape = None
-    anchors = torch.empty(0)  # init
-    strides = torch.empty(0)  # init
-
-    def __init__(self, nc=80, ch=()):  # detection layer
-        super().__init__()
-        self.nc = nc  # number of classes
-        self.nl = len(ch)  # number of detection layers
-        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
-        self.no = nc + self.reg_max * 4  # number of outputs per anchor
-        self.stride = torch.zeros(self.nl)  # strides computed during build
-        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc)  # channels
-        self.cv2 = nn.ModuleList(
-            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
-        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
-        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
-
-    def forward(self, x):
-        """Concatenates and returns predicted bounding boxes and class probabilities."""
-        shape = x[0].shape  # BCHW
-        for i in range(self.nl):
-            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
-        if self.training:
-            return x
-        elif self.dynamic or self.shape != shape:
-            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
-            self.shape = shape
-
-        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
-        if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'):  # avoid TF FlexSplitV ops
-            box = x_cat[:, :self.reg_max * 4]
-            cls = x_cat[:, self.reg_max * 4:]
-        else:
-            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
-        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
-        y = torch.cat((dbox, cls.sigmoid()), 1)
-        return y if self.export else (y, x)
-
-    def bias_init(self):
-        """Initialize Detect() biases, WARNING: requires stride availability."""
-        m = self  # self.model[-1]  # Detect() module
-        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
-        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
-        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
-            a[-1].bias.data[:] = 1.0  # box
-            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
-
-
-class MLPBlock(nn.Module):
-
-    def __init__(
-        self,
-        embedding_dim,
-        mlp_dim,
-        act=nn.GELU,
-    ):
-        super().__init__()
-        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
-        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
-        self.act = act()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.lin2(self.act(self.lin1(x)))
-
-
-# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
-# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
-class LayerNorm2d(nn.Module):
-
-    def __init__(self, num_channels, eps=1e-6):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(num_channels))
-        self.bias = nn.Parameter(torch.zeros(num_channels))
-        self.eps = eps
-
-    def forward(self, x):
-        u = x.mean(1, keepdim=True)
-        s = (x - u).pow(2).mean(1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.eps)
-        x = self.weight[:, None, None] * x + self.bias[:, None, None]
-        return x
-
-
-class Segment(Detect):
-    """YOLOv8 Segment head for segmentation models."""
-
-    def __init__(self, nc=80, nm=32, npr=256, ch=()):
-        """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
-        super().__init__(nc, ch)
-        self.nm = nm  # number of masks
-        self.npr = npr  # number of protos
-        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
-        self.detect = Detect.forward
-
-        c4 = max(ch[0] // 4, self.nm)
-        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
-
-    def forward(self, x):
-        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
-        p = self.proto(x[0])  # mask protos
-        bs = p.shape[0]  # batch size
-
-        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
-        x = self.detect(self, x)
-        if self.training:
-            return x, mc, p
-        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
-
-
-class Pose(Detect):
-    """YOLOv8 Pose head for keypoints models."""
-
-    def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
-        """Initialize YOLO network with default parameters and Convolutional Layers."""
-        super().__init__(nc, ch)
-        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
-        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total
-        self.detect = Detect.forward
-
-        c4 = max(ch[0] // 4, self.nk)
-        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
-
-    def forward(self, x):
-        """Perform forward pass through YOLO model and return predictions."""
-        bs = x[0].shape[0]  # batch size
-        kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
-        x = self.detect(self, x)
-        if self.training:
-            return x, kpt
-        pred_kpt = self.kpts_decode(bs, kpt)
-        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
-
-    def kpts_decode(self, bs, kpts):
-        """Decodes keypoints."""
-        ndim = self.kpt_shape[1]
-        if self.export:  # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug
-            y = kpts.view(bs, *self.kpt_shape, -1)
-            a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides
-            if ndim == 3:
-                a = torch.cat((a, y[:, :, 1:2].sigmoid()), 2)
-            return a.view(bs, self.nk, -1)
-        else:
-            y = kpts.clone()
-            if ndim == 3:
-                y[:, 2::3].sigmoid_()  # inplace sigmoid
-            y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
-            y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
-            return y
-
-
-class Classify(nn.Module):
-    """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""
-
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
-        super().__init__()
-        c_ = 1280  # efficientnet_b0 size
-        self.conv = Conv(c1, c_, k, s, autopad(k, p), g)
-        self.pool = nn.AdaptiveAvgPool2d(1)  # to x(b,c_,1,1)
-        self.drop = nn.Dropout(p=0.0, inplace=True)
-        self.linear = nn.Linear(c_, c2)  # to x(b,c2)
-
-    def forward(self, x):
-        """Performs a forward pass of the YOLO model on input image data."""
-        if isinstance(x, list):
-            x = torch.cat(x, 1)
-        x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
-        return x if self.training else x.softmax(1)
diff --git a/ultralytics/nn/modules/__init__.py b/ultralytics/nn/modules/__init__.py
new file mode 100644
index 0000000..b148cbf
--- /dev/null
+++ b/ultralytics/nn/modules/__init__.py
@@ -0,0 +1,17 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck,
+                    HGBlock, HGStem, Proto, RepC3)
+from .conv import (CBAM, ChannelAttention, Concat, Conv, ConvTranspose, DWConv, DWConvTranspose2d, Focus, GhostConv,
+                   LightConv, RepConv, SpatialAttention)
+from .head import Classify, Detect, Pose, RTDETRDecoder, Segment
+from .transformer import (AIFI, MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer, LayerNorm2d,
+                          MLPBlock, MSDeformAttn, TransformerBlock, TransformerEncoderLayer, TransformerLayer)
+
+__all__ = [
+    'Conv', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv',
+    'ChannelAttention', 'SpatialAttention', 'CBAM', 'Concat', 'TransformerLayer', 'TransformerBlock', 'MLPBlock',
+    'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost',
+    'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect', 'Segment', 'Pose', 'Classify',
+    'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI', 'DeformableTransformerDecoder',
+    'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP']
diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py
new file mode 100644
index 0000000..58ff1de
--- /dev/null
+++ b/ultralytics/nn/modules/block.py
@@ -0,0 +1,305 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Block modules
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .conv import Conv, DWConv, GhostConv, LightConv, RepConv
+from .transformer import TransformerBlock
+
+__all__ = [
+    'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck',
+    'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3']
+
+
+class DFL(nn.Module):
+    """
+    Integral module of Distribution Focal Loss (DFL).
+    Proposed in Generalized Focal Loss https://ieeexplore.ieee.org/document/9792391
+    """
+
+    def __init__(self, c1=16):
+        """Initialize a convolutional layer with a given number of input channels."""
+        super().__init__()
+        self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
+        x = torch.arange(c1, dtype=torch.float)
+        self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
+        self.c1 = c1
+
+    def forward(self, x):
+        """Applies a transformer layer on input tensor 'x' and returns a tensor."""
+        b, c, a = x.shape  # batch, channels, anchors
+        return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
+        # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
+
+
+class Proto(nn.Module):
+    """YOLOv8 mask Proto module for segmentation models."""
+
+    def __init__(self, c1, c_=256, c2=32):  # ch_in, number of protos, number of masks
+        super().__init__()
+        self.cv1 = Conv(c1, c_, k=3)
+        self.upsample = nn.ConvTranspose2d(c_, c_, 2, 2, 0, bias=True)  # nn.Upsample(scale_factor=2, mode='nearest')
+        self.cv2 = Conv(c_, c_, k=3)
+        self.cv3 = Conv(c_, c2)
+
+    def forward(self, x):
+        """Performs a forward pass through layers using an upsampled input image."""
+        return self.cv3(self.cv2(self.upsample(self.cv1(x))))
+
+
+class HGStem(nn.Module):
+    """StemBlock of PPHGNetV2 with 5 convolutions and one maxpool2d.
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+    """
+
+    def __init__(self, c1, cm, c2):
+        super().__init__()
+        self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
+        self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
+        self.stem2b = Conv(cm // 2, cm, 2, 1, 0, act=nn.ReLU())
+        self.stem3 = Conv(cm * 2, cm, 3, 2, act=nn.ReLU())
+        self.stem4 = Conv(cm, c2, 1, 1, act=nn.ReLU())
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=1, padding=0, ceil_mode=True)
+
+    def forward(self, x):
+        """Forward pass of a PPHGNetV2 backbone layer."""
+        x = self.stem1(x)
+        x = F.pad(x, [0, 1, 0, 1])
+        x2 = self.stem2a(x)
+        x2 = F.pad(x2, [0, 1, 0, 1])
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = torch.cat([x1, x2], dim=1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+        return x
+
+
+class HGBlock(nn.Module):
+    """HG_Block of PPHGNetV2 with 2 convolutions and LightConv.
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+    """
+
+    def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
+        super().__init__()
+        block = LightConv if lightconv else Conv
+        self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
+        self.sc = Conv(c1 + n * cm, c2 // 2, 1, 1, act=act)  # squeeze conv
+        self.ec = Conv(c2 // 2, c2, 1, 1, act=act)  # excitation conv
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        """Forward pass of a PPHGNetV2 backbone layer."""
+        y = [x]
+        y.extend(m(y[-1]) for m in self.m)
+        y = self.ec(self.sc(torch.cat(y, 1)))
+        return y + x if self.add else y
+
+
+class SPP(nn.Module):
+    """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
+
+    def __init__(self, c1, c2, k=(5, 9, 13)):
+        """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
+        self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
+
+    def forward(self, x):
+        """Forward pass of the SPP layer, performing spatial pyramid pooling."""
+        x = self.cv1(x)
+        return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
+
+
+class SPPF(nn.Module):
+    """Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher."""
+
+    def __init__(self, c1, c2, k=5):  # equivalent to SPP(k=(5, 9, 13))
+        super().__init__()
+        c_ = c1 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c_ * 4, c2, 1, 1)
+        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
+
+    def forward(self, x):
+        """Forward pass through Ghost Convolution block."""
+        x = self.cv1(x)
+        y1 = self.m(x)
+        y2 = self.m(y1)
+        return self.cv2(torch.cat((x, y1, y2, self.m(y2)), 1))
+
+
+class C1(nn.Module):
+    """CSP Bottleneck with 1 convolution."""
+
+    def __init__(self, c1, c2, n=1):  # ch_in, ch_out, number
+        super().__init__()
+        self.cv1 = Conv(c1, c2, 1, 1)
+        self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
+
+    def forward(self, x):
+        """Applies cross-convolutions to input in the C3 module."""
+        y = self.cv1(x)
+        return self.m(y) + y
+
+
+class C2(nn.Module):
+    """CSP Bottleneck with 2 convolutions."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv(2 * self.c, c2, 1)  # optional act=FReLU(c2)
+        # self.attention = ChannelAttention(2 * self.c)  # or SpatialAttention()
+        self.m = nn.Sequential(*(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n)))
+
+    def forward(self, x):
+        """Forward pass through the CSP bottleneck with 2 convolutions."""
+        a, b = self.cv1(x).chunk(2, 1)
+        return self.cv2(torch.cat((self.m(a), b), 1))
+
+
+class C2f(nn.Module):
+    """CSP Bottleneck with 2 convolutions."""
+
+    def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        self.c = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
+        self.cv2 = Conv((2 + n) * self.c, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
+
+    def forward(self, x):
+        """Forward pass of a YOLOv5 CSPDarknet backbone layer."""
+        y = list(self.cv1(x).chunk(2, 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+
+    def forward_split(self, x):
+        """Applies spatial attention to module's input."""
+        y = list(self.cv1(x).split((self.c, self.c), 1))
+        y.extend(m(y[-1]) for m in self.m)
+        return self.cv2(torch.cat(y, 1))
+
+
+class C3(nn.Module):
+    """CSP Bottleneck with 3 convolutions."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1)  # optional act=FReLU(c2)
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
+
+    def forward(self, x):
+        """Forward pass through the CSP bottleneck with 2 convolutions."""
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
+
+
+class C3x(C3):
+    """C3 module with cross-convolutions."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize C3TR instance and set default parameters."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        self.c_ = int(c2 * e)
+        self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
+
+
+class RepC3(nn.Module):
+    """Rep C3."""
+
+    def __init__(self, c1, c2, n=3, e=1.0):
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c2, 1, 1)
+        self.cv2 = Conv(c1, c2, 1, 1)
+        self.m = nn.Sequential(*[RepConv(c_, c_) for _ in range(n)])
+        self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
+
+    def forward(self, x):
+        """Forward pass of RT-DETR neck layer."""
+        return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
+
+
+class C3TR(C3):
+    """C3 module with TransformerBlock()."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize C3Ghost module with GhostBottleneck()."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)
+        self.m = TransformerBlock(c_, c_, 4, n)
+
+
+class C3Ghost(C3):
+    """C3 module with GhostBottleneck()."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
+        """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
+        super().__init__(c1, c2, n, shortcut, g, e)
+        c_ = int(c2 * e)  # hidden channels
+        self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
+
+
+class GhostBottleneck(nn.Module):
+    """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
+
+    def __init__(self, c1, c2, k=3, s=1):  # ch_in, ch_out, kernel, stride
+        super().__init__()
+        c_ = c2 // 2
+        self.conv = nn.Sequential(
+            GhostConv(c1, c_, 1, 1),  # pw
+            DWConv(c_, c_, k, s, act=False) if s == 2 else nn.Identity(),  # dw
+            GhostConv(c_, c2, 1, 1, act=False))  # pw-linear
+        self.shortcut = nn.Sequential(DWConv(c1, c1, k, s, act=False), Conv(c1, c2, 1, 1,
+                                                                            act=False)) if s == 2 else nn.Identity()
+
+    def forward(self, x):
+        """Applies skip connection and concatenation to input tensor."""
+        return self.conv(x) + self.shortcut(x)
+
+
+class Bottleneck(nn.Module):
+    """Standard bottleneck."""
+
+    def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):  # ch_in, ch_out, shortcut, groups, kernels, expand
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, k[0], 1)
+        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
+        self.add = shortcut and c1 == c2
+
+    def forward(self, x):
+        """'forward()' applies the YOLOv5 FPN to input data."""
+        return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
+
+
+class BottleneckCSP(nn.Module):
+    """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
+
+    def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
+        super().__init__()
+        c_ = int(c2 * e)  # hidden channels
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = nn.Conv2d(c1, c_, 1, 1, bias=False)
+        self.cv3 = nn.Conv2d(c_, c_, 1, 1, bias=False)
+        self.cv4 = Conv(2 * c_, c2, 1, 1)
+        self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
+        self.act = nn.SiLU()
+        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
+
+    def forward(self, x):
+        """Applies a CSP bottleneck with 3 convolutions."""
+        y1 = self.cv3(self.m(self.cv1(x)))
+        y2 = self.cv2(x)
+        return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
diff --git a/ultralytics/nn/modules/conv.py b/ultralytics/nn/modules/conv.py
new file mode 100644
index 0000000..4f2836b
--- /dev/null
+++ b/ultralytics/nn/modules/conv.py
@@ -0,0 +1,277 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Convolution modules
+"""
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+__all__ = [
+    'Conv', 'LightConv', 'DWConv', 'DWConvTranspose2d', 'ConvTranspose', 'Focus', 'GhostConv', 'ChannelAttention',
+    'SpatialAttention', 'CBAM', 'Concat', 'RepConv']
+
+
+def autopad(k, p=None, d=1):  # kernel, padding, dilation
+    """Pad to 'same' shape outputs."""
+    if d > 1:
+        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
+    return p
+
+
+class Conv(nn.Module):
+    """Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)."""
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
+        """Initialize Conv layer with given arguments including activation."""
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
+        self.bn = nn.BatchNorm2d(c2)
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+    def forward(self, x):
+        """Apply convolution, batch normalization and activation to input tensor."""
+        return self.act(self.bn(self.conv(x)))
+
+    def forward_fuse(self, x):
+        """Perform transposed convolution of 2D data."""
+        return self.act(self.conv(x))
+
+
+class LightConv(nn.Module):
+    """Light convolution with args(ch_in, ch_out, kernel).
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py
+    """
+
+    def __init__(self, c1, c2, k=1, act=nn.ReLU()):
+        """Initialize Conv layer with given arguments including activation."""
+        super().__init__()
+        self.conv1 = Conv(c1, c2, 1, act=False)
+        self.conv2 = DWConv(c2, c2, k, act=act)
+
+    def forward(self, x):
+        """Apply 2 convolutions to input tensor."""
+        return self.conv2(self.conv1(x))
+
+
+class DWConv(Conv):
+    """Depth-wise convolution."""
+
+    def __init__(self, c1, c2, k=1, s=1, d=1, act=True):  # ch_in, ch_out, kernel, stride, dilation, activation
+        super().__init__(c1, c2, k, s, g=math.gcd(c1, c2), d=d, act=act)
+
+
+class DWConvTranspose2d(nn.ConvTranspose2d):
+    """Depth-wise transpose convolution."""
+
+    def __init__(self, c1, c2, k=1, s=1, p1=0, p2=0):  # ch_in, ch_out, kernel, stride, padding, padding_out
+        super().__init__(c1, c2, k, s, p1, p2, groups=math.gcd(c1, c2))
+
+
+class ConvTranspose(nn.Module):
+    """Convolution transpose 2d layer."""
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=2, s=2, p=0, bn=True, act=True):
+        """Initialize ConvTranspose2d layer with batch normalization and activation function."""
+        super().__init__()
+        self.conv_transpose = nn.ConvTranspose2d(c1, c2, k, s, p, bias=not bn)
+        self.bn = nn.BatchNorm2d(c2) if bn else nn.Identity()
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+    def forward(self, x):
+        """Applies transposed convolutions, batch normalization and activation to input."""
+        return self.act(self.bn(self.conv_transpose(x)))
+
+    def forward_fuse(self, x):
+        """Applies activation and convolution transpose operation to input."""
+        return self.act(self.conv_transpose(x))
+
+
+class Focus(nn.Module):
+    """Focus wh information into c-space."""
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        self.conv = Conv(c1 * 4, c2, k, s, p, g, act=act)
+        # self.contract = Contract(gain=2)
+
+    def forward(self, x):  # x(b,c,w,h) -> y(b,4c,w/2,h/2)
+        return self.conv(torch.cat((x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]), 1))
+        # return self.conv(self.contract(x))
+
+
+class GhostConv(nn.Module):
+    """Ghost Convolution https://github.com/huawei-noah/ghostnet."""
+
+    def __init__(self, c1, c2, k=1, s=1, g=1, act=True):  # ch_in, ch_out, kernel, stride, groups
+        super().__init__()
+        c_ = c2 // 2  # hidden channels
+        self.cv1 = Conv(c1, c_, k, s, None, g, act=act)
+        self.cv2 = Conv(c_, c_, 5, 1, None, c_, act=act)
+
+    def forward(self, x):
+        """Forward propagation through a Ghost Bottleneck layer with skip connection."""
+        y = self.cv1(x)
+        return torch.cat((y, self.cv2(y)), 1)
+
+
+class RepConv(nn.Module):
+    """RepConv is a basic rep-style block, including training and deploy status
+    This code is based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+    """
+    default_act = nn.SiLU()  # default activation
+
+    def __init__(self, c1, c2, k=3, s=1, p=1, g=1, d=1, act=True, bn=False, deploy=False):
+        super().__init__()
+        assert k == 3 and p == 1
+        self.g = g
+        self.c1 = c1
+        self.c2 = c2
+        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
+
+        self.bn = nn.BatchNorm2d(num_features=c1) if bn and c2 == c1 and s == 1 else None
+        self.conv1 = Conv(c1, c2, k, s, p=p, g=g, act=False)
+        self.conv2 = Conv(c1, c2, 1, s, p=(p - k // 2), g=g, act=False)
+
+    def forward_fuse(self, x):
+        """Forward process"""
+        return self.act(self.conv(x))
+
+    def forward(self, x):
+        """Forward process"""
+        id_out = 0 if self.bn is None else self.bn(x)
+        return self.act(self.conv1(x) + self.conv2(x) + id_out)
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        kernelid, biasid = self._fuse_bn_tensor(self.bn)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _avg_to_3x3_tensor(self, avgp):
+        channels = self.c1
+        groups = self.g
+        kernel_size = avgp.kernel_size
+        input_dim = channels // groups
+        k = torch.zeros((channels, input_dim, kernel_size, kernel_size))
+        k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = 1.0 / kernel_size ** 2
+        return k
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return torch.nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, Conv):
+            kernel = branch.conv.weight
+            running_mean = branch.bn.running_mean
+            running_var = branch.bn.running_var
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn.eps
+        elif isinstance(branch, nn.BatchNorm2d):
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.c1 // self.g
+                kernel_value = np.zeros((self.c1, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.c1):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
+            kernel = self.id_tensor
+            running_mean = branch.running_mean
+            running_var = branch.running_var
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch.eps
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape(-1, 1, 1, 1)
+        return kernel * t, beta - running_mean * gamma / std
+
+    def fuse_convs(self):
+        if hasattr(self, 'conv'):
+            return
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv = nn.Conv2d(in_channels=self.conv1.conv.in_channels,
+                              out_channels=self.conv1.conv.out_channels,
+                              kernel_size=self.conv1.conv.kernel_size,
+                              stride=self.conv1.conv.stride,
+                              padding=self.conv1.conv.padding,
+                              dilation=self.conv1.conv.dilation,
+                              groups=self.conv1.conv.groups,
+                              bias=True).requires_grad_(False)
+        self.conv.weight.data = kernel
+        self.conv.bias.data = bias
+        for para in self.parameters():
+            para.detach_()
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+        if hasattr(self, 'nm'):
+            self.__delattr__('nm')
+        if hasattr(self, 'bn'):
+            self.__delattr__('bn')
+        if hasattr(self, 'id_tensor'):
+            self.__delattr__('id_tensor')
+
+
+class ChannelAttention(nn.Module):
+    """Channel-attention module https://github.com/open-mmlab/mmdetection/tree/v3.0.0rc1/configs/rtmdet."""
+
+    def __init__(self, channels: int) -> None:
+        super().__init__()
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channels, channels, 1, 1, 0, bias=True)
+        self.act = nn.Sigmoid()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.act(self.fc(self.pool(x)))
+
+
+class SpatialAttention(nn.Module):
+    """Spatial-attention module."""
+
+    def __init__(self, kernel_size=7):
+        """Initialize Spatial-attention module with kernel size argument."""
+        super().__init__()
+        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
+        padding = 3 if kernel_size == 7 else 1
+        self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
+        self.act = nn.Sigmoid()
+
+    def forward(self, x):
+        """Apply channel and spatial attention on input for feature recalibration."""
+        return x * self.act(self.cv1(torch.cat([torch.mean(x, 1, keepdim=True), torch.max(x, 1, keepdim=True)[0]], 1)))
+
+
+class CBAM(nn.Module):
+    """Convolutional Block Attention Module."""
+
+    def __init__(self, c1, kernel_size=7):  # ch_in, kernels
+        super().__init__()
+        self.channel_attention = ChannelAttention(c1)
+        self.spatial_attention = SpatialAttention(kernel_size)
+
+    def forward(self, x):
+        """Applies the forward pass through C1 module."""
+        return self.spatial_attention(self.channel_attention(x))
+
+
+class Concat(nn.Module):
+    """Concatenate a list of tensors along dimension."""
+
+    def __init__(self, dimension=1):
+        """Concatenates a list of tensors along a specified dimension."""
+        super().__init__()
+        self.d = dimension
+
+    def forward(self, x):
+        """Forward pass for the YOLOv8 mask Proto module."""
+        return torch.cat(x, self.d)
diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py
new file mode 100644
index 0000000..3bd13cd
--- /dev/null
+++ b/ultralytics/nn/modules/head.py
@@ -0,0 +1,382 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Model head modules
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.init import constant_, xavier_uniform_
+
+from ultralytics.yolo.utils.tal import dist2bbox, make_anchors
+
+from .block import DFL, Proto
+from .conv import Conv
+from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
+from .utils import bias_init_with_prob, linear_init_
+
+__all__ = ['Detect', 'Segment', 'Pose', 'Classify', 'RTDETRDecoder']
+
+
+class Detect(nn.Module):
+    """YOLOv8 Detect head for detection models."""
+    dynamic = False  # force grid reconstruction
+    export = False  # export mode
+    shape = None
+    anchors = torch.empty(0)  # init
+    strides = torch.empty(0)  # init
+
+    def __init__(self, nc=80, ch=()):  # detection layer
+        super().__init__()
+        self.nc = nc  # number of classes
+        self.nl = len(ch)  # number of detection layers
+        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
+        self.no = nc + self.reg_max * 4  # number of outputs per anchor
+        self.stride = torch.zeros(self.nl)  # strides computed during build
+        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], self.nc)  # channels
+        self.cv2 = nn.ModuleList(
+            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch)
+        self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
+        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
+
+    def forward(self, x):
+        """Concatenates and returns predicted bounding boxes and class probabilities."""
+        shape = x[0].shape  # BCHW
+        for i in range(self.nl):
+            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
+        if self.training:
+            return x
+        elif self.dynamic or self.shape != shape:
+            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
+            self.shape = shape
+
+        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
+        if self.export and self.format in ('saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs'):  # avoid TF FlexSplitV ops
+            box = x_cat[:, :self.reg_max * 4]
+            cls = x_cat[:, self.reg_max * 4:]
+        else:
+            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
+        dbox = dist2bbox(self.dfl(box), self.anchors.unsqueeze(0), xywh=True, dim=1) * self.strides
+        y = torch.cat((dbox, cls.sigmoid()), 1)
+        return y if self.export else (y, x)
+
+    def bias_init(self):
+        """Initialize Detect() biases, WARNING: requires stride availability."""
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            b[-1].bias.data[:m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
+
+
+class Segment(Detect):
+    """YOLOv8 Segment head for segmentation models."""
+
+    def __init__(self, nc=80, nm=32, npr=256, ch=()):
+        """Initialize the YOLO model attributes such as the number of masks, prototypes, and the convolution layers."""
+        super().__init__(nc, ch)
+        self.nm = nm  # number of masks
+        self.npr = npr  # number of protos
+        self.proto = Proto(ch[0], self.npr, self.nm)  # protos
+        self.detect = Detect.forward
+
+        c4 = max(ch[0] // 4, self.nm)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nm, 1)) for x in ch)
+
+    def forward(self, x):
+        """Return model outputs and mask coefficients if training, otherwise return outputs and mask coefficients."""
+        p = self.proto(x[0])  # mask protos
+        bs = p.shape[0]  # batch size
+
+        mc = torch.cat([self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 2)  # mask coefficients
+        x = self.detect(self, x)
+        if self.training:
+            return x, mc, p
+        return (torch.cat([x, mc], 1), p) if self.export else (torch.cat([x[0], mc], 1), (x[1], mc, p))
+
+
+class Pose(Detect):
+    """YOLOv8 Pose head for keypoints models."""
+
+    def __init__(self, nc=80, kpt_shape=(17, 3), ch=()):
+        """Initialize YOLO network with default parameters and Convolutional Layers."""
+        super().__init__(nc, ch)
+        self.kpt_shape = kpt_shape  # number of keypoints, number of dims (2 for x,y or 3 for x,y,visible)
+        self.nk = kpt_shape[0] * kpt_shape[1]  # number of keypoints total
+        self.detect = Detect.forward
+
+        c4 = max(ch[0] // 4, self.nk)
+        self.cv4 = nn.ModuleList(nn.Sequential(Conv(x, c4, 3), Conv(c4, c4, 3), nn.Conv2d(c4, self.nk, 1)) for x in ch)
+
+    def forward(self, x):
+        """Perform forward pass through YOLO model and return predictions."""
+        bs = x[0].shape[0]  # batch size
+        kpt = torch.cat([self.cv4[i](x[i]).view(bs, self.nk, -1) for i in range(self.nl)], -1)  # (bs, 17*3, h*w)
+        x = self.detect(self, x)
+        if self.training:
+            return x, kpt
+        pred_kpt = self.kpts_decode(bs, kpt)
+        return torch.cat([x, pred_kpt], 1) if self.export else (torch.cat([x[0], pred_kpt], 1), (x[1], kpt))
+
+    def kpts_decode(self, bs, kpts):
+        """Decodes keypoints."""
+        ndim = self.kpt_shape[1]
+        if self.export:  # required for TFLite export to avoid 'PLACEHOLDER_FOR_GREATER_OP_CODES' bug
+            y = kpts.view(bs, *self.kpt_shape, -1)
+            a = (y[:, :, :2] * 2.0 + (self.anchors - 0.5)) * self.strides
+            if ndim == 3:
+                a = torch.cat((a, y[:, :, 1:2].sigmoid()), 2)
+            return a.view(bs, self.nk, -1)
+        else:
+            y = kpts.clone()
+            if ndim == 3:
+                y[:, 2::3].sigmoid_()  # inplace sigmoid
+            y[:, 0::ndim] = (y[:, 0::ndim] * 2.0 + (self.anchors[0] - 0.5)) * self.strides
+            y[:, 1::ndim] = (y[:, 1::ndim] * 2.0 + (self.anchors[1] - 0.5)) * self.strides
+            return y
+
+
+class Classify(nn.Module):
+    """YOLOv8 classification head, i.e. x(b,c1,20,20) to x(b,c2)."""
+
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1):  # ch_in, ch_out, kernel, stride, padding, groups
+        super().__init__()
+        c_ = 1280  # efficientnet_b0 size
+        self.conv = Conv(c1, c_, k, s, p, g)
+        self.pool = nn.AdaptiveAvgPool2d(1)  # to x(b,c_,1,1)
+        self.drop = nn.Dropout(p=0.0, inplace=True)
+        self.linear = nn.Linear(c_, c2)  # to x(b,c2)
+
+    def forward(self, x):
+        """Performs a forward pass of the YOLO model on input image data."""
+        if isinstance(x, list):
+            x = torch.cat(x, 1)
+        x = self.linear(self.drop(self.pool(self.conv(x)).flatten(1)))
+        return x if self.training else x.softmax(1)
+
+
+class RTDETRDecoder(nn.Module):
+
+    def __init__(
+            self,
+            nc=80,
+            ch=(512, 1024, 2048),
+            hidden_dim=256,
+            num_queries=300,
+            strides=(8, 16, 32),  # TODO
+            nl=3,
+            num_decoder_points=4,
+            nhead=8,
+            num_decoder_layers=6,
+            dim_feedforward=1024,
+            dropout=0.,
+            act=nn.ReLU(),
+            eval_idx=-1,
+            # training args
+            num_denoising=100,
+            label_noise_ratio=0.5,
+            box_noise_scale=1.0,
+            learnt_init_query=False):
+        super().__init__()
+        assert len(ch) <= nl
+        assert len(strides) == len(ch)
+        for _ in range(nl - len(strides)):
+            strides.append(strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = strides
+        self.nl = nl
+        self.nc = nc
+        self.num_queries = num_queries
+        self.num_decoder_layers = num_decoder_layers
+
+        # backbone feature projection
+        self._build_input_proj_layer(ch)
+
+        # Transformer module
+        decoder_layer = DeformableTransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, act, nl,
+                                                          num_decoder_points)
+        self.decoder = DeformableTransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(nc, hidden_dim)
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim))
+        self.enc_score_head = nn.Linear(hidden_dim, nc)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.ModuleList([nn.Linear(hidden_dim, nc) for _ in range(num_decoder_layers)])
+        self.dec_bbox_head = nn.ModuleList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers)])
+
+        self._reset_parameters()
+
+    def forward(self, feats, gt_meta=None):
+        # input projection and embedding
+        memory, spatial_shapes, _ = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training:
+            raise NotImplementedError
+            # denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+            #     get_contrastive_denoising_training_group(gt_meta,
+            #                                 self.num_classes,
+            #                                 self.num_queries,
+            #                                 self.denoising_class_embed.weight,
+            #                                 self.num_denoising,
+            #                                 self.label_noise_ratio,
+            #                                 self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask = None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(target,
+                                              init_ref_points_unact,
+                                              memory,
+                                              spatial_shapes,
+                                              self.dec_bbox_head,
+                                              self.dec_score_head,
+                                              self.query_pos_head,
+                                              attn_mask=attn_mask)
+        if not self.training:
+            out_logits = out_logits.sigmoid_()
+        return out_bboxes, out_logits  # enc_topk_bboxes, enc_topk_logits, dn_meta
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight, 0.)
+        constant_(self.enc_bbox_head.layers[-1].bias, 0.)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight, 0.)
+            constant_(reg_.layers[-1].bias, 0.)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for layer in self.input_proj:
+            xavier_uniform_(layer[0].weight)
+
+    def _build_input_proj_layer(self, ch):
+        self.input_proj = nn.ModuleList()
+        for in_channels in ch:
+            self.input_proj.append(
+                nn.Sequential(nn.Conv2d(in_channels, self.hidden_dim, kernel_size=1, bias=False),
+                              nn.BatchNorm2d(self.hidden_dim)))
+        in_channels = ch[-1]
+        for _ in range(self.nl - len(ch)):
+            self.input_proj.append(
+                nn.Sequential(nn.Conv2D(in_channels, self.hidden_dim, kernel_size=3, stride=2, padding=1, bias=False),
+                              nn.BatchNorm2d(self.hidden_dim)))
+            in_channels = self.hidden_dim
+
+    def _generate_anchors(self, spatial_shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2):
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=torch.float32),
+                                            torch.arange(end=w, dtype=torch.float32),
+                                            indexing='ij')
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+
+            valid_WH = torch.tensor([h, w]).to(torch.float32)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+
+        anchors = torch.concat(anchors, 1)
+        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.inf)
+        return anchors.to(device=device, dtype=dtype), valid_mask.to(device=device)
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.nl > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.nl):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0]
+        for feat in proj_feats:
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).permute(0, 2, 1))
+            # [nl, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = torch.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return feat_flatten, spatial_shapes, level_start_index
+
+    def _get_decoder_input(self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        anchors, valid_mask = self._generate_anchors(spatial_shapes, dtype=memory.dtype, device=memory.device)
+        memory = torch.where(valid_mask, memory, torch.tensor(0.))
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)  # (bs, h*w, nc)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors  # (bs, h*w, 4)
+
+        # (bs, topk)
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1)
+        # extract region proposal boxes
+        # (bs, topk_ind)
+        batch_ind = torch.arange(end=bs, dtype=topk_ind.dtype).unsqueeze(-1).repeat(1, self.num_queries).view(-1)
+        topk_ind = topk_ind.view(-1)
+
+        # Unsigmoided
+        reference_points_unact = enc_outputs_coord_unact[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+
+        enc_topk_bboxes = torch.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
+        if self.training:
+            reference_points_unact = reference_points_unact.detach()
+        enc_topk_logits = enc_outputs_class[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).repeat(bs, 1, 1)
+        else:
+            target = output_memory[batch_ind, topk_ind].view(bs, self.num_queries, -1)
+            if self.training:
+                target = target.detach()
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
diff --git a/ultralytics/nn/modules/transformer.py b/ultralytics/nn/modules/transformer.py
new file mode 100644
index 0000000..4ae946a
--- /dev/null
+++ b/ultralytics/nn/modules/transformer.py
@@ -0,0 +1,390 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Transformer modules
+"""
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import constant_, xavier_uniform_
+
+from .conv import Conv
+from .utils import _get_clones, inverse_sigmoid, multi_scale_deformable_attn_pytorch
+
+__all__ = [
+    'TransformerEncoderLayer', 'TransformerLayer', 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'AIFI',
+    'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP']
+
+
+class TransformerEncoderLayer(nn.Module):
+    """Transformer Encoder."""
+
+    def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False):
+        super().__init__()
+        self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True)
+        # Implementation of Feedforward model
+        self.fc1 = nn.Linear(c1, cm)
+        self.fc2 = nn.Linear(cm, c1)
+
+        self.norm1 = nn.LayerNorm(c1)
+        self.norm2 = nn.LayerNorm(c1)
+        self.dropout = nn.Dropout(dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.act = act
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos=None):
+        """Add position embeddings if given."""
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.ma(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.fc2(self.dropout(self.act(self.fc1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.ma(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.fc2(self.dropout(self.act(self.fc1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self, src, src_mask=None, src_key_padding_mask=None, pos=None):
+        """Forward propagates the input through the encoder module."""
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class AIFI(TransformerEncoderLayer):
+
+    def __init__(self, c1, cm=2048, num_heads=8, dropout=0, act=nn.GELU(), normalize_before=False):
+        super().__init__(c1, cm, num_heads, dropout, act, normalize_before)
+
+    def forward(self, x):
+        c, h, w = x.shape[1:]
+        pos_embed = self.build_2d_sincos_position_embedding(w, h, c)
+        # flatten [B, C, H, W] to [B, HxW, C]
+        x = super().forward(x.flatten(2).permute(0, 2, 1), pos=pos_embed.to(device=x.device, dtype=x.dtype))
+        return x.permute((0, 2, 1)).view([-1, c, h, w])
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w, h, embed_dim=256, temperature=10000.):
+        grid_w = torch.arange(int(w), dtype=torch.float32)
+        grid_h = torch.arange(int(h), dtype=torch.float32)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing='ij')
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, dtype=torch.float32) / pos_dim
+        omega = 1. / (temperature ** omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([torch.sin(out_w), torch.cos(out_w),
+                             torch.sin(out_h), torch.cos(out_h)], axis=1)[None, :, :]
+
+
+class TransformerLayer(nn.Module):
+    """Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance)."""
+
+    def __init__(self, c, num_heads):
+        """Initializes a self-attention mechanism using linear transformations and multi-head attention."""
+        super().__init__()
+        self.q = nn.Linear(c, c, bias=False)
+        self.k = nn.Linear(c, c, bias=False)
+        self.v = nn.Linear(c, c, bias=False)
+        self.ma = nn.MultiheadAttention(embed_dim=c, num_heads=num_heads)
+        self.fc1 = nn.Linear(c, c, bias=False)
+        self.fc2 = nn.Linear(c, c, bias=False)
+
+    def forward(self, x):
+        """Apply a transformer block to the input x and return the output."""
+        x = self.ma(self.q(x), self.k(x), self.v(x))[0] + x
+        x = self.fc2(self.fc1(x)) + x
+        return x
+
+
+class TransformerBlock(nn.Module):
+    """Vision Transformer https://arxiv.org/abs/2010.11929."""
+
+    def __init__(self, c1, c2, num_heads, num_layers):
+        """Initialize a Transformer module with position embedding and specified number of heads and layers."""
+        super().__init__()
+        self.conv = None
+        if c1 != c2:
+            self.conv = Conv(c1, c2)
+        self.linear = nn.Linear(c2, c2)  # learnable position embedding
+        self.tr = nn.Sequential(*(TransformerLayer(c2, num_heads) for _ in range(num_layers)))
+        self.c2 = c2
+
+    def forward(self, x):
+        """Forward propagates the input through the bottleneck module."""
+        if self.conv is not None:
+            x = self.conv(x)
+        b, _, w, h = x.shape
+        p = x.flatten(2).permute(2, 0, 1)
+        return self.tr(p + self.linear(p)).permute(1, 2, 0).reshape(b, self.c2, w, h)
+
+
+class MLPBlock(nn.Module):
+
+    def __init__(self, embedding_dim, mlp_dim, act=nn.GELU):
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+
+    def __init__(self, num_channels, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class MSDeformAttn(nn.Module):
+    """
+    Original Multi-Scale Deformable Attention Module.
+    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
+    """
+
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError(f'd_model must be divisible by n_heads, but got {d_model} and {n_heads}')
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        assert _d_per_head * n_heads == d_model, '`d_model` must be divisible by `n_heads`'
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(
+            1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, value, value_spatial_shapes, value_mask=None):
+        """
+        https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, len_q = query.shape[:2]
+        _, len_v = value.shape[:2]
+        assert sum(s[0] * s[1] for s in value_spatial_shapes) == len_v
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value = value.masked_fill(value_mask[..., None], float(0))
+        value = value.view(bs, len_v, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(bs, len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(bs, len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(bs, len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        n = reference_points.shape[-1]
+        if n == 2:
+            offset_normalizer = torch.as_tensor(value_spatial_shapes, dtype=query.dtype, device=query.device).flip(-1)
+            add = sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            sampling_locations = reference_points[:, :, None, :, None, :] + add
+
+        elif n == 4:
+            add = sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            sampling_locations = reference_points[:, :, None, :, None, :2] + add
+        else:
+            raise ValueError(f'Last dim of reference_points must be 2 or 4, but got {n}.')
+        output = multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+        return output
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    """
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
+    https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/deformable_transformer.py
+    """
+
+    def __init__(self, d_model=256, n_heads=8, d_ffn=1024, dropout=0., act=nn.ReLU(), n_levels=4, n_points=4):
+        super().__init__()
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.act = act
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.act(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                tgt,
+                reference_points,
+                src,
+                src_spatial_shapes,
+                src_padding_mask=None,
+                attn_mask=None,
+                query_pos=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos)
+        if attn_mask is not None:
+            attn_mask = torch.where(attn_mask.astype('bool'), torch.zeros(attn_mask.shape, tgt.dtype),
+                                    torch.full(attn_mask.shape, float('-inf'), tgt.dtype))
+        tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(self.with_pos_embed(tgt, query_pos), reference_points, src, src_spatial_shapes,
+                               src_padding_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+class DeformableTransformerDecoder(nn.Module):
+    """
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/transformers/deformable_transformer.py
+    """
+
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.hidden_dim = hidden_dim
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                reference_points,
+                src,
+                src_spatial_shapes,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                src_padding_mask=None):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points = None
+        ref_points_detach = torch.sigmoid(reference_points)
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+            output = layer(output, ref_points_input, src, src_spatial_shapes, src_padding_mask, attn_mask,
+                           query_pos_embed)
+
+            inter_ref_bbox = torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(torch.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)))
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach() if self.training else inter_ref_bbox
+
+        return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits)
diff --git a/ultralytics/nn/modules/utils.py b/ultralytics/nn/modules/utils.py
new file mode 100644
index 0000000..35cf30c
--- /dev/null
+++ b/ultralytics/nn/modules/utils.py
@@ -0,0 +1,78 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+Module utils
+"""
+
+import copy
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.init import uniform_
+
+__all__ = ['multi_scale_deformable_attn_pytorch', 'inverse_sigmoid']
+
+
+def _get_clones(module, n):
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(n)])
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    return float(-np.log((1 - prior_prob) / prior_prob))  # return bias_init
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    if hasattr(module, 'bias') and module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+def multi_scale_deformable_attn_pytorch(value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                                        sampling_locations: torch.Tensor,
+                                        attention_weights: torch.Tensor) -> torch.Tensor:
+    """
+    Multi-scale deformable attention.
+    https://github.com/IDEA-Research/detrex/blob/main/detrex/layers/multi_scale_deform_attn.py
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = (value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_))
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(value_l_,
+                                          sampling_grid_l_,
+                                          mode='bilinear',
+                                          padding_mode='zeros',
+                                          align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(bs * num_heads, 1, num_queries,
+                                                                  num_levels * num_points)
+    output = ((torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(
+        bs, num_heads * embed_dims, num_queries))
+    return output.transpose(1, 2).contiguous()
diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py
index b4d6050..c174272 100644
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@@ -8,9 +8,10 @@ import thop
 import torch
 import torch.nn as nn
 
-from ultralytics.nn.modules import (C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, Classify,
-                                    Concat, Conv, ConvTranspose, Detect, DWConv, DWConvTranspose2d, Ensemble, Focus,
-                                    GhostBottleneck, GhostConv, Pose, Segment)
+from ultralytics.nn.modules import (AIFI, C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x,
+                                    Classify, Concat, Conv, ConvTranspose, Detect, DWConv, DWConvTranspose2d, Focus,
+                                    GhostBottleneck, GhostConv, HGBlock, HGStem, Pose, RepC3, RepConv, RTDETRDecoder,
+                                    Segment)
 from ultralytics.yolo.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
 from ultralytics.yolo.utils.checks import check_requirements, check_suffix, check_yaml
 from ultralytics.yolo.utils.plotting import feature_visualization
@@ -105,6 +106,9 @@ class BaseModel(nn.Module):
                     m.conv_transpose = fuse_deconv_and_bn(m.conv_transpose, m.bn)
                     delattr(m, 'bn')  # remove batchnorm
                     m.forward = m.forward_fuse  # update forward
+                if isinstance(m, RepConv):
+                    m.fuse_convs()
+                    m.forward = m.forward_fuse  # update forward
             self.info(verbose=verbose)
 
         return self
@@ -334,6 +338,22 @@ class ClassificationModel(BaseModel):
                     m[i] = nn.Conv2d(m[i].in_channels, nc, m[i].kernel_size, m[i].stride, bias=m[i].bias is not None)
 
 
+class Ensemble(nn.ModuleList):
+    """Ensemble of models."""
+
+    def __init__(self):
+        """Initialize an ensemble of models."""
+        super().__init__()
+
+    def forward(self, x, augment=False, profile=False, visualize=False):
+        """Function generates the YOLOv5 network's final layer."""
+        y = [module(x, augment, profile, visualize)[0] for module in self]
+        # y = torch.stack(y).max(0)[0]  # max ensemble
+        # y = torch.stack(y).mean(0)  # mean ensemble
+        y = torch.cat(y, 2)  # nms ensemble, y shape(B, HW, C)
+        return y, None  # inference, train output
+
+
 # Functions ------------------------------------------------------------------------------------------------------------
 
 
@@ -415,7 +435,7 @@ def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
 def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
     """Loads a single model weights."""
     ckpt, weight = torch_safe_load(weight)  # load ckpt
-    args = {**DEFAULT_CFG_DICT, **ckpt['train_args']}  # combine model and default args, preferring model args
+    args = {**DEFAULT_CFG_DICT, **(ckpt.get('train_args', {}))}  # combine model and default args, preferring model args
     model = (ckpt.get('ema') or ckpt['model']).to(device).float()  # FP32 model
 
     # Model compatibility updates
@@ -472,20 +492,29 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
 
         n = n_ = max(round(n * depth), 1) if n > 1 else n  # depth gain
         if m in (Classify, Conv, ConvTranspose, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, Focus,
-                 BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x):
+                 BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x, RepC3):
             c1, c2 = ch[f], args[0]
             if c2 != nc:  # if c2 not equal to number of classes (i.e. for Classify() output)
                 c2 = make_divisible(min(c2, max_channels) * width, 8)
 
             args = [c1, c2, *args[1:]]
-            if m in (BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, C3x):
+            if m in (BottleneckCSP, C1, C2, C2f, C3, C3TR, C3Ghost, C3x, RepC3):
                 args.insert(2, n)  # number of repeats
                 n = 1
+        elif m is AIFI:
+            args = [ch[f], *args]
+        elif m in (HGStem, HGBlock):
+            c1, cm, c2 = ch[f], args[0], args[1]
+            args = [c1, cm, c2, *args[2:]]
+            if m is HGBlock:
+                args.insert(4, n)  # number of repeats
+                n = 1
+
         elif m is nn.BatchNorm2d:
             args = [ch[f]]
         elif m is Concat:
             c2 = sum(ch[x] for x in f)
-        elif m in (Detect, Segment, Pose):
+        elif m in (Detect, Segment, Pose, RTDETRDecoder):
             args.append([ch[x] for x in f])
             if m is Segment:
                 args[2] = make_divisible(min(args[2], max_channels) * width, 8)
diff --git a/ultralytics/vit/__init__.py b/ultralytics/vit/__init__.py
index 32cd34f..e142705 100644
--- a/ultralytics/vit/__init__.py
+++ b/ultralytics/vit/__init__.py
@@ -1 +1,6 @@
-from .sam import SAM  # noqa
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .rtdetr import RTDETR
+from .sam import SAM
+
+__all__ = 'RTDETR', 'SAM', 'SAM'  # allow simpler import
diff --git a/ultralytics/vit/rtdetr/__init__.py b/ultralytics/vit/rtdetr/__init__.py
new file mode 100644
index 0000000..4d12115
--- /dev/null
+++ b/ultralytics/vit/rtdetr/__init__.py
@@ -0,0 +1,7 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .model import RTDETR
+from .predict import RTDETRPredictor
+from .val import RTDETRValidator
+
+__all__ = 'RTDETRPredictor', 'RTDETRValidator', 'RTDETR'
diff --git a/ultralytics/vit/rtdetr/model.py b/ultralytics/vit/rtdetr/model.py
new file mode 100644
index 0000000..fc81b7c
--- /dev/null
+++ b/ultralytics/vit/rtdetr/model.py
@@ -0,0 +1,104 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+# RT-DETR model interface
+"""
+
+from pathlib import Path
+
+from ultralytics.nn.tasks import DetectionModel, attempt_load_one_weight, yaml_model_load
+from ultralytics.yolo.cfg import get_cfg
+from ultralytics.yolo.engine.exporter import Exporter
+from ultralytics.yolo.utils import DEFAULT_CFG, DEFAULT_CFG_DICT
+from ultralytics.yolo.utils.checks import check_imgsz
+
+from ...yolo.utils.torch_utils import smart_inference_mode
+from .predict import RTDETRPredictor
+from .val import RTDETRValidator
+
+
+class RTDETR:
+
+    def __init__(self, model='rtdetr-l.pt') -> None:
+        if model and not model.endswith('.pt') and not model.endswith('.yaml'):
+            raise NotImplementedError('RT-DETR only supports creating from pt file or yaml file.')
+        # Load or create new YOLO model
+        self.predictor = None
+        suffix = Path(model).suffix
+        if suffix == '.yaml':
+            self._new(model)
+        else:
+            self._load(model)
+
+    def _new(self, cfg: str, verbose=True):
+        cfg_dict = yaml_model_load(cfg)
+        self.cfg = cfg
+        self.task = 'detect'
+        self.model = DetectionModel(cfg_dict, verbose=verbose)  # build model
+
+        # Below added to allow export from yamls
+        self.model.args = DEFAULT_CFG_DICT  # attach args to model
+        self.model.task = self.task
+
+    @smart_inference_mode()
+    def _load(self, weights: str):
+        self.model, _ = attempt_load_one_weight(weights)
+        self.model.args = DEFAULT_CFG_DICT  # attach args to model
+        self.task = self.model.args['task']
+
+    @smart_inference_mode()
+    def predict(self, source, stream=False, **kwargs):
+        """
+        Perform prediction using the YOLO model.
+
+        Args:
+            source (str | int | PIL | np.ndarray): The source of the image to make predictions on.
+                          Accepts all source types accepted by the YOLO model.
+            stream (bool): Whether to stream the predictions or not. Defaults to False.
+            **kwargs : Additional keyword arguments passed to the predictor.
+                       Check the 'configuration' section in the documentation for all available options.
+
+        Returns:
+            (List[ultralytics.yolo.engine.results.Results]): The prediction results.
+        """
+        overrides = dict(conf=0.25, task='detect', mode='predict')
+        overrides.update(kwargs)  # prefer kwargs
+        if not self.predictor:
+            self.predictor = RTDETRPredictor(overrides=overrides)
+            self.predictor.setup_model(model=self.model)
+        else:  # only update args if predictor is already setup
+            self.predictor.args = get_cfg(self.predictor.args, overrides)
+        return self.predictor(source, stream=stream)
+
+    def train(self, **kwargs):
+        """Function trains models but raises an error as RTDETR models do not support training."""
+        raise NotImplementedError("RTDETR models don't support training")
+
+    def val(self, **kwargs):
+        """Run validation given dataset."""
+        overrides = dict(task='detect', mode='val')
+        overrides.update(kwargs)  # prefer kwargs
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.imgsz = check_imgsz(args.imgsz, max_dim=1)
+        validator = RTDETRValidator(args=args)
+        validator(model=self.model)
+        self.metrics = validator.metrics
+        return validator.metrics
+
+    @smart_inference_mode()
+    def export(self, **kwargs):
+        """
+        Export model.
+
+        Args:
+            **kwargs : Any other args accepted by the predictors. To see all args check 'configuration' section in docs
+        """
+        overrides = dict(task='detect')
+        overrides.update(kwargs)
+        overrides['mode'] = 'export'
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.task = self.task
+        if args.imgsz == DEFAULT_CFG.imgsz:
+            args.imgsz = self.model.args['imgsz']  # use trained imgsz unless custom value is passed
+        if args.batch == DEFAULT_CFG.batch:
+            args.batch = 1  # default to 1 if not modified
+        return Exporter(overrides=args)(model=self.model)
diff --git a/ultralytics/vit/rtdetr/predict.py b/ultralytics/vit/rtdetr/predict.py
new file mode 100644
index 0000000..ee47b37
--- /dev/null
+++ b/ultralytics/vit/rtdetr/predict.py
@@ -0,0 +1,42 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+import torch
+
+from ultralytics.yolo.data.augment import LetterBox
+from ultralytics.yolo.engine.predictor import BasePredictor
+from ultralytics.yolo.engine.results import Results
+from ultralytics.yolo.utils import ops
+
+
+class RTDETRPredictor(BasePredictor):
+
+    def postprocess(self, preds, img, orig_imgs):
+        """Postprocess predictions and returns a list of Results objects."""
+        bboxes, scores = preds[:2]  # (1, bs, 300, 4), (1, bs, 300, nc)
+        bboxes, scores = bboxes.squeeze_(0), scores.squeeze_(0)
+        results = []
+        for i, bbox in enumerate(bboxes):  # (300, 4)
+            bbox = ops.xywh2xyxy(bbox)
+            score, cls = scores[i].max(-1)  # (300, )
+            idx = score > self.args.conf
+            pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1)[idx]  # filter
+            orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
+            oh, ow = orig_img.shape[:2]
+            if not isinstance(orig_imgs, torch.Tensor):
+                pred[..., [0, 2]] *= ow
+                pred[..., [1, 3]] *= oh
+            path = self.batch[0]
+            img_path = path[i] if isinstance(path, list) else path
+            results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred))
+        return results
+
+    def pre_transform(self, im):
+        """Pre-transform input image before inference.
+
+        Args:
+            im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
+
+        Return: A list of transformed imgs.
+        """
+        # The size must be square(640) and scaleFilled.
+        return [LetterBox(self.imgsz, auto=False, scaleFill=True)(image=x) for x in im]
diff --git a/ultralytics/vit/rtdetr/val.py b/ultralytics/vit/rtdetr/val.py
new file mode 100644
index 0000000..41d4000
--- /dev/null
+++ b/ultralytics/vit/rtdetr/val.py
@@ -0,0 +1,114 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from pathlib import Path
+
+import torch
+
+from ultralytics.yolo.data import YOLODataset
+from ultralytics.yolo.data.augment import Compose, Format, LetterBox
+from ultralytics.yolo.utils import colorstr, ops
+from ultralytics.yolo.v8.detect import DetectionValidator
+
+__all__ = ['RTDETRValidator']
+
+
+# TODO: Temporarily, RT-DETR does not need padding.
+class RTDETRDataset(YOLODataset):
+
+    def __init__(self, *args, data=None, **kwargs):
+        super().__init__(*args, data=data, use_segments=False, use_keypoints=False, **kwargs)
+
+    def build_transforms(self, hyp=None):
+        """Temporarily, only for evaluation."""
+        transforms = Compose([LetterBox(new_shape=(self.imgsz, self.imgsz), auto=False, scaleFill=True)])
+        transforms.append(
+            Format(bbox_format='xywh',
+                   normalize=True,
+                   return_mask=self.use_segments,
+                   return_keypoint=self.use_keypoints,
+                   batch_idx=True,
+                   mask_ratio=hyp.mask_ratio,
+                   mask_overlap=hyp.overlap_mask))
+        return transforms
+
+
+class RTDETRValidator(DetectionValidator):
+
+    def build_dataset(self, img_path, mode='val', batch=None):
+        """Build YOLO Dataset
+
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
+            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
+        """
+        return RTDETRDataset(
+            img_path=img_path,
+            imgsz=self.args.imgsz,
+            batch_size=batch,
+            augment=False,  # no augmentation
+            hyp=self.args,
+            rect=False,  # no rect
+            cache=self.args.cache or None,
+            prefix=colorstr(f'{mode}: '),
+            data=self.data)
+
+    def postprocess(self, preds):
+        """Apply Non-maximum suppression to prediction outputs."""
+        bboxes, scores = preds[:2]  # (1, bs, 300, 4), (1, bs, 300, nc)
+        bboxes, scores = bboxes.squeeze_(0), scores.squeeze_(0)  # (bs, 300, 4)
+        bs = len(bboxes)
+        outputs = [torch.zeros((0, 6), device=bboxes.device)] * bs
+        for i, bbox in enumerate(bboxes):  # (300, 4)
+            bbox = ops.xywh2xyxy(bbox)
+            score, cls = scores[i].max(-1)  # (300, )
+            # Do not need threshold for evaluation as only got 300 boxes here.
+            # idx = score > self.args.conf
+            pred = torch.cat([bbox, score[..., None], cls[..., None]], dim=-1)  # filter
+            outputs[i] = pred  # [idx]
+
+        return outputs
+
+    def update_metrics(self, preds, batch):
+        """Metrics."""
+        for si, pred in enumerate(preds):
+            idx = batch['batch_idx'] == si
+            cls = batch['cls'][idx]
+            bbox = batch['bboxes'][idx]
+            nl, npr = cls.shape[0], pred.shape[0]  # number of labels, predictions
+            shape = batch['ori_shape'][si]
+            correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device)  # init
+            self.seen += 1
+
+            if npr == 0:
+                if nl:
+                    self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
+                    if self.args.plots:
+                        self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
+                continue
+
+            # Predictions
+            if self.args.single_cls:
+                pred[:, 5] = 0
+            predn = pred.clone()
+            predn[..., [0, 2]] *= shape[1]  # native-space pred
+            predn[..., [1, 3]] *= shape[0]  # native-space pred
+
+            # Evaluate
+            if nl:
+                tbox = ops.xywh2xyxy(bbox)  # target boxes
+                tbox[..., [0, 2]] *= shape[1]  # native-space pred
+                tbox[..., [1, 3]] *= shape[0]  # native-space pred
+                labelsn = torch.cat((cls, tbox), 1)  # native-space labels
+                correct_bboxes = self._process_batch(predn, labelsn)
+                # TODO: maybe remove these `self.` arguments as they already are member variable
+                if self.args.plots:
+                    self.confusion_matrix.process_batch(predn, labelsn)
+            self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1)))  # (conf, pcls, tcls)
+
+            # Save
+            if self.args.save_json:
+                self.pred_to_json(predn, batch['im_file'][si])
+            if self.args.save_txt:
+                file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
+                self.save_one_txt(predn, self.args.save_conf, shape, file)
diff --git a/ultralytics/vit/sam/modules/decoders.py b/ultralytics/vit/sam/modules/decoders.py
index 47acca8..743dcb4 100644
--- a/ultralytics/vit/sam/modules/decoders.py
+++ b/ultralytics/vit/sam/modules/decoders.py
@@ -157,5 +157,5 @@ class MLP(nn.Module):
         for i, layer in enumerate(self.layers):
             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
         if self.sigmoid_output:
-            x = F.sigmoid(x)
+            x = torch.sigmoid(x)
         return x
diff --git a/ultralytics/yolo/engine/predictor.py b/ultralytics/yolo/engine/predictor.py
index ed36989..b3d7192 100644
--- a/ultralytics/yolo/engine/predictor.py
+++ b/ultralytics/yolo/engine/predictor.py
@@ -115,9 +115,7 @@ class BasePredictor:
             im (torch.Tensor | List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
         """
         if not isinstance(im, torch.Tensor):
-            same_shapes = all(x.shape == im[0].shape for x in im)
-            auto = same_shapes and self.model.pt
-            im = np.stack([LetterBox(self.imgsz, auto=auto, stride=self.model.stride)(image=x) for x in im])
+            im = np.stack(self.pre_transform(im))
             im = im[..., ::-1].transpose((0, 3, 1, 2))  # BGR to RGB, BHWC to BCHW, (n, 3, h, w)
             im = np.ascontiguousarray(im)  # contiguous
             im = torch.from_numpy(im)
@@ -127,6 +125,18 @@ class BasePredictor:
         img /= 255  # 0 - 255 to 0.0 - 1.0
         return img
 
+    def pre_transform(self, im):
+        """Pre-tranform input image before inference.
+
+        Args:
+            im (List(np.ndarray)): (N, 3, h, w) for tensor, [(h, w, 3) x N] for list.
+
+        Return: A list of transformed imgs.
+        """
+        same_shapes = all(x.shape == im[0].shape for x in im)
+        auto = same_shapes and self.model.pt
+        return [LetterBox(self.imgsz, auto=auto, stride=self.model.stride)(image=x) for x in im]
+
     def write_results(self, idx, results, batch):
         """Write inference results to a file or directory."""
         p, im, _ = batch
diff --git a/ultralytics/yolo/utils/downloads.py b/ultralytics/yolo/utils/downloads.py
index fc7462e..a3f661d 100644
--- a/ultralytics/yolo/utils/downloads.py
+++ b/ultralytics/yolo/utils/downloads.py
@@ -18,7 +18,8 @@ from ultralytics.yolo.utils import LOGGER, checks, clean_url, emojis, is_online,
 GITHUB_ASSET_NAMES = [f'yolov8{k}{suffix}.pt' for k in 'nsmlx' for suffix in ('', '6', '-cls', '-seg', '-pose')] + \
                      [f'yolov5{k}u.pt' for k in 'nsmlx'] + \
                      [f'yolov3{k}u.pt' for k in ('', '-spp', '-tiny')] + \
-                     [f'sam_{k}.pt' for k in 'bl']
+                     [f'sam_{k}.pt' for k in 'bl'] + \
+                     [f'rtdetr-{k}.pt' for k in 'lx']
 GITHUB_ASSET_STEMS = [Path(k).stem for k in GITHUB_ASSET_NAMES]
 
 
diff --git a/ultralytics/yolo/utils/files.py b/ultralytics/yolo/utils/files.py
index 7eee0f0..2a13c4e 100644
--- a/ultralytics/yolo/utils/files.py
+++ b/ultralytics/yolo/utils/files.py
@@ -49,7 +49,7 @@ def increment_path(path, exist_ok=False, sep='', mkdir=False):
 
         # Method 1
         for n in range(2, 9999):
-            p = f'{path}{sep}{str(n).zfill(4)}{suffix}'  # increment path
+            p = f'{path}{sep}{n}{suffix}'  # increment path
             if not os.path.exists(p):  #
                 break
         path = Path(p)
diff --git a/ultralytics/yolo/utils/torch_utils.py b/ultralytics/yolo/utils/torch_utils.py
index 1ca3205..ef17b31 100644
--- a/ultralytics/yolo/utils/torch_utils.py
+++ b/ultralytics/yolo/utils/torch_utils.py
@@ -25,7 +25,7 @@ TORCHVISION_0_10 = check_version(torchvision.__version__, '0.10.0')
 TORCH_1_9 = check_version(torch.__version__, '1.9.0')
 TORCH_1_11 = check_version(torch.__version__, '1.11.0')
 TORCH_1_12 = check_version(torch.__version__, '1.12.0')
-TORCH_2_X = check_version(torch.__version__, minimum='2.0')
+TORCH_2_0 = check_version(torch.__version__, minimum='2.0')
 
 
 @contextmanager
@@ -85,7 +85,7 @@ def select_device(device='', batch=0, newline=False, verbose=True):
             p = torch.cuda.get_device_properties(i)
             s += f"{'' if i == 0 else space}CUDA:{d} ({p.name}, {p.total_memory / (1 << 20):.0f}MiB)\n"  # bytes to MB
         arg = 'cuda:0'
-    elif mps and getattr(torch, 'has_mps', False) and torch.backends.mps.is_available() and TORCH_2_X:
+    elif mps and getattr(torch, 'has_mps', False) and torch.backends.mps.is_available() and TORCH_2_0:
         # Prefer MPS if available
         s += 'MPS\n'
         arg = 'mps'
@@ -274,11 +274,14 @@ def init_seeds(seed=0, deterministic=False):
     torch.cuda.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)  # for Multi-GPU, exception safe
     # torch.backends.cudnn.benchmark = True  # AutoBatch problem https://github.com/ultralytics/yolov5/issues/9287
-    if deterministic and TORCH_1_12:  # https://github.com/ultralytics/yolov5/pull/8213
-        torch.use_deterministic_algorithms(True)
-        torch.backends.cudnn.deterministic = True
-        os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
-        os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:  # https://github.com/ultralytics/yolov5/pull/8213
+        if TORCH_2_0:
+            torch.use_deterministic_algorithms(True)
+            torch.backends.cudnn.deterministic = True
+            os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+            os.environ['PYTHONHASHSEED'] = str(seed)
+        else:
+            LOGGER.warning('WARNING ⚠️ Upgrade to torch>=2.0.0 for deterministic training.')
 
 
 class ModelEMA: