diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index e9c9a68b0..ee8a02d79 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -203,12 +203,13 @@ extern "C" RocalTensor ROCAL_API_CALL rocalRotateFixed(RocalContext context, Roc
  * \param [in] is_output is the output tensor part of the graph output
  * \param [in] alpha controls contrast of the image
  * \param [in] beta controls brightness of the image
+ * \param [in] conditional_execution controls the execution of the augmentation
  * \param [in] output_layout the layout of the output tensor
  * \param [in] output_datatype the data type of the output tensor
  * \return RocalTensor
  */
 extern "C" RocalTensor ROCAL_API_CALL rocalBrightness(RocalContext context, RocalTensor input, bool is_output,
-                                                      RocalFloatParam alpha = NULL, RocalFloatParam beta = NULL,
+                                                      RocalFloatParam alpha = NULL, RocalFloatParam beta = NULL, RocalIntParam conditional_execution = NULL,
                                                       RocalTensorLayout output_layout = ROCAL_NONE,
                                                       RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
@@ -219,13 +220,14 @@ extern "C" RocalTensor ROCAL_API_CALL rocalBrightness(RocalContext context, Roca
  * \param [in] is_output is the output tensor part of the graph output
  * \param [in] alpha controls contrast of the image
  * \param [in] beta controls brightness of the image
+ * \param [in] conditional_execution controls the execution of the augmentation
  * \param [in] output_layout the layout of the output tensor
  * \param [in] output_datatype the data type of the output tensor
  * \return RocalTensor
  */
 extern "C" RocalTensor ROCAL_API_CALL rocalBrightnessFixed(RocalContext context, RocalTensor input,
                                                            float alpha, float beta,
-                                                           bool is_output,
+                                                           bool is_output, int conditional_execution = 1,
                                                            RocalTensorLayout output_layout = ROCAL_NONE,
                                                            RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
@@ -307,7 +309,7 @@ extern "C" RocalTensor ROCAL_API_CALL rocalContrastFixed(RocalContext context, R
  * \return RocalTensor
  */
 extern "C" RocalTensor ROCAL_API_CALL rocalFlip(RocalContext context, RocalTensor input, bool is_output,
-                                                RocalIntParam horizonal_flag = NULL, RocalIntParam vertical_flag = NULL,
+                                                RocalIntParam horizonal_flag = NULL, RocalIntParam vertical_flag = NULL, RocalIntParam depth_flag = NULL,
                                                 RocalTensorLayout output_layout = ROCAL_NONE,
                                                 RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
@@ -323,7 +325,7 @@ extern "C" RocalTensor ROCAL_API_CALL rocalFlip(RocalContext context, RocalTenso
  * \return RocalTensor
  */
 extern "C" RocalTensor ROCAL_API_CALL rocalFlipFixed(RocalContext context, RocalTensor input,
-                                                     int horizonal_flag, int vertical_flag, bool is_output,
+                                                     int horizonal_flag, int vertical_flag, int depth_flag, bool is_output,
                                                      RocalTensorLayout output_layout = ROCAL_NONE,
                                                      RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
@@ -565,6 +567,68 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSnPNoiseFixed(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/*! \brief Applies gaussian noise effect on images.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] mean Mean of the distribution
+ * \param [in] std_dev Standard deviation of the distribution
+ * \param [in] seed seed value for the random number generator
+ * \param [in] conditional_execution controls the execution of the augmentation
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalGaussianNoise(RocalContext context, RocalTensor input,
+                                                         bool is_output,
+                                                         RocalFloatParam mean = NULL, RocalFloatParam std_dev = NULL,
+                                                         int seed = 0, RocalIntParam conditional_execution = NULL,
+                                                         RocalTensorLayout output_layout = ROCAL_NONE,
+                                                         RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
+/*! \brief Applies gaussian noise effect on images with fixed parameters.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] mean Mean of the distribution
+ * \param [in] std_dev Standard deviation of the distribution
+ * \param [in] seed seed value for the random number generator
+ * \param [in] conditional_execution controls the execution of the augmentation
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalGaussianNoiseFixed(RocalContext context, RocalTensor input,
+                                                              float mean, float std_dev,
+                                                              bool is_output, int seed = 0, int conditional_execution = 1,
+                                                              RocalTensorLayout output_layout = ROCAL_NONE,
+                                                              RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
+/*! \brief Applies slice augmentation on images.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] anchor_tensor Anchor used for slice
+ * \param [in] shape_tensor Shape of the output slice
+ * \param [in] fill_values Fill value for the slice padding
+ * \param [in] policy Padding policy used for slice augmentation
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalSlice(RocalContext context,
+                                                 RocalTensor input,
+                                                 bool is_output,
+                                                 RocalTensor anchor_tensor,
+                                                 std::vector<int> shape_tensor,
+                                                 std::vector<float> fill_values,
+                                                 RocalOutOfBoundsPolicy policy,
+                                                 RocalTensorLayout output_layout = ROCAL_NONE,
+                                                 RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 /*! \brief Applies snow effect on images.
  * \ingroup group_rocal_augmentations
  * \param [in] context Rocal context
@@ -1098,4 +1162,6 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSSDRandomCrop(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+extern "C" RocalTensor ROCAL_API_CALL rocalSetLayout(RocalContext context, RocalTensor input,
+                                                     RocalTensorLayout output_layout = ROCAL_NONE);
 #endif  // MIVISIONX_ROCAL_API_AUGMENTATION_H
diff --git a/rocAL/include/api/rocal_api_data_loaders.h b/rocAL/include/api/rocal_api_data_loaders.h
index 62e3a6e66..a1f477f72 100644
--- a/rocAL/include/api/rocal_api_data_loaders.h
+++ b/rocAL/include/api/rocal_api_data_loaders.h
@@ -576,6 +576,52 @@ extern "C" RocalTensor ROCAL_API_CALL rocalRawTFRecordSourceSingleShard(RocalCon
                                                                         unsigned out_width = 0, unsigned out_height = 0,
                                                                         const char* record_name_prefix = "");
 
+/*! \brief Creates Numpy raw data reader and loader. It allocates the resources and objects required to read raw data stored on the numpy arrays.
+ * \ingroup group_rocal_data_loaders
+ * \param [in] context Rocal context
+ * \param [in] source_path A NULL terminated char string pointing to the location on the disk
+ * \param [in] internal_shard_count Defines the parallelism level by internally sharding the input dataset and load/decode using multiple decoder/loader instances. Using shard counts bigger than 1 improves the load/decode performance if compute resources (CPU cores) are available.
+ * \param [in] is_output Determines if the user wants the loaded images to be part of the output or not.
+ * \param [in] shuffle Determines if the user wants to shuffle the dataset or not.
+ * \param [in] loop Determines if the user wants to indefinitely loops through images or not.
+ * \param [in] decode_size_policy Decode size policy used for the loader
+ * \return Reference to the output tensor
+ */
+extern "C"  RocalTensor  ROCAL_API_CALL rocalNumpyFileSource(
+                 RocalContext p_context,
+                 const char* source_path,
+                 unsigned internal_shard_count,
+                 std::vector<std::string> files = {},
+                 bool is_output = false,
+                 bool shuffle = false,
+                 bool loop = false,
+                 RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE,
+                 unsigned seed = 0);
+
+/*! \brief Creates Numpy raw data reader and loader. It allocates the resources and objects required to read raw data stored on the numpy arrays.
+ * \ingroup group_rocal_data_loaders
+ * \param [in] context Rocal context
+ * \param [in] source_path A NULL terminated char string pointing to the location on the disk
+ * \param [in] is_output Determines if the user wants the loaded images to be part of the output or not.
+ * \param [in] shuffle Determines if the user wants to shuffle the dataset or not.
+ * \param [in] loop Determines if the user wants to indefinitely loops through images or not.
+ * \param [in] decode_size_policy Decode size policy used for the loader
+ * \param [in] shard_id Shard id for this loader
+ * \param [in] shard_count Total shard count
+ * \return Reference to the output tensor
+ */
+extern "C"  RocalTensor  rocalNumpyFileSourceSingleShard(
+                 RocalContext p_context,
+                 const char* source_path,
+                 std::vector<std::string> files = {},
+                 bool is_output = false,
+                 bool shuffle = false,
+                 bool loop = false,
+                 RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE,
+                 unsigned shard_id = 0,
+                 unsigned shard_count = 1,
+                 unsigned seed = 0);
+
 /*!
  * \brief Creates a video reader and decoder as a source. It allocates the resources and objects required to read and decode mp4 videos stored on the file systems.
  * \ingroup group_rocal_data_loaders
diff --git a/rocAL/include/api/rocal_api_meta_data.h b/rocAL/include/api/rocal_api_meta_data.h
index 17407dbb8..17845535d 100644
--- a/rocAL/include/api/rocal_api_meta_data.h
+++ b/rocAL/include/api/rocal_api_meta_data.h
@@ -315,4 +315,23 @@ extern "C" void ROCAL_API_CALL rocalBoxIouMatcher(RocalContext p_context, std::v
  */
 extern "C" RocalTensorList ROCAL_API_CALL rocalGetMatchedIndices(RocalContext p_context);
 
+/*! \brief initialize the values required for ROI Random crop
+ * \ingroup group_rocal_meta_data
+ * \param [in] rocal_context rocal context
+ * \param [in] crop_shape_batch
+ * \param [in] roi_begin_batch
+ * \param [in] input_shape_batch
+ * \param [in] roi_end_batch
+ * \param [out] anchor The generated anchor tensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalROIRandomCrop(RocalContext p_context, RocalTensor p_input, RocalTensor roi_start, RocalTensor roi_end, std::vector<int> crop_shape);
+
+/*! \brief initialize the values required for ROI Random crop
+ * \ingroup group_rocal_meta_data
+ * \param [in] rocal_context rocal context
+ * \param [in] p_input
+ * \param [out] anchor The generated anchor tensor
+ */
+extern "C" RocalTensorList ROCAL_API_CALL rocalRandomObjectBbox(RocalContext p_context, RocalTensor p_input, std::string output_format="anchor_shape", int k_largest = -1, float foreground_prob = 1.0);
+
 #endif  // MIVISIONX_ROCAL_API_META_DATA_H
diff --git a/rocAL/include/api/rocal_api_types.h b/rocAL/include/api/rocal_api_types.h
index 929ab5892..eed98519a 100644
--- a/rocAL/include/api/rocal_api_types.h
+++ b/rocAL/include/api/rocal_api_types.h
@@ -218,9 +218,15 @@ enum RocalTensorLayout {
     /*! \brief AMD ROCAL_NFCHW
      */
     ROCAL_NFCHW = 3,
+    /*! \brief AMD ROCAL_NDHWC
+     */
+    ROCAL_NDHWC = 4,
+    /*! \brief AMD ROCAL_NCDHW
+     */
+    ROCAL_NCDHW = 5,
     /*! \brief AMD ROCAL_NONE
      */
-    ROCAL_NONE = 4  // Layout for generic tensors (Non-Image or Non-Video)
+    ROCAL_NONE = 6  // Layout for generic tensors (Non-Image or Non-Video)
 };
 
 /*! \brief rocAL Tensor Output Type enum
@@ -312,8 +318,7 @@ enum RocalResizeScalingMode {
 /*! \brief rocAL Resize Interpolation Type enum
  * \ingroup group_rocal_types
  */
-enum RocalResizeInterpolationType
-{
+enum RocalResizeInterpolationType {
     /*! \brief AMD ROCAL_NEAREST_NEIGHBOR_INTERPOLATION
      */
     ROCAL_NEAREST_NEIGHBOR_INTERPOLATION = 0,
@@ -373,4 +378,16 @@ enum RocalExternalSourceMode {
     ROCAL_EXTSOURCE_RAW_UNCOMPRESSED = 2,
 };
 
+/*! \brief Tensor padding types
+ * \ingroup group_rocal_types
+ */
+enum RocalOutOfBoundsPolicy {
+    /*! \brief TRIM_TO_SHAPE
+     */
+    TRIMTOSHAPE = 0,
+    /*! \brief PAD
+     */
+    PAD,
+};
+
 #endif  // MIVISIONX_ROCAL_API_TYPES_H
diff --git a/rocAL/include/augmentations/augmentations_nodes.h b/rocAL/include/augmentations/augmentations_nodes.h
index 34bc1d6a8..ef6beff32 100644
--- a/rocAL/include/augmentations/augmentations_nodes.h
+++ b/rocAL/include/augmentations/augmentations_nodes.h
@@ -55,3 +55,5 @@ THE SOFTWARE.
 #include "node_copy.h"
 #include "node_nop.h"
 #include "node_sequence_rearrange.h"
+#include "node_gaussian_noise.h"
+#include "node_slice.h"
diff --git a/rocAL/include/augmentations/color_augmentations/node_brightness.h b/rocAL/include/augmentations/color_augmentations/node_brightness.h
index 21369651c..16cd7a36b 100644
--- a/rocAL/include/augmentations/color_augmentations/node_brightness.h
+++ b/rocAL/include/augmentations/color_augmentations/node_brightness.h
@@ -31,8 +31,8 @@ class BrightnessNode : public Node {
     BrightnessNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
     BrightnessNode() = delete;
 
-    void init(float alpha, float beta);
-    void init(FloatParam *alpha_param, FloatParam *beta_param);
+    void init(float alpha, float beta, int conditional_execution);
+    void init(FloatParam *alpha_param, FloatParam *beta_param, IntParam *conditional_execution);
 
    protected:
     void create_node() override;
@@ -41,6 +41,8 @@ class BrightnessNode : public Node {
    private:
     ParameterVX<float> _alpha;
     ParameterVX<float> _beta;
+    ParameterVX<int> _conditional_execution;
     constexpr static float ALPHA_RANGE[2] = {0.1, 1.95};
     constexpr static float BETA_RANGE[2] = {0, 25};
+    constexpr static int CONDITIONAL_EXECUTION_RANGE[2] = {0, 1};
 };
\ No newline at end of file
diff --git a/rocAL/include/augmentations/effects_augmentations/node_gaussian_noise.h b/rocAL/include/augmentations/effects_augmentations/node_gaussian_noise.h
new file mode 100644
index 000000000..bdfad5e03
--- /dev/null
+++ b/rocAL/include/augmentations/effects_augmentations/node_gaussian_noise.h
@@ -0,0 +1,48 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "graph.h"
+#include "node.h"
+#include "parameter_factory.h"
+#include "parameter_vx.h"
+
+class GaussianNoiseNode : public Node {
+   public:
+    GaussianNoiseNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    GaussianNoiseNode() = delete;
+    void init(float mean, float std_dev, int seed, int conditional_execution);
+    void init(FloatParam *mean_param, FloatParam *stddev_param, int seed, IntParam *condition_execution_param);
+
+   protected:
+    void create_node() override;
+    void update_node() override;
+
+   private:
+    ParameterVX<float> _mean, _stddev;
+    ParameterVX<int> _conditional_execution;
+    constexpr static float MEAN_RANGE[2] = {0, 5};
+    constexpr static float STDDEV_RANGE[2] = {1, 5};
+    constexpr static int CONDITIONAL_EXECUTION_RANGE[2] = {0, 1};
+    int _seed;
+};
diff --git a/rocAL/include/augmentations/geometry_augmentations/node_flip.h b/rocAL/include/augmentations/geometry_augmentations/node_flip.h
index c2168adac..7873cd843 100644
--- a/rocAL/include/augmentations/geometry_augmentations/node_flip.h
+++ b/rocAL/include/augmentations/geometry_augmentations/node_flip.h
@@ -29,17 +29,19 @@ class FlipNode : public Node {
    public:
     FlipNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
     FlipNode() = delete;
-    void init(int h_flag, int v_flag);
-    void init(IntParam *h_flag_param, IntParam *v_flag_param);
+    void init(int h_flag, int v_flag, int d_flag);
+    void init(IntParam *h_flag_param, IntParam *v_flag_param, IntParam *d_flag_param);
     vx_array get_horizontal_flip() { return _horizontal.default_array(); }
     vx_array get_vertical_flip() { return _vertical.default_array(); }
+    vx_array get_depth_flip() { return _depth.default_array(); }
 
    protected:
     void create_node() override;
     void update_node() override;
 
    private:
-    ParameterVX<int> _horizontal, _vertical;
+    ParameterVX<int> _horizontal, _vertical, _depth;
     constexpr static int HORIZONTAL_RANGE[2] = {0, 1};
     constexpr static int VERTICAL_RANGE[2] = {0, 1};
+    constexpr static int DEPTH_RANGE[2] = {0, 1};
 };
diff --git a/rocAL/include/augmentations/geometry_augmentations/node_slice.h b/rocAL/include/augmentations/geometry_augmentations/node_slice.h
new file mode 100644
index 000000000..08f5cd0bc
--- /dev/null
+++ b/rocAL/include/augmentations/geometry_augmentations/node_slice.h
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "graph.h"
+#include "node.h"
+#include "parameter_factory.h"
+#include "parameter_vx.h"
+#include "rocal_api_types.h"
+
+class SliceNode : public Node {
+   public:
+    SliceNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    SliceNode() = delete;
+    ~SliceNode();
+    void init(Tensor *anchor_param, std::vector<int> shape_param, std::vector<float> &fill_values_param, RocalOutOfBoundsPolicy policy);
+
+   protected:
+    void create_node() override;
+    void update_node() override;
+    void create_shape_tensor();
+
+   private:
+    vx_array _fill_values_array;
+    void *_shape_array;
+    Tensor *_anchor;
+    vx_tensor _shape = nullptr;
+    std::vector<float> _fill_values, _fill_values_vec;
+    std::vector<int> _anchor_vec, _shape_vec;
+    std::vector<std::vector<uint32_t>> _slice_roi;
+    RocalOutOfBoundsPolicy _policy = RocalOutOfBoundsPolicy::PAD;
+};
\ No newline at end of file
diff --git a/rocAL/include/loaders/image/node_numpy_loader.h b/rocAL/include/loaders/image/node_numpy_loader.h
new file mode 100644
index 000000000..49918e4f5
--- /dev/null
+++ b/rocAL/include/loaders/image/node_numpy_loader.h
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "graph.h"
+#include "node.h"
+#include "numpy_loader_sharded.h"
+
+class NumpyLoaderNode : public Node {
+   public:
+    /// \param device_resources shard count from user
+
+    /// internal_shard_count number of loader/decoders are created and each shard is loaded and decoded using separate and independent resources increasing the parallelism and performance.
+    NumpyLoaderNode(Tensor *output, void *device_resources);
+    ~NumpyLoaderNode() override;
+    NumpyLoaderNode() = delete;
+    ///
+    /// \param internal_shard_count Defines the amount of parallelism user wants for the load and decode process to be handled internally.
+    /// \param source_path Defines the path that includes the image dataset
+    /// \param load_batch_count Defines the quantum count of the images to be loaded. It's usually equal to the user's batch size.
+    /// The loader will repeat images if necessary to be able to have images in multiples of the load_batch_count,
+    /// for example if there are 10 images in the dataset and load_batch_count is 3, the loader repeats 2 images as if there are 12 images available.
+    void init(unsigned internal_shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
+              size_t load_batch_count, RocalMemType mem_type, unsigned seed = 0, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), const char *prefix = "", unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
+
+    std::shared_ptr<LoaderModule> get_loader_module();
+
+   protected:
+    void create_node() override{};
+    void update_node() override{};
+
+   private:
+    std::shared_ptr<NumpyLoaderSharded> _loader_module = nullptr;
+};
diff --git a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
new file mode 100644
index 000000000..cd3b464e7
--- /dev/null
+++ b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "graph.h"
+#include "node.h"
+#include "numpy_loader_sharded.h"
+
+class NumpyLoaderSingleShardNode : public Node {
+   public:
+    NumpyLoaderSingleShardNode(Tensor *output, void *device_resources);
+    ~NumpyLoaderSingleShardNode() override;
+
+    /// \param user_shard_count shard count from user
+    /// \param  user_shard_id shard id from user
+    /// \param source_path Defines the path that includes the image dataset
+    /// \param load_batch_count Defines the quantum count of the images to be loaded. It's usually equal to the user's batch size.
+    /// The loader will repeat images if necessary to be able to have images in multiples of the load_batch_count,
+    /// for example if there are 10 images in the dataset and load_batch_count is 3, the loader repeats 2 images as if there are 12 images available.
+    void init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::vector<std::string> &files,
+              StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
+              size_t load_batch_count, RocalMemType mem_type, unsigned seed = 0, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
+
+    std::shared_ptr<LoaderModule> get_loader_module();
+
+   protected:
+    void create_node() override{};
+    void update_node() override{};
+
+   private:
+    std::shared_ptr<NumpyLoader> _loader_module = nullptr;
+};
\ No newline at end of file
diff --git a/rocAL/include/loaders/image/numpy_loader.h b/rocAL/include/loaders/image/numpy_loader.h
new file mode 100644
index 000000000..0ff053da2
--- /dev/null
+++ b/rocAL/include/loaders/image/numpy_loader.h
@@ -0,0 +1,91 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "circular_buffer.h"
+#include "commons.h"
+#include "image_read_and_decode.h"
+//
+// NumpyLoader runs an internal thread for loading an decoding of numpy arrays asynchronously
+// it uses a circular buffer to store decoded numpy arrays for the user
+class NumpyLoader : public LoaderModule {
+   public:
+    explicit NumpyLoader(void* dev_resources);
+    ~NumpyLoader() override;
+    LoaderModuleStatus load_next() override;
+    void initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RocalMemType mem_type, unsigned batch_size, bool keep_orig_size = false) override;
+    void set_output(Tensor* output_image) override;
+    void set_random_bbox_data_reader(std::shared_ptr<RandomBBoxCrop_MetaDataReader> randombboxcrop_meta_data_reader) override;
+    size_t remaining_count() override;  // returns number of remaining items to be loaded
+    void reset() override;              // Resets the loader to load from the beginning of the media
+    Timing timing() override;
+    void start_loading() override;
+    LoaderModuleStatus set_cpu_affinity(cpu_set_t cpu_mask);
+    LoaderModuleStatus set_cpu_sched_policy(struct sched_param sched_policy);
+    void set_gpu_device_id(int device_id);
+    std::vector<std::string> get_id() override;
+    decoded_image_info get_decode_image_info() override;
+    crop_image_info get_crop_image_info() override;
+    void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
+    void shut_down() override;
+    void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
+
+   private:
+    bool is_out_of_data();
+    void de_init();
+    void stop_internal_thread();
+    LoaderModuleStatus update_output_image();
+    LoaderModuleStatus load_routine();
+    std::shared_ptr<Reader> _reader;
+    std::shared_ptr<RandomBBoxCrop_MetaDataReader> _randombboxcrop_meta_data_reader = nullptr;
+    Tensor* _output_tensor;
+    std::vector<std::string> _output_names;  //!< image name/ids that are stores in the _output_image
+    size_t _output_mem_size;
+    MetaDataBatch* _meta_data = nullptr;  //!< The output of the meta_data_graph,
+    std::vector<std::vector<float>> _bbox_coords;
+    bool _internal_thread_running;
+    size_t _batch_size;
+    size_t _image_size;
+    std::thread _load_thread;
+    RocalMemType _mem_type;
+    decoded_image_info _decoded_img_info;
+    crop_image_info _crop_image_info;
+    decoded_image_info _output_decoded_img_info;
+    crop_image_info _output_cropped_img_info;
+    CircularBuffer _circ_buff;
+    TimingDBG _file_load_time, _swap_handle_time;
+    bool _is_initialized;
+    bool _stopped = false;
+    bool _loop;                     //<! If true the reader will wrap around at the end of the media (files/images/...) and wouldn't stop
+    size_t _prefetch_queue_depth;   // Used for circular buffer's internal buffer
+    size_t _image_counter = 0;      //!< How many images have been loaded already
+    size_t _remaining_image_count;  //!< How many images are there yet to be loaded
+    bool _decoder_keep_original = false;
+    int _device_id;
+    std::vector<std::vector<unsigned>> _tensor_roi;
+};
diff --git a/rocAL/include/loaders/image/numpy_loader_sharded.h b/rocAL/include/loaders/image/numpy_loader_sharded.h
new file mode 100644
index 000000000..744cfc716
--- /dev/null
+++ b/rocAL/include/loaders/image/numpy_loader_sharded.h
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "numpy_loader.h"
+//
+// NumpyLoaderSharded Can be used to run load and decode in multiple shards, each shard by a single loader instance,
+// It improves load and decode performance since each loader loads the images in parallel using an internal thread
+//
+class NumpyLoaderSharded : public LoaderModule {
+   public:
+    explicit NumpyLoaderSharded(void *dev_resources);
+    ~NumpyLoaderSharded() override;
+    LoaderModuleStatus load_next() override;
+    void initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RocalMemType mem_type, unsigned batch_size, bool keep_orig_size = false) override;
+    void set_output(Tensor *output_image) override;
+    void set_random_bbox_data_reader(std::shared_ptr<RandomBBoxCrop_MetaDataReader> randombboxcrop_meta_data_reader) override;
+    size_t remaining_count() override;
+    void reset() override;
+    void start_loading() override;
+    std::vector<std::string> get_id() override;
+    decoded_image_info get_decode_image_info() override;
+    crop_image_info get_crop_image_info() override;
+    Timing timing() override;
+    void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
+    void shut_down() override;
+    void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
+
+   private:
+    void increment_loader_idx();
+    void *_dev_resources;
+    bool _initialized = false;
+    std::vector<std::shared_ptr<NumpyLoader>> _loaders;
+    size_t _loader_idx;
+    size_t _shard_count = 1;
+    void fast_forward_through_empty_loaders();
+    size_t _prefetch_queue_depth;
+
+    Tensor *_output_tensor;
+    std::shared_ptr<RandomBBoxCrop_MetaDataReader> _randombboxcrop_meta_data_reader = nullptr;
+};
\ No newline at end of file
diff --git a/rocAL/include/loaders/image_source_evaluator.h b/rocAL/include/loaders/image_source_evaluator.h
index 7f1cdd7b0..1b85deebe 100644
--- a/rocAL/include/loaders/image_source_evaluator.h
+++ b/rocAL/include/loaders/image_source_evaluator.h
@@ -41,10 +41,14 @@ enum class MaxSizeEvaluationPolicy {
 class ImageSourceEvaluator {
    public:
     ImageSourceEvaluatorStatus create(ReaderConfig reader_cfg, DecoderConfig decoder_cfg);
+    ImageSourceEvaluatorStatus create(ReaderConfig reader_cfg);
     void find_max_dimension();
+    void find_max_numpy_dimensions();
     void set_size_evaluation_policy(MaxSizeEvaluationPolicy arg);
     size_t max_width();
     size_t max_height();
+    std::vector<size_t> max_numpy_dims() { return _max_numpy_dims; };
+    RocalTensorDataType get_numpy_dtype() { return _numpy_dtype; };
 
    private:
     class FindMaxSize {
@@ -61,6 +65,9 @@ class ImageSourceEvaluator {
     };
     FindMaxSize _width_max;
     FindMaxSize _height_max;
+    FindMaxSize _channel_max;
+    std::vector<size_t> _max_numpy_dims;
+    RocalTensorDataType _numpy_dtype;
     DecoderConfig _decoder_cfg_cv;
     std::shared_ptr<Decoder> _decoder;
     std::shared_ptr<Reader> _reader;
diff --git a/rocAL/include/meta_data/meta_data_reader.h b/rocAL/include/meta_data/meta_data_reader.h
index 035fe5411..9a9410f93 100644
--- a/rocAL/include/meta_data/meta_data_reader.h
+++ b/rocAL/include/meta_data/meta_data_reader.h
@@ -60,7 +60,7 @@ struct MetaDataConfig {
     bool _aspect_ratio_grouping;
 
    public:
-    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path, const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1)
+    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path = std::string(), const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1)
         : _type(type), _reader_type(reader_type), _path(path), _feature_key_map(feature_key_map), _file_prefix(file_prefix), _sequence_length(sequence_length), _frame_step(frame_step), _frame_stride(frame_stride) {}
     MetaDataConfig() = delete;
     MetaDataType type() const { return _type; }
diff --git a/rocAL/include/parameters/parameter.h b/rocAL/include/parameters/parameter.h
index 723c3dbd7..1bec7b334 100644
--- a/rocAL/include/parameters/parameter.h
+++ b/rocAL/include/parameters/parameter.h
@@ -33,6 +33,10 @@ class Parameter {
     /// used to internally renew state of the parameter if needed (for random parameters)
     virtual void renew(){};
 
+    virtual void create_array(unsigned batch_size){};
+
+    virtual std::vector<T> get_array() { return {}; };
+
     virtual ~Parameter() {}
     ///
     /// \return returns if this parameter takes a single value (vs a range of values or many values)
diff --git a/rocAL/include/parameters/parameter_factory.h b/rocAL/include/parameters/parameter_factory.h
index ccd3b4d2c..582d51fb5 100644
--- a/rocAL/include/parameters/parameter_factory.h
+++ b/rocAL/include/parameters/parameter_factory.h
@@ -29,6 +29,8 @@ THE SOFTWARE.
 #include "parameter_random.h"
 #include "parameter_simple.h"
 
+const int MAX_SEEDS = 1024;
+
 enum class RocalParameterType {
     DETERMINISTIC = 0,
     RANDOM_UNIFORM,
@@ -72,6 +74,8 @@ class ParameterFactory {
     void set_seed(unsigned seed);
     unsigned get_seed();
     void generate_seed();
+    int64_t get_seed_from_seedsequence();
+    void increment_seed_sequence_idx();
 
     template <typename T>
     Parameter<T>* create_uniform_rand_param(T start, T end) {
@@ -104,4 +108,6 @@ class ParameterFactory {
     static ParameterFactory* _instance;
     static std::mutex _mutex;
     ParameterFactory();
+    std::vector<int64_t> _seed_vector;
+    int _seed_sequence_idx = 0;
 };
diff --git a/rocAL/include/parameters/parameter_random.h b/rocAL/include/parameters/parameter_random.h
index c379a894f..54414ae07 100644
--- a/rocAL/include/parameters/parameter_random.h
+++ b/rocAL/include/parameters/parameter_random.h
@@ -51,7 +51,12 @@ class UniformRand : public Parameter<T> {
     T get() override {
         return _updated_val;
     };
-    void renew() override {
+
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void renew_value() {
         std::unique_lock<std::mutex> lock(_lock);
         auto val = _generator();
 
@@ -64,6 +69,21 @@ class UniformRand : public Parameter<T> {
                 ((double)val / (double)_generator.max()) * ((double)_end - (double)_start) + (double)_start);
         }
     }
+
+    void renew_array() {
+        for (uint i = 0; i < _batch_size; i++) {
+            renew_value();
+            _array[i] = _updated_val;
+        }
+    }
+
+    void renew() override {
+        if (_array.size() > 0) {
+            renew_array();
+        } else {
+            renew_value();
+        }
+    }
     int update(T start, T end) {
         std::unique_lock<std::mutex> lock(_lock);
         if (end < start)
@@ -73,6 +93,13 @@ class UniformRand : public Parameter<T> {
         _end = end;
         return 0;
     }
+
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+    }
+
     bool single_value() const override {
         return (_start == _end);
     }
@@ -81,8 +108,10 @@ class UniformRand : public Parameter<T> {
     T _start;
     T _end;
     T _updated_val;
+    std::vector<T> _array;
     std::mt19937 _generator;
     std::mutex _lock;
+    unsigned _batch_size;
 };
 
 template <typename T>
@@ -142,7 +171,8 @@ struct CustomRand : public Parameter<T> {
     T default_value() const override {
         return static_cast<T>(_mean);
     }
-    void renew() override {
+
+    void renew_value() {
         std::unique_lock<std::mutex> lock(_lock);
         if (single_value()) {
             // If there is only a single value possible for the random variable
@@ -150,7 +180,7 @@ struct CustomRand : public Parameter<T> {
             _updated_val = _values[0];
         } else {
             // Generate a value between [0 1]
-            double rand_val = (double)_generator() / (double)_generator.max();
+            double rand_val = (double) _generator() / (double) _generator.max();
 
             // Find the iterators pointing to the first element bigger than idx
             auto it = std::upper_bound(_comltv_dist.begin(), _comltv_dist.end(), rand_val);
@@ -161,10 +191,35 @@ struct CustomRand : public Parameter<T> {
             _updated_val = _values[idx];
         }
     }
+
+    void renew_array() {
+        for (uint i = 0; i < _batch_size; i++) {
+            renew_value();
+            _array[i] = _updated_val;
+        }
+    }
+
+    void renew() override {
+        if (_array.size() > 0) {
+            renew_array();
+        } else {
+            renew_value();
+        }
+    }
     T get() override {
         return _updated_val;
     };
 
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+    }
+
     bool single_value() const override {
         return (_values.size() == 1);
     }
@@ -175,6 +230,8 @@ struct CustomRand : public Parameter<T> {
     std::vector<double> _comltv_dist;  //!< commulative probabilities
     double _mean;
     T _updated_val;
+    std::vector<T> _array;
     std::mt19937 _generator;
     std::mutex _lock;
+    unsigned _batch_size;
 };
\ No newline at end of file
diff --git a/rocAL/include/parameters/parameter_simple.h b/rocAL/include/parameters/parameter_simple.h
index d3fb0dc3f..c1ee1d5a2 100644
--- a/rocAL/include/parameters/parameter_simple.h
+++ b/rocAL/include/parameters/parameter_simple.h
@@ -35,11 +35,37 @@ class SimpleParameter : public Parameter<T> {
     T get() override {
         return _val;
     }
-    int update(T new_val) {
+
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void update_single_value(T new_val) {
         _val = new_val;
+    }
+
+    void update_array(T new_val) {
+        for (uint i = 0; i < _batch_size; i++) {
+            update_single_value(new_val);
+            _array[i] = _val;
+        }
+    }
+
+    int update(T new_val) {
+        if (_array.size() > 0)
+            update_array(new_val);
+        else
+            update_single_value(new_val);
         return 0;
     }
 
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+        update(_val);
+    }
+
     ~SimpleParameter() = default;
 
     bool single_value() const override {
@@ -48,6 +74,8 @@ class SimpleParameter : public Parameter<T> {
 
    private:
     T _val;
+    std::vector<T> _array;
+    unsigned _batch_size;
 };
 using pIntParam = std::shared_ptr<SimpleParameter<int>>;
 using pFloatParam = std::shared_ptr<SimpleParameter<float>>;
diff --git a/rocAL/include/parameters/parameter_vx.h b/rocAL/include/parameters/parameter_vx.h
index e63da998f..e71cd48ee 100644
--- a/rocAL/include/parameters/parameter_vx.h
+++ b/rocAL/include/parameters/parameter_vx.h
@@ -52,11 +52,12 @@ class ParameterVX {
             THROW("Reading vx scalar failed" + TOSTR(status));
     }
     void create_array(std::shared_ptr<Graph> graph, vx_enum data_type, unsigned batch_size) {
-        // _arrVal = (T*)malloc(sizeof(T) * _batch_size);
         _batch_size = batch_size;
-        _arrVal.resize(_batch_size);
+        _param->create_array(_batch_size);
         _array = vxCreateArray(vxGetContext((vx_reference)graph->get()), data_type, _batch_size);
-        vxAddArrayItems(_array, _batch_size, _arrVal.data(), sizeof(T));
+        auto status = vxAddArrayItems(_array, _batch_size, get_array().data(), sizeof(T));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in create_array (ParameterVX): " + TOSTR(status))
         update_array();
     }
     void set_param(Parameter<T>* param) {
@@ -96,11 +97,7 @@ class ParameterVX {
     }
     void update_array() {
         vx_status status;
-        for (uint i = 0; i < _batch_size; i++) {
-            _arrVal[i] = renew();
-            // INFO("update_array: " + TOSTR(i) + "," + TOSTR(_arrVal[i]));
-        }
-        status = vxCopyArrayRange((vx_array)_array, 0, _batch_size, sizeof(T), _arrVal.data(), VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+        status = vxCopyArrayRange((vx_array)_array, 0, _batch_size, sizeof(T), get_array().data(), VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
         if (status != 0)
             THROW(" vxCopyArrayRange failed in update_array (ParameterVX): " + TOSTR(status))
     }
@@ -109,12 +106,15 @@ class ParameterVX {
         return _param->get();
     }
 
+    std::vector<T> get_array() {
+        return _param->get_array();
+    }
+
    private:
     vx_scalar _scalar;
-    vx_array _array;
+    vx_array _array = nullptr;
     Parameter<T>* _param;
     T _val;
-    std::vector<T> _arrVal;
     unsigned _batch_size;
     unsigned OVX_PARAM_IDX;
     const T _DEFAULT_RANGE_START;
diff --git a/rocAL/include/pipeline/commons.h b/rocAL/include/pipeline/commons.h
index d3687b582..03bdd5f12 100644
--- a/rocAL/include/pipeline/commons.h
+++ b/rocAL/include/pipeline/commons.h
@@ -46,6 +46,8 @@ enum class RocalTensorlayout {
     NCHW,
     NFHWC,
     NFCHW,
+    NDHWC,
+    NCDHW,
     NONE
 };
 
@@ -105,6 +107,15 @@ enum class RocalROIType {
     XYWH
 };
 
+/*! \brief Tensor 3D ROI type
+ *
+ * currently supports following formats
+ */
+enum class Rocal3DROIType {
+    LTFRBB = 0,
+    XYZWHD
+};
+
 /*! \brief Tensor ROI in LTRB format
  *
  */
diff --git a/rocAL/include/pipeline/graph.h b/rocAL/include/pipeline/graph.h
index ca0553ead..dc02652f3 100644
--- a/rocAL/include/pipeline/graph.h
+++ b/rocAL/include/pipeline/graph.h
@@ -32,6 +32,8 @@ class Graph {
     Graph(vx_context context, RocalAffinity affinity, int cpu_id = 0, size_t cpu_num_threads = 1, int gpu_id = 0);
     Status verify();
     Status process();
+    Status schedule();
+    Status wait();
     Status release();
     vx_graph get() { return _graph; }
 
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index dfe663d96..d0cf50ab1 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -37,6 +37,8 @@ THE SOFTWARE.
 #include "node_image_loader_single_shard.h"
 #include "node_video_loader.h"
 #include "node_video_loader_single_shard.h"
+#include "node_numpy_loader.h"
+#include "node_numpy_loader_single_shard.h"
 #include "ring_buffer.h"
 #include "timing_debug.h"
 #if ENABLE_HIP
@@ -139,20 +141,41 @@ class MasterGraph {
                              RocalTensorlayout layout, bool eos);
     void set_external_source_reader_flag() { _external_source_reader = true; }
     size_t bounding_box_batch_count(pMetaDataBatch meta_data_batch);
+    Tensor* roi_random_crop(Tensor *input, Tensor *roi_start, Tensor *roi_end, int *crop_shape);
+    TensorList* random_object_bbox(Tensor *input, std::string output_format, int k_largest = -1, float foreground_prob=1.0);
+    void update_roi_random_crop();
+    void update_random_object_bbox();
+    void findLabels(const u_int8_t *input, std::set<int> &labels, std::vector<int> roi_size, std::vector<size_t> max_size);
+    void filterByLabel(const u_int8_t *input, std::vector<int> &output, std::vector<int> roi_size, std::vector<size_t> max_size, int label);
+    void labelRow(const int *label_base, const int *in_row, int *out_row, unsigned length);
+    int disjointGetGroup(const int &x) { return x; }
+    int disjointSetGroup(int &x, int new_id);
+    int disjointFind(int *items, int x);
+    int disjointMerge(int *items, int x, int y);
+    void mergeRow(int *label_base, const int *in1, const int *in2, int *out1, int *out2, unsigned n);
+    int labelMergeFunc(const u_int8_t *input, std::vector<int> &size, std::vector<size_t> &max_size, std::vector<int> &output_compact, std::mt19937 &rng);
+    bool hit(std::vector<unsigned>& hits, unsigned idx);
+    void get_label_boundingboxes(std::vector<std::vector<std::vector<unsigned>>> &boxes, std::vector<std::pair<unsigned, unsigned>> ranges, std::vector<unsigned> hits, int *in, std::vector<int> origin, unsigned width);
+    int pick_box(std::vector<std::vector<std::vector<unsigned>>> boxes, std::mt19937 &rng, int k_largest = -1);
 #if ENABLE_OPENCL
-    cl_command_queue get_ocl_cmd_q() { return _device.resources()->cmd_queue; }
+        cl_command_queue get_ocl_cmd_q() {
+        return _device.resources()->cmd_queue;
+    }
 #endif
    private:
     Status update_node_parameters();
     void create_single_graph();
+    void create_multiple_graphs();
     void start_processing();
     void stop_processing();
     void output_routine();
+    void output_routine_multiple_loaders();
     void decrease_image_count();
     /// notify_user_thread() is called when the internal processing thread is done with processing all available tensors
     void notify_user_thread();
     /// no_more_processed_data() is logically linked to the notify_user_thread() and is used to tell the user they've already consumed all the processed tensors
     bool no_more_processed_data();
+    bool is_out_of_data();
     RingBuffer _ring_buffer;                                                      //!< The queue that keeps the tensors that have benn processed by the internal thread (_output_thread) asynchronous to the user's thread
     pMetaDataBatch _augmented_meta_data = nullptr;                                //!< The output of the meta_data_graph,
     std::shared_ptr<CropCordBatch> _random_bbox_crop_cords_data = nullptr;
@@ -179,10 +202,12 @@ class MasterGraph {
     DeviceManager _device;                                                        //!< Keeps the device related constructs needed for running on GPU
 #endif
     std::shared_ptr<Graph> _graph = nullptr;
+    std::vector<std::shared_ptr<Graph>> _graphs;
     RocalAffinity _affinity;
     size_t _cpu_num_threads;                                                      //!< Defines the number of CPU threads used for processing
     const int _gpu_id;                                                            //!< Defines the device id used for processing
     pLoaderModule _loader_module;                                                 //!< Keeps the loader module used to feed the input the tensors of the graph
+    std::vector<pLoaderModule> _loader_modules;                                   //!< Keeps the list of loader modules used to feed the input the tensors of the graph
     TimingDBG _convert_time, _process_time, _bencode_time;
     const size_t _user_batch_size;                                                //!< Batch size provided by the user
     vx_context _context;
@@ -193,7 +218,7 @@ class MasterGraph {
     bool _first_run = true;
     bool _processing;                                                             //!< Indicates if internal processing thread should keep processing or not
     const static unsigned SAMPLE_SIZE = sizeof(unsigned char);
-    int _remaining_count;                                                         //!< Keeps the count of remaining tensors yet to be processed for the user,
+    int _remaining_count = INT_MAX;                                                         //!< Keeps the count of remaining tensors yet to be processed for the user,
     bool _loop;                                                                   //!< Indicates if user wants to indefinitely loops through tensors or not
     size_t _prefetch_queue_depth;
     bool _output_routine_finished_processing = false;
@@ -217,6 +242,23 @@ class MasterGraph {
     // box IoU matcher variables
     bool _is_box_iou_matcher = false;                                             // bool variable to set the box iou matcher
     BoxIouMatcherInfo _iou_matcher_info;
+    bool _is_roi_random_crop = false;
+    bool _is_random_object_bbox = false;
+    int *_crop_shape_batch = nullptr;
+    int *_roi_batch = nullptr;
+    Tensor *_roi_random_crop_tensor = nullptr;
+    Tensor *_roi_start_tensor = nullptr;
+    Tensor *_roi_end_tensor = nullptr;
+    Tensor *_random_object_bbox_label_tensor = nullptr;
+    Tensor *_random_object_bbox_box1_tensor = nullptr;
+    Tensor *_random_object_bbox_box2_tensor = nullptr;
+    void *_roi_random_crop_buf = nullptr;
+    void *_random_object_bbox_box1_buf = nullptr;
+    void *_random_object_bbox_box2_buf = nullptr;
+    TensorList _random_object_bbox_tensor_list;
+    std::string _random_object_bbox_output_format;
+    int _k_largest;
+    float _foreground_prob;
 #if ENABLE_HIP
     BoxEncoderGpu *_box_encoder_gpu = nullptr;
 #endif
@@ -258,15 +300,17 @@ std::shared_ptr<T> MasterGraph::meta_add_node(std::shared_ptr<M> node) {
  */
 template <>
 inline std::shared_ptr<ImageLoaderNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<ImageLoaderNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<ImageLoaderNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -276,15 +320,17 @@ inline std::shared_ptr<ImageLoaderNode> MasterGraph::add_node(const std::vector<
 
 template <>
 inline std::shared_ptr<ImageLoaderSingleShardNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<ImageLoaderSingleShardNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<ImageLoaderSingleShardNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -294,16 +340,18 @@ inline std::shared_ptr<ImageLoaderSingleShardNode> MasterGraph::add_node(const s
 
 template <>
 inline std::shared_ptr<FusedJpegCropNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<FusedJpegCropNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<FusedJpegCropNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
-    _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -313,16 +361,18 @@ inline std::shared_ptr<FusedJpegCropNode> MasterGraph::add_node(const std::vecto
 
 template <>
 inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<FusedJpegCropSingleShardNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<FusedJpegCropSingleShardNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
-    _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -335,15 +385,17 @@ inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add_node(const
  */
 template <>
 inline std::shared_ptr<Cifar10LoaderNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<Cifar10LoaderNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<Cifar10LoaderNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -356,15 +408,17 @@ inline std::shared_ptr<Cifar10LoaderNode> MasterGraph::add_node(const std::vecto
  */
 template <>
 inline std::shared_ptr<VideoLoaderNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<VideoLoaderNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<VideoLoaderNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
@@ -374,18 +428,60 @@ inline std::shared_ptr<VideoLoaderNode> MasterGraph::add_node(const std::vector<
 
 template <>
 inline std::shared_ptr<VideoLoaderSingleShardNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    if (_loader_module)
-        THROW("A loader already exists, cannot have more than one loader")
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<VideoLoaderSingleShardNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<VideoLoaderSingleShardNode>(outputs[0], nullptr);
 #endif
-    _loader_module = node->get_loader_module();
-    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
+    _root_nodes.push_back(node);
+    for (auto &output : outputs)
+        _tensor_map.insert(std::make_pair(output, node));
+
+    return node;
+}
+
+template <>
+inline std::shared_ptr<NumpyLoaderNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
+#if ENABLE_HIP || ENABLE_OPENCL
+    auto node = std::make_shared<NumpyLoaderNode>(outputs[0], (void *)_device.resources());
+#else
+    auto node = std::make_shared<NumpyLoaderNode>(outputs[0], nullptr);
+#endif
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
     _root_nodes.push_back(node);
     for (auto &output : outputs)
         _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
+
+template <>
+inline std::shared_ptr<NumpyLoaderSingleShardNode> MasterGraph::add_node(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
+    // if (_loader_module)
+    //     THROW("A loader already exists, cannot have more than one loader")
+#if ENABLE_HIP || ENABLE_OPENCL
+    auto node = std::make_shared<NumpyLoaderSingleShardNode>(outputs[0], (void *)_device.resources());
+#else
+    auto node = std::make_shared<NumpyLoaderSingleShardNode>(outputs[0], nullptr);
+#endif
+    auto loader_module = node->get_loader_module();
+    loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_modules.emplace_back(loader_module);
+    node->set_id(_loader_modules.size() - 1);
+    _root_nodes.push_back(node);
+    for (auto &output : outputs)
+        _tensor_map.insert(std::make_pair(output, node));
+
+    return node;
+}
\ No newline at end of file
diff --git a/rocAL/include/pipeline/node.h b/rocAL/include/pipeline/node.h
index 9a729879f..b192b06ad 100644
--- a/rocAL/include/pipeline/node.h
+++ b/rocAL/include/pipeline/node.h
@@ -37,13 +37,15 @@ class Node {
     void update_parameters();
     std::vector<Tensor *> input() { return _inputs; };
     std::vector<Tensor *> output() { return _outputs; };
-    void add_next(const std::shared_ptr<Node> &node) {}      // To be implemented
-    void add_previous(const std::shared_ptr<Node> &node) {}  // To be implemented
+    void add_next(const std::shared_ptr<Node> &node);    // To be implemented
+    void add_previous(const std::shared_ptr<Node> &node); // To be implemented
     std::shared_ptr<Graph> graph() { return _graph; }
     void set_meta_data(pMetaDataBatch meta_data_info) { _meta_data_info = meta_data_info; }
     bool _is_ssd = false;
     const Roi2DCords *get_src_roi() { return _inputs[0]->info().roi().get_2D_roi(); }
     const Roi2DCords *get_dst_roi() { return _outputs[0]->info().roi().get_2D_roi(); }
+    void set_id(int id) { _graph_id = id; }
+    int get_id() { return _graph_id; }
 
    protected:
     virtual void create_node() = 0;
@@ -54,4 +56,7 @@ class Node {
     vx_node _node = nullptr;
     size_t _batch_size;
     pMetaDataBatch _meta_data_info;
+    std::vector<std::shared_ptr<Node>> _next;
+    std::vector<std::shared_ptr<Node>> _prev;
+    int _graph_id = -1;
 };
diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 63a639c6b..9c300702b 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -150,6 +150,10 @@ class TensorInfo {
             dims_mapping = {0, 1, 4, 2, 3};
         } else if (input_layout == RocalTensorlayout::NFCHW && output_layout == RocalTensorlayout::NFHWC) {
             dims_mapping = {0, 1, 3, 4, 2};
+        } else if (input_layout == RocalTensorlayout::NCDHW && output_layout == RocalTensorlayout::NDHWC) {
+            dims_mapping = {0, 2, 3, 4, 1};
+        } else if (input_layout == RocalTensorlayout::NDHWC && output_layout == RocalTensorlayout::NCDHW) {
+            dims_mapping = {0, 4, 1, 2, 3};
         } else {
             THROW("Invalid layout conversion")
         }
@@ -177,9 +181,19 @@ class TensorInfo {
                 _max_shape[0] = _dims.at(4);
                 _max_shape[1] = _dims.at(3);
                 _channels = _dims.at(2);
+            } else if (_layout == RocalTensorlayout::NDHWC) {
+                _is_image = false;
+                _max_shape.resize(4);
+                _max_shape.assign(_dims.begin() + 1, _dims.end());
+                _channels = _dims.at(4);
+            } else if (_layout == RocalTensorlayout::NCDHW) {
+                _is_image = false;
+                _max_shape.resize(4);
+                _max_shape.assign(_dims.begin() + 1, _dims.end());
+                _channels = _dims.at(1);
             }
-        } else {                                                             // For other tensors
-            if (!_max_shape.size()) _max_shape.resize(_num_of_dims - 1, 0);  // Since 2 values will be stored in the vector
+        } else {
+            if (!_max_shape.size()) _max_shape.resize(_num_of_dims - 1, 0);
             _max_shape.assign(_dims.begin() + 1, _dims.end());
         }
         reset_tensor_roi_buffers();
@@ -193,6 +207,11 @@ class TensorInfo {
             modify_strides();
         }
         _layout = layout;
+        if (_layout == RocalTensorlayout::NHWC || _layout == RocalTensorlayout::NDHWC) {
+            _channels = _dims.back();
+        } else if (_layout == RocalTensorlayout::NCHW || _layout == RocalTensorlayout::NCDHW) {
+            _channels = _dims.at(1);
+        }
     }
     void set_dims(std::vector<size_t>& new_dims) {
         if (_num_of_dims == new_dims.size()) {
@@ -231,6 +250,32 @@ class TensorInfo {
         set_tensor_layout(layout);            // Modify the layout and dims based on the layout input
         reset_tensor_roi_buffers();           // Reset ROI buffers to reflect the modified width and height
     }
+    void modify_dims(RocalTensorlayout layout, std::vector<int> new_dims) {
+        switch (_layout) {
+            case RocalTensorlayout::NHWC:
+            case RocalTensorlayout::NCHW: {
+                _max_shape[0] = _dims[1] = new_dims[0];
+                _max_shape[1] = _dims[2] = new_dims[1];
+                _max_shape[2] = _dims[3] = new_dims[2];
+                break;
+            }
+            case RocalTensorlayout::NDHWC:
+            case RocalTensorlayout::NCDHW: {
+                _max_shape[0] = _dims[1] = new_dims[0];
+                _max_shape[1] = _dims[2] = new_dims[1];
+                _max_shape[2] = _dims[3] = new_dims[2];
+                _max_shape[3] = _dims[4] = new_dims[3];
+                break;
+            }
+            default: {
+                THROW("Invalid layout type specified")
+            }
+        }
+        modify_strides();
+        _data_size = _strides[0] * _dims[0];  // Modify data size wrt latest width and height
+        set_tensor_layout(layout);            // Modify the layout and dims based on the layout input
+        reset_tensor_roi_buffers();           // Reset ROI buffers to reflect the modified width and height
+    }
     void modify_strides() {
         _strides[_num_of_dims - 1] = _data_type_size;
         for (int i = _num_of_dims - 2; i >= 0; i--) {
@@ -335,9 +380,11 @@ class Tensor : public rocalTensor {
     // create_from_handle() no internal memory allocation is done here since
     // tensor's handle should be swapped with external buffers before usage
     int create_from_handle(vx_context context);
+    int create_from_ptr(vx_context context, void *ptr);
     int create_virtual(vx_context context, vx_graph graph);
     bool is_handle_set() { return (_vx_handle != 0); }
     void set_dims(std::vector<size_t> dims) override { _info.set_dims(dims); }
+    void set_layout(RocalTensorlayout layout) { _info.set_tensor_layout(layout); }
     unsigned num_of_dims() override { return _info.num_of_dims(); }
     unsigned batch_size() override { return _info.batch_size(); }
     std::vector<size_t> dims() override { return _info.dims(); }
diff --git a/rocAL/include/readers/image/image_reader.h b/rocAL/include/readers/image/image_reader.h
index d7f13b4d6..1e2491a5c 100644
--- a/rocAL/include/readers/image/image_reader.h
+++ b/rocAL/include/readers/image/image_reader.h
@@ -29,6 +29,7 @@ THE SOFTWARE.
 #include <lmdb.h>
 #include "meta_data_reader.h"
 #include "video_properties.h"
+#include "tensor.h"
 
 #define CHECK_LMDB_RETURN_STATUS(status)                                                          \
     do {                                                                                          \
@@ -48,6 +49,7 @@ enum class StorageType {
     MXNET_RECORDIO = 7,
     VIDEO_FILE_SYSTEM = 8,
     EXTERNAL_FILE_SOURCE = 9,      // to support reading from external source
+    NUMPY_DATA = 10
 };
 
 enum class ExternalSourceFileMode {
@@ -80,6 +82,8 @@ struct ReaderConfig {
     void set_frame_step(unsigned step) { _sequence_frame_step = step; }
     void set_frame_stride(unsigned stride) { _sequence_frame_stride = stride; }
     void set_external_filemode(ExternalSourceFileMode mode) { _file_mode = mode; }
+    void set_files(const std::vector<std::string> &files) { _files = files; }
+    void set_seed(unsigned seed) { _seed = seed; }
     size_t get_shard_count() { return _shard_count; }
     size_t get_shard_id() { return _shard_id; }
     size_t get_cpu_num_threads() { return _cpu_num_threads; }
@@ -87,7 +91,9 @@ struct ReaderConfig {
     size_t get_sequence_length() { return _sequence_length; }
     size_t get_frame_step() { return _sequence_frame_step; }
     size_t get_frame_stride() { return _sequence_frame_stride; }
+    std::vector<std::string> get_files() { return _files; }
     std::string path() { return _path; }
+    unsigned seed() { return _seed; }
 #ifdef ROCAL_VIDEO
     void set_video_properties(VideoProperties video_prop) { _video_prop = video_prop; }
     VideoProperties get_video_properties() { return _video_prop; }
@@ -116,6 +122,8 @@ struct ReaderConfig {
     std::string _file_prefix = "";  //!< to read only files with prefix. supported only for cifar10_data_reader and tf_record_reader
     std::shared_ptr<MetaDataReader> _meta_data_reader = nullptr;
     ExternalSourceFileMode _file_mode = ExternalSourceFileMode::NONE;
+    std::vector<std::string> _files;
+    unsigned _seed = 0;
 #ifdef ROCAL_VIDEO
     VideoProperties _video_prop;
 #endif
@@ -132,6 +140,25 @@ struct ImageRecordIOHeader {
                            */
 };
 
+struct NumpyHeaderData {
+    public:
+    std::vector<unsigned> _shape;
+    RocalTensorDataType _type_info;
+    bool _fortran_order = false;
+    int64_t _data_offset = 0;
+
+    RocalTensorDataType type() const { return _type_info; };
+
+    size_t size() const {
+        size_t num_elements = 1;
+        for (const auto& dim: _shape)
+            num_elements *= dim;
+        return num_elements;
+    };
+
+    size_t nbytes() const { return tensor_data_size(_type_info) * size(); }
+    std::vector<unsigned> shape() const { return _shape; }
+};
 
 class Reader {
    public:
@@ -162,6 +189,10 @@ class Reader {
     //! Copies the data of the opened item to the buf
     virtual size_t read_data(unsigned char *buf, size_t read_size) = 0;
 
+    virtual const NumpyHeaderData get_numpy_header_data() { return {}; }
+
+    virtual size_t read_numpy_data(void *buf, size_t read_size, std::vector<size_t> max_shape) { return 0; }
+
     //! Closes the opened item
     virtual int close() = 0;
 
diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
new file mode 100644
index 000000000..48115c165
--- /dev/null
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include <dirent.h>
+
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include <mutex>
+
+#include "commons.h"
+#include "image_reader.h"
+#include "timing_debug.h"
+
+class NumpyDataReader : public Reader {
+   public:
+    //! Looks up the folder which contains the files, amd loads the image names
+    /*!
+     \param desc  User provided descriptor containing the files' path.
+    */
+    Reader::Status initialize(ReaderConfig desc) override;
+    //! Reads the next resource item
+    /*!
+     \param buf User's provided buffer to receive the loaded images
+     \return Size of the loaded resource
+    */
+    size_t read_data(unsigned char* buf, size_t max_size) override;
+    //! Opens the next file in the folder
+    /*!
+     \return The size of the next file, 0 if couldn't access it
+    */
+    size_t open() override;
+
+    const NumpyHeaderData get_numpy_header_data() override;
+
+    size_t read_numpy_data(void* buf, size_t read_size, std::vector<size_t> max_shape) override;
+
+    //! Resets the object's state to read from the first file in the folder
+    void reset() override;
+
+    //! Returns the name of the latest file opened
+    std::string id() override { return _last_id; };
+
+    unsigned count_items() override;
+
+    ~NumpyDataReader() override;
+
+    int close() override;
+
+    NumpyDataReader();
+
+   private:
+    //! opens the folder containnig the images
+    Reader::Status open_folder();
+    Reader::Status subfolder_reading();
+    std::string _folder_path;
+    DIR* _src_dir;
+    DIR* _sub_dir;
+    struct dirent* _entity;
+    std::vector<std::string> _file_names;
+    std::vector<std::string> _files;
+    std::vector<NumpyHeaderData> _file_headers;
+    unsigned _curr_file_idx;
+    FILE* _current_fPtr;
+    unsigned _current_file_size;
+    NumpyHeaderData _curr_file_header;
+    std::string _last_id;
+    std::string _last_file_name;
+    size_t _shard_id = 0;
+    size_t _shard_count = 1;  // equivalent of batch size
+    //!< _batch_count Defines the quantum count of the images to be read. It's usually equal to the user's batch size.
+    /// The loader will repeat images if necessary to be able to have images available in multiples of the load_batch_count,
+    /// for instance if there are 10 images in the dataset and _batch_count is 3, the loader repeats 2 images as if there are 12 images available.
+    size_t _batch_count = 1;
+    size_t _file_id = 0;
+    size_t _in_batch_read_count = 0;
+    bool _loop;
+    bool _shuffle;
+    int _read_counter = 0;
+    unsigned _seed = 0;
+    //!< _file_count_all_shards total_number of files in to figure out the max_batch_size (usually needed for distributed training).
+    size_t _file_count_all_shards;
+    std::mutex _cache_mutex_;
+    std::map<std::string, NumpyHeaderData> _header_cache_;
+    const RocalTensorDataType get_numpy_dtype(const std::string& format);
+    inline void ignore_spaces(const char*& ptr);
+    void decode_header(NumpyHeaderData& target, const std::string& header);
+    template <size_t N>
+    void skip_string(const char*& ptr, const char (&what)[N]);
+    template <size_t N>
+    bool check_and_skip_string(const char*& ptr, const char (&what)[N]);
+    template <size_t N>
+    void skip_field(const char*& ptr, const char (&name)[N]);
+    template <typename T = int64_t>
+    T parse_int(const char*& ptr);
+    std::string read_dtype_string(const char*& input, char delim_start = '\'', char delim_end = '\'');
+    void read_header(NumpyHeaderData& parsed_header, std::string file_path);
+    template <typename T>
+    size_t copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
+    bool get_cached_header(const std::string& file_name, NumpyHeaderData& target);
+    void update_header_cache(const std::string& file_name, const NumpyHeaderData& value);
+    void incremenet_read_ptr();
+    int release();
+    size_t get_file_shard_id();
+    void incremenet_file_id() { _file_id++; }
+    void replicate_last_image_to_fill_last_shard();
+    void replicate_last_batch_to_pad_partial_shard();
+    TimingDBG _shuffle_time;
+};
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index 33fb5b57a..c740eadc5 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -646,6 +646,7 @@ rocalBrightness(
     bool is_output,
     RocalFloatParam p_alpha,
     RocalFloatParam p_beta,
+    RocalIntParam conditional_execution,
     RocalTensorLayout output_layout,
     RocalTensorOutputType output_datatype) {
     Tensor* output = nullptr;
@@ -658,6 +659,7 @@ rocalBrightness(
     auto input = static_cast<Tensor*>(p_input);
     auto alpha = static_cast<FloatParam*>(p_alpha);
     auto beta = static_cast<FloatParam*>(p_beta);
+    auto conditional_execution_value = static_cast<IntParam*>(conditional_execution);
     try {
         RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
         RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
@@ -665,7 +667,7 @@ rocalBrightness(
         output_info.set_tensor_layout(op_tensor_layout);
         output_info.set_data_type(op_tensor_datatype);
         output = context->master_graph->create_tensor(output_info, is_output);
-        context->master_graph->add_node<BrightnessNode>({input}, {output})->init(alpha, beta);
+        context->master_graph->add_node<BrightnessNode>({input}, {output})->init(alpha, beta, conditional_execution_value);
     } catch (const std::exception& e) {
         context->capture_error(e.what());
         ERR(e.what())
@@ -680,6 +682,7 @@ rocalBrightnessFixed(
     float alpha,
     float beta,
     bool is_output,
+    int conditional_execution,
     RocalTensorLayout output_layout,
     RocalTensorOutputType output_datatype) {
     Tensor* output = nullptr;
@@ -697,7 +700,7 @@ rocalBrightnessFixed(
         output_info.set_tensor_layout(op_tensor_layout);
         output_info.set_data_type(op_tensor_datatype);
         output = context->master_graph->create_tensor(output_info, is_output);
-        context->master_graph->add_node<BrightnessNode>({input}, {output})->init(alpha, beta);
+        context->master_graph->add_node<BrightnessNode>({input}, {output})->init(alpha, beta, conditional_execution);
     } catch (const std::exception& e) {
         context->capture_error(e.what());
         ERR(e.what())
@@ -1155,6 +1158,110 @@ rocalSnPNoiseFixed(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL
+rocalGaussianNoise(
+    RocalContext p_context,
+    RocalTensor p_input,
+    bool is_output,
+    RocalFloatParam mean,
+    RocalFloatParam std_dev,
+    int seed,
+    RocalIntParam conditional_execution,
+    RocalTensorLayout output_layout,
+    RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    auto mean_value = static_cast<FloatParam*>(mean);
+    auto stddev_value = static_cast<FloatParam*>(std_dev);
+    auto conditional_execution_value = static_cast<IntParam*>(conditional_execution);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        context->master_graph->add_node<GaussianNoiseNode>({input}, {output})->init(mean_value, stddev_value, seed, conditional_execution_value);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
+RocalTensor ROCAL_API_CALL
+rocalGaussianNoiseFixed(
+    RocalContext p_context,
+    RocalTensor p_input,
+    float mean,
+    float std_dev,
+    bool is_output,
+    int seed,
+    int conditional_execution,
+    RocalTensorLayout output_layout,
+    RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        context->master_graph->add_node<GaussianNoiseNode>({input}, {output})->init(mean, std_dev, seed, conditional_execution);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
+RocalTensor ROCAL_API_CALL
+rocalSlice(
+    RocalContext p_context,
+    RocalTensor p_input,
+    bool is_output,
+    RocalTensor anchor_tensor,
+    std::vector<int> shape,
+    std::vector<float> fill_values,
+    RocalOutOfBoundsPolicy policy,
+    RocalTensorLayout output_layout,
+    RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr))
+        ERR("Invalid ROCAL context or invalid input tensor")
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    auto anchor = static_cast<Tensor*>(anchor_tensor);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output_info.modify_dims(op_tensor_layout, shape);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        context->master_graph->add_node<SliceNode>({input}, {output})->init(anchor, shape, fill_values, policy);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalFlip(
     RocalContext p_context,
@@ -1162,6 +1269,7 @@ rocalFlip(
     bool is_output,
     RocalIntParam p_horizontal_flag,
     RocalIntParam p_vertical_flag,
+    RocalIntParam p_depth_flag,
     RocalTensorLayout output_layout,
     RocalTensorOutputType output_datatype) {
     Tensor* output = nullptr;
@@ -1173,6 +1281,7 @@ rocalFlip(
     auto input = static_cast<Tensor*>(p_input);
     auto horizontal_flag = static_cast<IntParam*>(p_horizontal_flag);
     auto vertical_flag = static_cast<IntParam*>(p_vertical_flag);
+    auto depth_flag = static_cast<IntParam*>(p_depth_flag);
     try {
         RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
         RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
@@ -1181,7 +1290,7 @@ rocalFlip(
         output_info.set_data_type(op_tensor_datatype);
         output = context->master_graph->create_tensor(output_info, is_output);
         std::shared_ptr<FlipNode> flip_node = context->master_graph->add_node<FlipNode>({input}, {output});
-        flip_node->init(horizontal_flag, vertical_flag);
+        flip_node->init(horizontal_flag, vertical_flag, depth_flag);
         if (context->master_graph->meta_data_graph())
             context->master_graph->meta_add_node<FlipMetaNode, FlipNode>(flip_node);
     } catch (const std::exception& e) {
@@ -1197,6 +1306,7 @@ rocalFlipFixed(
     RocalTensor p_input,
     int horizontal_flag,
     int vertical_flag,
+    int depth_flag,
     bool is_output,
     RocalTensorLayout output_layout,
     RocalTensorOutputType output_datatype) {
@@ -1215,7 +1325,7 @@ rocalFlipFixed(
         output_info.set_data_type(op_tensor_datatype);
         output = context->master_graph->create_tensor(output_info, is_output);
         std::shared_ptr<FlipNode> flip_node = context->master_graph->add_node<FlipNode>({input}, {output});
-        flip_node->init(horizontal_flag, vertical_flag);
+        flip_node->init(horizontal_flag, vertical_flag, depth_flag);
         if (context->master_graph->meta_data_graph())
             context->master_graph->meta_add_node<FlipMetaNode, FlipNode>(flip_node);
     } catch (const std::exception& e) {
@@ -2155,3 +2265,26 @@ rocalNop(
     }
     return output;
 }
+
+RocalTensor ROCAL_API_CALL
+rocalSetLayout(
+    RocalContext p_context,
+    RocalTensor p_input,
+    RocalTensorLayout output_layout) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        input->set_layout(op_tensor_layout);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return input;
+}
diff --git a/rocAL/source/api/rocal_api_data_loaders.cpp b/rocAL/source/api/rocal_api_data_loaders.cpp
index e28fdc290..51bc92892 100644
--- a/rocAL/source/api/rocal_api_data_loaders.cpp
+++ b/rocAL/source/api/rocal_api_data_loaders.cpp
@@ -34,6 +34,8 @@ THE SOFTWARE.
 #include "node_fused_jpeg_crop_single_shard.h"
 #include "node_image_loader.h"
 #include "node_image_loader_single_shard.h"
+#include "node_numpy_loader.h"
+#include "node_numpy_loader_single_shard.h"
 #include "node_resize.h"
 #include "rocal_api.h"
 
@@ -65,6 +67,38 @@ evaluate_image_data_set(RocalImageSizeEvaluationPolicy decode_size_policy, Stora
     return std::make_tuple(max_width, max_height);
 };
 
+std::vector<size_t>
+evaluate_numpy_data_set(RocalImageSizeEvaluationPolicy decode_size_policy, StorageType storage_type,
+                        DecoderType decoder_type, const std::string &source_path, const std::vector<std::string> &files)
+{
+    auto translate_image_size_policy = [](RocalImageSizeEvaluationPolicy decode_size_policy)
+    {
+        switch(decode_size_policy)
+        {
+            case ROCAL_USE_MAX_SIZE:
+            case ROCAL_USE_MAX_SIZE_RESTRICTED:
+                return MaxSizeEvaluationPolicy::MAXIMUM_FOUND_SIZE;
+            case ROCAL_USE_MOST_FREQUENT_SIZE:
+                return MaxSizeEvaluationPolicy::MOST_FREQUENT_SIZE;
+            default:
+                return MaxSizeEvaluationPolicy::MAXIMUM_FOUND_SIZE;
+        }
+    };
+
+    ImageSourceEvaluator source_evaluator;
+    source_evaluator.set_size_evaluation_policy(translate_image_size_policy(decode_size_policy));
+    auto reader_cfg = ReaderConfig(storage_type, source_path);
+    if (!files.empty())
+        reader_cfg.set_files(files);
+    if (source_evaluator.create(reader_cfg) != ImageSourceEvaluatorStatus::OK)
+        THROW("Initializing file source input evaluator failed ")
+    auto max_dims = source_evaluator.max_numpy_dims();
+    int data_type = (int)source_evaluator.get_numpy_dtype();
+    max_dims.push_back(data_type);
+
+    return max_dims;
+};
+
 auto convert_color_format = [](RocalImageColor color_format, size_t n, size_t h, size_t w) {
     switch (color_format) {
         case ROCAL_COLOR_RGB24: {
@@ -297,7 +331,7 @@ rocalSequenceReader(
         output = context->master_graph->create_loader_output_tensor(info);
         auto cpu_num_threads = context->master_graph->calculate_cpu_num_threads(1);
 
-        context->master_graph->add_node<ImageLoaderNode>({}, {output})->init(internal_shard_count, cpu_num_threads, source_path, "", std::map<std::string, std::string>(), StorageType::SEQUENCE_FILE_SYSTEM, DecoderType::TURBO_JPEG, shuffle, loop, context->master_graph->sequence_batch_size(), context->master_graph->mem_type(), context->master_graph->meta_data_reader(), decoder_keep_original, "", sequence_length, step, stride);
+        context->master_graph->add_node<ImageLoaderNode>({}, {output})->init(internal_shard_count, cpu_num_threads, source_path, "", std::map<std::string, std::string>(), StorageType::SEQUENCE_FILE_SYSTEM, DecoderType::TURBO_JPEG, shuffle, loop, context->master_graph->sequence_batch_size(), context->master_graph->mem_type(), context->master_graph->meta_data_reader(), decoder_keep_original, "", sequence_length, step, stride, ExternalSourceFileMode::NONE);
         context->master_graph->set_loop(loop);
 
         if (is_output) {
@@ -367,7 +401,7 @@ rocalSequenceReaderSingleShard(
         output = context->master_graph->create_loader_output_tensor(info);
         auto cpu_num_threads = context->master_graph->calculate_cpu_num_threads(shard_count);
 
-        context->master_graph->add_node<ImageLoaderSingleShardNode>({}, {output})->init(shard_id, shard_count, cpu_num_threads, source_path, "", StorageType::SEQUENCE_FILE_SYSTEM, DecoderType::TURBO_JPEG, shuffle, loop, context->master_graph->sequence_batch_size(), context->master_graph->mem_type(), context->master_graph->meta_data_reader(), decoder_keep_original, std::map<std::string, std::string>(), sequence_length, step, stride);
+        context->master_graph->add_node<ImageLoaderSingleShardNode>({}, {output})->init(shard_id, shard_count, cpu_num_threads, source_path, "", StorageType::SEQUENCE_FILE_SYSTEM, DecoderType::TURBO_JPEG, shuffle, loop, context->master_graph->sequence_batch_size(), context->master_graph->mem_type(), context->master_graph->meta_data_reader(), decoder_keep_original, std::map<std::string, std::string>(), sequence_length, step, stride, ExternalSourceFileMode::NONE);
         context->master_graph->set_loop(loop);
 
         if (is_output) {
@@ -1596,6 +1630,129 @@ rocalVideoFileSource(
 }
 
 RocalTensor ROCAL_API_CALL
+rocalNumpyFileSource(
+    RocalContext p_context,
+    const char* source_path,
+    unsigned internal_shard_count,
+    std::vector<std::string> files,
+    bool is_output,
+    bool shuffle,
+    bool loop,
+    RocalImageSizeEvaluationPolicy decode_size_policy,
+    unsigned seed) {
+    Tensor* output = nullptr;
+    auto context = static_cast<Context*>(p_context);
+    try {
+        auto max_dimensions = evaluate_numpy_data_set(decode_size_policy, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE,
+                                                      source_path, files);
+
+        RocalTensorlayout tensor_format = RocalTensorlayout::NONE;
+        RocalTensorDataType tensor_data_type;
+        std::unordered_map<int, RocalTensorDataType> data_type_map = {
+            {0, RocalTensorDataType::FP32},
+            {1, RocalTensorDataType::FP16},
+            {2, RocalTensorDataType::UINT8},
+            {3, RocalTensorDataType::INT8},
+            {4, RocalTensorDataType::UINT32},
+            {5, RocalTensorDataType::INT32},
+        };
+        auto dtype = max_dimensions.at(max_dimensions.size() - 1);
+        max_dimensions.pop_back();
+        tensor_data_type = data_type_map[dtype];
+        unsigned num_of_dims = max_dimensions.size() + 1;
+        std::vector<size_t> dims;
+        dims.resize(num_of_dims);
+        dims[0] = context->user_batch_size();
+        for (uint i = 0; i < max_dimensions.size(); i++)
+            dims[i + 1] = max_dimensions[i];
+        auto info = TensorInfo(std::vector<size_t>(std::move(dims)),
+                               context->master_graph->mem_type(),
+                               tensor_data_type);
+        info.set_tensor_layout(tensor_format);
+        info.set_max_shape();
+        output = context->master_graph->create_loader_output_tensor(info);
+
+        context->master_graph->add_node<NumpyLoaderNode>({}, {output})->init(internal_shard_count, source_path, files, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type(), seed);
+        context->master_graph->set_loop(loop);
+
+        if (is_output) {
+            auto actual_output = context->master_graph->create_tensor(info, is_output);
+            context->master_graph->add_node<CopyNode>({output}, {actual_output});
+        }
+
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        std::cerr << e.what() << '\n';
+    }
+    return output;
+}
+
+RocalTensor ROCAL_API_CALL
+rocalNumpyFileSourceSingleShard(
+    RocalContext p_context,
+    const char* source_path,
+    std::vector<std::string> files,
+    bool is_output,
+    bool shuffle,
+    bool loop,
+    RocalImageSizeEvaluationPolicy decode_size_policy,
+    unsigned shard_id,
+    unsigned shard_count,
+    unsigned seed) {
+    Tensor* output = nullptr;
+    auto context = static_cast<Context*>(p_context);
+    try {
+        if (shard_count < 1)
+            THROW("Shard count should be bigger than 0")
+
+        if (shard_id >= shard_count)
+            THROW("Shard id should be smaller than shard count")
+
+        auto max_dimensions = evaluate_numpy_data_set(decode_size_policy, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE,
+                                                      source_path, files);
+
+        RocalTensorlayout tensor_format = RocalTensorlayout::NONE;
+        RocalTensorDataType tensor_data_type;
+        std::unordered_map<int, RocalTensorDataType> data_type_map = {
+            {0, RocalTensorDataType::FP32},
+            {1, RocalTensorDataType::FP16},
+            {2, RocalTensorDataType::UINT8},
+            {3, RocalTensorDataType::INT8},
+            {4, RocalTensorDataType::UINT32},
+            {5, RocalTensorDataType::INT32},
+        };
+        auto dtype = max_dimensions.at(max_dimensions.size() - 1);
+        max_dimensions.pop_back();
+        tensor_data_type = data_type_map[dtype];
+        unsigned num_of_dims = max_dimensions.size() + 1;
+        std::vector<size_t> dims;
+        dims.resize(num_of_dims);
+        dims[0] = context->user_batch_size();
+        for (uint i = 0; i < max_dimensions.size(); i++)
+            dims[i + 1] = max_dimensions[i];
+        auto info = TensorInfo(std::vector<size_t>(std::move(dims)),
+                               context->master_graph->mem_type(),
+                               tensor_data_type);
+        info.set_tensor_layout(tensor_format);
+        info.set_max_shape();
+        output = context->master_graph->create_loader_output_tensor(info);
+
+        context->master_graph->add_node<NumpyLoaderSingleShardNode>({}, {output})->init(shard_id, shard_count, source_path, files, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type(), seed);
+        context->master_graph->set_loop(loop);
+
+        if (is_output) {
+            auto actual_output = context->master_graph->create_tensor(info, is_output);
+            context->master_graph->add_node<CopyNode>({output}, {actual_output});
+        }
+
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        std::cerr << e.what() << '\n';
+    }
+    return output;
+}
+
+RocalTensor  ROCAL_API_CALL
 rocalVideoFileSourceSingleShard(
     RocalContext p_context,
     const char* source_path,
diff --git a/rocAL/source/api/rocal_api_meta_data.cpp b/rocAL/source/api/rocal_api_meta_data.cpp
index 313553507..76b985586 100644
--- a/rocAL/source/api/rocal_api_meta_data.cpp
+++ b/rocAL/source/api/rocal_api_meta_data.cpp
@@ -513,3 +513,27 @@ RocalTensorList
     auto context = static_cast<Context*>(p_context);
     return context->master_graph->matched_index_meta_data();
 }
+
+RocalTensor
+    ROCAL_API_CALL
+    rocalROIRandomCrop(RocalContext p_context, RocalTensor p_input, RocalTensor roi_start, RocalTensor roi_end, std::vector<int> crop_shape) {
+     if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    auto roi_start_tensor = static_cast<Tensor*>(roi_start);
+    auto roi_end_tensor = static_cast<Tensor*>(roi_end);
+    return context->master_graph->roi_random_crop(input, roi_start_tensor, roi_end_tensor, crop_shape.data());
+}
+
+RocalTensorList
+    ROCAL_API_CALL
+    rocalRandomObjectBbox(RocalContext p_context, RocalTensor p_input, std::string output_format, int k_largest, float foreground_prob) {
+     if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    return context->master_graph->random_object_bbox(input, output_format, k_largest, foreground_prob);
+}
diff --git a/rocAL/source/augmentations/color_augmentations/node_brightness.cpp b/rocAL/source/augmentations/color_augmentations/node_brightness.cpp
index c1bba946d..4570ee316 100644
--- a/rocAL/source/augmentations/color_augmentations/node_brightness.cpp
+++ b/rocAL/source/augmentations/color_augmentations/node_brightness.cpp
@@ -26,7 +26,8 @@ THE SOFTWARE.
 
 BrightnessNode::BrightnessNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs),
                                                                                                             _alpha(ALPHA_RANGE[0], ALPHA_RANGE[1]),
-                                                                                                            _beta(BETA_RANGE[0], BETA_RANGE[1]) {}
+                                                                                                            _beta(BETA_RANGE[0], BETA_RANGE[1]),
+                                                                                                            _conditional_execution(CONDITIONAL_EXECUTION_RANGE[0], CONDITIONAL_EXECUTION_RANGE[1]) {}
 
 void BrightnessNode::create_node() {
     if (_node)
@@ -34,6 +35,7 @@ void BrightnessNode::create_node() {
 
     _alpha.create_array(_graph, VX_TYPE_FLOAT32, _batch_size);
     _beta.create_array(_graph, VX_TYPE_FLOAT32, _batch_size);
+    _conditional_execution.create_array(_graph, VX_TYPE_INT32, _batch_size);
     int input_layout = static_cast<int>(_inputs[0]->info().layout());
     int output_layout = static_cast<int>(_outputs[0]->info().layout());
     int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
@@ -41,23 +43,26 @@ void BrightnessNode::create_node() {
     vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
     vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
 
-    _node = vxExtRppBrightness(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), _alpha.default_array(), _beta.default_array(), input_layout_vx, output_layout_vx,roi_type_vx);
+    _node = vxExtRppBrightness(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), _alpha.default_array(), _beta.default_array(), _conditional_execution.default_array(), input_layout_vx, output_layout_vx,roi_type_vx);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
         THROW("Adding the brightness (vxExtRppBrightness) node failed: " + TOSTR(status))
 }
 
-void BrightnessNode::init(float alpha, float beta) {
+void BrightnessNode::init(float alpha, float beta, int conditional_execution) {
     _alpha.set_param(alpha);
     _beta.set_param(beta);
+    _conditional_execution.set_param(conditional_execution);
 }
 
-void BrightnessNode::init(FloatParam *alpha, FloatParam *beta) {
+void BrightnessNode::init(FloatParam *alpha, FloatParam *beta, IntParam *conditional_execution) {
     _alpha.set_param(core(alpha));
     _beta.set_param(core(beta));
+    _conditional_execution.set_param(core(conditional_execution));
 }
 
 void BrightnessNode::update_node() {
     _alpha.update_array();
     _beta.update_array();
+    _conditional_execution.update_array();
 }
diff --git a/rocAL/source/augmentations/effects_augmentations/node_gaussian_noise.cpp b/rocAL/source/augmentations/effects_augmentations/node_gaussian_noise.cpp
new file mode 100644
index 000000000..5d3d82f55
--- /dev/null
+++ b/rocAL/source/augmentations/effects_augmentations/node_gaussian_noise.cpp
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vx_ext_rpp.h>
+#include "node_gaussian_noise.h"
+#include "exception.h"
+
+GaussianNoiseNode::GaussianNoiseNode(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) : Node(inputs, outputs),
+                                                                                                      _mean(MEAN_RANGE[0], MEAN_RANGE[1]),
+                                                                                                      _stddev(STDDEV_RANGE[0], STDDEV_RANGE[1]),
+                                                                                                      _conditional_execution(CONDITIONAL_EXECUTION_RANGE[0], CONDITIONAL_EXECUTION_RANGE[1]) {}
+
+void GaussianNoiseNode::create_node() {
+    if (_node)
+        return;
+
+    _mean.create_array(_graph, VX_TYPE_FLOAT32, _batch_size);
+    _stddev.create_array(_graph, VX_TYPE_FLOAT32, _batch_size);
+    _conditional_execution.create_array(_graph, VX_TYPE_INT32, _batch_size);
+    vx_scalar seed = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_seed);
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int output_layout = static_cast<int>(_outputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+
+    _node = vxExtRppGaussianNoise(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), _mean.default_array(),
+                          _stddev.default_array(), _conditional_execution.default_array(), seed, input_layout_vx, output_layout_vx, roi_type_vx);
+    vx_status status;
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the Noise (vxExtRppGaussianNoise) node failed: " + TOSTR(status))
+}
+
+void GaussianNoiseNode::init(float mean, float stddev, int seed, int conditional_execution) {
+    _mean.set_param(mean);
+    _stddev.set_param(stddev);
+    _seed = seed;
+    _conditional_execution.set_param(conditional_execution);
+}
+
+void GaussianNoiseNode::init(FloatParam* mean_param, FloatParam* stddev_param, int seed, IntParam* condition_execution_param) {
+    _mean.set_param(core(mean_param));
+    _stddev.set_param(core(stddev_param));
+    _seed = seed;
+    _conditional_execution.set_param(core(condition_execution_param));
+}
+
+void GaussianNoiseNode::update_node() {
+    _mean.update_array();
+    _stddev.update_array();
+    _conditional_execution.update_array();
+}
diff --git a/rocAL/source/augmentations/geometry_augmentations/node_flip.cpp b/rocAL/source/augmentations/geometry_augmentations/node_flip.cpp
index 86be1dcb3..0e22a468b 100644
--- a/rocAL/source/augmentations/geometry_augmentations/node_flip.cpp
+++ b/rocAL/source/augmentations/geometry_augmentations/node_flip.cpp
@@ -26,7 +26,8 @@ THE SOFTWARE.
 
 FlipNode::FlipNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs),
                                                                                                 _horizontal(HORIZONTAL_RANGE[0], HORIZONTAL_RANGE[1]),
-                                                                                                _vertical(VERTICAL_RANGE[0], VERTICAL_RANGE[1]) {}
+                                                                                                _vertical(VERTICAL_RANGE[0], VERTICAL_RANGE[1]),
+                                                                                                _depth(DEPTH_RANGE[0], DEPTH_RANGE[1]) {}
 
 void FlipNode::create_node() {
     if (_node)
@@ -34,6 +35,7 @@ void FlipNode::create_node() {
 
     _horizontal.create_array(_graph, VX_TYPE_UINT32, _batch_size);
     _vertical.create_array(_graph, VX_TYPE_UINT32, _batch_size);
+    _depth.create_array(_graph, VX_TYPE_UINT32, _batch_size);
     int input_layout = static_cast<int>(_inputs[0]->info().layout());
     int output_layout = static_cast<int>(_outputs[0]->info().layout());
     int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
@@ -42,23 +44,26 @@ void FlipNode::create_node() {
     vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
 
     _node = vxExtRppFlip(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(),
-                         _horizontal.default_array(), _vertical.default_array(), input_layout_vx, output_layout_vx,roi_type_vx);
+                         _horizontal.default_array(), _vertical.default_array(), _depth.default_array(), input_layout_vx, output_layout_vx,roi_type_vx);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
         THROW("Adding the flip (vxExtRppFlip) node failed: " + TOSTR(status))
 }
 
-void FlipNode::init(int h_flag, int v_flag) {
+void FlipNode::init(int h_flag, int v_flag, int d_flag) {
     _horizontal.set_param(h_flag);
     _vertical.set_param(v_flag);
+    _depth.set_param(d_flag);
 }
 
-void FlipNode::init(IntParam *h_flag, IntParam *v_flag) {
+void FlipNode::init(IntParam *h_flag, IntParam *v_flag, IntParam *d_flag) {
     _horizontal.set_param(core(h_flag));
     _vertical.set_param(core(v_flag));
+    _depth.set_param(core(d_flag));
 }
 
 void FlipNode::update_node() {
     _horizontal.update_array();
     _vertical.update_array();
+    _depth.update_array();
 }
diff --git a/rocAL/source/augmentations/geometry_augmentations/node_slice.cpp b/rocAL/source/augmentations/geometry_augmentations/node_slice.cpp
new file mode 100644
index 000000000..8c757ccd7
--- /dev/null
+++ b/rocAL/source/augmentations/geometry_augmentations/node_slice.cpp
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "node_slice.h"
+
+#include <vx_ext_rpp.h>
+
+#include "exception.h"
+
+SliceNode::SliceNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
+
+void SliceNode::create_node() {
+    if (_node)
+        return;
+
+    create_shape_tensor();
+    auto max_shape = _outputs[0]->info().max_shape();
+    _slice_roi.resize(_batch_size, std::vector<uint32_t>(max_shape.size()));
+    for (uint i = 0; i < _batch_size; i++)
+        for (uint j = 0; j < max_shape.size(); j++)
+            _slice_roi[i][j] = max_shape[j];
+    const int buffer_size = _batch_size;
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+    _fill_values_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, buffer_size);
+    vx_status status = vxAddArrayItems(_fill_values_array, buffer_size, _fill_values_vec.data(), sizeof(vx_float32));
+    if (status != 0)
+        THROW(" vxAddArrayItems failed in the slice (vxExtRppSlice) node: " + TOSTR(status));
+    vx_scalar policy = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_policy);
+    _node = vxExtRppSlice(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), _anchor->handle(),
+                          _shape, _fill_values_array, policy, input_layout_vx, roi_type_vx);
+
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the slice node (vxRppSlice) failed: " + TOSTR(status))
+}
+
+void SliceNode::update_node() {
+    // if fill values passed by user less than what is required, replicate the values
+    if (_fill_values.size() == 1) {
+        std::fill(_fill_values_vec.begin(), _fill_values_vec.end(), _fill_values[0]);
+    }
+    vx_status status = VX_SUCCESS;
+    _outputs[0]->update_tensor_roi(_slice_roi);
+    status = vxCopyArrayRange((vx_array)_fill_values_array, 0, _batch_size, sizeof(vx_float32), _fill_values_vec.data(), VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    if (status != 0)
+        WRN("ERROR: vxCopyArrayRange failed in the slice node (vxExtRppSlice) node: " + TOSTR(status))
+    int* shape_arr = (int *) _shape_array;
+    // replicate shape values for all samples in a batch
+    for (uint i = 0; i < _batch_size; i++) {
+        int sample_idx = i * _shape_vec.size();
+        memcpy(&(shape_arr[sample_idx]), _shape_vec.data(), _shape_vec.size() * sizeof(int));
+    }
+}
+
+void SliceNode::init(Tensor *anchor, std::vector<int> shape, std::vector<float> &fill_values, RocalOutOfBoundsPolicy policy) {
+    _policy = policy;
+    _anchor = anchor;
+    _shape_vec = shape;
+    _fill_values = fill_values;
+    _fill_values_vec.resize(_batch_size);
+}
+
+// Create vx_tensor for the shape coordinates
+void SliceNode::create_shape_tensor() {
+    vx_size num_of_dims = 2;
+    vx_size stride[num_of_dims];
+    std::vector<size_t> _shape_tensor_dims = {_batch_size, _shape_vec.size()};
+    stride[0] = sizeof(vx_int32);
+    stride[1] = stride[0] * _shape_tensor_dims[0];
+    vx_enum mem_type = VX_MEMORY_TYPE_HOST;
+    if (_inputs[0]->info().mem_type() == RocalMemType::HIP)
+        mem_type = VX_MEMORY_TYPE_HIP;
+    allocate_host_or_pinned_mem(&_shape_array, stride[1] * _shape_vec.size(), _inputs[0]->info().mem_type());
+
+    _shape = vxCreateTensorFromHandle(vxGetContext((vx_reference)_graph->get()), num_of_dims, _shape_tensor_dims.data(), VX_TYPE_INT32, 0,
+                                      stride, reinterpret_cast<void *>(_shape_array), mem_type);
+    vx_status status;
+    if ((status = vxGetStatus((vx_reference)_shape)) != VX_SUCCESS)
+        THROW("Error: vxCreateTensorFromHandle(_shape: failed " + TOSTR(status))
+}
+
+SliceNode::~SliceNode() {
+    if (_inputs[0]->info().mem_type() == RocalMemType::HIP) {
+#if ENABLE_HIP
+        hipError_t err = hipHostFree(_shape_array);
+        if (err != hipSuccess)
+            std::cerr << "\n[ERR] hipFree failed  " << std::to_string(err) << "\n";
+#endif
+    } else {
+        if (_shape_array) free(_shape_array);
+    }
+    if (_shape) vxReleaseTensor(&_shape);
+}
diff --git a/rocAL/source/loaders/image/node_numpy_loader.cpp b/rocAL/source/loaders/image/node_numpy_loader.cpp
new file mode 100644
index 000000000..3f5319490
--- /dev/null
+++ b/rocAL/source/loaders/image/node_numpy_loader.cpp
@@ -0,0 +1,63 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "node_numpy_loader.h"
+
+#include "exception.h"
+
+NumpyLoaderNode::NumpyLoaderNode(Tensor *output, void *device_resources) : Node({}, {output}) {
+    _loader_module = std::make_shared<NumpyLoaderSharded>(device_resources);
+}
+
+void NumpyLoaderNode::init(unsigned internal_shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
+                           size_t load_batch_count, RocalMemType mem_type, unsigned seed, bool decoder_keep_orig, const std::map<std::string, std::string> feature_key_map, const char *file_prefix, unsigned sequence_length, unsigned step, unsigned stride) {
+    if (!_loader_module)
+        THROW("ERROR: loader module is not set for NumpyLoaderNode, cannot initialize")
+    if (internal_shard_count < 1)
+        THROW("Shard count should be greater than or equal to one")
+    _loader_module->set_output(_outputs[0]);
+    // Set reader and decoder config accordingly for the NumpyLoaderNode
+    auto reader_cfg = ReaderConfig(storage_type, source_path, "", feature_key_map, shuffle, loop);
+    reader_cfg.set_shard_count(internal_shard_count);
+    reader_cfg.set_batch_count(load_batch_count);
+    reader_cfg.set_file_prefix(file_prefix);
+    reader_cfg.set_files(files);
+    reader_cfg.set_seed(seed);
+    //  sequence_length, step and stride parameters used only for SequenceReader
+    reader_cfg.set_sequence_length(sequence_length);
+    reader_cfg.set_frame_step(step);
+    reader_cfg.set_frame_stride(stride);
+    _loader_module->initialize(reader_cfg, DecoderConfig(DecoderType::SKIP_DECODE),
+                               mem_type,
+                               _batch_size, decoder_keep_orig);
+    _loader_module->start_loading();
+}
+
+std::shared_ptr<LoaderModule> NumpyLoaderNode::get_loader_module() {
+    if (!_loader_module)
+        WRN("NumpyLoaderNode's loader module is null, not initialized")
+    return _loader_module;
+}
+
+NumpyLoaderNode::~NumpyLoaderNode() {
+    _loader_module = nullptr;
+}
diff --git a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
new file mode 100644
index 000000000..ed9d3730a
--- /dev/null
+++ b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
@@ -0,0 +1,66 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "node_numpy_loader_single_shard.h"
+
+#include "exception.h"
+
+NumpyLoaderSingleShardNode::NumpyLoaderSingleShardNode(Tensor *output, void *device_resources) : Node({}, {output}) {
+    _loader_module = std::make_shared<NumpyLoader>(device_resources);
+}
+
+void NumpyLoaderSingleShardNode::init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type,
+                                      bool shuffle, bool loop, size_t load_batch_count, RocalMemType mem_type, unsigned seed,
+                                      bool decoder_keep_original, const std::map<std::string, std::string> feature_key_map, unsigned sequence_length, unsigned step, unsigned stride) {
+    if (!_loader_module)
+        THROW("ERROR: loader module is not set for NumpyLoaderNode, cannot initialize")
+    if (shard_count < 1)
+        THROW("Shard count should be greater than or equal to one")
+    if (shard_id >= shard_count)
+        THROW("Shard is should be smaller than shard count")
+    _loader_module->set_output(_outputs[0]);
+    // Set reader and decoder config accordingly for the NumpyLoaderNode
+    auto reader_cfg = ReaderConfig(storage_type, source_path, "", feature_key_map, shuffle, loop);
+    reader_cfg.set_shard_count(shard_count);
+    reader_cfg.set_shard_id(shard_id);
+    reader_cfg.set_batch_count(load_batch_count);
+    reader_cfg.set_files(files);
+    reader_cfg.set_seed(seed);
+    //  sequence_length, step and stride parameters used only for SequenceReader
+    reader_cfg.set_sequence_length(sequence_length);
+    reader_cfg.set_frame_step(step);
+    reader_cfg.set_frame_stride(stride);
+    _loader_module->initialize(reader_cfg, DecoderConfig(DecoderType::SKIP_DECODE),
+                               mem_type,
+                               _batch_size, decoder_keep_original);
+    _loader_module->start_loading();
+}
+
+std::shared_ptr<LoaderModule> NumpyLoaderSingleShardNode::get_loader_module() {
+    if (!_loader_module)
+        WRN("NumpyLoaderSingleShardNode's loader module is null, not initialized")
+    return _loader_module;
+}
+
+NumpyLoaderSingleShardNode::~NumpyLoaderSingleShardNode() {
+    _loader_module = nullptr;
+}
diff --git a/rocAL/source/loaders/image/numpy_loader.cpp b/rocAL/source/loaders/image/numpy_loader.cpp
new file mode 100644
index 000000000..4e614dca3
--- /dev/null
+++ b/rocAL/source/loaders/image/numpy_loader.cpp
@@ -0,0 +1,306 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "numpy_loader.h"
+
+#include <chrono>
+#include <thread>
+
+#include "vx_ext_amd.h"
+
+NumpyLoader::NumpyLoader(void *dev_resources) : _circ_buff(dev_resources),
+                                                _file_load_time("file load time", DBG_TIMING),
+                                                _swap_handle_time("Swap_handle_time", DBG_TIMING) {
+    _output_tensor = nullptr;
+    _mem_type = RocalMemType::HOST;
+    _internal_thread_running = false;
+    _output_mem_size = 0;
+    _batch_size = 1;
+    _is_initialized = false;
+    _remaining_image_count = 0;
+    _device_id = 0;
+}
+
+NumpyLoader::~NumpyLoader() {
+    de_init();
+}
+
+void NumpyLoader::shut_down() {
+    if (_internal_thread_running)
+        stop_internal_thread();
+    _circ_buff.release();
+}
+
+void NumpyLoader::set_prefetch_queue_depth(size_t prefetch_queue_depth) {
+    if (prefetch_queue_depth <= 0)
+        THROW("Prefetch quque depth value cannot be zero or negative");
+    _prefetch_queue_depth = prefetch_queue_depth;
+}
+
+void NumpyLoader::set_gpu_device_id(int device_id) {
+    if (device_id < 0)
+        THROW("invalid device_id passed to loader");
+    _device_id = device_id;
+}
+
+size_t
+NumpyLoader::remaining_count() {
+    return _remaining_image_count;
+}
+
+void NumpyLoader::reset() {
+    // stop the writer thread and empty the internal circular buffer
+    _internal_thread_running = false;
+    _circ_buff.unblock_writer();
+
+    if (_load_thread.joinable())
+        _load_thread.join();
+
+    // Emptying the internal circular buffer
+    _circ_buff.reset();
+
+    // resetting the reader thread to the start of the media
+    _image_counter = 0;
+    _reader->reset();
+
+    // Start loading (writer thread) again
+    start_loading();
+}
+
+void NumpyLoader::de_init() {
+    // Set running to 0 and wait for the internal thread to join
+    stop_internal_thread();
+    _output_mem_size = 0;
+    _batch_size = 1;
+    _is_initialized = false;
+    _remaining_image_count = 0;
+}
+
+LoaderModuleStatus
+NumpyLoader::load_next() {
+    return update_output_image();
+}
+
+void NumpyLoader::set_output(Tensor *output_tensor) {
+    _output_tensor = output_tensor;
+    _output_mem_size = ((_output_tensor->info().data_size() / 8) * 8 + 8);
+}
+
+void NumpyLoader::set_random_bbox_data_reader(std::shared_ptr<RandomBBoxCrop_MetaDataReader> randombboxcrop_meta_data_reader) {
+    _randombboxcrop_meta_data_reader = randombboxcrop_meta_data_reader;
+    _circ_buff.random_bbox_crop_flag = true;
+}
+
+void NumpyLoader::stop_internal_thread() {
+    _internal_thread_running = false;
+    _stopped = true;
+    _circ_buff.unblock_reader();
+    _circ_buff.unblock_writer();
+    _circ_buff.reset();
+    if (_load_thread.joinable())
+        _load_thread.join();
+}
+
+void NumpyLoader::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RocalMemType mem_type, unsigned batch_size, bool decoder_keep_original) {
+    if (_is_initialized)
+        WRN("initialize() function is already called and loader module is initialized")
+
+    if (_output_mem_size == 0)
+        THROW("output image size is 0, set_output() should be called before initialize for loader modules")
+
+    _mem_type = mem_type;
+    _batch_size = batch_size;
+    _loop = reader_cfg.loop();
+    _image_size = _output_tensor->info().data_size() / batch_size;
+    _output_names.resize(batch_size);
+    try {
+        _reader = create_reader(reader_cfg);
+    } catch (const std::exception &e) {
+        de_init();
+        throw;
+    }
+    _decoded_img_info._image_names.resize(_batch_size);
+    _crop_image_info._crop_image_coords.resize(_batch_size);
+    _tensor_roi.resize(_batch_size);
+    _circ_buff.init(_mem_type, _output_mem_size, _prefetch_queue_depth);
+    _is_initialized = true;
+    LOG("Loader module initialized");
+}
+
+void NumpyLoader::start_loading() {
+    if (!_is_initialized)
+        THROW("start_loading() should be called after initialize() function is called")
+
+    _remaining_image_count = _reader->count_items();
+    _internal_thread_running = true;
+    _load_thread = std::thread(&NumpyLoader::load_routine, this);
+}
+
+LoaderModuleStatus
+NumpyLoader::load_routine() {
+    LOG("Started the internal loader thread");
+    LoaderModuleStatus last_load_status = LoaderModuleStatus::OK;
+    // Initially record number of all the images that are going to be loaded, this is used to know how many still there
+
+    while (_internal_thread_running) {
+        auto data = _circ_buff.get_write_buffer();
+        if (!_internal_thread_running)
+            break;
+
+        auto load_status = LoaderModuleStatus::NO_MORE_DATA_TO_READ;
+        {
+            unsigned file_counter = 0;
+            _file_load_time.start();  // Debug timing
+
+            while ((file_counter != _batch_size) && _reader->count_items() > 0) {
+                auto read_ptr = data + _image_size * file_counter;
+                auto max_shape = _output_tensor->info().max_shape();
+                size_t readSize = _reader->open();
+                if (readSize == 0) {
+                    WRN("Opened file " + _reader->id() + " of size 0");
+                    continue;
+                }
+                auto fsize = _reader->read_numpy_data(read_ptr, readSize, max_shape);
+                if (fsize == 0)
+                    THROW("Numpy arrays must contain readable data")
+                _decoded_img_info._image_names[file_counter] = _reader->id();
+                _tensor_roi[file_counter] = _reader->get_numpy_header_data().shape();
+                _reader->close();
+                file_counter++;
+            }
+            _file_load_time.end();  // Debug timing
+            _circ_buff.set_image_info(_decoded_img_info);
+            _circ_buff.push();
+            _image_counter += _output_tensor->info().batch_size();
+            load_status = LoaderModuleStatus::OK;
+        }
+        if (load_status != LoaderModuleStatus::OK) {
+            if (last_load_status != load_status) {
+                if (load_status == LoaderModuleStatus::NO_MORE_DATA_TO_READ ||
+                    load_status == LoaderModuleStatus::NO_FILES_TO_READ) {
+                    LOG("Cycled through all images, count " + TOSTR(_image_counter));
+                } else {
+                    ERR("ERROR: Detected error in reading the images");
+                }
+                last_load_status = load_status;
+            }
+
+            // Here it sets the out-of-data flag and signal the circular buffer's internal
+            // read semaphore using release() call
+            // , and calls the release() allows the reader thread to wake up and handle
+            // the out-of-data case properly
+            // It also slows down the reader thread since there is no more data to read,
+            // till program ends or till reset is called
+            _circ_buff.unblock_reader();
+            std::this_thread::sleep_for(std::chrono::seconds(1));
+        }
+    }
+    return LoaderModuleStatus::OK;
+}
+
+bool NumpyLoader::is_out_of_data() {
+    return (remaining_count() < _batch_size);
+}
+LoaderModuleStatus
+NumpyLoader::update_output_image() {
+    LoaderModuleStatus status = LoaderModuleStatus::OK;
+
+    if (is_out_of_data())
+        return LoaderModuleStatus::NO_MORE_DATA_TO_READ;
+    if (_stopped)
+        return LoaderModuleStatus::OK;
+
+    // _circ_buff.get_read_buffer_x() is blocking and puts the caller on sleep until new images are written to the _circ_buff
+    if ((_mem_type == RocalMemType::OCL) || (_mem_type == RocalMemType::HIP)) {
+        auto data_buffer = _circ_buff.get_read_buffer_dev();
+        _swap_handle_time.start();
+        if (_output_tensor->swap_handle(data_buffer) != 0)
+            return LoaderModuleStatus ::DEVICE_BUFFER_SWAP_FAILED;
+        _swap_handle_time.end();
+    } else {
+        auto data_buffer = _circ_buff.get_read_buffer_host();
+        _swap_handle_time.start();
+        if (_output_tensor->swap_handle(data_buffer) != 0)
+            return LoaderModuleStatus::HOST_BUFFER_SWAP_FAILED;
+        _swap_handle_time.end();
+    }
+    if (_stopped)
+        return LoaderModuleStatus::OK;
+
+    _output_decoded_img_info = _circ_buff.get_image_info();
+    if (_randombboxcrop_meta_data_reader) {
+        _output_cropped_img_info = _circ_buff.get_cropped_image_info();
+    }
+    _output_names = _output_decoded_img_info._image_names;
+    _output_tensor->update_tensor_roi(_tensor_roi);
+    // _output_tensor->update_tensor_roi(_output_decoded_img_info._roi_width, _output_decoded_img_info._roi_height);
+    // _output_tensor->update_tensor_orig_roi(_output_decoded_img_info._original_width, _output_decoded_img_info._original_height);
+    _circ_buff.pop();
+    if (!_loop)
+        _remaining_image_count -= _batch_size;
+
+    return status;
+}
+
+Timing NumpyLoader::timing() {
+    Timing t;
+    t.read_time = _file_load_time.get_timing();
+    t.process_time = _swap_handle_time.get_timing();
+    return t;
+}
+
+LoaderModuleStatus NumpyLoader::set_cpu_affinity(cpu_set_t cpu_mask) {
+    if (!_internal_thread_running)
+        THROW("set_cpu_affinity() should be called after start_loading function is called")
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
+#else
+    int ret = pthread_setaffinity_np(_load_thread.native_handle(),
+                                     sizeof(cpu_set_t), &cpu_mask);
+    if (ret != 0)
+        WRN("Error calling pthread_setaffinity_np: " + TOSTR(ret));
+#endif
+    return LoaderModuleStatus::OK;
+}
+
+LoaderModuleStatus NumpyLoader::set_cpu_sched_policy(struct sched_param sched_policy) {
+    if (!_internal_thread_running)
+        THROW("set_cpu_sched_policy() should be called after start_loading function is called")
+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
+#else
+    auto ret = pthread_setschedparam(_load_thread.native_handle(), SCHED_FIFO, &sched_policy);
+    if (ret != 0)
+        WRN("Unsuccessful in setting thread realtime priority for loader thread err = " + TOSTR(ret))
+#endif
+    return LoaderModuleStatus::OK;
+}
+
+std::vector<std::string> NumpyLoader::get_id() {
+    return _output_names;
+}
+
+decoded_image_info NumpyLoader::get_decode_image_info() {
+    return _output_decoded_img_info;
+}
+
+crop_image_info NumpyLoader::get_crop_image_info() {
+    return _output_cropped_img_info;
+}
diff --git a/rocAL/source/loaders/image/numpy_loader_sharded.cpp b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
new file mode 100644
index 000000000..b514baf91
--- /dev/null
+++ b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
@@ -0,0 +1,163 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "numpy_loader_sharded.h"
+
+NumpyLoaderSharded::NumpyLoaderSharded(void* dev_resources) : _dev_resources(dev_resources) {
+    _loader_idx = 0;
+}
+
+void NumpyLoaderSharded::set_prefetch_queue_depth(size_t prefetch_queue_depth) {
+    if (prefetch_queue_depth <= 0)
+        THROW("Prefetch quque depth value cannot be zero or negative");
+    _prefetch_queue_depth = prefetch_queue_depth;
+}
+
+std::vector<std::string> NumpyLoaderSharded::get_id() {
+    if (!_initialized)
+        THROW("get_id() should be called after initialize() function");
+    return _loaders[_loader_idx]->get_id();
+}
+
+decoded_image_info NumpyLoaderSharded::get_decode_image_info() {
+    return _loaders[_loader_idx]->get_decode_image_info();
+}
+
+crop_image_info NumpyLoaderSharded::get_crop_image_info() {
+    return _loaders[_loader_idx]->get_crop_image_info();
+}
+
+NumpyLoaderSharded::~NumpyLoaderSharded() {
+    _loaders.clear();
+}
+
+void NumpyLoaderSharded::fast_forward_through_empty_loaders() {
+    int loaders_count = _loaders.size();
+    // reject empty loaders and get to a loader that still has images to play
+    while (_loaders[_loader_idx]->remaining_count() == 0 && loaders_count-- > 0)
+        increment_loader_idx();
+}
+
+LoaderModuleStatus NumpyLoaderSharded::load_next() {
+    if (!_initialized)
+        return LoaderModuleStatus::NOT_INITIALIZED;
+
+    increment_loader_idx();
+
+    // Since loaders may have different number of images loaded, some run out earlier than other.
+    // Fast forward through loaders that are empty to get to a loader that is not empty.
+    fast_forward_through_empty_loaders();
+
+    auto ret = _loaders[_loader_idx]->load_next();
+
+    return ret;
+}
+void NumpyLoaderSharded::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RocalMemType mem_type,
+                                    unsigned batch_size, bool keep_orig_size) {
+    if (_initialized)
+        return;
+    _shard_count = reader_cfg.get_shard_count();
+    // Create loader modules
+    for (size_t i = 0; i < _shard_count; i++) {
+        std::shared_ptr loader = std::make_shared<NumpyLoader>(_dev_resources);
+        loader->set_prefetch_queue_depth(_prefetch_queue_depth);
+        _loaders.push_back(loader);
+    }
+    // Initialize loader modules
+    for (size_t idx = 0; idx < _shard_count; idx++) {
+        _loaders[idx]->set_output(_output_tensor);
+        _loaders[idx]->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
+        _loaders[idx]->set_gpu_device_id(idx);
+        reader_cfg.set_shard_count(_shard_count);
+        reader_cfg.set_shard_id(idx);
+        _loaders[idx]->initialize(reader_cfg, decoder_cfg, mem_type, batch_size, keep_orig_size);
+    }
+    _initialized = true;
+}
+void NumpyLoaderSharded::start_loading() {
+    for (unsigned i = 0; i < _loaders.size(); i++) {
+        _loaders[i]->start_loading();
+        //  Changing thread scheduling policy and it's priority does not help on latest Ubuntu builds
+        //  and needs tweaking the Linux security settings , can be turned on for experimentation
+#if 0
+        // Set thread scheduling policy
+        struct sched_param params;
+        params.sched_priority = sched_get_priority_max(SCHED_FIFO);
+        _loaders[i]->set_cpu_sched_policy(params);
+#endif
+        // Setting cpu affinity for threads works and can be activated below for experimentation
+#if 0
+        // Set thread affinity thread 0 to core 0 , 1 toc core 1 , ...
+        cpu_set_t cpuset;
+        CPU_ZERO(&cpuset);
+        CPU_SET(i, &cpuset);
+        _loaders[i]->set_cpu_affinity(cpuset);
+#endif
+    }
+}
+
+void NumpyLoaderSharded::shut_down() {
+    for (unsigned i = 0; i < _loaders.size(); i++)
+        _loaders[i]->shut_down();
+}
+
+void NumpyLoaderSharded::set_output(Tensor* output_tensor) {
+    _output_tensor = output_tensor;
+}
+
+void NumpyLoaderSharded::set_random_bbox_data_reader(std::shared_ptr<RandomBBoxCrop_MetaDataReader> randombboxcrop_meta_data_reader) {
+    _randombboxcrop_meta_data_reader = randombboxcrop_meta_data_reader;
+}
+
+size_t NumpyLoaderSharded::remaining_count() {
+    int sum = 0;
+    for (auto& loader : _loaders)
+        sum += loader->remaining_count();
+    return sum;
+}
+void NumpyLoaderSharded::reset() {
+    for (auto& loader : _loaders)
+        loader->reset();
+}
+void NumpyLoaderSharded::increment_loader_idx() {
+    _loader_idx = (_loader_idx + 1) % _shard_count;
+}
+
+Timing NumpyLoaderSharded::timing() {
+    Timing t;
+    long long unsigned max_decode_time = 0;
+    long long unsigned max_read_time = 0;
+    long long unsigned swap_handle_time = 0;
+
+    // image read and decode runs in parallel using multiple loaders, and the observable latency that the NumpyLoaderSharded user
+    // is experiences on the load_next() call due to read and decode time is the maximum of all
+    for (auto& loader : _loaders) {
+        auto info = loader->timing();
+        max_read_time = (info.read_time > max_read_time) ? info.read_time : max_read_time;
+        max_decode_time = (info.decode_time > max_decode_time) ? info.decode_time : max_decode_time;
+        swap_handle_time += info.process_time;
+    }
+    t.decode_time = max_decode_time;
+    t.read_time = max_read_time;
+    t.process_time = swap_handle_time;
+    return t;
+}
diff --git a/rocAL/source/loaders/image_source_evaluator.cpp b/rocAL/source/loaders/image_source_evaluator.cpp
index 96a39b726..55aafac07 100644
--- a/rocAL/source/loaders/image_source_evaluator.cpp
+++ b/rocAL/source/loaders/image_source_evaluator.cpp
@@ -50,6 +50,14 @@ ImageSourceEvaluator::create(ReaderConfig reader_cfg, DecoderConfig decoder_cfg)
     return status;
 }
 
+ImageSourceEvaluatorStatus
+ImageSourceEvaluator::create(ReaderConfig reader_cfg) {
+    ImageSourceEvaluatorStatus status = ImageSourceEvaluatorStatus::OK;
+    _reader = create_reader(std::move(reader_cfg));
+    find_max_numpy_dimensions();
+    return status;
+}
+
 void ImageSourceEvaluator::find_max_dimension() {
     _reader->reset();
 
@@ -77,6 +85,39 @@ void ImageSourceEvaluator::find_max_dimension() {
     _reader->reset();
 }
 
+void ImageSourceEvaluator::find_max_numpy_dimensions() {
+    _reader->reset();
+
+    while (_reader->count_items()) {
+        size_t fsize = _reader->open();
+        if (fsize == 0)
+            THROW("Numpy arrays must contain readable data")
+        const NumpyHeaderData numpy_header = _reader->get_numpy_header_data();
+        _reader->close();
+
+        if (_max_numpy_dims.size() == 0) {
+            _max_numpy_dims.resize(numpy_header._shape.size());
+            _numpy_dtype = numpy_header._type_info;
+        }
+
+        if (_max_numpy_dims.size() != numpy_header._shape.size()) {
+            THROW("All numpy arrays must have the same number of dimensions")
+        }
+
+        if (_numpy_dtype != numpy_header._type_info) {
+            THROW("All numpy arrays must have the same data type")
+        }
+
+        for (uint i = 0; i < _max_numpy_dims.size(); i++) {
+            if (numpy_header._shape[i] > _max_numpy_dims[i]) {
+                _max_numpy_dims[i] = numpy_header._shape[i];
+            }
+        }
+    }
+    // return the reader read pointer to the begining of the resource
+    _reader->reset();
+}
+
 void ImageSourceEvaluator::FindMaxSize::process_sample(unsigned val) {
     if (_policy == MaxSizeEvaluationPolicy::MAXIMUM_FOUND_SIZE) {
         _max = (val > _max) ? val : _max;
diff --git a/rocAL/source/parameters/parameter_factory.cpp b/rocAL/source/parameters/parameter_factory.cpp
index 6f3800bb4..cb31a55b8 100644
--- a/rocAL/source/parameters/parameter_factory.cpp
+++ b/rocAL/source/parameters/parameter_factory.cpp
@@ -104,33 +104,46 @@ void ParameterFactory::generate_seed() {
     _seed = rd();
 }
 
+int64_t
+ParameterFactory::get_seed_from_seedsequence() {
+    increment_seed_sequence_idx();
+    return _seed_vector[_seed_sequence_idx];
+}
+
+void ParameterFactory::increment_seed_sequence_idx() {
+    _seed_sequence_idx = (_seed_sequence_idx + 1) % MAX_SEEDS;
+}
+
 void ParameterFactory::set_seed(unsigned seed) {
     _seed = seed;
+    _seed_vector.resize(MAX_SEEDS);
+    std::seed_seq ss{seed};
+    ss.generate(_seed_vector.begin(), _seed_vector.end());
 }
 
 IntParam* ParameterFactory::create_uniform_int_rand_param(int start, int end) {
-    auto gen = new UniformRand<int>(start, end, _seed);
+    auto gen = new UniformRand<int>(start, end, get_seed_from_seedsequence());
     auto ret = new IntParam(gen, RocalParameterType::RANDOM_UNIFORM);
     _parameters.insert(gen);
     return ret;
 }
 
 FloatParam* ParameterFactory::create_uniform_float_rand_param(float start, float end) {
-    auto gen = new UniformRand<float>(start, end, _seed);
+    auto gen = new UniformRand<float>(start, end, get_seed_from_seedsequence());
     auto ret = new FloatParam(gen, RocalParameterType::RANDOM_UNIFORM);
     _parameters.insert(gen);
     return ret;
 }
 
 IntParam* ParameterFactory::create_custom_int_rand_param(const int* value, const double* frequencies, size_t size) {
-    auto gen = new CustomRand<int>(value, frequencies, size, _seed);
+    auto gen = new CustomRand<int>(value, frequencies, size, get_seed_from_seedsequence());
     auto ret = new IntParam(gen, RocalParameterType::RANDOM_CUSTOM);
     _parameters.insert(gen);
     return ret;
 }
 
 FloatParam* ParameterFactory::create_custom_float_rand_param(const float* value, const double* frequencies, size_t size) {
-    auto gen = new CustomRand<float>(value, frequencies, size, _seed);
+    auto gen = new CustomRand<float>(value, frequencies, size, get_seed_from_seedsequence());
     auto ret = new FloatParam(gen, RocalParameterType::RANDOM_CUSTOM);
     _parameters.insert(gen);
     return ret;
diff --git a/rocAL/source/pipeline/graph.cpp b/rocAL/source/pipeline/graph.cpp
index 93111d173..094f2a8ed 100644
--- a/rocAL/source/pipeline/graph.cpp
+++ b/rocAL/source/pipeline/graph.cpp
@@ -101,6 +101,24 @@ Graph::process() {
     return Status::OK;
 }
 
+Graph::Status
+Graph::schedule() {
+    vx_status status;
+    if ((status = vxScheduleGraph(_graph)) != VX_SUCCESS)
+        THROW("ERROR: vxScheduleGraph failed " + TOSTR(status))
+
+    return Status::OK;
+}
+
+Graph::Status
+Graph::wait() {
+    vx_status status;
+    if ((status = vxWaitGraph(_graph)) != VX_SUCCESS)
+        THROW("ERROR: vxScheduleGraph failed " + TOSTR(status))
+
+    return Status::OK;
+}
+
 Graph::Status
 Graph::release() {
     vx_status status = VX_SUCCESS;
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index 917913338..e63100f05 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -257,6 +257,26 @@ void MasterGraph::create_single_graph() {
     _graph->verify();
 }
 
+void MasterGraph::create_multiple_graphs() {
+    // Actual graph creating and calls into adding nodes to graph is deferred and is happening here to enable potential future optimizations
+    int num_of_graphs = _loader_modules.size();
+    for (int n = 0; n < num_of_graphs; n++) {
+        _graphs.emplace_back(std::make_shared<Graph>(_context, _affinity, 0, _cpu_num_threads, _gpu_id));
+    }
+    for (auto &node : _nodes) {
+        // Any tensor not yet created can be created as virtual tensor
+        for (auto &tensor : node->output())
+            if (tensor->info().type() == TensorInfo::Type::UNKNOWN) {
+                tensor->create_virtual(_context, _graphs[node->get_id()]->get());
+                _internal_tensors.push_back(tensor);
+            }
+        node->create(_graphs[node->get_id()]);
+    }
+    
+    for (auto& graph : _graphs)
+        graph->verify();
+}
+
 MasterGraph::Status
 MasterGraph::build() {
     if (_internal_tensor_list.empty())
@@ -268,7 +288,12 @@ MasterGraph::build() {
     _ring_buffer.init(_mem_type, nullptr, _internal_tensor_list.data_size(), _internal_tensor_list.roi_size());
 #endif
     if (_is_box_encoder) _ring_buffer.initBoxEncoderMetaData(_mem_type, _user_batch_size * _num_anchors * 4 * sizeof(float), _user_batch_size * _num_anchors * sizeof(int));
-    create_single_graph();
+    if (_loader_modules.size() > 1) {
+        create_multiple_graphs();
+    } else {
+        _loader_module = _loader_modules[0];
+        create_single_graph();
+    }
     start_processing();
     return Status::OK;
 }
@@ -324,7 +349,8 @@ void MasterGraph::release() {
     _tensor_map.clear();
     _ring_buffer.release_gpu_res();
     // shut_down loader:: required for releasing any allocated resourses
-    _loader_module->shut_down();
+    for (auto loader_module : _loader_modules)
+        loader_module->shut_down();
     // release output buffer if allocated
     if (_output_tensor_buffer != nullptr) {
 #if ENABLE_OPENCL
@@ -346,9 +372,38 @@ void MasterGraph::release() {
     _output_tensor_list.release();    // It will call the vxReleaseTensor internally in the destructor for each tensor in the list
     for (auto tensor_list : _metadata_output_tensor_list)
         dynamic_cast<TensorList *>(tensor_list)->release();  // It will call the vxReleaseTensor internally in the destructor for each tensor in the list
+    if(_is_roi_random_crop)
+    {
+        if(_crop_shape_batch != nullptr)
+            delete[] _crop_shape_batch;
+        if(_roi_random_crop_buf != nullptr) {
+            if (_affinity == RocalAffinity::GPU) {
+    #if ENABLE_HIP
+                hipError_t err = hipHostFree(_roi_random_crop_buf);
+                if (err != hipSuccess)
+                    std::cerr << "\n[ERR] hipFree failed  " << std::to_string(err) << "\n";
+    #endif
+            } else { free(_roi_random_crop_buf); }
+        delete _roi_random_crop_tensor;
+        }
+    }
+    if(_is_random_object_bbox)
+    {
+        if(_random_object_bbox_box1_buf != nullptr) {
+            free(_random_object_bbox_box1_buf);
+        }
+        if(_random_object_bbox_box2_buf != nullptr) {
+            free(_random_object_bbox_box2_buf);
+        }
+        // _random_object_bbox_tensor_list.release();
+    }
 
     if (_graph != nullptr)
         _graph->release();
+    for (auto& graph : _graphs) {
+        if (graph != nullptr)
+            graph->release();
+    }
     if (_meta_data_reader != nullptr)
         _meta_data_reader->release();
 
@@ -416,7 +471,8 @@ MasterGraph::reset() {
     if (_randombboxcrop_meta_data_reader != nullptr)
         _randombboxcrop_meta_data_reader->release();
     // resetting loader module to start from the beginning of the media and clear it's internal state/buffers
-    _loader_module->reset();
+    for (auto loader_module : _loader_modules)
+        loader_module->reset();
     // restart processing of the images
     _first_run = true;
     _output_routine_finished_processing = false;
@@ -438,10 +494,13 @@ MasterGraph::mem_type() {
 
 Timing
 MasterGraph::timing() {
-    Timing t = _loader_module->timing();
-    t.process_time += _process_time.get_timing();
-    t.copy_to_output += _convert_time.get_timing();
-    t.bb_process_time += _bencode_time.get_timing();
+    Timing t;
+    for (auto loader_module : _loader_modules) {
+        t = loader_module->timing();
+        t.process_time += _process_time.get_timing();
+        t.copy_to_output += _convert_time.get_timing();
+        t.bb_process_time += _bencode_time.get_timing();
+    }
     return t;
 }
 
@@ -878,6 +937,15 @@ MasterGraph::get_output_tensors() {
     return &_output_tensor_list;
 }
 
+bool MasterGraph::is_out_of_data() {
+    for (auto loader_module : _loader_modules) {
+        if (loader_module->remaining_count() < (_is_sequence_reader_output ? _sequence_batch_size : _user_batch_size)) {
+            return true;
+        }
+    }
+    return false;
+}
+
 void MasterGraph::output_routine() {
     INFO("Output routine started with " + TOSTR(_remaining_count) + " to load");
     try {
@@ -943,6 +1011,8 @@ void MasterGraph::output_routine() {
                     _meta_data_graph->process(_augmented_meta_data, output_meta_data);
                 }
             }
+            if(_is_random_object_bbox) { update_random_object_bbox(); }
+            if(_is_roi_random_crop) { update_roi_random_crop(); }
             _process_time.start();
             _graph->process();
             _process_time.end();
@@ -980,10 +1050,125 @@ void MasterGraph::output_routine() {
     }
 }
 
+void MasterGraph::output_routine_multiple_loaders() {
+    INFO("Output routine started with " + TOSTR(_remaining_count) + " to load");
+    try {
+        while (_processing) {
+            if (is_out_of_data()) {
+                // If the internal process routine ,output_routine(), has finished processing all the images, and last
+                // processed images stored in the _ring_buffer will be consumed by the user when it calls the run() func
+                notify_user_thread();
+                // the following call is required in case the ring buffer is waiting for more data to be loaded and there is no more data to process.
+                _ring_buffer.release_if_empty();
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                continue;
+            }
+            _rb_block_if_full_time.start();
+            // _ring_buffer.get_write_buffers() is blocking and blocks here until user uses processed image by calling run() and frees space in the ring_buffer
+            auto write_buffers = _ring_buffer.get_write_buffers();
+            auto write_output_buffers = write_buffers.first;
+            _rb_block_if_full_time.end();
+
+            // Swap handles on the input tensor, so that new tensor is loaded to be processed
+            for (auto loader_module : _loader_modules) {
+                auto load_ret = loader_module->load_next();
+                if (load_ret != LoaderModuleStatus::OK)
+                    THROW("Loader module failed to load next batch of images, status " + TOSTR(load_ret))                
+            }
+
+            if (!_processing)
+                break;
+            auto full_batch_image_names = _loader_modules[0]->get_id(); // Temp change
+            auto decode_image_info = _loader_modules[0]->get_decode_image_info();   // Temp change
+            auto crop_image_info = _loader_modules[0]->get_crop_image_info();   // Temp change
+
+            if (full_batch_image_names.size() != _user_batch_size)
+                WRN("Internal problem: names count " + TOSTR(full_batch_image_names.size()))
+            
+            /*
+            // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing
+            if (_meta_data_reader)
+                _meta_data_reader->lookup(full_batch_image_names);
+            */
+
+            if (!_processing)
+                break;
+
+            // Swap handles on the output tensor, so that new processed tensor will be written to the a new buffer
+            for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++)
+                _internal_tensor_list[idx]->swap_handle(write_output_buffers[idx]);
+
+            if (!_processing)
+                break;
+
+            for (auto node : _nodes) {
+                if (node->_is_ssd) {
+                    node->set_meta_data(_augmented_meta_data);
+                }
+            }
+
+            update_node_parameters();
+            pMetaDataBatch output_meta_data = nullptr;
+            /* if (_augmented_meta_data) {
+                output_meta_data = _augmented_meta_data->clone(!_augmentation_metanode);  // copy the data if metadata is not processed by the nodes, else create an empty instance
+                if (_meta_data_graph) {
+                    if (_is_random_bbox_crop) {
+                        _meta_data_graph->update_random_bbox_meta_data(_augmented_meta_data, output_meta_data, decode_image_info, crop_image_info);
+                    } else {
+                        _meta_data_graph->update_meta_data(_augmented_meta_data, decode_image_info);
+                    }
+                    _meta_data_graph->process(_augmented_meta_data, output_meta_data);
+                }
+            }*/
+            if(_is_random_object_bbox) { update_random_object_bbox(); }
+            if(_is_roi_random_crop) { update_roi_random_crop(); }
+            _process_time.start();
+            for (auto& graph : _graphs) {
+                graph->process();
+            }
+            _process_time.end();
+
+            auto write_roi_buffers = write_buffers.second;   // Obtain ROI buffers from ring buffer
+            for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++)
+                _internal_tensor_list[idx]->copy_roi(write_roi_buffers[idx]);   // Copy ROI from internal tensor's buffer to ring buffer
+
+            /*_bencode_time.start();
+            if (_is_box_encoder) {
+                auto bbox_encode_write_buffers = _ring_buffer.get_box_encode_write_buffers();
+#if ENABLE_HIP
+                if (_mem_type == RocalMemType::HIP) {
+                    // get bbox encoder read buffers
+                    if (_box_encoder_gpu) _box_encoder_gpu->Run(output_meta_data, (float *)bbox_encode_write_buffers.first, (int *)bbox_encode_write_buffers.second);
+                } else
+#endif
+                    _meta_data_graph->update_box_encoder_meta_data(&_anchors, output_meta_data, _criteria, _offset, _scale, _means, _stds, (float *)bbox_encode_write_buffers.first, (int *)bbox_encode_write_buffers.second);
+            }
+            _bencode_time.end();
+#ifdef ROCAL_VIDEO
+            // _sequence_start_framenum_vec.insert(_sequence_start_framenum_vec.begin(), _loader_module->get_sequence_start_frame_number());
+            // _sequence_frame_timestamps_vec.insert(_sequence_frame_timestamps_vec.begin(), _loader_module->get_sequence_frame_timestamps());
+#endif
+            */
+            _ring_buffer.set_meta_data(full_batch_image_names, output_meta_data);
+            _ring_buffer.push();  // Image data and metadata is now stored in output the ring_buffer, increases it's level by 1
+        }
+    } catch (const std::exception &e) {
+        ERR("Exception thrown in the process routine: " + STR(e.what()) + STR("\n"));
+        _processing = false;
+        _ring_buffer.release_all_blocked_calls();
+    }
+}
+
 void MasterGraph::start_processing() {
     _processing = true;
-    _remaining_count = _loader_module->remaining_count();
-    _output_thread = std::thread(&MasterGraph::output_routine, this);
+    for (auto loader_module : _loader_modules) {
+        _remaining_count = std::min(_remaining_count, static_cast<int>(loader_module->remaining_count()));
+    }
+    if (_loader_modules.size() == 1) {
+        _output_thread = std::thread(&MasterGraph::output_routine, this);
+    } else {
+        _output_thread = std::thread(&MasterGraph::output_routine_multiple_loaders, this);
+    }
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
 #else
 //  Changing thread scheduling policy and it's priority does not help on latest Ubuntu builds
@@ -1436,6 +1621,590 @@ TensorList *MasterGraph::matched_index_meta_data() {
     }
     return &_matches_tensor_list;
 }
+class BatchRNG {
+ public:
+  /**
+   * @brief Used to keep batch of RNGs, so Operators can be immune to order of sample processing
+   * while using randomness
+   *
+   * @param seed Used to generate seed_seq to initialize batch of RNGs
+   * @param batch_size How many RNGs to store
+   * @param state_size How many seed are used to initialize one RNG. Used to lower probablity of
+   * collisions between seeds used to initialize RNGs in different operators.
+   */
+  BatchRNG(int64_t seed, int batch_size, int state_size = 4)
+  : seed_(seed) {
+    std::seed_seq seq{seed_};
+    std::vector<uint32_t> seeds(batch_size * state_size);
+    seq.generate(seeds.begin(), seeds.end());
+    rngs_.reserve(batch_size);
+    for (int i = 0; i < batch_size * state_size; i += state_size) {
+      std::seed_seq s(seeds.begin() + i, seeds.begin() + i + state_size);
+      rngs_.emplace_back(s);
+    }
+  }
+
+
+  /**
+   * Returns engine corresponding to given sample ID
+   */
+  std::mt19937 &operator[](int sample) noexcept {
+    return rngs_[sample];
+  }
+
+ private:
+  int64_t seed_;
+  std::vector<std::mt19937> rngs_;
+};
+
+TensorList *MasterGraph::random_object_bbox(Tensor *input, std::string output_format, int k_largest, float foreground_prob) {
+    _random_object_bbox_label_tensor = input;
+    _is_random_object_bbox = true;
+    _k_largest = k_largest;
+    _foreground_prob = foreground_prob;
+    auto output_dims = _random_object_bbox_label_tensor->num_of_dims() - 1;
+    _random_object_bbox_output_format = output_format;
+    if(output_format == "start_end" || output_format == "anchor_shape") {        
+        // create new instance of tensor class
+        std::vector<size_t> box1_dims = {_user_batch_size, output_dims};
+        auto box1_info = TensorInfo(std::move(box1_dims), RocalMemType::HOST, RocalTensorDataType::INT32);
+        _random_object_bbox_box1_tensor = new Tensor(box1_info);
+
+        // allocate memory for the raw buffer pointer in tensor object
+        allocate_host_or_pinned_mem(&_random_object_bbox_box1_buf, _user_batch_size * output_dims * sizeof(int), RocalMemType::HOST);
+        _random_object_bbox_box1_tensor->create_from_ptr(_context, _random_object_bbox_box1_buf);
+
+        // create new instance of tensor class
+        std::vector<size_t> box2_dims = {_user_batch_size, output_dims};
+        auto box2_info = TensorInfo(std::move(box2_dims), RocalMemType::HOST, RocalTensorDataType::INT32);
+        _random_object_bbox_box2_tensor = new Tensor(box2_info);
+
+        // allocate memory for the raw buffer pointer in tensor object
+        allocate_host_or_pinned_mem(&_random_object_bbox_box2_buf, _user_batch_size * output_dims * sizeof(int), RocalMemType::HOST);
+        _random_object_bbox_box2_tensor->create_from_ptr(_context, _random_object_bbox_box2_buf);
+        _random_object_bbox_tensor_list.push_back(_random_object_bbox_box1_tensor);
+        _random_object_bbox_tensor_list.push_back(_random_object_bbox_box2_tensor);
+    } else if(output_format == "box") {
+        // create new instance of tensor class
+        std::vector<size_t> box1_dims = {_user_batch_size, output_dims * 2};
+        auto box1_info = TensorInfo(std::move(box1_dims), RocalMemType::HOST, RocalTensorDataType::INT32);
+        _random_object_bbox_box1_tensor = new Tensor(box1_info);
+
+        // allocate memory for the raw buffer pointer in tensor object
+        allocate_host_or_pinned_mem(&_random_object_bbox_box1_buf, _user_batch_size * output_dims * 2 * sizeof(int), RocalMemType::HOST);
+        _random_object_bbox_box1_tensor->create_from_ptr(_context, _random_object_bbox_box1_buf);
+        _random_object_bbox_tensor_list.push_back(_random_object_bbox_box1_tensor);
+    }
+    return &_random_object_bbox_tensor_list;
+}
+
+void MasterGraph::update_random_object_bbox() {
+    u_int8_t *input = static_cast<u_int8_t *>(_random_object_bbox_label_tensor->buffer());
+    auto roi_dims = reinterpret_cast<int *>(_random_object_bbox_label_tensor->info().roi().get_ptr());
+    std::vector<size_t> max_size = _random_object_bbox_label_tensor->info().max_shape();
+    auto single_image_size = _random_object_bbox_label_tensor->data_size() / _user_batch_size;
+    auto input_dims = _random_object_bbox_label_tensor->num_of_dims() - 1;
+    uint seed = std::time(0);
+    BatchRNG _rng = {seed, static_cast<int>(_user_batch_size)};
+    std::uniform_real_distribution<> foreground(0, 1);
+    int *box1_buf = static_cast<int *>(_random_object_bbox_box1_buf);
+    int *box2_buf = static_cast<int *>(_random_object_bbox_box2_buf);
+#pragma omp parallel for num_threads(_user_batch_size)
+    for (uint i = 0; i < _user_batch_size; i++) {
+        auto sample_idx = i * input_dims;
+        int *input_shape = &roi_dims[sample_idx * 2 + input_dims];
+        std::vector<int> roi_size;
+        for (uint j = 0; j < input_dims; j++) {
+            roi_size.push_back(input_shape[j]);
+        }
+        std::vector<int> output_compact;
+        auto label = input + i * single_image_size;
+        int total_box = 0;
+        bool fg = foreground(_rng[i]) < _foreground_prob;
+        if (fg) total_box = labelMergeFunc(label, roi_size, max_size, output_compact, _rng[i]);
+        if (total_box) {
+            std::vector<std::vector<std::vector<unsigned>>> boxes;  // total - lo,hi - 4d
+            std::vector<std::pair<unsigned, unsigned>> ranges;      // totalbox - lo,hi
+            std::vector<unsigned> hits;
+            boxes.resize(total_box);
+            ranges.resize(total_box);
+            hits.resize((total_box / 32 + !!(total_box % 32)));
+            auto out_row = output_compact.data();
+            for (int d1 = 0; d1 < roi_size[0]; d1++) {
+                for (int d2 = 0; d2 < roi_size[1]; d2++) {
+                    for (int d3 = 0; d3 < roi_size[2]; d3++) {
+                        std::vector<int> origin{d1, d2, d3, 0};
+                        get_label_boundingboxes(boxes, ranges, hits, out_row, origin, roi_size[3]);
+                        out_row += roi_size[3];
+                    }
+                }
+            }
+            int chosen_box_idx = pick_box(boxes, _rng[i], _k_largest);
+            if(chosen_box_idx == -1) { ERR("No ROI regions found in input. Setting input shape as ROI region"); }
+            if(_random_object_bbox_output_format == "box") {
+                for (uint j = 0; j < input_dims; j++) {
+                    if(chosen_box_idx >= 0) {
+                        box1_buf[sample_idx + j] = boxes[chosen_box_idx][0][j];
+                        box1_buf[sample_idx + j + input_dims] = boxes[chosen_box_idx][1][j];
+                    }
+                    else {
+                        box1_buf[sample_idx + j] = 0;
+                        box1_buf[sample_idx + j + input_dims] = input_shape[j];
+                    }
+                }
+            } else if(_random_object_bbox_output_format == "anchor_shape") {
+                for (uint j = 0; j < input_dims; j++) {
+                    if(chosen_box_idx >= 0) {
+                        box1_buf[sample_idx + j] = boxes[chosen_box_idx][0][j];
+                        box2_buf[sample_idx + j] = boxes[chosen_box_idx][0][j] - boxes[chosen_box_idx][1][j];
+                    }
+                    else {
+                        box1_buf[sample_idx + j] = 0;
+                        box2_buf[sample_idx + j] = input_shape[j];
+                    }
+                }
+            } else if(_random_object_bbox_output_format == "start_end") {
+                for (uint j = 0; j < input_dims; j++) {
+                    if(chosen_box_idx >= 0) {
+                        box1_buf[sample_idx + j] = boxes[chosen_box_idx][0][j];
+                        box2_buf[sample_idx + j] = boxes[chosen_box_idx][1][j];
+                    }
+                    else {
+                        box1_buf[sample_idx + j] = 0;
+                        box2_buf[sample_idx + j] = input_shape[j];
+                    }
+                }
+            }
+        } else {
+            if(_random_object_bbox_output_format == "box") {
+                for (uint j = 0; j < input_dims; j++) {
+                    box1_buf[sample_idx + j] = 0;
+                    box1_buf[sample_idx + j + input_dims] = input_shape[j];
+                }
+            } else {
+                for (uint j = 0; j < input_dims; j++) {
+                    box1_buf[sample_idx + j] = 0;
+                    box2_buf[sample_idx + j] = input_shape[j];
+                }
+            }
+        }
+    }
+}
+
+int MasterGraph::pick_box(std::vector<std::vector<std::vector<unsigned>>> boxes, std::mt19937 &rng, int k_largest) {
+    auto beg = boxes.begin();
+    auto end = boxes.end();
+    int n = end - beg;
+    if (n <= 0)
+        return -1;
+    if (k_largest > 0 && k_largest < n) {
+        std::vector<std::pair<int64_t, int>> vol_idx;
+        vol_idx.resize(n);
+        for (int i = 0; i < n; i++) {
+            std::vector<unsigned> crop_region;
+            std::transform(boxes[i][1].begin(),boxes[i][1].end(), boxes[i][0].begin(),
+               std::back_inserter(crop_region),
+               [](const auto& hi, const auto& lo)
+               {
+                   return hi - lo;
+               });
+            auto volume_val = 1;
+            for (auto val : crop_region) {
+                volume_val *= val;
+            }
+            vol_idx[i] = {-volume_val, i};
+        }
+        std::sort(vol_idx.begin(), vol_idx.end());
+        std::uniform_int_distribution<int> dist(0, std::min(n, k_largest) - 1);
+        return vol_idx[dist(rng)].second;
+    } else {
+        std::uniform_int_distribution<int> dist(0, n - 1);
+        return dist(rng);
+    }
+}
+
+void MasterGraph::findLabels(const u_int8_t *input, std::set<int> &labels, std::vector<int> roi_size, std::vector<size_t> max_size) {
+    if (!roi_size.size() || !max_size.size())
+        return;
+    int prev = input[0];
+    labels.insert(prev);
+    int num_dims = roi_size.size();
+    std::vector<unsigned> strides(num_dims + 1);
+    strides[num_dims] = 1;
+    for (int i = num_dims - 1; i >= 0; i--) {
+        strides[i] = strides[i + 1] * max_size[i];
+    }
+    auto index = 0;
+    for (int c = 0; c < roi_size[0]; c++) {
+        int outerDim1 = index;
+        for (int d = 0; d < roi_size[1]; d++) {
+            int outerDim2 = outerDim1;
+            for (int h = 0; h < roi_size[2]; h++) {
+                int outerDim3 = outerDim2;
+                for (int w = 0; w < roi_size[3]; w++) {
+                    auto value = input[outerDim3++];
+                    if (value == prev)
+                        continue;  // skip runs of equal labels
+                    labels.insert(value);
+                    prev = value;
+                }
+                outerDim2 += strides[3];
+            }
+            outerDim1 += strides[2];
+        }
+        index += strides[1];
+    }
+}
+
+void MasterGraph::filterByLabel(const u_int8_t *input, std::vector<int> &output, std::vector<int> roi_size, std::vector<size_t> max_size, int label) {
+    int num_dims = roi_size.size();
+    std::vector<unsigned> strides(num_dims + 1);
+    strides[num_dims] = 1;
+    for (int i = num_dims - 1; i >= 0; i--) {
+        strides[i] = strides[i + 1] * max_size[i];
+    }
+    int index = 0;
+    int out_index = 0;
+    for (int c = 0; c < roi_size[0]; c++) {
+        int outerDim1 = index;
+        for (int d = 0; d < roi_size[1]; d++) {
+            int outerDim2 = outerDim1;
+            for (int h = 0; h < roi_size[2]; h++) {
+                int outerDim3 = outerDim2;
+                for (int w = 0; w < roi_size[3]; w++) {
+                    output[out_index++] = input[outerDim3++] == label;
+                }
+                outerDim2 += strides[3];
+            }
+            outerDim1 += strides[2];
+        }
+        index += strides[1];
+    }
+}
+
+void MasterGraph::labelRow(const int *label_base, const int *in_row, int *out_row, unsigned length) {
+    int curr_label = -1;
+    int bg_label = -1;
+    int prev = 0;
+    for (unsigned i = 0; i < length; i++) {
+        if (in_row[i] != prev) {
+            if (in_row[i] != 0) {
+                curr_label = out_row + i - label_base;
+            } else {
+                curr_label = bg_label;
+            }
+        }
+        out_row[i] = curr_label;
+        prev = in_row[i];
+    }
+}
+
+int MasterGraph::disjointSetGroup(int &x, int new_id) {
+    int old = x;
+    x = new_id;
+    return old;
+}
+
+int MasterGraph::disjointFind(int *items, int x) {
+    int x0 = x;
+
+    // find the label
+    for (;;) {
+        int g = disjointGetGroup(items[x]);
+        if (g == x)
+            break;
+        x = g;
+    }
+
+    int r = x;
+
+    // assign all intermediate labels to save time in subsequent calls
+    x = x0;
+    while (x != disjointGetGroup(items[x])) {
+        x0 = disjointSetGroup(items[x], r);
+        x = x0;
+    }
+
+    return r;
+}
+
+int MasterGraph::disjointMerge(int *items, int x, int y) {
+    y = disjointFind(items, y);
+    x = disjointFind(items, x);
+    if (x < y) {
+        disjointSetGroup(items[y], x);
+        return x;
+    } else if (y < x) {
+        disjointSetGroup(items[x], y);
+        return y;
+    } else {
+        // already merged
+        return x;
+    }
+}
+
+void MasterGraph::mergeRow(int *label_base, const int *in1, const int *in2, int *out1, int *out2, unsigned n) {
+    int bg_label = -1;
+    int prev1 = bg_label;
+    int prev2 = bg_label;
+    for (unsigned i = 0, in_offset = 0, out_offset = 0; i < n; i++, in_offset += 1, out_offset += 1) {
+        int &o1 = out1[out_offset];
+        int &o2 = out2[out_offset];
+        if (o1 != prev1 || o2 != prev2) {
+            if (o1 != bg_label) {
+                if (in1[in_offset] == in2[in_offset]) {
+                    disjointMerge(label_base, o1, o2);
+                }
+            }
+            prev1 = o1;
+            prev2 = o2;
+        }
+    }
+}
+
+int MasterGraph::labelMergeFunc(const u_int8_t *input, std::vector<int> &size, std::vector<size_t> &max_size, std::vector<int> &output_compact, std::mt19937 &rng) {
+    int64_t total_buf_size = 1;
+    for (auto val : size)
+        total_buf_size *= val;
+    std::vector<int> output_filtered;
+    output_filtered.resize(total_buf_size);
+    output_compact.resize(total_buf_size);
+    std::fill(output_filtered.begin(), output_filtered.end(), 0);
+    std::fill(output_compact.begin(), output_compact.end(), 0);
+    std::set<int> labels_found;
+    findLabels(input, labels_found, size, max_size);
+    labels_found.erase(0); // Removing background class
+    int selected_label;
+    if (!labels_found.size()) return 0;   // All labels belongs to background
+    if(labels_found.size() == 1) { selected_label = *labels_found.begin(); }
+    else {
+        std::uniform_int_distribution<int> class_dist{1, *labels_found.rbegin()};
+        selected_label = class_dist(rng);
+    }
+    filterByLabel(input, output_filtered, size, max_size, selected_label);
+    for (int i = 0; i < size[0]; i++) {
+        for (int j = 0; j < size[1]; j++) {
+            for (int k = 0; k < size[2]; k++) {
+                labelRow(output_compact.data(),
+                         output_filtered.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + (k * size[3]),
+                         output_compact.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + (k * size[3]),
+                         size[3]);
+                if (k > 0) {
+                    mergeRow(output_compact.data(),
+                             output_filtered.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + (k - 1) * size[3],
+                             output_filtered.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + (k)*size[3],
+                             output_compact.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + ((k - 1) * size[3]),
+                             output_compact.data() + (i * size[1] * size[2] * size[3]) + (j * (size[2] * size[3])) + (k * size[3]),
+                             size[3]);
+                }
+            }
+        }
+    }
+    for (int k = 0; k < size[0]; k++) {
+        for (int stride = 1; stride <= size[1]; stride *= 2) {
+            for (int i = stride; i < size[1]; i += 2 * stride) {
+                auto out_slice = output_compact.data() + (i * size[2] * size[3]);
+                auto in_slice = output_filtered.data() + (i * size[2] * size[3]);
+                auto prev_out = output_compact.data() + ((i - 1) * size[2] * size[3]);
+                auto prev_in = output_filtered.data() + ((i - 1) * size[2] * size[3]);
+                mergeRow(output_compact.data(),
+                         prev_in, in_slice, prev_out, out_slice, size[2] * size[3]);
+            }
+        }
+    }
+    std::set<int> label_set;
+    int bg_label = 0;
+    int old_bg_label = -1;
+    int prev = old_bg_label;
+    int remapped = old_bg_label;
+    for (int64_t i = 0; i < total_buf_size; i++) {
+        if (output_compact[i] != old_bg_label) {
+            if (output_compact[i] != prev) {
+                prev = output_compact[i];
+                // look up `ds` only when the value changes - this saves a lot of lookups
+                remapped = disjointFind(output_compact.data(), i);
+                // no need to assign labels[i] = remapped; find did it
+                label_set.insert(remapped);
+            } else {
+                output_compact[i] = remapped;
+            }
+        }
+    }
+    std::map<int, int> label_map;
+    int next_label = 0;
+    for (auto old : label_set) {
+        if (next_label == bg_label)
+            next_label++;
+        label_map[old] = next_label++;
+    }
+    label_map[old_bg_label] = bg_label;
+    prev = output_compact[0];
+    remapped = label_map.find(prev)->second;
+    for (auto &label : output_compact) {
+        if (label != prev) {
+            prev = label;
+            remapped = label_map.find(prev)->second;
+        }
+        label = remapped;
+    }
+    return label_set.size();
+}
+
+bool MasterGraph::hit(std::vector<unsigned> &hits, unsigned idx) {
+    unsigned flag = (1u << (idx & 31));
+    unsigned &h = hits[idx >> 5];
+    bool ret = h & flag;
+    h |= flag;
+    return ret;
+}
+
+void MasterGraph::get_label_boundingboxes(std::vector<std::vector<std::vector<unsigned>>> &boxes,
+                                          std::vector<std::pair<unsigned, unsigned>> ranges,
+                                          std::vector<unsigned> hits,
+                                          int *in,
+                                          std::vector<int> origin,
+                                          unsigned width) {
+    for (auto &mask : hits) {
+        mask = 0u;  // mark all labels as not found in this row
+    }
+
+    int ndim = 4;
+
+    const unsigned nboxes = ranges.size();
+    int background = -1;
+    for (unsigned i = 0; i < width; i++) {
+        if (in[i] != background) {
+            // We make a "hole" in the label indices for the background.
+            int skip_bg = (background >= 0 && in[i] >= background);
+            unsigned idx = static_cast<unsigned>(in[i]) - skip_bg;
+            // deliberate use of unsigned overflow to detect negative labels as out-of-range
+            if (idx < nboxes) {
+                if (!hit(hits, idx)) {
+                    ranges[idx].first = i;
+                }
+                ranges[idx].second = i;
+            }
+        }
+    }
+
+    std::vector<unsigned> lo(4, 0);
+    std::vector<unsigned> hi(4, 0);
+
+    for (int i = 0; i < ndim; i++) {
+        lo[i] = origin[i];
+        hi[i] = origin[i] + 1;  // one past
+    }
+    const int d = 3;
+
+    for (uint word = 0; word < hits.size(); word++) {
+        unsigned mask = hits[word];
+        unsigned i = 32 * word;
+        while (mask) {
+            if ((mask & 0xffu) == 0) {  // skip 8 labels if not set
+                mask >>= 8;
+                i += 8;
+                continue;
+            }
+            if (mask & 1) {  // label found? mark it
+                lo[d] = ranges[i].first + origin[d];
+                hi[d] = (ranges[i].second + origin[d] + 1);  // one past the index found in this function
+                if (boxes[i].empty()) {
+                    // empty box - create a new one
+                    boxes[i].push_back(lo);
+                    boxes[i].push_back(hi);
+                } else {
+                    // expand existing
+                    std::transform(boxes[i][0].begin(), boxes[i][0].end(), lo.begin(), boxes[i][0].begin(),
+                                   [](const auto &val1, const auto &val2) {
+                                       return val1 < val2 ? val1 : val2;
+                                   });
+                    std::transform(boxes[i][1].begin(), boxes[i][1].end(), hi.begin(), boxes[i][1].begin(),
+                                   [](const auto &val1, const auto &val2) {
+                                       return val1 > val2 ? val1 : val2;
+                                   });
+                }
+            }
+            mask >>= 1;
+            i++;  // skip one label
+        }
+    }
+}
+
+Tensor* MasterGraph::roi_random_crop(Tensor *input, Tensor *roi_start, Tensor *roi_end, int *crop_shape)
+{
+    _is_roi_random_crop = true;
+    _roi_start_tensor = roi_start;
+    _roi_end_tensor = roi_end;
+    auto input_dims = input->info().is_image() ? input->num_of_dims() - 2 : input->num_of_dims() - 1;
+
+    _roi_batch = reinterpret_cast<int *>(input->info().roi().get_ptr());
+    _crop_shape_batch = new int[input_dims * _user_batch_size]; // TODO handle this case later when different crop_shape is given for each tensor
+
+    // replicate crop_shape values for all samples in a batch
+    for(uint i = 0; i < _user_batch_size; i++)
+    {
+        int sample_idx = i * input_dims;
+        memcpy(&(_crop_shape_batch[sample_idx]), crop_shape, input_dims * sizeof(int));
+    }
+
+    // create new instance of tensor class
+    std::vector<size_t> dims = {_user_batch_size, input_dims};
+    auto info = TensorInfo(std::move(dims), input->info().mem_type(), RocalTensorDataType::INT32);
+    _roi_random_crop_tensor = new Tensor(info);
+
+    // allocate memory for the raw buffer pointer in tensor object
+    allocate_host_or_pinned_mem(&_roi_random_crop_buf, _user_batch_size * input_dims * sizeof(int), input->info().mem_type());
+    _roi_random_crop_tensor->create_from_ptr(_context, _roi_random_crop_buf);
+    return _roi_random_crop_tensor;
+}
+
+void MasterGraph::update_roi_random_crop() {
+    int *crop_begin_batch = static_cast<int *>(_roi_random_crop_buf);
+    uint seed = std::time(0);
+    auto input_dims = _roi_random_crop_tensor->info().dims()[1];
+    // get the roi_begin and roi_end values from random_object_bbox
+    int *roi_begin_batch = static_cast<int *>(_random_object_bbox_box1_buf);
+    int *roi_end_batch = static_cast<int *>(_random_object_bbox_box2_buf);
+    BatchRNG _rng = {seed, static_cast<int>(_user_batch_size)};
+    for(uint i = 0; i < _user_batch_size; i++) {
+        int sample_idx = i * input_dims;
+        int *crop_shape = &_crop_shape_batch[sample_idx];
+        int *roi_begin = &roi_begin_batch[sample_idx];
+        int *input_shape = &_roi_batch[sample_idx * 2 + input_dims];
+        int *roi_end = &roi_end_batch[sample_idx];
+        int *crop_begin = &crop_begin_batch[sample_idx];
+
+        for(uint j = 0; j < input_dims; j++) {
+            // check if crop_shape, roi_end is greater than input_shape
+            if(crop_shape[j] > input_shape[j])
+                THROW("crop shape cannot be greater than input shape");
+            if (roi_end[j] > input_shape[j])
+                THROW("ROI shape cannot be greater than input shape");
+
+            int roi_length = roi_end[j] - roi_begin[j];
+            int crop_length = crop_shape[j];
+            if (roi_length == crop_length) {
+                crop_begin[j] = roi_begin[j];
+            } else {
+                int64_t start_range[2] = {roi_begin[j], roi_end[j] - crop_length};
+
+                // swap range values if start_range[0] > start_range[1]
+                if (start_range[0] > start_range[1]) {
+                    int64_t temp = start_range[0];
+                    start_range[0] = start_range[1];
+                    start_range[1] = temp;
+                }
+
+                // check if range is within the bounds of input
+                start_range[0] = std::max<int64_t>(0, start_range[0]);
+                start_range[1] = std::min<int64_t>(input_shape[j] - crop_length, start_range[1]);
+
+                auto dist = std::uniform_int_distribution<int64_t>(start_range[0], start_range[1]);
+                crop_begin[j] = dist(_rng[i]);
+            }
+        }
+    }
+}
 
 void MasterGraph::notify_user_thread() {
     if (_output_routine_finished_processing)
diff --git a/rocAL/source/pipeline/node.cpp b/rocAL/source/pipeline/node.cpp
index 410d351ac..e6940f870 100644
--- a/rocAL/source/pipeline/node.cpp
+++ b/rocAL/source/pipeline/node.cpp
@@ -38,3 +38,18 @@ void Node::create(std::shared_ptr<Graph> graph) {
 void Node::update_parameters() {
     update_node();
 }
+
+void Node::add_next(const std::shared_ptr<Node> &node) {
+    
+    if (node->get_id() < 0)
+        node->set_id(_graph_id);
+    _next.emplace_back(node);
+}
+
+void Node::add_previous(const std::shared_ptr<Node> &node) {
+    for (auto prev_node : _prev) {
+        if (prev_node->get_id() != node->get_id())
+            THROW("The nodes are interdependent between 2 graphs")
+    }
+    _prev.emplace_back(node);
+}
diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index bc234e813..2cab4ce79 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -77,6 +77,10 @@ vx_enum interpret_tensor_data_type(RocalTensorDataType data_type) {
             return VX_TYPE_FLOAT16;
         case RocalTensorDataType::UINT8:
             return VX_TYPE_UINT8;
+        case RocalTensorDataType::UINT32:
+            return VX_TYPE_UINT32;
+        case RocalTensorDataType::INT32:
+            return VX_TYPE_INT32;
         default:
             THROW("Unsupported Tensor type " + TOSTR(data_type))
     }
@@ -110,7 +114,7 @@ void TensorInfo::reset_tensor_roi_buffers() {
     unsigned *roi_buf;
     auto roi_no_of_dims = _is_image ? 2 : (_num_of_dims - 1);
     auto roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
-    allocate_host_or_pinned_mem((void **)&roi_buf, roi_size * roi_no_of_dims * 2 * sizeof(unsigned), _mem_type);                       // 2 denotes, one coordinate each for begin and end
+    allocate_host_or_pinned_mem((void **)&roi_buf, roi_size * roi_no_of_dims * 2 * sizeof(unsigned), _mem_type);
     _roi.set_ptr(roi_buf, _mem_type, roi_size, roi_no_of_dims);
     if (_is_image) {
         Roi2DCords *roi = _roi.get_2D_roi();
@@ -119,7 +123,10 @@ void TensorInfo::reset_tensor_roi_buffers() {
             roi[i].xywh.h = _max_shape.at(1);
         }
     } else {
-        // TODO - For other tensor types
+        for (unsigned i = 0; i < _batch_size; i++) {
+            unsigned *tensor_shape = _roi[i].end;
+            tensor_shape[i] = _max_shape[i];
+        }
     }
 }
 
@@ -212,15 +219,10 @@ void Tensor::update_tensor_roi(const std::vector<std::vector<uint32_t>> &shape)
     for (unsigned i = 0; i < info().batch_size(); i++) {
         if (shape[i].size() != (info().num_of_dims() - 1))
             THROW("The number of dims to be updated and the num of dims of tensor info does not match")
-
+        
         unsigned *tensor_shape = _info.roi()[i].end;
-        for (unsigned d = 0; d < shape[i].size(); d++) {
-            if (shape[i][d] > max_shape[d]) {
-                WRN("Given ROI shape is larger than buffer shape for tensor[" + TOSTR(i) + "] " + TOSTR(shape[i][d]) + " > " + TOSTR(max_shape[d]))
-                tensor_shape[d] = max_shape[d];
-            } else {
-                tensor_shape[d] = shape[i][d];
-            }
+        for (unsigned j = 0; j < max_shape.size(); j++) {
+            tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
         }
     }
 }
@@ -281,6 +283,30 @@ int Tensor::create_from_handle(vx_context context) {
     return 0;
 }
 
+int Tensor::create_from_ptr(vx_context context, void *ptr) {
+    if (_vx_handle) {
+        WRN("Tensor object create method is already called ")
+        return -1;
+    }
+
+    _context = context;
+    vx_enum tensor_data_type = interpret_tensor_data_type(_info.data_type());
+    unsigned num_of_dims = _info.num_of_dims();
+    vx_size stride[num_of_dims];
+
+    stride[0] = tensor_data_size(_info.data_type());
+    for (unsigned i = 1; i < num_of_dims; i++)
+        stride[i] = stride[i - 1] * _info.dims().at(i - 1);
+
+    _vx_handle = vxCreateTensorFromHandle(_context, _info.num_of_dims(), _info.dims().data(), tensor_data_type, 0, stride, ptr, vx_mem_type(_info._mem_type));
+    vx_status status;
+    if ((status = vxGetStatus((vx_reference)_vx_handle)) != VX_SUCCESS)
+        THROW("Error: vxCreateTensorFromHandle(input: failed " + TOSTR(status))
+    _info._type = TensorInfo::Type::HANDLE;
+    _mem_handle = ptr;
+    return 0;
+}
+
 int Tensor::create(vx_context context) {
     if (_vx_handle) {
         WRN("Tensor object create method is already called ")
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
new file mode 100644
index 000000000..2f2171509
--- /dev/null
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -0,0 +1,527 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "numpy_data_reader.h"
+
+#include <commons.h>
+
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+#include <random>
+
+#include "filesystem.h"
+
+NumpyDataReader::NumpyDataReader() : _shuffle_time("shuffle_time", DBG_TIMING) {
+    _src_dir = nullptr;
+    _sub_dir = nullptr;
+    _entity = nullptr;
+    _curr_file_idx = 0;
+    _current_file_size = 0;
+    _current_fPtr = nullptr;
+    _loop = false;
+    _file_id = 0;
+    _shuffle = false;
+    _file_count_all_shards = 0;
+}
+
+unsigned NumpyDataReader::count_items() {
+    if (_loop)
+        return _file_names.size();
+
+    int ret = ((int)_file_names.size() - _read_counter);
+    return ((ret < 0) ? 0 : ret);
+}
+
+Reader::Status NumpyDataReader::initialize(ReaderConfig desc) {
+    auto ret = Reader::Status::OK;
+    _file_id = 0;
+    _folder_path = desc.path();
+    _shard_id = desc.get_shard_id();
+    _shard_count = desc.get_shard_count();
+    _batch_count = desc.get_batch_size();
+    _shuffle = desc.shuffle();
+    _loop = desc.loop();
+    _files = desc.get_files();
+    _seed = desc.seed();
+    ret = subfolder_reading();
+    // the following code is required to make every shard the same size:: required for multi-gpu training
+    if (_shard_count > 1 && _batch_count > 1 && _files.empty()) {
+        int _num_batches = _file_names.size() / _batch_count;
+        int max_batches_per_shard = (_file_count_all_shards + _shard_count - 1) / _shard_count;
+        max_batches_per_shard = (max_batches_per_shard + _batch_count - 1) / _batch_count;
+        if (_num_batches < max_batches_per_shard) {
+            replicate_last_batch_to_pad_partial_shard();
+        }
+    }
+    _file_headers.resize(_file_names.size());
+    // shuffle dataset if set
+    _shuffle_time.start();
+    if (ret == Reader::Status::OK && _shuffle) {
+        std::mt19937 rng(_seed);
+        std::shuffle(_file_names.begin(), _file_names.end(), rng);
+    }
+    _shuffle_time.end();
+    return ret;
+}
+
+void NumpyDataReader::incremenet_read_ptr() {
+    _read_counter++;
+    _curr_file_idx = (_curr_file_idx + 1) % _file_names.size();
+}
+
+size_t NumpyDataReader::open() {
+    auto file_path = _file_names[_curr_file_idx];       // Get current file name
+    _curr_file_header = _file_headers[_curr_file_idx];  // Get current file header
+    incremenet_read_ptr();
+    _last_id = file_path;
+    auto last_slash_idx = _last_id.find_last_of("\\/");
+    if (std::string::npos != last_slash_idx) {
+        _last_id.erase(0, last_slash_idx + 1);
+    }
+
+    auto ret = get_cached_header(file_path, _curr_file_header);
+    if (!ret) {
+        read_header(_curr_file_header, file_path);
+        update_header_cache(file_path, _curr_file_header);
+    } else {
+        _current_fPtr = std::fopen(file_path.c_str(), "rb");
+        if (_current_fPtr == nullptr)
+            THROW("Could not open file " + file_path + ": " + std::strerror(errno));
+    }
+    fseek(_current_fPtr, 0, SEEK_SET);  // Take the file pointer back to the start
+
+    return _curr_file_header.nbytes();
+}
+
+bool NumpyDataReader::get_cached_header(const std::string& file_name, NumpyHeaderData& header) {
+    std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
+    auto it = _header_cache_.find(file_name);
+    if (it == _header_cache_.end()) {
+        return false;
+    } else {
+        header = it->second;
+        return true;
+    }
+}
+
+void NumpyDataReader::update_header_cache(const std::string& file_name, const NumpyHeaderData& value) {
+    std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
+    _header_cache_[file_name] = value;
+}
+
+const RocalTensorDataType NumpyDataReader::get_numpy_dtype(const std::string& format) {
+    if (format == "u1") return RocalTensorDataType::UINT8;
+    // if (format == "u2") return TypeTable::GetTypeInfo<uint16_t>();   // Currently not supported in rocAL
+    if (format == "u4") return RocalTensorDataType::UINT32;
+    // if (format == "u8") return TypeTable::GetTypeInfo<uint64_t>();   // Currently not supported in rocAL
+    if (format == "i1") return RocalTensorDataType::INT8;
+    // if (format == "i2") return TypeTable::GetTypeInfo<int16_t>();    // Currently not supported in rocAL
+    if (format == "i4") return RocalTensorDataType::INT32;
+    // if (format == "i8") return TypeTable::GetTypeInfo<int64_t>();    // Currently not supported in rocAL
+    if (format == "f2")
+#if defined(AMD_FP16_SUPPORT)
+        return RocalTensorDataType::FP16;
+#else
+        THROW("FLOAT16 type tensor not supported")
+#endif
+    if (format == "f4") return RocalTensorDataType::FP32;
+    // if (format == "f8") return TypeTable::GetTypeInfo<double>();     // Currently not supported in rocAL
+    THROW("Unknown Numpy type string");
+}
+
+inline void NumpyDataReader::ignore_spaces(const char*& ptr) {
+    while (::isspace(*ptr))
+        ptr++;
+}
+
+template <size_t N>
+void NumpyDataReader::skip_string(const char*& ptr, const char (&what)[N]) {
+    if (strncmp(ptr, what, N - 1))
+        THROW("Found wrong symbol during parsing");
+    ptr += N - 1;
+}
+
+template <size_t N>
+bool NumpyDataReader::check_and_skip_string(const char*& ptr, const char (&what)[N]) {
+    if (!strncmp(ptr, what, N - 1)) {
+        ptr += N - 1;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+template <size_t N>
+void NumpyDataReader::skip_field(const char*& ptr, const char (&name)[N]) {
+    ignore_spaces(ptr);
+    skip_string(ptr, "'");
+    skip_string(ptr, name);
+    skip_string(ptr, "'");
+    ignore_spaces(ptr);
+    skip_string(ptr, ":");
+    ignore_spaces(ptr);
+}
+
+template <typename T = int64_t>
+T NumpyDataReader::parse_int(const char*& ptr) {
+    char* out_ptr = const_cast<char*>(ptr);  // strtol takes a non-const pointer
+    T value = static_cast<T>(strtol(ptr, &out_ptr, 10));
+    if (out_ptr == ptr)
+        THROW("Parse error: expected a number.");
+    ptr = out_ptr;
+    return value;
+}
+
+std::string NumpyDataReader::read_dtype_string(const char*& input, char delim_start, char delim_end) {
+    if (*input++ != delim_start)
+        THROW("Expected \'" + std::to_string(delim_start) + "\'");
+    std::string out;
+    for (; *input != '\0'; input++) {
+        if (*input == '\\') {
+            switch (*++input) {
+                case '\\':
+                    out += '\\';
+                    break;
+                case '\'':
+                    out += '\'';
+                    break;
+                case '\t':
+                    out += '\t';
+                    break;
+                case '\n':
+                    out += '\n';
+                    break;
+                case '\"':
+                    out += '\"';
+                    break;
+                default:
+                    out += '\\';
+                    out += *input;
+                    break;
+            }
+        } else if (*input == delim_end) {
+            break;
+        } else {
+            out += *input;
+        }
+    }
+    if (*input++ != delim_end)
+        THROW("Expected \'" + std::to_string(delim_end) + "\'");
+    return out;
+}
+
+void NumpyDataReader::decode_header(NumpyHeaderData& target, const std::string& header) {
+    const char* hdr = header.c_str();
+    ignore_spaces(hdr);
+    skip_string(hdr, "{");
+    skip_field(hdr, "descr");
+    auto typestr = read_dtype_string(hdr);
+    // < means LE, | means N/A, = means native. In all those cases, we can read
+    bool little_endian = (typestr[0] == '<' || typestr[0] == '|' || typestr[0] == '=');
+    if (!little_endian)
+        THROW("Big Endian files are not supported.");
+    target._type_info = get_numpy_dtype(typestr.substr(1));
+
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "fortran_order");
+    if (check_and_skip_string(hdr, "True")) {
+        target._fortran_order = true;
+    } else if (check_and_skip_string(hdr, "False")) {
+        target._fortran_order = false;
+    } else {
+        THROW("Failed to parse fortran_order field.");
+    }
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "shape");
+    skip_string(hdr, "(");
+    ignore_spaces(hdr);
+    target._shape.clear();
+    while (*hdr != ')') {
+        // parse_int already skips the leading spaces (strtol does).
+        target._shape.push_back(static_cast<unsigned>(parse_int<int64_t>(hdr)));
+        ignore_spaces(hdr);
+        if (!(check_and_skip_string(hdr, ",")) && (target._shape.size() <= 1))
+            THROW("The first number in a tuple must be followed by a comma.");
+    }
+    if (target._fortran_order) {
+        // cheapest thing to do is to define the tensor in an reversed way
+        std::reverse(target._shape.begin(), target._shape.end());
+    }
+}
+
+void NumpyDataReader::read_header(NumpyHeaderData& parsed_header, std::string file_path) {
+    // check if the file is actually a numpy file
+    std::vector<char> token(128);
+    _current_fPtr = std::fopen(file_path.c_str(), "rb");
+    if (_current_fPtr == nullptr)
+        THROW("Could not open file " + file_path + ": " + std::strerror(errno));
+    int64_t n_read = std::fread(token.data(), 1, 10, _current_fPtr);
+    if (n_read != 10)
+        THROW("Can not read header.");
+    token[n_read] = '\0';
+
+    // check if heqder is too short
+    std::string header = std::string(token.data());
+    if (header.find_first_of("NUMPY") == std::string::npos)
+        THROW("File is not a numpy file.");
+
+    // extract header length
+    uint16_t header_len = 0;
+    memcpy(&header_len, &token[8], 2);
+    if ((header_len + 10) % 16 != 0)
+        THROW("Error extracting header length.");
+
+    // read header: the offset is a magic number
+    int64_t offset = 6 + 1 + 1 + 2;
+    // the header_len can be 4GiB according to the NPYv2 file format
+    // specification: https://numpy.org/neps/nep-0001-npy-format.html
+    // while this allocation could be sizable, it is performed on the host.
+    token.resize(header_len + 1);
+    if (std::fseek(_current_fPtr, offset, SEEK_SET))
+        THROW("Seek operation failed: " + std::strerror(errno));
+    n_read = std::fread(token.data(), 1, header_len, _current_fPtr);
+    if (n_read != header_len)
+        THROW("Can not read header.");
+    token[header_len] = '\0';
+    header = std::string(token.data());
+    if (header.find('{') == std::string::npos)
+        THROW("Header is corrupted.");
+    offset += header_len;
+    if (std::fseek(_current_fPtr, offset, SEEK_SET))
+        THROW("Seek operation failed: " + std::strerror(errno));
+
+    decode_header(parsed_header, header);
+    parsed_header._data_offset = offset;
+}
+
+size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector<size_t> max_shape) {
+    if (!_current_fPtr)
+        THROW("Null file pointer");
+
+    // Requested read size bigger than the file size? just read as many bytes as the file size
+    read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
+
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
+        THROW("Seek operation failed: " + std::strerror(errno));
+
+    auto shape = _curr_file_header.shape();
+    auto num_dims = max_shape.size();
+    std::vector<unsigned> strides(num_dims + 1);
+    strides[num_dims] = 1;
+    for (int i = num_dims - 1; i >= 0; i--) {
+        strides[i] = strides[i + 1] * max_shape[i];
+    }
+
+    size_t actual_read_size = 0;
+    if (_curr_file_header.type() == RocalTensorDataType::UINT8)
+        actual_read_size = copy_array_data<u_int8_t>((u_int8_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::UINT32)
+        actual_read_size = copy_array_data<u_int32_t>((u_int32_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::INT8)
+        actual_read_size = copy_array_data<int8_t>((int8_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::INT32)
+        actual_read_size = copy_array_data<int32_t>((int32_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::FP16)
+#if defined(AMD_FP16_SUPPORT)
+        actual_read_size = copy_array_data<half>((half*)buf, strides, shape);
+#else
+        THROW("FLOAT16 type tensor not supported")
+#endif
+    if (_curr_file_header.type() == RocalTensorDataType::FP32)
+        actual_read_size = copy_array_data<float>((float*)buf, strides, shape);
+
+    return actual_read_size;
+}
+
+template <typename T>
+size_t NumpyDataReader::copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim) {
+    if (dim == (shapes.size() - 1)) {
+        auto actual_read_size = std::fread(buf, sizeof(T), shapes[dim], _current_fPtr);
+        return actual_read_size;
+    }
+    T* startPtr = buf;
+    size_t read_size = 0;
+    for (unsigned d = 0; d < shapes[dim]; d++) {
+        read_size += copy_array_data<T>(startPtr, strides, shapes, dim + 1);
+        startPtr += strides[dim + 1];
+    }
+    return read_size;
+}
+
+const NumpyHeaderData NumpyDataReader::get_numpy_header_data() {
+    return _curr_file_header;
+}
+
+size_t NumpyDataReader::read_data(unsigned char* buf, size_t read_size) {
+    if (!_current_fPtr)
+        return 0;
+
+    // Requested read size bigger than the file size? just read as many bytes as the file size
+    read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
+
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
+        THROW("Seek operation failed: " + std::strerror(errno));
+
+    size_t actual_read_size = std::fread(buf, 1, _curr_file_header.nbytes(), _current_fPtr);
+    return actual_read_size;
+}
+
+int NumpyDataReader::close() {
+    return release();
+}
+
+NumpyDataReader::~NumpyDataReader() {
+    release();
+}
+
+int NumpyDataReader::release() {
+    if (!_current_fPtr)
+        return 0;
+    fclose(_current_fPtr);
+    _current_fPtr = nullptr;
+    return 0;
+}
+
+void NumpyDataReader::reset() {
+    _shuffle_time.start();
+    if (_shuffle) {
+        std::mt19937 rng(_seed);
+        std::shuffle(_file_names.begin(), _file_names.end(), rng);
+    }
+    _shuffle_time.end();
+    _read_counter = 0;
+    _curr_file_idx = 0;
+}
+
+Reader::Status NumpyDataReader::subfolder_reading() {
+    auto ret = Reader::Status::OK;
+    if (!_files.empty()) {
+        for (unsigned file_count = 0; file_count < _files.size(); file_count++) {
+            std::string file_path = _files[file_count];
+            filesys::path pathObj(file_path);
+            if (filesys::exists(pathObj) && filesys::is_regular_file(pathObj)) {
+                // ignore files with extensions .tar, .zip, .7z
+                auto file_extension_idx = file_path.find_last_of(".");
+                if (file_extension_idx != std::string::npos) {
+                    std::string file_extension = file_path.substr(file_extension_idx + 1);
+                    if (file_extension != "npy")
+                        continue;
+                    else
+                        _file_names.push_back(file_path);
+                }
+            }
+        }
+    } else {
+        if ((_sub_dir = opendir(_folder_path.c_str())) == nullptr)
+            THROW("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] ERROR: Failed opening the directory at " + _folder_path);
+
+        std::vector<std::string> entry_name_list;
+        std::string _full_path = _folder_path;
+
+        while ((_entity = readdir(_sub_dir)) != nullptr) {
+            std::string entry_name(_entity->d_name);
+            if (strcmp(_entity->d_name, ".") == 0 || strcmp(_entity->d_name, "..") == 0) continue;
+            entry_name_list.push_back(entry_name);
+        }
+        closedir(_sub_dir);
+        std::sort(entry_name_list.begin(), entry_name_list.end());
+
+        for (unsigned dir_count = 0; dir_count < entry_name_list.size(); ++dir_count) {
+            std::string subfolder_path = _full_path + "/" + entry_name_list[dir_count];
+            filesys::path pathObj(subfolder_path);
+            if (filesys::exists(pathObj) && filesys::is_regular_file(pathObj)) {
+                // ignore files with extensions .tar, .zip, .7z
+                auto file_extension_idx = subfolder_path.find_last_of(".");
+                if (file_extension_idx != std::string::npos) {
+                    std::string file_extension = subfolder_path.substr(file_extension_idx + 1);
+                    if (file_extension != "npy")
+                        continue;
+                }
+                ret = open_folder();
+                break;  // assume directory has only files.
+            } else if (filesys::exists(pathObj) && filesys::is_directory(pathObj)) {
+                _folder_path = subfolder_path;
+                if (open_folder() != Reader::Status::OK)
+                    WRN("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] File reader cannot access the storage at " + _folder_path);
+            }
+        }
+    }
+    if (_in_batch_read_count > 0 && _in_batch_read_count < _batch_count) {
+        replicate_last_image_to_fill_last_shard();
+        LOG("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] Replicated " + _folder_path + _last_file_name + " " + TOSTR((_batch_count - _in_batch_read_count)) + " times to fill the last batch")
+    }
+    if (!_file_names.empty())
+        LOG("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] Total of " + TOSTR(_file_names.size()) + " images loaded from " + _full_path)
+    return ret;
+}
+
+void NumpyDataReader::replicate_last_image_to_fill_last_shard() {
+    for (size_t i = _in_batch_read_count; i < _batch_count; i++)
+        _file_names.push_back(_last_file_name);
+}
+
+void NumpyDataReader::replicate_last_batch_to_pad_partial_shard() {
+    if (_file_names.size() >= _batch_count) {
+        for (size_t i = 0; i < _batch_count; i++)
+            _file_names.push_back(_file_names[i - _batch_count]);
+    }
+}
+
+Reader::Status NumpyDataReader::open_folder() {
+    if ((_src_dir = opendir(_folder_path.c_str())) == nullptr)
+        THROW("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] ERROR: Failed opening the directory at " + _folder_path);
+
+    while ((_entity = readdir(_src_dir)) != nullptr) {
+        if (_entity->d_type != DT_REG)
+            continue;
+
+        if (get_file_shard_id() != _shard_id) {
+            _file_count_all_shards++;
+            incremenet_file_id();
+            continue;
+        }
+        _in_batch_read_count++;
+        _in_batch_read_count = (_in_batch_read_count % _batch_count == 0) ? 0 : _in_batch_read_count;
+        std::string file_path = _folder_path;
+        file_path.append("/");
+        file_path.append(_entity->d_name);
+        _last_file_name = file_path;
+        _file_names.push_back(file_path);
+        _file_count_all_shards++;
+        incremenet_file_id();
+    }
+    if (_file_names.empty())
+        WRN("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] Did not load any file from " + _folder_path)
+
+    closedir(_src_dir);
+    return Reader::Status::OK;
+}
+
+size_t NumpyDataReader::get_file_shard_id() {
+    if (_batch_count == 0 || _shard_count == 0)
+        THROW("Shard (Batch) size cannot be set to 0")
+    // return (_file_id / (_batch_count)) % _shard_count;
+    return _file_id % _shard_count;
+}
diff --git a/rocAL/source/readers/image/reader_factory.cpp b/rocAL/source/readers/image/reader_factory.cpp
index add3295f6..74b395bab 100644
--- a/rocAL/source/readers/image/reader_factory.cpp
+++ b/rocAL/source/readers/image/reader_factory.cpp
@@ -32,6 +32,7 @@ THE SOFTWARE.
 #include "external_source_reader.h"
 #include "file_source_reader.h"
 #include "mxnet_recordio_reader.h"
+#include "numpy_data_reader.h"
 #include "sequence_file_source_reader.h"
 #include "tf_record_reader.h"
 
@@ -91,6 +92,12 @@ std::shared_ptr<Reader> create_reader(ReaderConfig config) {
                 throw std::runtime_error("ExternalSourceReader cannot access the storage");
             return ret;
         } break;
+        case StorageType::NUMPY_DATA: {
+            auto ret = std::make_shared<NumpyDataReader>();
+            if (ret->initialize(config) != Reader::Status::OK)
+                throw std::runtime_error("NumpyDataReader cannot access the storage");
+            return ret;
+        } break;
         default:
             throw std::runtime_error("Reader type is unsupported");
     }
diff --git a/rocAL_pybind/amd/rocal/fn.py b/rocAL_pybind/amd/rocal/fn.py
index 314a2996b..1ec5289c2 100644
--- a/rocAL_pybind/amd/rocal/fn.py
+++ b/rocAL_pybind/amd/rocal/fn.py
@@ -135,12 +135,13 @@ def fog(*inputs, fog=0.5, device=None, output_layout=types.NHWC, output_dtype=ty
     return (fog_image)
 
 
-def brightness(*inputs, brightness=None, brightness_shift=None, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
+def brightness(*inputs, brightness=None, brightness_shift=None, conditional_execution=1, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
     """!Adjusts brightness of the image.
 
         @param inputs                                                                 the input image passed to the augmentation
         @param brightness (float, optional, default = None):                          brightness multiplier. Values >= 0 are accepted. For example: 0 - black image, 1 - no change, 2 - increase brightness twice
         @param brightness_shift (float, optional, default = None)                     brightness shift
+        @param conditional_execution (int, optional, default = None)                  controls the execution of the augmentation
         @param device (string, optional, default = None)                              Parameter unused for augmentation
         @param output_layout (int, optional, default = types.NHWC)                    tensor layout for the augmentation output
         @param output_dtype (int, optional, default = types.UINT8)                    tensor dtype for the augmentation output
@@ -151,22 +152,25 @@ def brightness(*inputs, brightness=None, brightness_shift=None, device=None, out
         brightness, float) else brightness
     brightness_shift = b.createFloatParameter(brightness_shift) if isinstance(
         brightness_shift, float) else brightness_shift
+    conditional_execution = b.createIntParameter(conditional_execution) if isinstance(
+            conditional_execution, int) else conditional_execution
 
     # pybind call arguments
     kwargs_pybind = {"input_image": inputs[0], "is_output": False, "brightness": brightness, "brightness_shift": brightness_shift,
-                     "output_layout": output_layout, "output_dtype": output_dtype}
+                     "conditional_execution": conditional_execution, "output_layout": output_layout, "output_dtype": output_dtype}
     brightness_image = b.brightness(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (brightness_image)
 
 
-def brightness_fixed(*inputs, brightness=1.0, brightness_shift=0.0, device=None,
+def brightness_fixed(*inputs, brightness=1.0, brightness_shift=0.0, conditional_execution=1, device=None,
                      output_layout=types.NHWC, output_dtype=types.UINT8):
     """!Adjusts brightness of the image with fixed parameters.
 
         @param inputs                                                                 the input image passed to the augmentation
         @param brightness (float, optional, default = 1.0)                            brightness multiplier. Values >= 0 are accepted. For example: 0 - black image, 1 - no change, 2 - increase brightness twice
         @param brightness_shift (float, optional, default = 0.0)                      brightness shift
+        @param conditional_execution (int, optional, default = None)                  controls the execution of the augmentation
         @param device (string, optional, default = None)                              Parameter unused for augmentation
         @param output_layout (int, optional, default = types.NHWC)                    tensor layout for the augmentation output
         @param output_dtype (int, optional, default = types.UINT8)                    tensor dtype for the augmentation output
@@ -175,7 +179,7 @@ def brightness_fixed(*inputs, brightness=1.0, brightness_shift=0.0, device=None,
     """
     # pybind call arguments
     kwargs_pybind = {"input_image": inputs[0], "is_output": False, "brightness": brightness, "brightness_shift": brightness_shift,
-                     "output_layout": output_layout, "output_dtype": output_dtype}
+                     "conditional_execution": conditional_execution, "output_layout": output_layout, "output_dtype": output_dtype}
     brightness_image = b.brightnessFixed(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (brightness_image)
@@ -252,12 +256,13 @@ def contrast(*inputs, contrast=None, contrast_center=None, device=None, output_l
     return (contrast_image)
 
 
-def flip(*inputs, horizontal=0, vertical=0, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
+def flip(*inputs, horizontal=0, vertical=0, depth=0, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
     """!Flip images horizontally and/or vertically based on inputs.
 
         @param inputs                                                                 the input image passed to the augmentation
         @param horizontal (int, optional, default = 0)                                flip the horizontal dimension
         @param vertical (int, optional, default = 0)                                  flip the vertical dimension
+        @param depth (int, optional, default = 0)                                     flip the depth dimension
         @param device (string, optional, default = None)                              Parameter unused for augmentation
         @param output_layout (int, optional, default = types.NHWC)                    tensor layout for the augmentation output
         @param output_dtype (int, optional, default = types.UINT8)                    tensor dtype for the augmentation output
@@ -268,10 +273,12 @@ def flip(*inputs, horizontal=0, vertical=0, device=None, output_layout=types.NHW
         horizontal, int) else horizontal
     vertical = b.createIntParameter(
         vertical) if isinstance(vertical, int) else vertical
+    depth = b.createIntParameter(
+        depth) if isinstance(depth, int) else depth
 
     # pybind call arguments
     kwargs_pybind = {"input_image": inputs[0],
-                     "is_output": False, "horizontal": horizontal, "vertical": vertical, "output_layout": output_layout, "output_dtype": output_dtype}
+                     "is_output": False, "horizontal": horizontal, "vertical": vertical, "depth": depth, "output_layout": output_layout, "output_dtype": output_dtype}
     flip_image = b.flip(Pipeline._current_pipeline._handle,
                         *(kwargs_pybind.values()))
     return (flip_image)
@@ -822,6 +829,32 @@ def crop(*inputs, crop=[0, 0], crop_pos_x=0.5, crop_pos_y=0.5, crop_pos_z=0.5,
     return (cropped_image)
 
 
+def slice(*inputs, anchor = [], shape = [], dtype = types.FLOAT, end = [], fill_values = [0.0],  out_of_bounds_policy = types.PAD, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
+    """
+    The slice can be specified by proving the start and end coordinates, or start coordinates and shape of the slice. Both coordinates and shapes can be provided in absolute or relative terms.
+
+    The slice arguments can be specified by the following named arguments:
+
+    start: Slice start coordinates (absolute)
+
+    rel_start: Slice start coordinates (relative)
+
+    end: Slice end coordinates (absolute)
+
+    rel_end: Slice end coordinates (relative)
+
+    shape: Slice shape (absolute)
+
+    rel_shape: Slice shape (relative)
+
+    """
+
+    kwargs_pybind = {"input": inputs[0], "is_output": False, "anchor": anchor, "shape": shape, "fill_values": fill_values,
+                     "out_of_bounds_policy": out_of_bounds_policy, "output_layout": output_layout, "output_dtype": output_dtype}
+    slice_output = b.slice(Pipeline._current_pipeline._handle ,*(kwargs_pybind.values()))
+    return slice_output
+
+
 def color_twist(*inputs, brightness=1.0, contrast=1.0, hue=0.0,
                 saturation=1.0, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
     """!Adjusts the brightness, hue and saturation of the images.
@@ -1068,3 +1101,63 @@ def external_source(*inputs, source, device=None, color_format=types.RGB, random
     external_source_operator = b.externalFileSource(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (external_source_operator, [])  # Labels is Empty
+
+def set_layout(*inputs, output_layout=types.NHWC):
+    """!Adjusts brightness of the image.
+
+        @param inputs                                                                 the input image passed to the augmentation
+        @param output_layout (int, optional, default = types.NHWC)                    tensor layout for the augmentation output
+
+        @return    Tensor with required output layout
+    """
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "output_layout": output_layout}
+    new_output = b.setLayout(
+        Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (new_output)
+
+def gaussian_noise(*inputs, mean=0.0, std_dev=1.0, seed=0, conditional_execution=1, device=None, output_layout=types.NHWC, output_dtype=types.UINT8):
+    """!Applies Gaussian noise to the input image.
+
+        @param inputs (list)                                                          The input image to which salt-and-pepper noise is applied.
+        @param mean (float, optional, default = 0.0)                                  Mean used for noise generation. Default is 0.0.
+        @param std_dev (float, optional, default = 1.0)                               Standard deviation used for noise generation. Default is 1.0.
+        @param seed (int, optional, default = 0)                                      Random seed. Default is 0.
+        @param conditional_execution (int, optional, default = None)                  controls the execution of the augmentation
+        @param device (string, optional, default = None)                              Parameter unused for augmentation
+        @param output_layout (int, optional, default = types.NHWC)                    Tensor layout for the augmentation output. Default is types.NHWC.
+        @param output_dtype (int, optional, default = types.UINT*)                    Tensor dtype for the augmentation output. Default is types.UINT8.
+
+        @return    images with Gaussian noise added.
+    """
+    mean = b.createFloatParameter(
+        mean) if isinstance(mean, float) else mean
+    std_dev = b.createFloatParameter(
+        std_dev) if isinstance(std_dev, float) else std_dev
+    conditional_execution = b.createIntParameter(conditional_execution) if isinstance(
+                conditional_execution, int) else conditional_execution
+
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "is_output": False, "mean": mean, "std_dev": std_dev,
+                     "seed": seed, "conditional_execution": conditional_execution, "output_layout": output_layout, "output_dtype": output_dtype}
+    noise_added_image = b.gaussianNoise(
+        Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (noise_added_image)
+
+def roi_random_crop(*inputs, roi_start, roi_end, crop_shape):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "roi_start": roi_start, "roi_end": roi_end, "crop_shape": crop_shape}
+    anchor = b.roiRandomCrop(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (anchor)
+
+def random_object_bbox(*inputs, format='anchor_shape', background=0, cache_objects=False, classes=[], foreground_prob=1.0, ignore_class=False, k_largest=-1, seed=0, threshold=[]):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "format": format, "k_largest": k_largest, "foreground_prob": foreground_prob}
+    selected_roi = b.randomObjectBbox(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    if format == "box":
+        return (selected_roi)
+    elif format == "anchor_shape" or format == "start_end":
+        return (selected_roi[0], selected_roi[1])
+    else:
+        print('Wrong format passed to random_object_bbox')
+        return ()
diff --git a/rocAL_pybind/amd/rocal/pipeline.py b/rocAL_pybind/amd/rocal/pipeline.py
index 506c7b969..a95c283a4 100644
--- a/rocAL_pybind/amd/rocal/pipeline.py
+++ b/rocAL_pybind/amd/rocal/pipeline.py
@@ -84,7 +84,7 @@ def __init__(self, batch_size=-1, num_threads=0, device_id=-1, seed=1,
         self._check_ops_decoder = [
             "ImageDecoder", "ImageDecoderSlice", "ImageDecoderRandomCrop", "ImageDecoderRaw"]
         self._check_ops_reader = ["labelReader", "TFRecordReaderClassification", "TFRecordReaderDetection",
-                                  "COCOReader", "Caffe2Reader", "Caffe2ReaderDetection", "CaffeReader", "CaffeReaderDetection"]
+                                  "COCOReader", "Caffe2Reader", "Caffe2ReaderDetection", "CaffeReader", "CaffeReaderDetection", "NumpyReader"]
         self._batch_size = batch_size
         self._num_threads = num_threads
         self._device_id = device_id
diff --git a/rocAL_pybind/amd/rocal/plugin/pytorch.py b/rocAL_pybind/amd/rocal/plugin/pytorch.py
index bfc888ad0..ca2ad6724 100644
--- a/rocAL_pybind/amd/rocal/plugin/pytorch.py
+++ b/rocAL_pybind/amd/rocal/plugin/pytorch.py
@@ -29,6 +29,84 @@
 import ctypes
 
 
+class ROCALNumpyIterator(object):
+    def __init__(self, pipeline, tensor_dtype=types.FLOAT, device="cpu", device_id=0, return_roi=False):
+        self.loader = pipeline
+        self.tensor_dtype = tensor_dtype
+        self.device = device
+        self.device_id = device_id
+        self.output_memory_type = self.loader._output_memory_type
+        self.output_list = None
+        self.batch_size = self.loader._batch_size
+        self.return_roi = return_roi
+        print("self.device", self.device)
+        self.len = b.getRemainingImages(self.loader._handle)
+
+    def next(self):
+        return self.__next__()
+
+    def __next__(self):
+        if self.loader.rocal_run() != 0:
+            raise StopIteration
+        self.output_tensor_list = self.loader.get_output_tensors()
+
+        if self.output_list is None:
+            # Output list used to store pipeline outputs - can support multiple augmentation outputs
+            self.output_list = []
+            for i in range(len(self.output_tensor_list)):
+                dimensions = self.output_tensor_list[i].dimensions()
+                if self.return_roi:
+                    self.num_dims = len(dimensions) - 1
+                    self.roi_array = np.zeros(self.batch_size * self.num_dims * 2, dtype=np.uint32)
+                    self.output_tensor_list[i].copy_roi(self.roi_array)
+                    self.max_roi_size = np.zeros(self.num_dims, dtype=np.uint32)
+                    for j in range(self.batch_size):
+                        index = j * self.num_dims * 2
+                        roi_size = self.roi_array[index + self.num_dims : index + self.num_dims * 2] - self.roi_array[index : index + self.num_dims]
+                        self.max_roi_size = np.maximum(roi_size, self.max_roi_size)
+                if self.device == "cpu":
+                    torch_dtype = self.output_tensor_list[i].dtype()
+                    output = torch.empty(
+                        dimensions, dtype=getattr(torch, torch_dtype))
+                else:
+                    torch_gpu_device = torch.device('cuda', self.device_id)
+                    torch_dtype = self.output_tensor_list[i].dtype()
+                    output = torch.empty(dimensions, dtype=getattr(
+                        torch, torch_dtype), device=torch_gpu_device)
+
+                self.output_tensor_list[i].copy_data(ctypes.c_void_p(
+                    output.data_ptr()), self.output_memory_type)
+                self.output_list.append(output)
+        else:
+            for i in range(len(self.output_tensor_list)):
+                if self.return_roi:
+                    self.output_tensor_list[i].copy_roi(self.roi_array)
+                    self.max_roi_size = np.zeros(self.num_dims, dtype=np.uint32)
+                    for j in range(self.batch_size):
+                        index = j * self.num_dims * 2
+                        roi_size = self.roi_array[index + self.num_dims : index + self.num_dims * 2] - self.roi_array[index : index + self.num_dims]
+                        self.max_roi_size = np.maximum(roi_size, self.max_roi_size)
+                self.output_tensor_list[i].copy_data(ctypes.c_void_p(
+                    self.output_list[i].data_ptr()), self.output_memory_type)
+        if self.return_roi:
+            roi_output_list = []
+            for i in range(len(self.output_list)):
+                roi_output_list.append(self.output_list[i][:, :self.max_roi_size[0], :self.max_roi_size[1], :self.max_roi_size[2], :self.max_roi_size[3]])
+            return roi_output_list
+        return self.output_list
+
+    def reset(self):
+        b.rocalResetLoaders(self.loader._handle)
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return self.len
+
+    def __del__(self):
+        b.rocalRelease(self.loader._handle)
+
 class ROCALGenericIterator(object):
     """!Iterator for processing data
 
diff --git a/rocAL_pybind/amd/rocal/readers.py b/rocAL_pybind/amd/rocal/readers.py
index 70e5a25f3..b115a3d92 100644
--- a/rocAL_pybind/amd/rocal/readers.py
+++ b/rocAL_pybind/amd/rocal/readers.py
@@ -350,3 +350,15 @@ def mxnet(path, stick_to_shard=False, pad_last_batch=False):
     mxnet_metadata = b.mxnetReader(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return mxnet_metadata
+
+
+def numpy(*inputs, file_root='', files=[], num_shards=1,
+          random_shuffle=False, shard_id=0, stick_to_shard=False, pad_last_batch=False, seed=0):
+
+    Pipeline._current_pipeline._reader = "NumpyReader"
+    # Output
+    kwargs_pybind = {"source_path": file_root, "files": files, "is_output": False, "shuffle": random_shuffle,
+                     "loop": False, "decode_size_policy": types.MAX_SIZE, "shard_id": shard_id, "shard_count": num_shards, "seed": seed}
+    numpy_reader_output = b.numpyReaderSourceShard(
+        Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (numpy_reader_output)
diff --git a/rocAL_pybind/amd/rocal/types.py b/rocAL_pybind/amd/rocal/types.py
index 6cbfa3496..cb0778c2c 100644
--- a/rocAL_pybind/amd/rocal/types.py
+++ b/rocAL_pybind/amd/rocal/types.py
@@ -62,6 +62,8 @@
 from rocal_pybind.types import NCHW
 from rocal_pybind.types import NFHWC
 from rocal_pybind.types import NFCHW
+from rocal_pybind.types import NDHWC
+from rocal_pybind.types import NCDHW
 
 #     RocalDecodeDevice
 from rocal_pybind.types import HARDWARE_DECODE
@@ -94,6 +96,9 @@
 from rocal_pybind.types import EXTSOURCE_RAW_COMPRESSED
 from rocal_pybind.types import EXTSOURCE_RAW_UNCOMPRESSED
 
+from rocal_pybind.types import TRIMTOSHAPE
+from rocal_pybind.types import PAD
+
 _known_types = {
 
     OK: ("OK", OK),
@@ -122,6 +127,8 @@
     NCHW: ("NCHW", NCHW),
     NFHWC: ("NFHWC", NFHWC),
     NFCHW: ("NFCHW", NFCHW),
+    NDHWC: ("NDHWC", NDHWC),
+    NCDHW: ("NCDHW", NCDHW),
     BGR: ("BGR", BGR),
     RGB: ("RGB", RGB),
     GRAY: ("GRAY", GRAY),
@@ -152,6 +159,9 @@
     EXTSOURCE_FNAME: ("EXTSOURCE_FNAME", EXTSOURCE_FNAME),
     EXTSOURCE_RAW_COMPRESSED: ("EXTSOURCE_RAW_COMPRESSED", EXTSOURCE_RAW_COMPRESSED),
     EXTSOURCE_RAW_UNCOMPRESSED: ("EXTSOURCE_RAW_UNCOMPRESSED", EXTSOURCE_RAW_UNCOMPRESSED),
+    TRIMTOSHAPE: ("TRIMTOSHAPE", TRIMTOSHAPE),
+    PAD: ("PAD", PAD),
+
 }
 
 def data_type_function(dtype):
diff --git a/rocAL_pybind/examples/rocAL_api_numpy_reader.py b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
new file mode 100644
index 000000000..e2961eddc
--- /dev/null
+++ b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
@@ -0,0 +1,124 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import random
+
+from amd.rocal.pipeline import Pipeline
+from amd.rocal.plugin.pytorch import ROCALNumpyIterator
+import amd.rocal.fn as fn
+import amd.rocal.types as types
+import sys
+import os, glob
+
+val_cases_list = ['00000', '00003', '00005', '00006', '00012', '00024', '00034', '00041', '00044', '00049', '00052', '00056', '00061', '00065', '00066', '00070', '00076', '00078', '00080', '00084',
+                  '00086', '00087', '00092', '00111', '00112', '00125', '00128', '00138', '00157', '00160', '00161', '00162', '00169', '00171', '00176', '00185', '00187', '00189', '00198', '00203', '00206', '00207']
+
+def load_data(path, files_pattern):
+    data = sorted(glob.glob(os.path.join(path, files_pattern)))
+    assert len(data) > 0, f"Found no data at {path}"
+    return data
+
+def get_data_split(path: str):
+    imgs = load_data(path, "*_x.npy")
+    lbls = load_data(path, "*_y.npy")
+    assert len(imgs) == len(lbls), f"Found {len(imgs)} volumes but {len(lbls)} corresponding masks"
+    imgs_train, lbls_train, imgs_val, lbls_val = [], [], [], []
+    for (case_img, case_lbl) in zip(imgs, lbls):
+        if case_img.split("_")[-2] in val_cases_list:
+            imgs_val.append(case_img)
+            lbls_val.append(case_lbl)
+        else:
+            imgs_train.append(case_img)
+            lbls_train.append(case_lbl)
+
+    return imgs_train, imgs_val, lbls_train, lbls_val
+
+def main():
+    if  len(sys.argv) < 3:
+        print ('Please pass numpy_folder cpu/gpu batch_size')
+        exit(0)
+    try:
+        path= "OUTPUT_IMAGES_PYTHON/NEW_API/NUMPY_READER/"
+        isExist = os.path.exists(path)
+        if not isExist:
+            os.makedirs(path)
+    except OSError as error:
+        print(error)
+    data_path = sys.argv[1]
+    if(sys.argv[2] == "cpu"):
+        rocal_cpu = True
+    else:
+        rocal_cpu = False
+    batch_size = int(sys.argv[3])
+    num_threads = 8
+    device_id = 0
+    local_rank = 0
+    world_size = 1
+    random_seed = random.SystemRandom().randint(0, 2**32 - 1)
+    x_train, x_val, y_train, y_val = get_data_split(data_path)
+
+    import time
+    start = time.time()
+    pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=2)
+
+    with pipeline:
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
+        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
+        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
+        [roi_start, roi_end] = fn.random_object_bbox(label_output, format="start_end", k_largest=2, foreground_prob=0.4)
+        anchor = fn.roi_random_crop(label_output, roi_start=roi_start, roi_end=roi_end, crop_shape=(1, 128, 128, 128))
+        data_sliced_output = fn.slice(data_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.FLOAT)
+        label_sliced_output = fn.slice(label_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.UINT8)       
+        hflip = fn.random.coin_flip(probability=0.33)
+        vflip = fn.random.coin_flip(probability=0.33)
+        dflip = fn.random.coin_flip(probability=0.33)
+        data_flip_output = fn.flip(data_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.FLOAT)
+        label_flip_output = fn.flip(label_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.UINT8)
+        brightness = fn.random.uniform(range=[0.7, 1.3])
+        add_brightness = fn.random.coin_flip(probability=0.1)
+        brightness_output = fn.brightness(data_flip_output, brightness=brightness, brightness_shift=0.0, conditional_execution=add_brightness, output_layout=types.NCDHW, output_dtype=types.FLOAT)
+        add_noise = fn.random.coin_flip(probability=0.5)
+        std_dev = fn.random.uniform(range=[0.0, 0.1])
+        noise_output = fn.gaussian_noise(brightness_output, mean=0.0, std_dev=std_dev, conditional_execution=add_noise, output_layout=types.NCDHW, output_dtype=types.FLOAT)
+        pipeline.set_outputs(noise_output, label_flip_output)
+
+    pipeline.build()
+
+    pipeline1 = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
+
+    with pipeline1:
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size)
+        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
+        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
+        pipeline1.set_outputs(data_output, label_output)
+
+    pipeline1.build()
+    
+    numpyIteratorPipeline = ROCALNumpyIterator(pipeline, device='cpu' if rocal_cpu else 'gpu')
+    print(len(numpyIteratorPipeline))
+    valNumpyIteratorPipeline = ROCALNumpyIterator(pipeline1, device='cpu' if rocal_cpu else 'gpu', return_roi=True)
+    print(len(valNumpyIteratorPipeline))
+    cnt = 0
+    for epoch in range(100):
+        print("+++++++++++++++++++++++++++++EPOCH+++++++++++++++++++++++++++++++++++++",epoch)
+        for i , it in enumerate(numpyIteratorPipeline):
+            print(i, it[0].shape, it[1].shape)
+            for j in range(batch_size):
+                print(it[0][j].cpu().numpy().shape, it[1][j].cpu().numpy().shape)
+                cnt += 1
+            print("************************************** i *************************************",i)
+        numpyIteratorPipeline.reset()
+        for i , it in enumerate(valNumpyIteratorPipeline):
+            print(i, it[0].shape, it[1].shape)
+            for j in range(batch_size):
+                print(it[0][j].cpu().numpy().shape, it[1][j].cpu().numpy().shape)
+                cnt += 1
+            print("************************************** i *************************************",i)
+        valNumpyIteratorPipeline.reset()
+    print("*********************************************************************")
+    print(f'Took {time.time() - start} seconds')
+
+if __name__ == '__main__':
+    main()
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 06e226dce..1562575dd 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -134,6 +134,8 @@ std::unordered_map<int, std::string> rocalToPybindLayout = {
     {1, "NCHW"},
     {2, "NFHWC"},
     {3, "NFCHW"},
+    {4, "NDHWC"},
+    {5, "NCDHW"},
 };
 
 std::unordered_map<int, std::string> rocalToPybindOutputDtype = {
@@ -379,6 +381,8 @@ PYBIND11_MODULE(rocal_pybind, m) {
         .value("NCHW", ROCAL_NCHW)
         .value("NFHWC", ROCAL_NFHWC)
         .value("NFCHW", ROCAL_NFCHW)
+        .value("NDHWC", ROCAL_NDHWC)
+        .value("NCDHW", ROCAL_NCDHW)
         .export_values();
     py::enum_<RocalDecodeDevice>(types_m, "RocalDecodeDevice", "Decode device type")
         .value("HARDWARE_DECODE", ROCAL_HW_DECODE)
@@ -402,6 +406,10 @@ PYBIND11_MODULE(rocal_pybind, m) {
         .def_readwrite("y", &ROIxywh::y)
         .def_readwrite("w", &ROIxywh::w)
         .def_readwrite("h", &ROIxywh::h);
+    py::enum_<RocalOutOfBoundsPolicy>(types_m, "RocalOutOfBoundsPolicy", "Rocal Out of Bounds Policy Type")
+        .value("TRIMTOSHAPE", TRIMTOSHAPE)
+        .value("PAD", PAD)
+        .export_values();
     // rocal_api_info.h
     m.def("getRemainingImages", &rocalGetRemainingImages);
     m.def("getImageName", &wrapper_image_name);
@@ -435,6 +443,8 @@ PYBIND11_MODULE(rocal_pybind, m) {
         int *ptr = static_cast<int *>(buf.ptr);
         rocalGetImageSizes(context, ptr);
     });
+    m.def("roiRandomCrop", &rocalROIRandomCrop, py::return_value_policy::reference);
+    m.def("randomObjectBbox", &rocalRandomObjectBbox, py::return_value_policy::reference);
     m.def("getROIImgSizes", [](RocalContext context, py::array_t<int> array) {
         auto buf = array.request();
         int *ptr = static_cast<int *>(buf.ptr);
@@ -630,9 +640,15 @@ PYBIND11_MODULE(rocal_pybind, m) {
           py::return_value_policy::reference);
     m.def("externalSourceFeedInput", &wrapperRocalExternalSourceFeedInput,
           py::return_value_policy::reference);
+    m.def("numpyReaderSource", &rocalNumpyFileSource, "Reads file from the source given and decodes it according to the policy",
+          py::return_value_policy::reference);
+    m.def("numpyReaderSourceShard", &rocalNumpyFileSourceSingleShard, "Reads file from the source given and decodes it according to the shard id and number of shards",
+          py::return_value_policy::reference);
     m.def("rocalResetLoaders", &rocalResetLoaders);
     m.def("videoMetaDataReader", &rocalCreateVideoLabelReader, py::return_value_policy::reference);
     // rocal_api_augmentation.h
+    m.def("setLayout", &rocalSetLayout,
+          py::return_value_policy::reference);
     m.def("ssdRandomCrop", &rocalSSDRandomCrop,
           py::return_value_policy::reference);
     m.def("resize", &rocalResize,
@@ -705,5 +721,9 @@ PYBIND11_MODULE(rocal_pybind, m) {
           py::return_value_policy::reference);
     m.def("lensCorrection", &rocalLensCorrection,
           py::return_value_policy::reference);
+    m.def("gaussianNoise", &rocalGaussianNoise,
+          py::return_value_policy::reference);
+    m.def("slice", &rocalSlice,
+          py::return_value_policy::reference);
 }
 }  // namespace rocal
diff --git a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
index e48fe1d78..51265859f 100644
--- a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
+++ b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
@@ -319,6 +319,12 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
             rocalCreateMXNetReader(handle, path, true);
             decoded_output = rocalMXNetRecordSource(handle, path, color_format, num_threads, false, false, false, ROCAL_USE_USER_GIVEN_SIZE_RESTRICTED, decode_max_width, decode_max_height);
         } break;
+        case 12:  // Numpy reader
+        {
+            std::cout << ">>>>>>> Running Numpy reader" << std::endl;
+            pipeline_type = 4;
+            decoded_output = rocalNumpyFileSource(handle, path, num_threads, {}, false, false, false, ROCAL_USE_MAX_SIZE);
+        } break;
         default: {
             std::cout << ">>>>>>> Running IMAGE READER" << std::endl;
             pipeline_type = 1;
@@ -766,6 +772,53 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
                     }
                 }
             } break;
+            case 4: {  // numpy reader pipeline
+                RocalTensorList output_tensor_list;
+                output_tensor_list = rocalGetOutputTensors(handle);
+                for (int idx = 0; idx < output_tensor_list->size(); idx++) {
+                    unsigned char *out_buffer;
+                    if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP32) {
+                        float *out_f_buffer;
+                        std::cout << "Creating float buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f_buffer = (float *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f_buffer = (float *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 4);
+                        // convert_float_to_uchar_buffer(out_f_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 4);
+                    } else if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP16) {
+                        half *out_f16_buffer;
+                        std::cout << "Creating float16 buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f16_buffer = (half *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f16_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f16_buffer = (half *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 2);
+                        // convert_float_to_uchar_buffer(out_f16_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 2);
+                    } else {
+                        std::cout << "Creating uchar buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_buffer = (unsigned char *)(output_tensor_list->at(idx)->buffer());
+                    }
+                }
+                std::cout << "Copied numpy data to buffers\n";
+            } break;
             default: {
                 std::cout << "Not a valid pipeline type ! Exiting!\n";
                 return -1;